From e5f08bae8c8d10cdf1b7c134f119b4eec00ce11a Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 29 Jan 2025 22:52:12 +0100 Subject: [PATCH] PTX: Update generated files with Blackwell instructions (#3568) * ptx: Update existing instructions * ptx: Add new instructions * Fix returning error out values See: - https://gitlab-master.nvidia.com/CCCL/libcuda-ptx/-/merge_requests/74 - https://gitlab-master.nvidia.com/CCCL/libcuda-ptx/-/merge_requests/73 * ptx: Fix out var declaration See https://gitlab-master.nvidia.com/CCCL/libcuda-ptx/-/merge_requests/75 * mbarrier.{test,try}_wait: Fix test. Wrong files were included. * docs: Fix special registers include * Allow non-included documentation pages * Workaround NVRTC Co-authored-by: Allard Hendriksen --- .../generated/barrier_cluster_aligned.rst | 63 + .../generated/clusterlaunchcontrol.rst | 68 + .../instructions/generated/cp_async_bulk.rst | 38 +- .../generated/cp_async_bulk_multicast.rst | 2 +- .../generated/cp_async_bulk_tensor.rst | 280 +- .../cp_async_bulk_tensor_gather_scatter.rst | 124 + .../cp_async_bulk_tensor_multicast.rst | 200 +- .../generated/cp_async_mbarrier_arrive.rst | 11 + .../cp_async_mbarrier_arrive_noinc.rst | 11 + .../generated/cp_reduce_async_bulk.rst | 6 +- .../ptx/instructions/generated/elect_sync.rst | 11 + .../ptx/instructions/generated/fence.rst | 170 +- .../generated/fence_proxy_async.rst | 6 +- ...ence_proxy_async_generic_sync_restrict.rst | 30 + .../generated/fence_sync_restrict.rst | 30 + .../{special_registers.rst => get_sreg.rst} | 99 +- .../ptx/instructions/generated/getctarank.rst | 2 +- .../ptx/instructions/generated/mapa.rst | 14 + .../generated/mbarrier_arrive.rst | 105 +- .../generated/mbarrier_arrive_expect_tx.rst | 54 +- .../generated/mbarrier_test_wait.rst | 34 +- .../generated/mbarrier_test_wait_parity.rst | 34 +- .../generated/mbarrier_try_wait.rst | 70 +- .../generated/mbarrier_try_wait_parity.rst | 70 +- .../generated/multimem_ld_reduce.rst | 2396 ++++++ .../instructions/generated/multimem_red.rst | 2306 ++++++ .../instructions/generated/multimem_st.rst | 250 + .../ptx/instructions/generated/red_async.rst | 32 +- .../ptx/instructions/generated/st_async.rst | 10 +- .../ptx/instructions/generated/st_bulk.rst | 13 + .../instructions/generated/tcgen05_alloc.rst | 70 + .../instructions/generated/tcgen05_commit.rst | 48 + .../ptx/instructions/generated/tcgen05_cp.rst | 434 ++ .../instructions/generated/tcgen05_fence.rst | 18 + .../ptx/instructions/generated/tcgen05_ld.rst | 758 ++ .../instructions/generated/tcgen05_mma.rst | 2378 ++++++ .../instructions/generated/tcgen05_mma_ws.rst | 4482 ++++++++++++ .../instructions/generated/tcgen05_shift.rst | 24 + .../ptx/instructions/generated/tcgen05_st.rst | 758 ++ .../instructions/generated/tcgen05_wait.rst | 18 + .../generated/tensormap_replace.rst | 114 +- .../ptx/instructions/special_registers.rst | 2 +- docs/repo.toml | 2 +- .../instructions/generated/barrier_cluster.h | 81 +- .../generated/barrier_cluster_aligned.h | 130 + .../generated/clusterlaunchcontrol.h | 240 + .../instructions/generated/cp_async_bulk.h | 155 +- .../generated/cp_async_bulk_commit_group.h | 15 +- .../generated/cp_async_bulk_multicast.h | 38 +- .../generated/cp_async_bulk_tensor.h | 868 ++- .../cp_async_bulk_tensor_gather_scatter.h | 288 + .../cp_async_bulk_tensor_multicast.h | 530 +- .../generated/cp_async_bulk_wait_group.h | 30 +- .../generated/cp_async_mbarrier_arrive.h | 26 + .../cp_async_mbarrier_arrive_noinc.h | 26 + .../generated/cp_reduce_async_bulk.h | 983 +-- .../generated/cp_reduce_async_bulk_bf16.h | 78 +- .../generated/cp_reduce_async_bulk_f16.h | 78 +- .../generated/cp_reduce_async_bulk_tensor.h | 788 +- .../__ptx/instructions/generated/elect_sync.h | 36 + .../cuda/__ptx/instructions/generated/fence.h | 224 +- .../generated/fence_mbarrier_init.h | 19 +- .../generated/fence_proxy_alias.h | 15 +- .../generated/fence_proxy_async.h | 47 +- .../fence_proxy_async_generic_sync_restrict.h | 62 + .../generated/fence_proxy_tensormap_generic.h | 96 +- .../generated/fence_sync_restrict.h | 62 + .../__ptx/instructions/generated/get_sreg.h | 629 +- .../__ptx/instructions/generated/getctarank.h | 24 +- .../cuda/__ptx/instructions/generated/mapa.h | 33 + .../instructions/generated/mbarrier_arrive.h | 345 +- .../generated/mbarrier_arrive_expect_tx.h | 154 +- .../generated/mbarrier_arrive_no_complete.h | 23 +- .../generated/mbarrier_expect_tx.h | 94 + .../instructions/generated/mbarrier_init.h | 15 +- .../generated/mbarrier_test_wait.h | 135 +- .../generated/mbarrier_test_wait_parity.h | 134 +- .../generated/mbarrier_try_wait.h | 281 +- .../generated/mbarrier_try_wait_parity.h | 281 +- .../generated/multimem_ld_reduce.h | 2148 ++++++ .../instructions/generated/multimem_red.h | 1272 ++++ .../instructions/generated/multimem_st.h | 186 + .../__ptx/instructions/generated/red_async.h | 336 +- .../__ptx/instructions/generated/st_async.h | 120 +- .../__ptx/instructions/generated/st_bulk.h | 31 + .../instructions/generated/tcgen05_alloc.h | 105 + .../instructions/generated/tcgen05_commit.h | 81 + .../__ptx/instructions/generated/tcgen05_cp.h | 612 ++ .../instructions/generated/tcgen05_fence.h | 44 + .../__ptx/instructions/generated/tcgen05_ld.h | 4446 ++++++++++++ .../instructions/generated/tcgen05_mma.h | 3842 ++++++++++ .../instructions/generated/tcgen05_mma_ws.h | 6438 +++++++++++++++++ .../instructions/generated/tcgen05_shift.h | 36 + .../__ptx/instructions/generated/tcgen05_st.h | 4554 ++++++++++++ .../instructions/generated/tcgen05_wait.h | 44 + .../generated/tensormap_cp_fenceproxy.h | 68 +- .../generated/tensormap_replace.h | 658 +- .../ptx/generated/barrier_cluster_aligned.h | 61 + .../cuda/ptx/generated/clusterlaunchcontrol.h | 84 + .../cuda/ptx/generated/cp_async_bulk.h | 29 +- .../ptx/generated/cp_async_bulk_multicast.h | 28 +- .../cuda/ptx/generated/cp_async_bulk_tensor.h | 325 +- .../cp_async_bulk_tensor_gather_scatter.h | 180 + .../cp_async_bulk_tensor_multicast.h | 405 +- .../ptx/generated/cp_async_mbarrier_arrive.h | 26 + .../cp_async_mbarrier_arrive_noinc.h | 26 + .../cuda/ptx/generated/elect_sync.h | 26 + .../libcudacxx/cuda/ptx/generated/fence.h | 64 +- .../fence_proxy_async_generic_sync_restrict.h | 38 + .../cuda/ptx/generated/fence_sync_restrict.h | 38 + .../test/libcudacxx/cuda/ptx/generated/mapa.h | 27 + .../cuda/ptx/generated/mbarrier_arrive.h | 56 + .../ptx/generated/mbarrier_arrive_expect_tx.h | 29 + .../cuda/ptx/generated/mbarrier_expect_tx.h | 50 + .../cuda/ptx/generated/mbarrier_test_wait.h | 55 + .../ptx/generated/mbarrier_test_wait_parity.h | 55 + .../cuda/ptx/generated/mbarrier_try_wait.h | 31 + .../ptx/generated/mbarrier_try_wait_parity.h | 32 + .../cuda/ptx/generated/mbarrier_wait.h | 24 - .../cuda/ptx/generated/mbarrier_wait_parity.h | 24 - .../cuda/ptx/generated/multimem_ld_reduce.h | 1020 +++ .../cuda/ptx/generated/multimem_red.h | 840 +++ .../cuda/ptx/generated/multimem_st.h | 110 + .../libcudacxx/cuda/ptx/generated/st_bulk.h | 26 + .../cuda/ptx/generated/tcgen05_alloc.h | 81 + .../cuda/ptx/generated/tcgen05_commit.h | 62 + .../cuda/ptx/generated/tcgen05_cp.h | 396 + .../cuda/ptx/generated/tcgen05_fence.h | 44 + .../cuda/ptx/generated/tcgen05_ld.h | 1012 +++ .../cuda/ptx/generated/tcgen05_mma.h | 2928 ++++++++ .../cuda/ptx/generated/tcgen05_mma_ws.h | 3570 +++++++++ .../cuda/ptx/generated/tcgen05_shift.h | 39 + .../cuda/ptx/generated/tcgen05_st.h | 1012 +++ .../cuda/ptx/generated/tcgen05_wait.h | 40 + .../cuda/ptx/generated/tensormap_replace.h | 390 +- .../libcudacxx/cuda/ptx/nvrtc_workaround.h | 34 + .../ptx/ptx.barrier.cluster.compile.pass.cpp | 2 + ...p.async.bulk.commit_group.compile.pass.cpp | 2 + .../ptx/ptx.cp.async.bulk.compile.pass.cpp | 2 + ...x.cp.async.bulk.multicast.compile.pass.cpp | 2 + .../ptx.cp.async.bulk.tensor.compile.pass.cpp | 2 + ...ync.bulk.tensor.multicast.compile.pass.cpp | 2 + ....cp.async.bulk.wait_group.compile.pass.cpp | 2 + .../ptx.cp.reduce.async.bulk.compile.pass.cpp | 2 + ....reduce.async.bulk.tensor.compile.pass.cpp | 2 + .../cuda/ptx/ptx.fence.compile.pass.cpp | 2 + .../cuda/ptx/ptx.get_sreg.compile.pass.cpp | 2 + .../cuda/ptx/ptx.getctarank.compile.pass.cpp | 2 + .../ptx/ptx.mbarrier.arrive.compile.pass.cpp | 2 + .../ptx/ptx.mbarrier.init.compile.pass.cpp | 2 + .../ptx/ptx.mbarrier.wait.compile.pass.cpp | 6 +- .../cuda/ptx/ptx.red.async.compile.pass.cpp | 2 + .../cuda/ptx/ptx.st.async.compile.pass.cpp | 2 + ...x.tensormap.cp_fenceproxy.compile.pass.cpp | 2 + .../ptx.tensormap.replace.compile.pass.cpp | 2 + 155 files changed, 58115 insertions(+), 2973 deletions(-) create mode 100644 docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/clusterlaunchcontrol.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/cp_async_mbarrier_arrive.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/elect_sync.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/fence_sync_restrict.rst rename docs/libcudacxx/ptx/instructions/generated/{special_registers.rst => get_sreg.rst} (83%) create mode 100644 docs/libcudacxx/ptx/instructions/generated/mapa.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/multimem_ld_reduce.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/multimem_red.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/multimem_st.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/st_bulk.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_alloc.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_commit.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_cp.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_fence.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_ld.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_mma.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_mma_ws.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_shift.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_st.rst create mode 100644 docs/libcudacxx/ptx/instructions/generated/tcgen05_wait.rst create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/multimem_red.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/multimem_st.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/st_bulk.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_commit.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_cp.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_fence.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_ld.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_mma.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_shift.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_st.h create mode 100644 libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_wait.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/clusterlaunchcontrol.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_gather_scatter.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_mbarrier_arrive.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_mbarrier_arrive_noinc.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async_generic_sync_restrict.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_sync_restrict.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_expect_tx.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_test_wait.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_test_wait_parity.h delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.h delete mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_ld_reduce.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_red.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_st.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/st_bulk.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_alloc.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_commit.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_cp.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_fence.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_ld.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_mma.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_mma_ws.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_shift.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_st.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_wait.h create mode 100644 libcudacxx/test/libcudacxx/cuda/ptx/nvrtc_workaround.h diff --git a/docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst new file mode 100644 index 00000000000..a24093ac7b6 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster_aligned.rst @@ -0,0 +1,63 @@ +.. + This file was automatically generated. Do not edit. + +barrier.cluster.arrive.aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // barrier.cluster.arrive.aligned; // PTX ISA 78, SM_90 + // .aligned = { .aligned } + // Marked volatile and as clobbering memory + template + __device__ static inline void barrier_cluster_arrive( + cuda::ptx::dot_aligned_t); + +barrier.cluster.wait.aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // barrier.cluster.wait.aligned; // PTX ISA 78, SM_90 + // .aligned = { .aligned } + // Marked volatile and as clobbering memory + template + __device__ static inline void barrier_cluster_wait( + cuda::ptx::dot_aligned_t); + +barrier.cluster.arrive.release.aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90 + // .sem = { .release } + // .aligned = { .aligned } + // Marked volatile and as clobbering memory + template + __device__ static inline void barrier_cluster_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::dot_aligned_t); + +barrier.cluster.arrive.relaxed.aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90 + // .sem = { .relaxed } + // .aligned = { .aligned } + // Marked volatile + template + __device__ static inline void barrier_cluster_arrive( + cuda::ptx::sem_relaxed_t, + cuda::ptx::dot_aligned_t); + +barrier.cluster.wait.acquire.aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // barrier.cluster.wait.sem.aligned; // PTX ISA 80, SM_90 + // .sem = { .acquire } + // .aligned = { .aligned } + // Marked volatile and as clobbering memory + template + __device__ static inline void barrier_cluster_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::dot_aligned_t); diff --git a/docs/libcudacxx/ptx/instructions/generated/clusterlaunchcontrol.rst b/docs/libcudacxx/ptx/instructions/generated/clusterlaunchcontrol.rst new file mode 100644 index 00000000000..b372c5bf33e --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/clusterlaunchcontrol.rst @@ -0,0 +1,68 @@ +.. + This file was automatically generated. Do not edit. + +clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 [addr], [smem_bar]; // PTX ISA 86, SM_100 + template + __device__ static inline void clusterlaunchcontrol_try_cancel( + void* addr, + uint64_t* smem_bar); + +clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128 [addr], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + template + __device__ static inline void clusterlaunchcontrol_try_cancel_multicast( + void* addr, + uint64_t* smem_bar); + +clusterlaunchcontrol.query_cancel.is_canceled.pred.b128 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // clusterlaunchcontrol.query_cancel.is_canceled.pred.b128 pred_is_canceled, try_cancel_response; // PTX ISA 86, SM_100 + template = true> + __device__ static inline bool clusterlaunchcontrol_query_cancel_is_canceled( + B128 try_cancel_response); + +clusterlaunchcontrol.query_cancel.get_first_ctaid::x.b32.b128 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // clusterlaunchcontrol.query_cancel.get_first_ctaid::x.b32.b128 ret_dim, try_cancel_response; // PTX ISA 86, SM_100 + template = true, typename B128, enable_if_t = true> + __device__ static inline B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_x( + B128 try_cancel_response); + +clusterlaunchcontrol.query_cancel.get_first_ctaid::y.b32.b128 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // clusterlaunchcontrol.query_cancel.get_first_ctaid::y.b32.b128 ret_dim, try_cancel_response; // PTX ISA 86, SM_100 + template = true, typename B128, enable_if_t = true> + __device__ static inline B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_y( + B128 try_cancel_response); + +clusterlaunchcontrol.query_cancel.get_first_ctaid::z.b32.b128 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // clusterlaunchcontrol.query_cancel.get_first_ctaid::z.b32.b128 ret_dim, try_cancel_response; // PTX ISA 86, SM_100 + template = true, typename B128, enable_if_t = true> + __device__ static inline B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_z( + B128 try_cancel_response); + +clusterlaunchcontrol.query_cancel.get_first_ctaid.v4.b32.b128 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // clusterlaunchcontrol.query_cancel.get_first_ctaid.v4.b32.b128 block_dim, try_cancel_response; // PTX ISA 86, SM_100 + template = true, typename B128, enable_if_t = true> + __device__ static inline void clusterlaunchcontrol_query_cancel_get_first_ctaid( + B32 (&block_dim)[4], + B128 try_cancel_response); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst index 4883d8495eb..2bb334f1971 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst @@ -5,7 +5,7 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, SM_90 + // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -17,11 +17,27 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes const uint32_t& size, uint64_t* smem_bar); +cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // PTX ISA 86, SM_90 + // .dst = { .shared::cta } + // .src = { .global } + template + __device__ static inline void cp_async_bulk( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* srcMem, + const uint32_t& size, + uint64_t* smem_bar); + cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90 + // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .shared::cta } template @@ -37,7 +53,7 @@ cp.async.bulk.global.shared::cta.bulk_group ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 + // cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -47,3 +63,19 @@ cp.async.bulk.global.shared::cta.bulk_group void* dstMem, const void* srcMem, const uint32_t& size); + +cp.async.bulk.global.shared::cta.bulk_group.cp_mask +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.dst.src.bulk_group.cp_mask [dstMem], [srcMem], size, byteMask; // PTX ISA 86, SM_100 + // .dst = { .global } + // .src = { .shared::cta } + template + __device__ static inline void cp_async_bulk_cp_mask( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + void* dstMem, + const void* srcMem, + const uint32_t& size, + const uint16_t& byteMask); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst index af027c0b623..396a04e468b 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst @@ -5,7 +5,7 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::clu ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1. PTX ISA 80, SM_90a + // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst index 1c21efdd0a3..9d44a10800b 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst @@ -5,7 +5,7 @@ cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1a. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -17,11 +17,63 @@ cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes const int32_t (&tensorCoords)[1], uint64_t* smem_bar); +cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_90 + // .dst = { .shared::cta } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar); + +cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar); + +cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar); + cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -36,7 +88,7 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1b. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -48,11 +100,63 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes const int32_t (&tensorCoords)[2], uint64_t* smem_bar); +cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_90 + // .dst = { .shared::cta } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar); + +cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar); + +cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar); + cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -67,7 +171,7 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1c. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -79,11 +183,63 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes const int32_t (&tensorCoords)[3], uint64_t* smem_bar); +cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_90 + // .dst = { .shared::cta } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar); + +cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar); + +cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar); + cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -98,7 +254,7 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1d. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -110,11 +266,63 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes const int32_t (&tensorCoords)[4], uint64_t* smem_bar); +cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_90 + // .dst = { .shared::cta } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar); + +cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar); + +cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar); + cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -129,7 +337,7 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1e. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -141,11 +349,63 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes const int32_t (&tensorCoords)[5], uint64_t* smem_bar); +cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_90 + // .dst = { .shared::cta } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); + +cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); + +cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); + cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90 + // cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.rst new file mode 100644 index 00000000000..971f0213cb0 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.rst @@ -0,0 +1,124 @@ +.. + This file was automatically generated. Do not edit. + +cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100 + // .dst = { .shared::cta } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); + +cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); + +cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cta } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); + +cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + template + __device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 86, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile::scatter4.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .global } + // .src = { .shared::cta } + template + __device__ static inline void cp_async_bulk_tensor_tile_scatter4( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + const void* srcMem); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst index ac33a05b69f..8ea38a2e0ad 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst @@ -5,7 +5,7 @@ cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a + // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -18,11 +18,49 @@ cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes uint64_t* smem_bar, const uint16_t& ctaMask); +cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar, + const uint16_t& ctaMask); + cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a + // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -35,11 +73,49 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes uint64_t* smem_bar, const uint16_t& ctaMask); +cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar, + const uint16_t& ctaMask); + cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a + // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -52,11 +128,49 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes uint64_t* smem_bar, const uint16_t& ctaMask); +cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar, + const uint16_t& ctaMask); + cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a + // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -69,11 +183,49 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes uint64_t* smem_bar, const uint16_t& ctaMask); +cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar, + const uint16_t& ctaMask); + cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a + // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -85,3 +237,41 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes const int32_t (&tensorCoords)[5], uint64_t* smem_bar, const uint16_t& ctaMask); + +cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); + +cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a + // .dst = { .shared::cluster } + // .src = { .global } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_mbarrier_arrive.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_mbarrier_arrive.rst new file mode 100644 index 00000000000..73ce222a9ec --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_mbarrier_arrive.rst @@ -0,0 +1,11 @@ +.. + This file was automatically generated. Do not edit. + +cp.async.mbarrier.arrive.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.mbarrier.arrive.b64 [addr]; // PTX ISA 70, SM_80 + template + __device__ static inline void cp_async_mbarrier_arrive( + uint64_t* addr); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.rst new file mode 100644 index 00000000000..31b7a2e5a2b --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.rst @@ -0,0 +1,11 @@ +.. + This file was automatically generated. Do not edit. + +cp.async.mbarrier.arrive.noinc.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // cp.async.mbarrier.arrive.noinc.b64 [addr]; // PTX ISA 70, SM_80 + template + __device__ static inline void cp_async_mbarrier_arrive_noinc( + uint64_t* addr); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst index b043eb9f456..8228b69ed41 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst @@ -10,7 +10,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.an // .src = { .shared::cta } // .type = { .b32 } // .op = { .and } - template + template = true> __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -29,7 +29,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.or // .src = { .shared::cta } // .type = { .b32 } // .op = { .or } - template + template = true> __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -48,7 +48,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.xo // .src = { .shared::cta } // .type = { .b32 } // .op = { .xor } - template + template = true> __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/elect_sync.rst b/docs/libcudacxx/ptx/instructions/generated/elect_sync.rst new file mode 100644 index 00000000000..bc909c54319 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/elect_sync.rst @@ -0,0 +1,11 @@ +.. + This file was automatically generated. Do not edit. + +elect.sync +^^^^^^^^^^ +.. code:: cuda + + // elect.sync _|is_elected, membermask; // PTX ISA 80, SM_90 + template + __device__ static inline bool elect_sync( + const uint32_t& membermask); diff --git a/docs/libcudacxx/ptx/instructions/generated/fence.rst b/docs/libcudacxx/ptx/instructions/generated/fence.rst index ed21fa80b6e..50137394587 100644 --- a/docs/libcudacxx/ptx/instructions/generated/fence.rst +++ b/docs/libcudacxx/ptx/instructions/generated/fence.rst @@ -5,94 +5,190 @@ fence.sc.cta ^^^^^^^^^^^^ .. code:: cuda - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } + // fence.sem.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .sc } // .scope = { .cta, .gpu, .sys } - template + template __device__ static inline void fence( - cuda::ptx::sem_t sem, + cuda::ptx::sem_sc_t, cuda::ptx::scope_t scope); fence.sc.gpu ^^^^^^^^^^^^ .. code:: cuda - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } + // fence.sem.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .sc } // .scope = { .cta, .gpu, .sys } - template + template __device__ static inline void fence( - cuda::ptx::sem_t sem, + cuda::ptx::sem_sc_t, cuda::ptx::scope_t scope); fence.sc.sys ^^^^^^^^^^^^ .. code:: cuda - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } + // fence.sem.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .sc } // .scope = { .cta, .gpu, .sys } - template + template __device__ static inline void fence( - cuda::ptx::sem_t sem, + cuda::ptx::sem_sc_t, cuda::ptx::scope_t scope); +fence.sc.cluster +^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.scope; // 2. PTX ISA 78, SM_90 + // .sem = { .sc } + // .scope = { .cluster } + template + __device__ static inline void fence( + cuda::ptx::sem_sc_t, + cuda::ptx::scope_cluster_t); + fence.acq_rel.cta ^^^^^^^^^^^^^^^^^ .. code:: cuda - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } + // fence.sem.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .acq_rel } // .scope = { .cta, .gpu, .sys } - template + template __device__ static inline void fence( - cuda::ptx::sem_t sem, + cuda::ptx::sem_acq_rel_t, cuda::ptx::scope_t scope); fence.acq_rel.gpu ^^^^^^^^^^^^^^^^^ .. code:: cuda - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } + // fence.sem.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .acq_rel } // .scope = { .cta, .gpu, .sys } - template + template __device__ static inline void fence( - cuda::ptx::sem_t sem, + cuda::ptx::sem_acq_rel_t, cuda::ptx::scope_t scope); fence.acq_rel.sys ^^^^^^^^^^^^^^^^^ .. code:: cuda - // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 - // .sem = { .sc, .acq_rel } + // fence.sem.scope; // 1. PTX ISA 60, SM_70 + // .sem = { .acq_rel } // .scope = { .cta, .gpu, .sys } - template + template __device__ static inline void fence( - cuda::ptx::sem_t sem, + cuda::ptx::sem_acq_rel_t, cuda::ptx::scope_t scope); -fence.sc.cluster -^^^^^^^^^^^^^^^^ +fence.acq_rel.cluster +^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // fence{.sem}.scope; // 2. PTX ISA 78, SM_90 - // .sem = { .sc, .acq_rel } + // fence.sem.scope; // 2. PTX ISA 78, SM_90 + // .sem = { .acq_rel } // .scope = { .cluster } - template + template __device__ static inline void fence( - cuda::ptx::sem_t sem, + cuda::ptx::sem_acq_rel_t, cuda::ptx::scope_cluster_t); -fence.acq_rel.cluster +fence.acquire.cta +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.scope; // PTX ISA 86, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope); + +fence.acquire.cluster ^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // fence{.sem}.scope; // 2. PTX ISA 78, SM_90 - // .sem = { .sc, .acq_rel } - // .scope = { .cluster } - template + // fence.sem.scope; // PTX ISA 86, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + template __device__ static inline void fence( - cuda::ptx::sem_t sem, - cuda::ptx::scope_cluster_t); + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope); + +fence.acquire.gpu +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.scope; // PTX ISA 86, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope); + +fence.acquire.sys +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.scope; // PTX ISA 86, SM_90 + // .sem = { .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope); + +fence.release.cta +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.scope; // PTX ISA 86, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope); + +fence.release.cluster +^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.scope; // PTX ISA 86, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope); + +fence.release.gpu +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.scope; // PTX ISA 86, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope); + +fence.release.sys +^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.scope; // PTX ISA 86, SM_90 + // .sem = { .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template + __device__ static inline void fence( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope); diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst index 8376e96ce6b..9f4000b675e 100644 --- a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst +++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst @@ -13,7 +13,7 @@ fence.proxy.async.global ^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 + // fence.proxy.async.space; // 6. PTX ISA 80, SM_90 // .space = { .global, .shared::cluster, .shared::cta } template __device__ static inline void fence_proxy_async( @@ -23,7 +23,7 @@ fence.proxy.async.shared::cluster ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 + // fence.proxy.async.space; // 6. PTX ISA 80, SM_90 // .space = { .global, .shared::cluster, .shared::cta } template __device__ static inline void fence_proxy_async( @@ -33,7 +33,7 @@ fence.proxy.async.shared::cta ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 + // fence.proxy.async.space; // 6. PTX ISA 80, SM_90 // .space = { .global, .shared::cluster, .shared::cta } template __device__ static inline void fence_proxy_async( diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.rst new file mode 100644 index 00000000000..e67c4852355 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.rst @@ -0,0 +1,30 @@ +.. + This file was automatically generated. Do not edit. + +fence.proxy.async::generic.acquire.sync_restrict::shared::cluster.cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.async::generic.sem.sync_restrict::space.scope; // PTX ISA 86, SM_90 + // .sem = { .acquire } + // .space = { .shared::cluster } + // .scope = { .cluster } + template + __device__ static inline void fence_proxy_async_generic_sync_restrict( + cuda::ptx::sem_acquire_t, + cuda::ptx::space_cluster_t, + cuda::ptx::scope_cluster_t); + +fence.proxy.async::generic.release.sync_restrict::shared::cta.cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.proxy.async::generic.sem.sync_restrict::space.scope; // PTX ISA 86, SM_90 + // .sem = { .release } + // .space = { .shared::cta } + // .scope = { .cluster } + template + __device__ static inline void fence_proxy_async_generic_sync_restrict( + cuda::ptx::sem_release_t, + cuda::ptx::space_shared_t, + cuda::ptx::scope_cluster_t); diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_sync_restrict.rst b/docs/libcudacxx/ptx/instructions/generated/fence_sync_restrict.rst new file mode 100644 index 00000000000..bae82190e25 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/fence_sync_restrict.rst @@ -0,0 +1,30 @@ +.. + This file was automatically generated. Do not edit. + +fence.acquire.sync_restrict::shared::cluster.cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.sync_restrict::space.scope; // PTX ISA 86, SM_90 + // .sem = { .acquire } + // .space = { .shared::cluster } + // .scope = { .cluster } + template + __device__ static inline void fence_sync_restrict( + cuda::ptx::sem_acquire_t, + cuda::ptx::space_cluster_t, + cuda::ptx::scope_cluster_t); + +fence.release.sync_restrict::shared::cta.cluster +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // fence.sem.sync_restrict::space.scope; // PTX ISA 86, SM_90 + // .sem = { .release } + // .space = { .shared::cta } + // .scope = { .cluster } + template + __device__ static inline void fence_sync_restrict( + cuda::ptx::sem_release_t, + cuda::ptx::space_shared_t, + cuda::ptx::scope_cluster_t); diff --git a/docs/libcudacxx/ptx/instructions/generated/special_registers.rst b/docs/libcudacxx/ptx/instructions/generated/get_sreg.rst similarity index 83% rename from docs/libcudacxx/ptx/instructions/generated/special_registers.rst rename to docs/libcudacxx/ptx/instructions/generated/get_sreg.rst index aa1add84781..9582c4384ff 100644 --- a/docs/libcudacxx/ptx/instructions/generated/special_registers.rst +++ b/docs/libcudacxx/ptx/instructions/generated/get_sreg.rst @@ -1,9 +1,12 @@ +.. + This file was automatically generated. Do not edit. + tid.x ^^^^^ .. code:: cuda // mov.u32 sreg_value, %%tid.x; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_tid_x(); tid.y @@ -11,7 +14,7 @@ tid.y .. code:: cuda // mov.u32 sreg_value, %%tid.y; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_tid_y(); tid.z @@ -19,7 +22,7 @@ tid.z .. code:: cuda // mov.u32 sreg_value, %%tid.z; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_tid_z(); ntid.x @@ -27,7 +30,7 @@ ntid.x .. code:: cuda // mov.u32 sreg_value, %%ntid.x; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_ntid_x(); ntid.y @@ -35,7 +38,7 @@ ntid.y .. code:: cuda // mov.u32 sreg_value, %%ntid.y; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_ntid_y(); ntid.z @@ -43,7 +46,7 @@ ntid.z .. code:: cuda // mov.u32 sreg_value, %%ntid.z; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_ntid_z(); laneid @@ -51,7 +54,7 @@ laneid .. code:: cuda // mov.u32 sreg_value, %%laneid; // PTX ISA 13 - template + template __device__ static inline uint32_t get_sreg_laneid(); warpid @@ -59,7 +62,7 @@ warpid .. code:: cuda // mov.u32 sreg_value, %%warpid; // PTX ISA 13 - template + template __device__ static inline uint32_t get_sreg_warpid(); nwarpid @@ -67,7 +70,7 @@ nwarpid .. code:: cuda // mov.u32 sreg_value, %%nwarpid; // PTX ISA 20, SM_35 - template + template __device__ static inline uint32_t get_sreg_nwarpid(); ctaid.x @@ -75,7 +78,7 @@ ctaid.x .. code:: cuda // mov.u32 sreg_value, %%ctaid.x; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_ctaid_x(); ctaid.y @@ -83,7 +86,7 @@ ctaid.y .. code:: cuda // mov.u32 sreg_value, %%ctaid.y; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_ctaid_y(); ctaid.z @@ -91,7 +94,7 @@ ctaid.z .. code:: cuda // mov.u32 sreg_value, %%ctaid.z; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_ctaid_z(); nctaid.x @@ -99,7 +102,7 @@ nctaid.x .. code:: cuda // mov.u32 sreg_value, %%nctaid.x; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_nctaid_x(); nctaid.y @@ -107,7 +110,7 @@ nctaid.y .. code:: cuda // mov.u32 sreg_value, %%nctaid.y; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_nctaid_y(); nctaid.z @@ -115,7 +118,7 @@ nctaid.z .. code:: cuda // mov.u32 sreg_value, %%nctaid.z; // PTX ISA 20 - template + template __device__ static inline uint32_t get_sreg_nctaid_z(); smid @@ -123,7 +126,7 @@ smid .. code:: cuda // mov.u32 sreg_value, %%smid; // PTX ISA 13 - template + template __device__ static inline uint32_t get_sreg_smid(); nsmid @@ -131,7 +134,7 @@ nsmid .. code:: cuda // mov.u32 sreg_value, %%nsmid; // PTX ISA 20, SM_35 - template + template __device__ static inline uint32_t get_sreg_nsmid(); gridid @@ -139,7 +142,7 @@ gridid .. code:: cuda // mov.u64 sreg_value, %%gridid; // PTX ISA 30 - template + template __device__ static inline uint64_t get_sreg_gridid(); is_explicit_cluster @@ -147,7 +150,7 @@ is_explicit_cluster .. code:: cuda // mov.pred sreg_value, %%is_explicit_cluster; // PTX ISA 78, SM_90 - template + template __device__ static inline bool get_sreg_is_explicit_cluster(); clusterid.x @@ -155,7 +158,7 @@ clusterid.x .. code:: cuda // mov.u32 sreg_value, %%clusterid.x; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_clusterid_x(); clusterid.y @@ -163,7 +166,7 @@ clusterid.y .. code:: cuda // mov.u32 sreg_value, %%clusterid.y; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_clusterid_y(); clusterid.z @@ -171,7 +174,7 @@ clusterid.z .. code:: cuda // mov.u32 sreg_value, %%clusterid.z; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_clusterid_z(); nclusterid.x @@ -179,7 +182,7 @@ nclusterid.x .. code:: cuda // mov.u32 sreg_value, %%nclusterid.x; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_nclusterid_x(); nclusterid.y @@ -187,7 +190,7 @@ nclusterid.y .. code:: cuda // mov.u32 sreg_value, %%nclusterid.y; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_nclusterid_y(); nclusterid.z @@ -195,7 +198,7 @@ nclusterid.z .. code:: cuda // mov.u32 sreg_value, %%nclusterid.z; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_nclusterid_z(); cluster_ctaid.x @@ -203,7 +206,7 @@ cluster_ctaid.x .. code:: cuda // mov.u32 sreg_value, %%cluster_ctaid.x; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_cluster_ctaid_x(); cluster_ctaid.y @@ -211,7 +214,7 @@ cluster_ctaid.y .. code:: cuda // mov.u32 sreg_value, %%cluster_ctaid.y; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_cluster_ctaid_y(); cluster_ctaid.z @@ -219,7 +222,7 @@ cluster_ctaid.z .. code:: cuda // mov.u32 sreg_value, %%cluster_ctaid.z; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_cluster_ctaid_z(); cluster_nctaid.x @@ -227,7 +230,7 @@ cluster_nctaid.x .. code:: cuda // mov.u32 sreg_value, %%cluster_nctaid.x; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_cluster_nctaid_x(); cluster_nctaid.y @@ -235,7 +238,7 @@ cluster_nctaid.y .. code:: cuda // mov.u32 sreg_value, %%cluster_nctaid.y; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_cluster_nctaid_y(); cluster_nctaid.z @@ -243,7 +246,7 @@ cluster_nctaid.z .. code:: cuda // mov.u32 sreg_value, %%cluster_nctaid.z; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_cluster_nctaid_z(); cluster_ctarank @@ -251,7 +254,7 @@ cluster_ctarank .. code:: cuda // mov.u32 sreg_value, %%cluster_ctarank; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_cluster_ctarank(); cluster_nctarank @@ -259,7 +262,7 @@ cluster_nctarank .. code:: cuda // mov.u32 sreg_value, %%cluster_nctarank; // PTX ISA 78, SM_90 - template + template __device__ static inline uint32_t get_sreg_cluster_nctarank(); lanemask_eq @@ -267,7 +270,7 @@ lanemask_eq .. code:: cuda // mov.u32 sreg_value, %%lanemask_eq; // PTX ISA 20, SM_35 - template + template __device__ static inline uint32_t get_sreg_lanemask_eq(); lanemask_le @@ -275,7 +278,7 @@ lanemask_le .. code:: cuda // mov.u32 sreg_value, %%lanemask_le; // PTX ISA 20, SM_35 - template + template __device__ static inline uint32_t get_sreg_lanemask_le(); lanemask_lt @@ -283,7 +286,7 @@ lanemask_lt .. code:: cuda // mov.u32 sreg_value, %%lanemask_lt; // PTX ISA 20, SM_35 - template + template __device__ static inline uint32_t get_sreg_lanemask_lt(); lanemask_ge @@ -291,7 +294,7 @@ lanemask_ge .. code:: cuda // mov.u32 sreg_value, %%lanemask_ge; // PTX ISA 20, SM_35 - template + template __device__ static inline uint32_t get_sreg_lanemask_ge(); lanemask_gt @@ -299,7 +302,7 @@ lanemask_gt .. code:: cuda // mov.u32 sreg_value, %%lanemask_gt; // PTX ISA 20, SM_35 - template + template __device__ static inline uint32_t get_sreg_lanemask_gt(); clock @@ -307,7 +310,7 @@ clock .. code:: cuda // mov.u32 sreg_value, %%clock; // PTX ISA 10 - template + template __device__ static inline uint32_t get_sreg_clock(); clock_hi @@ -315,7 +318,7 @@ clock_hi .. code:: cuda // mov.u32 sreg_value, %%clock_hi; // PTX ISA 50, SM_35 - template + template __device__ static inline uint32_t get_sreg_clock_hi(); clock64 @@ -323,7 +326,7 @@ clock64 .. code:: cuda // mov.u64 sreg_value, %%clock64; // PTX ISA 20, SM_35 - template + template __device__ static inline uint64_t get_sreg_clock64(); globaltimer @@ -331,7 +334,7 @@ globaltimer .. code:: cuda // mov.u64 sreg_value, %%globaltimer; // PTX ISA 31, SM_35 - template + template __device__ static inline uint64_t get_sreg_globaltimer(); globaltimer_lo @@ -339,7 +342,7 @@ globaltimer_lo .. code:: cuda // mov.u32 sreg_value, %%globaltimer_lo; // PTX ISA 31, SM_35 - template + template __device__ static inline uint32_t get_sreg_globaltimer_lo(); globaltimer_hi @@ -347,7 +350,7 @@ globaltimer_hi .. code:: cuda // mov.u32 sreg_value, %%globaltimer_hi; // PTX ISA 31, SM_35 - template + template __device__ static inline uint32_t get_sreg_globaltimer_hi(); total_smem_size @@ -355,7 +358,7 @@ total_smem_size .. code:: cuda // mov.u32 sreg_value, %%total_smem_size; // PTX ISA 41, SM_35 - template + template __device__ static inline uint32_t get_sreg_total_smem_size(); aggr_smem_size @@ -363,7 +366,7 @@ aggr_smem_size .. code:: cuda // mov.u32 sreg_value, %%aggr_smem_size; // PTX ISA 81, SM_90 - template + template __device__ static inline uint32_t get_sreg_aggr_smem_size(); dynamic_smem_size @@ -371,7 +374,7 @@ dynamic_smem_size .. code:: cuda // mov.u32 sreg_value, %%dynamic_smem_size; // PTX ISA 41, SM_35 - template + template __device__ static inline uint32_t get_sreg_dynamic_smem_size(); current_graph_exec @@ -379,5 +382,5 @@ current_graph_exec .. code:: cuda // mov.u64 sreg_value, %%current_graph_exec; // PTX ISA 80, SM_50 - template + template __device__ static inline uint64_t get_sreg_current_graph_exec(); diff --git a/docs/libcudacxx/ptx/instructions/generated/getctarank.rst b/docs/libcudacxx/ptx/instructions/generated/getctarank.rst index 374c182576f..19b3783086c 100644 --- a/docs/libcudacxx/ptx/instructions/generated/getctarank.rst +++ b/docs/libcudacxx/ptx/instructions/generated/getctarank.rst @@ -5,7 +5,7 @@ getctarank.shared::cluster.u32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90 + // getctarank.space.u32 dest, addr; // PTX ISA 78, SM_90 // .space = { .shared::cluster } template __device__ static inline uint32_t getctarank( diff --git a/docs/libcudacxx/ptx/instructions/generated/mapa.rst b/docs/libcudacxx/ptx/instructions/generated/mapa.rst new file mode 100644 index 00000000000..4ffc70d85d9 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/mapa.rst @@ -0,0 +1,14 @@ +.. + This file was automatically generated. Do not edit. + +mapa.shared::cluster.u32 +^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mapa.space.u32 dest, addr, target_cta; // PTX ISA 78, SM_90 + // .space = { .shared::cluster } + template + __device__ static inline Tp* mapa( + cuda::ptx::space_cluster_t, + const Tp* addr, + uint32_t target_cta); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst index 21436e2b3ca..fea199e4747 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst @@ -24,7 +24,7 @@ mbarrier.arrive.release.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 + // mbarrier.arrive.sem.scope.space.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cta, .cluster } // .space = { .shared::cta } @@ -39,7 +39,7 @@ mbarrier.arrive.release.cluster.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 + // mbarrier.arrive.sem.scope.space.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cta, .cluster } // .space = { .shared::cta } @@ -54,7 +54,7 @@ mbarrier.arrive.release.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 + // mbarrier.arrive.sem.scope.space.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cta, .cluster } // .space = { .shared::cta } @@ -70,7 +70,7 @@ mbarrier.arrive.release.cluster.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 + // mbarrier.arrive.sem.scope.space.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cta, .cluster } // .space = { .shared::cta } @@ -86,7 +86,7 @@ mbarrier.arrive.release.cluster.shared::cluster.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr]; // 4a. PTX ISA 80, SM_90 + // mbarrier.arrive.sem.scope.space.b64 _, [addr]; // 4a. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } @@ -101,7 +101,7 @@ mbarrier.arrive.release.cluster.shared::cluster.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90 + // mbarrier.arrive.sem.scope.space.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } @@ -112,3 +112,96 @@ mbarrier.arrive.release.cluster.shared::cluster.b64 cuda::ptx::space_cluster_t, uint64_t* addr, const uint32_t& count); + +mbarrier.arrive.relaxed.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.sem.scope.space.b64 state, [addr], count; // PTX ISA 86, SM_90 + // .space = { .shared::cta } + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::space_shared_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& count); + +mbarrier.arrive.relaxed.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.sem.scope.space.b64 state, [addr], count; // PTX ISA 86, SM_90 + // .space = { .shared::cta } + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::space_shared_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& count); + +mbarrier.arrive.relaxed.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.sem.scope.space.b64 state, [addr]; // PTX ISA 86, SM_90 + // .space = { .shared::cta } + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::space_shared_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + uint64_t* addr); + +mbarrier.arrive.relaxed.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.sem.scope.space.b64 state, [addr]; // PTX ISA 86, SM_90 + // .space = { .shared::cta } + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::space_shared_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + uint64_t* addr); + +mbarrier.arrive.relaxed.cluster.shared::cluster.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.sem.scope.space.b64 _, [addr], count; // PTX ISA 86, SM_90 + // .space = { .shared::cluster } + // .sem = { .relaxed } + // .scope = { .cluster } + template + __device__ static inline void mbarrier_arrive( + cuda::ptx::space_cluster_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_cluster_t, + uint64_t* addr, + const uint32_t& count); + +mbarrier.arrive.relaxed.cluster.shared::cluster.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.sem.scope.space.b64 _, [addr]; // PTX ISA 86, SM_90 + // .space = { .shared::cluster } + // .sem = { .relaxed } + // .scope = { .cluster } + template + __device__ static inline void mbarrier_arrive( + cuda::ptx::space_cluster_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_cluster_t, + uint64_t* addr); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst index 47c56eca31a..318a7eb5b98 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst @@ -5,7 +5,7 @@ mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 + // mbarrier.arrive.expect_tx.sem.scope.space.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cta, .cluster } // .space = { .shared::cta } @@ -21,7 +21,7 @@ mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 + // mbarrier.arrive.expect_tx.sem.scope.space.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cta, .cluster } // .space = { .shared::cta } @@ -37,7 +37,7 @@ mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 + // mbarrier.arrive.expect_tx.sem.scope.space.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } @@ -48,3 +48,51 @@ mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 cuda::ptx::space_cluster_t, uint64_t* addr, const uint32_t& tx_count); + +mbarrier.arrive.expect_tx.relaxed.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.expect_tx.sem.scope.space.b64 state, [addr], txCount; // PTX ISA 86, SM_90 + // .space = { .shared::cta } + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline uint64_t mbarrier_arrive_expect_tx( + cuda::ptx::space_shared_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& txCount); + +mbarrier.arrive.expect_tx.relaxed.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.expect_tx.sem.scope.space.b64 state, [addr], txCount; // PTX ISA 86, SM_90 + // .space = { .shared::cta } + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline uint64_t mbarrier_arrive_expect_tx( + cuda::ptx::space_shared_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& txCount); + +mbarrier.arrive.expect_tx.relaxed.cluster.shared::cluster.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.arrive.expect_tx.sem.scope.space.b64 _, [addr], txCount; // PTX ISA 86, SM_90 + // .space = { .shared::cluster } + // .sem = { .relaxed } + // .scope = { .cluster } + template + __device__ static inline void mbarrier_arrive_expect_tx( + cuda::ptx::space_cluster_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_cluster_t, + uint64_t* addr, + const uint32_t& txCount); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst index d16b2ac07ac..88ec36b43ac 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst @@ -15,7 +15,7 @@ mbarrier.test_wait.acquire.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.test_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 2. PTX ISA 80, SM_90 + // mbarrier.test_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // 2. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -29,7 +29,7 @@ mbarrier.test_wait.acquire.cluster.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.test_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 2. PTX ISA 80, SM_90 + // mbarrier.test_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // 2. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -38,3 +38,33 @@ mbarrier.test_wait.acquire.cluster.shared::cta.b64 cuda::ptx::scope_t scope, uint64_t* addr, const uint64_t& state); + +mbarrier.test_wait.relaxed.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.test_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_test_wait( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint64_t& state); + +mbarrier.test_wait.relaxed.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.test_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_test_wait( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint64_t& state); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst index ec464b3398b..1496d6cbccb 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst @@ -15,7 +15,7 @@ mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX ISA 80, SM_90 + // mbarrier.test_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -29,7 +29,7 @@ mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX ISA 80, SM_90 + // mbarrier.test_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -38,3 +38,33 @@ mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 cuda::ptx::scope_t scope, uint64_t* addr, const uint32_t& phaseParity); + +mbarrier.test_wait.parity.relaxed.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.test_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_test_wait_parity( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint32_t& phaseParity); + +mbarrier.test_wait.parity.relaxed.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.test_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_test_wait_parity( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint32_t& phaseParity); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst index 3dfdba46861..4d319a5b1e3 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst @@ -26,7 +26,7 @@ mbarrier.try_wait.acquire.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 6a. PTX ISA 80, SM_90 + // mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // 6a. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -40,7 +40,7 @@ mbarrier.try_wait.acquire.cluster.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 6a. PTX ISA 80, SM_90 + // mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // 6a. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -54,7 +54,7 @@ mbarrier.try_wait.acquire.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. PTX ISA 80, SM_90 + // mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -69,7 +69,7 @@ mbarrier.try_wait.acquire.cluster.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. PTX ISA 80, SM_90 + // mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -79,3 +79,65 @@ mbarrier.try_wait.acquire.cluster.shared::cta.b64 uint64_t* addr, const uint64_t& state, const uint32_t& suspendTimeHint); + +mbarrier.try_wait.relaxed.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_try_wait( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint64_t& state, + const uint32_t& suspendTimeHint); + +mbarrier.try_wait.relaxed.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_try_wait( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint64_t& state, + const uint32_t& suspendTimeHint); + +mbarrier.try_wait.relaxed.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_try_wait( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint64_t& state); + +mbarrier.try_wait.relaxed.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_try_wait( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint64_t& state); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst index 4e7af4bace5..6a51704cab4 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst @@ -26,7 +26,7 @@ mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. PTX ISA 80, SM_90 + // mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -40,7 +40,7 @@ mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. PTX ISA 80, SM_90 + // mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -54,7 +54,7 @@ mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. PTX ISA 80, SM_90 + // mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -69,7 +69,7 @@ mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. PTX ISA 80, SM_90 + // mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -79,3 +79,65 @@ mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 uint64_t* addr, const uint32_t& phaseParity, const uint32_t& suspendTimeHint); + +mbarrier.try_wait.parity.relaxed.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_try_wait_parity( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint32_t& phaseParity, + const uint32_t& suspendTimeHint); + +mbarrier.try_wait.parity.relaxed.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_try_wait_parity( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint32_t& phaseParity, + const uint32_t& suspendTimeHint); + +mbarrier.try_wait.parity.relaxed.cta.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_try_wait_parity( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint32_t& phaseParity); + +mbarrier.try_wait.parity.relaxed.cluster.shared::cta.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // PTX ISA 86, SM_90 + // .sem = { .relaxed } + // .scope = { .cta, .cluster } + template + __device__ static inline void mbarrier_try_wait_parity( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint32_t& phaseParity); diff --git a/docs/libcudacxx/ptx/instructions/generated/multimem_ld_reduce.rst b/docs/libcudacxx/ptx/instructions/generated/multimem_ld_reduce.rst new file mode 100644 index 00000000000..cd9f32bf5f0 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/multimem_ld_reduce.rst @@ -0,0 +1,2396 @@ +.. + This file was automatically generated. Do not edit. + +multimem.ld_reduce.weak.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .min } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_min_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.cta.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.sys.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.cta.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.cluster.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.gpu.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.sys.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint32_t* addr); + +multimem.ld_reduce.weak.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .min } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_min_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.cta.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.sys.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.cta.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.cluster.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.gpu.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.sys.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint64_t* addr); + +multimem.ld_reduce.weak.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .min } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_min_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.cta.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.sys.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.cta.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.cluster.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.gpu.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.sys.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int32_t* addr); + +multimem.ld_reduce.weak.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .min } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_min_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.cta.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.sys.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.cta.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.cluster.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.gpu.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.sys.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int64_t* addr); + +multimem.ld_reduce.weak.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .max } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_max_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.cta.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.sys.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.cta.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.cluster.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.gpu.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.sys.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint32_t* addr); + +multimem.ld_reduce.weak.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .max } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_max_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.cta.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.sys.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.cta.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.cluster.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.gpu.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.sys.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint64_t* addr); + +multimem.ld_reduce.weak.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .max } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_max_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.cta.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.sys.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.cta.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.cluster.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.gpu.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.sys.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int32_t* addr); + +multimem.ld_reduce.weak.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .max } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_max_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.cta.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.sys.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.cta.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.cluster.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.gpu.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.sys.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int64_t* addr); + +multimem.ld_reduce.weak.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .add } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_add_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.cta.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint32_t* addr); + +multimem.ld_reduce.relaxed.sys.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.cta.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.cluster.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.gpu.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint32_t* addr); + +multimem.ld_reduce.acquire.sys.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint32_t* addr); + +multimem.ld_reduce.weak.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .add } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_add_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.cta.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint64_t* addr); + +multimem.ld_reduce.relaxed.sys.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.cta.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.cluster.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.gpu.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint64_t* addr); + +multimem.ld_reduce.acquire.sys.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint64_t* addr); + +multimem.ld_reduce.weak.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .add } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_add_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.cta.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int32_t* addr); + +multimem.ld_reduce.relaxed.sys.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.cta.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.cluster.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.gpu.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int32_t* addr); + +multimem.ld_reduce.acquire.sys.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int32_t* addr); + +multimem.ld_reduce.weak.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .add } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_add_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.cta.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.cluster.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.gpu.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int64_t* addr); + +multimem.ld_reduce.relaxed.sys.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.cta.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.cluster.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.gpu.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int64_t* addr); + +multimem.ld_reduce.acquire.sys.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int64_t* addr); + +multimem.ld_reduce.weak.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .and } + template = true> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_and_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.cta.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.cluster.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.gpu.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.sys.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.cta.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.cluster.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.gpu.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.sys.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B32* addr); + +multimem.ld_reduce.weak.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .or } + template = true> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_or_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.cta.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.cluster.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.gpu.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.sys.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.cta.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.cluster.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.gpu.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.sys.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B32* addr); + +multimem.ld_reduce.weak.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .xor } + template = true> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_xor_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.cta.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.cluster.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.gpu.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B32* addr); + +multimem.ld_reduce.relaxed.sys.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.cta.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.cluster.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.gpu.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B32* addr); + +multimem.ld_reduce.acquire.sys.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B32* addr); + +multimem.ld_reduce.weak.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .and } + template = true> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_and_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.cta.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.cluster.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.gpu.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.sys.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.cta.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.cluster.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.gpu.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.sys.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B64* addr); + +multimem.ld_reduce.weak.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .or } + template = true> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_or_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.cta.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.cluster.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.gpu.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.sys.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.cta.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.cluster.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.gpu.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.sys.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B64* addr); + +multimem.ld_reduce.weak.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .weak } + // .op = { .xor } + template = true> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_xor_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.cta.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.cluster.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.gpu.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B64* addr); + +multimem.ld_reduce.relaxed.sys.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.cta.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.cluster.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.gpu.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B64* addr); + +multimem.ld_reduce.acquire.sys.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .acquire } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B64* addr); diff --git a/docs/libcudacxx/ptx/instructions/generated/multimem_red.rst b/docs/libcudacxx/ptx/instructions/generated/multimem_red.rst new file mode 100644 index 00000000000..095efaef45c --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/multimem_red.rst @@ -0,0 +1,2306 @@ +.. + This file was automatically generated. Do not edit. + +multimem.red.relaxed.cta.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.cluster.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.gpu.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.sys.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.cta.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.cluster.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.gpu.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.sys.global.min.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.cta.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.cluster.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.gpu.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.sys.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.cta.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.cluster.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.gpu.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.sys.global.min.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.cta.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.cluster.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.gpu.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.sys.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int32_t* addr, + int32_t val); + +multimem.red.release.cta.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int32_t* addr, + int32_t val); + +multimem.red.release.cluster.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int32_t* addr, + int32_t val); + +multimem.red.release.gpu.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int32_t* addr, + int32_t val); + +multimem.red.release.sys.global.min.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.cta.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.cluster.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.gpu.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.sys.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int64_t* addr, + int64_t val); + +multimem.red.release.cta.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int64_t* addr, + int64_t val); + +multimem.red.release.cluster.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int64_t* addr, + int64_t val); + +multimem.red.release.gpu.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int64_t* addr, + int64_t val); + +multimem.red.release.sys.global.min.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .min } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.cta.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.cluster.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.gpu.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.sys.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.cta.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.cluster.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.gpu.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.sys.global.max.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.cta.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.cluster.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.gpu.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.sys.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.cta.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.cluster.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.gpu.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.sys.global.max.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.cta.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.cluster.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.gpu.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.sys.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int32_t* addr, + int32_t val); + +multimem.red.release.cta.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int32_t* addr, + int32_t val); + +multimem.red.release.cluster.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int32_t* addr, + int32_t val); + +multimem.red.release.gpu.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int32_t* addr, + int32_t val); + +multimem.red.release.sys.global.max.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.cta.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.cluster.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.gpu.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.sys.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int64_t* addr, + int64_t val); + +multimem.red.release.cta.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int64_t* addr, + int64_t val); + +multimem.red.release.cluster.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int64_t* addr, + int64_t val); + +multimem.red.release.gpu.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int64_t* addr, + int64_t val); + +multimem.red.release.sys.global.max.s64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .max } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.cta.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.cluster.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.gpu.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.sys.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.cta.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.cluster.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.gpu.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint32_t* addr, + uint32_t val); + +multimem.red.release.sys.global.add.u32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint32_t* addr, + uint32_t val); + +multimem.red.relaxed.cta.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.cluster.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.gpu.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.sys.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.cta.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.cluster.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.gpu.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint64_t* addr, + uint64_t val); + +multimem.red.release.sys.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint64_t* addr, + uint64_t val); + +multimem.red.relaxed.cta.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.cluster.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.gpu.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.sys.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int32_t* addr, + int32_t val); + +multimem.red.release.cta.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int32_t* addr, + int32_t val); + +multimem.red.release.cluster.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int32_t* addr, + int32_t val); + +multimem.red.release.gpu.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int32_t* addr, + int32_t val); + +multimem.red.release.sys.global.add.s32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int32_t* addr, + int32_t val); + +multimem.red.relaxed.cta.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.cluster.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.gpu.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.sys.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int64_t* addr, + int64_t val); + +multimem.red.release.cta.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int64_t* addr, + int64_t val); + +multimem.red.release.cluster.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int64_t* addr, + int64_t val); + +multimem.red.release.gpu.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int64_t* addr, + int64_t val); + +multimem.red.release.sys.global.add.u64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .add } + template + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int64_t* addr, + int64_t val); + +multimem.red.relaxed.cta.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.cluster.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.gpu.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.sys.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B32* addr, + B32 val); + +multimem.red.release.cta.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B32* addr, + B32 val); + +multimem.red.release.cluster.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B32* addr, + B32 val); + +multimem.red.release.gpu.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B32* addr, + B32 val); + +multimem.red.release.sys.global.and.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.cta.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.cluster.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.gpu.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.sys.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B32* addr, + B32 val); + +multimem.red.release.cta.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B32* addr, + B32 val); + +multimem.red.release.cluster.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B32* addr, + B32 val); + +multimem.red.release.gpu.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B32* addr, + B32 val); + +multimem.red.release.sys.global.or.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.cta.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.cluster.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.gpu.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.sys.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B32* addr, + B32 val); + +multimem.red.release.cta.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B32* addr, + B32 val); + +multimem.red.release.cluster.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B32* addr, + B32 val); + +multimem.red.release.gpu.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B32* addr, + B32 val); + +multimem.red.release.sys.global.xor.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B32* addr, + B32 val); + +multimem.red.relaxed.cta.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.cluster.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.gpu.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.sys.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B64* addr, + B64 val); + +multimem.red.release.cta.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B64* addr, + B64 val); + +multimem.red.release.cluster.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B64* addr, + B64 val); + +multimem.red.release.gpu.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B64* addr, + B64 val); + +multimem.red.release.sys.global.and.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .and } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.cta.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.cluster.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.gpu.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.sys.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B64* addr, + B64 val); + +multimem.red.release.cta.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B64* addr, + B64 val); + +multimem.red.release.cluster.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B64* addr, + B64 val); + +multimem.red.release.gpu.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B64* addr, + B64 val); + +multimem.red.release.sys.global.or.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .or } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.cta.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.cluster.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.gpu.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B64* addr, + B64 val); + +multimem.red.relaxed.sys.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B64* addr, + B64 val); + +multimem.red.release.cta.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B64* addr, + B64 val); + +multimem.red.release.cluster.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B64* addr, + B64 val); + +multimem.red.release.gpu.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B64* addr, + B64 val); + +multimem.red.release.sys.global.xor.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + // .op = { .xor } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B64* addr, + B64 val); diff --git a/docs/libcudacxx/ptx/instructions/generated/multimem_st.rst b/docs/libcudacxx/ptx/instructions/generated/multimem_st.rst new file mode 100644 index 00000000000..00695328b76 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/multimem_st.rst @@ -0,0 +1,250 @@ +.. + This file was automatically generated. Do not edit. + +multimem.st.weak.global.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.global.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .weak } + template = true> + __device__ static inline void multimem_st( + cuda::ptx::sem_weak_t, + B32* addr, + B32 val); + +multimem.st.relaxed.cta.global.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B32* addr, + B32 val); + +multimem.st.relaxed.cluster.global.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B32* addr, + B32 val); + +multimem.st.relaxed.gpu.global.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B32* addr, + B32 val); + +multimem.st.relaxed.sys.global.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B32* addr, + B32 val); + +multimem.st.release.cta.global.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B32* addr, + B32 val); + +multimem.st.release.cluster.global.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B32* addr, + B32 val); + +multimem.st.release.gpu.global.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B32* addr, + B32 val); + +multimem.st.release.sys.global.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b32 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B32* addr, + B32 val); + +multimem.st.weak.global.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.global.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .weak } + template = true> + __device__ static inline void multimem_st( + cuda::ptx::sem_weak_t, + B64* addr, + B64 val); + +multimem.st.relaxed.cta.global.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B64* addr, + B64 val); + +multimem.st.relaxed.cluster.global.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B64* addr, + B64 val); + +multimem.st.relaxed.gpu.global.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B64* addr, + B64 val); + +multimem.st.relaxed.sys.global.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B64* addr, + B64 val); + +multimem.st.release.cta.global.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B64* addr, + B64 val); + +multimem.st.release.cluster.global.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B64* addr, + B64 val); + +multimem.st.release.gpu.global.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B64* addr, + B64 val); + +multimem.st.release.sys.global.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // multimem.st.sem.scope.global.b64 [addr], val; // PTX ISA 81, SM_90 + // .sem = { .relaxed, .release } + // .scope = { .cta, .cluster, .gpu, .sys } + template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> + __device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B64* addr, + B64 val); diff --git a/docs/libcudacxx/ptx/instructions/generated/red_async.rst b/docs/libcudacxx/ptx/instructions/generated/red_async.rst index 658fe0a8f44..c575b808401 100644 --- a/docs/libcudacxx/ptx/instructions/generated/red_async.rst +++ b/docs/libcudacxx/ptx/instructions/generated/red_async.rst @@ -5,7 +5,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .inc } template @@ -19,7 +19,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .dec } template @@ -33,7 +33,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .min } template @@ -47,7 +47,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .max } template @@ -61,7 +61,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .add } template @@ -75,7 +75,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .s32 } // .op = { .min } template @@ -89,7 +89,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .s32 } // .op = { .max } template @@ -103,7 +103,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .s32 } // .op = { .add } template @@ -117,10 +117,10 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .b32 } // .op = { .and } - template + template = true> __device__ static inline void red_async( cuda::ptx::op_and_op_t, B32* dest, @@ -131,10 +131,10 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .b32 } // .op = { .or } - template + template = true> __device__ static inline void red_async( cuda::ptx::op_or_op_t, B32* dest, @@ -145,10 +145,10 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .b32 } // .op = { .xor } - template + template = true> __device__ static inline void red_async( cuda::ptx::op_xor_op_t, B32* dest, @@ -159,7 +159,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u64 } // .op = { .add } template @@ -173,7 +173,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90 + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90 // .op = { .add } template __device__ static inline void red_async( diff --git a/docs/libcudacxx/ptx/instructions/generated/st_async.rst b/docs/libcudacxx/ptx/instructions/generated/st_async.rst index d00a152cf29..8cfc21ba0b5 100644 --- a/docs/libcudacxx/ptx/instructions/generated/st_async.rst +++ b/docs/libcudacxx/ptx/instructions/generated/st_async.rst @@ -5,7 +5,7 @@ st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. PTX ISA 81, SM_90 + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.type [addr], value, [remote_bar]; // 1. PTX ISA 81, SM_90 // .type = { .b32, .b64 } template __device__ static inline void st_async( @@ -17,7 +17,7 @@ st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. PTX ISA 81, SM_90 + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.type [addr], value, [remote_bar]; // 1. PTX ISA 81, SM_90 // .type = { .b32, .b64 } template __device__ static inline void st_async( @@ -29,7 +29,7 @@ st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2. PTX ISA 81, SM_90 + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.type [addr], value, [remote_bar]; // 2. PTX ISA 81, SM_90 // .type = { .b32, .b64 } template __device__ static inline void st_async( @@ -41,7 +41,7 @@ st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2. PTX ISA 81, SM_90 + // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.type [addr], value, [remote_bar]; // 2. PTX ISA 81, SM_90 // .type = { .b32, .b64 } template __device__ static inline void st_async( @@ -54,7 +54,7 @@ st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 .. code:: cuda // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; // 3. PTX ISA 81, SM_90 - template + template = true> __device__ static inline void st_async( B32* addr, const B32 (&value)[4], diff --git a/docs/libcudacxx/ptx/instructions/generated/st_bulk.rst b/docs/libcudacxx/ptx/instructions/generated/st_bulk.rst new file mode 100644 index 00000000000..817d3875fdc --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/st_bulk.rst @@ -0,0 +1,13 @@ +.. + This file was automatically generated. Do not edit. + +st.bulk.weak.shared::cta +^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // st.bulk.weak.shared::cta [addr], size, initval; // PTX ISA 86, SM_100 + template + __device__ static inline void st_bulk( + void* addr, + uint64_t size, + cuda::ptx::n32_t initval); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_alloc.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_alloc.rst new file mode 100644 index 00000000000..3bfb60fca71 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_alloc.rst @@ -0,0 +1,70 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.alloc.cta_group.sync.aligned.shared::cta.b32 [dst], nCols; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_alloc( + cuda::ptx::cta_group_t cta_group, + uint32_t* dst, + const uint32_t& nCols); + +tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.alloc.cta_group.sync.aligned.shared::cta.b32 [dst], nCols; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_alloc( + cuda::ptx::cta_group_t cta_group, + uint32_t* dst, + const uint32_t& nCols); + +tcgen05.dealloc.cta_group::1.sync.aligned.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.dealloc.cta_group.sync.aligned.b32 taddr, nCols; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_dealloc( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + const uint32_t& nCols); + +tcgen05.dealloc.cta_group::2.sync.aligned.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.dealloc.cta_group.sync.aligned.b32 taddr, nCols; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_dealloc( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + const uint32_t& nCols); + +tcgen05.relinquish_alloc_permit.cta_group::1.sync.aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.relinquish_alloc_permit.cta_group.sync.aligned; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_relinquish_alloc_permit( + cuda::ptx::cta_group_t cta_group); + +tcgen05.relinquish_alloc_permit.cta_group::2.sync.aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.relinquish_alloc_permit.cta_group.sync.aligned; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_relinquish_alloc_permit( + cuda::ptx::cta_group_t cta_group); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_commit.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_commit.rst new file mode 100644 index 00000000000..d5546fed3e5 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_commit.rst @@ -0,0 +1,48 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.commit.cta_group.mbarrier::arrive::one.shared::cluster.b64 [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_commit( + cuda::ptx::cta_group_t cta_group, + uint64_t* smem_bar); + +tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.commit.cta_group.mbarrier::arrive::one.shared::cluster.b64 [smem_bar]; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_commit( + cuda::ptx::cta_group_t cta_group, + uint64_t* smem_bar); + +tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.commit.cta_group.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [smem_bar], ctaMask; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_commit_multicast( + cuda::ptx::cta_group_t cta_group, + uint64_t* smem_bar, + uint16_t ctaMask); + +tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.commit.cta_group.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [smem_bar], ctaMask; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_commit_multicast( + cuda::ptx::cta_group_t cta_group, + uint64_t* smem_bar, + uint16_t ctaMask); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_cp.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_cp.rst new file mode 100644 index 00000000000..b0195c5b28e --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_cp.rst @@ -0,0 +1,434 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.cp.cta_group::1.128x256b +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x256b [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x256b( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.128x256b +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x256b [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x256b( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.4x256b +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.4x256b [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_4x256b( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.4x256b +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.4x256b [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_4x256b( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.128x128b +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x128b [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x128b( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.128x128b +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x128b [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x128b( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.64x128b.warpx2::02_13 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::02_13 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_02_13( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.64x128b.warpx2::02_13 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::02_13 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_02_13( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.64x128b.warpx2::01_23 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::01_23 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_01_23( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.64x128b.warpx2::01_23 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::01_23 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_01_23( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.32x128b.warpx4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.32x128b.warpx4 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_32x128b_warpx4( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.32x128b.warpx4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.32x128b.warpx4 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_32x128b_warpx4( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.128x256b.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x256b.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x256b_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.128x256b.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x256b.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x256b_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.4x256b.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.4x256b.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_4x256b_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.4x256b.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.4x256b.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_4x256b_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.128x128b.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x128b.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x128b_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.128x128b.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x128b.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x128b_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::02_13.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::02_13.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::01_23.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::01_23.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.32x128b.warpx4.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b6x16_p32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.32x128b.warpx4.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.128x256b.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x256b.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x256b_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.128x256b.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x256b.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x256b_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.4x256b.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.4x256b.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_4x256b_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.4x256b.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.4x256b.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_4x256b_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.128x128b.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x128b.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x128b_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.128x128b.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.128x128b.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_128x128b_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::02_13.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::02_13.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::01_23.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.64x128b.warpx2::01_23.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.32x128b.warpx4.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); + +tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b4x16_p64 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.cp.cta_group.32x128b.warpx4.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_fence.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_fence.rst new file mode 100644 index 00000000000..ee287ea8860 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_fence.rst @@ -0,0 +1,18 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.fence::before_thread_sync +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.fence::before_thread_sync; // PTX ISA 86, SM_100a, SM_101a + template + __device__ static inline void tcgen05_fence_before_thread_sync(); + +tcgen05.fence::after_thread_sync +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.fence::after_thread_sync; // PTX ISA 86, SM_100a, SM_101a + template + __device__ static inline void tcgen05_fence_after_thread_sync(); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_ld.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_ld.rst new file mode 100644 index 00000000000..0bb6bdbb5f5 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_ld.rst @@ -0,0 +1,758 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.ld.sync.aligned.16x64b.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[1], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[1], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[2], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[2], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[4], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[4], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[8], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x8.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[8], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[16], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x16.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[16], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[32], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x32.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[32], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x64.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[64], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x64.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[64], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x128.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x128.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[128], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x64b.x128.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x64b.x128.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[128], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[2], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[2], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[4], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[4], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[8], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[8], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[16], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x8.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[16], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[32], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x16.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[32], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[64], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x32.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[64], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x64.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[128], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x128b.x64.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x128b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[128], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[4], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[4], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[8], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[8], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[16], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[16], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[32], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x8.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[32], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[64], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x16.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[64], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[128], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x256b.x32.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x256b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[128], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[1], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[1], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[2], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[2], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[4], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[4], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[8], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x8.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[8], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[16], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x16.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[16], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[32], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x32.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[32], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x64.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[64], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[64], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x128.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x128.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[128], + uint32_t taddr); + +tcgen05.ld.sync.aligned.32x32b.x128.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.32x32b.x128.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[128], + uint32_t taddr); + +tcgen05.ld.sync.aligned.16x32bx2.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x1.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[1], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[1], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x2.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[2], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[2], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x4.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[4], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[4], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x8.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[8], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x8.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x8.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[8], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x16.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[16], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x16.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x16.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[16], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x32.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[32], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x32.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x32.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[32], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x64.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x64.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[64], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x64.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x64.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[64], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x128.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x128.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[128], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); + +tcgen05.ld.sync.aligned.16x32bx2.x128.pack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.ld.sync.aligned.16x32bx2.x128.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a + template = true, int N32> + __device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[128], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_mma.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_mma.rst new file mode 100644 index 00000000000..aa5a1675193 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_mma.rst @@ -0,0 +1,2378 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.mma.cta_group::1.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::2.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::1.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::2.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::1.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::f8f6f4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::f8f6f4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::i8 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::i8 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::2.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::1.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::2.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::1.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::f8f6f4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::f8f6f4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::i8 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::i8 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::2.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::1.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::2.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::1.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::f8f6f4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::f8f6f4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::i8 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::i8 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::2.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::1.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::2.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a + // .kind = { .kind::f16, .kind::tf32 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); + +tcgen05.mma.cta_group::1.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::f16 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::tf32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::f8f6f4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::f8f6f4 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::i8 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::i8 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_fill( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_fill( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_fill( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_fill( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_use( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_use( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_use( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_use( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_lastuse( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_lastuse( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_lastuse( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_lastuse( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_discard( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_discard( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_discard( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_discard( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf8f6f4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4, .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); + +tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .kind = { .kind::mxf4nvf4 } + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_mma_ws.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_mma_ws.rst new file mode 100644 index 00000000000..cb900a0ec40 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_mma_ws.rst @@ -0,0 +1,4482 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); + +tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); + +tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1 } + // .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } + template + __device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_shift.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_shift.rst new file mode 100644 index 00000000000..54e665ed3cc --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_shift.rst @@ -0,0 +1,24 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.shift.cta_group::1.down +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.shift.cta_group.down [taddr]; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_shift_down( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr); + +tcgen05.shift.cta_group::2.down +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.shift.cta_group.down [taddr]; // PTX ISA 86, SM_100a, SM_101a + // .cta_group = { .cta_group::1, .cta_group::2 } + template + __device__ static inline void tcgen05_shift_down( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_st.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_st.rst new file mode 100644 index 00000000000..3147a1757d8 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_st.rst @@ -0,0 +1,758 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.st.sync.aligned.16x64b.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[1]); + +tcgen05.st.sync.aligned.16x64b.x1.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[1]); + +tcgen05.st.sync.aligned.16x64b.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[2]); + +tcgen05.st.sync.aligned.16x64b.x2.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[2]); + +tcgen05.st.sync.aligned.16x64b.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.16x64b.x4.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.16x64b.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.16x64b.x8.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.16x64b.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.16x64b.x16.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.16x64b.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.16x64b.x32.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.16x64b.x64.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x64.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.16x64b.x64.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x64.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.16x64b.x128.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x128.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[128]); + +tcgen05.st.sync.aligned.16x64b.x128.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x64b.x128.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[128]); + +tcgen05.st.sync.aligned.16x128b.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[2]); + +tcgen05.st.sync.aligned.16x128b.x1.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[2]); + +tcgen05.st.sync.aligned.16x128b.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.16x128b.x2.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.16x128b.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.16x128b.x4.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.16x128b.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.16x128b.x8.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.16x128b.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.16x128b.x16.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.16x128b.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.16x128b.x32.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.16x128b.x64.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x64.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[128]); + +tcgen05.st.sync.aligned.16x128b.x64.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x128b.x64.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[128]); + +tcgen05.st.sync.aligned.16x256b.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.16x256b.x1.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.16x256b.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.16x256b.x2.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.16x256b.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.16x256b.x4.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.16x256b.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.16x256b.x8.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.16x256b.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.16x256b.x16.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.16x256b.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[128]); + +tcgen05.st.sync.aligned.16x256b.x32.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x256b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[128]); + +tcgen05.st.sync.aligned.32x32b.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[1]); + +tcgen05.st.sync.aligned.32x32b.x1.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[1]); + +tcgen05.st.sync.aligned.32x32b.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[2]); + +tcgen05.st.sync.aligned.32x32b.x2.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[2]); + +tcgen05.st.sync.aligned.32x32b.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.32x32b.x4.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.32x32b.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.32x32b.x8.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.32x32b.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.32x32b.x16.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.32x32b.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.32x32b.x32.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.32x32b.x64.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x64.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.32x32b.x64.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x64.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.32x32b.x128.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x128.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[128]); + +tcgen05.st.sync.aligned.32x32b.x128.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.32x32b.x128.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[128]); + +tcgen05.st.sync.aligned.16x32bx2.x1.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x1.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[1]); + +tcgen05.st.sync.aligned.16x32bx2.x1.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x1.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[1]); + +tcgen05.st.sync.aligned.16x32bx2.x2.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x2.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[2]); + +tcgen05.st.sync.aligned.16x32bx2.x2.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x2.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[2]); + +tcgen05.st.sync.aligned.16x32bx2.x4.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x4.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.16x32bx2.x4.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x4.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[4]); + +tcgen05.st.sync.aligned.16x32bx2.x8.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x8.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.16x32bx2.x8.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x8.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[8]); + +tcgen05.st.sync.aligned.16x32bx2.x16.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x16.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.16x32bx2.x16.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x16.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[16]); + +tcgen05.st.sync.aligned.16x32bx2.x32.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x32.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.16x32bx2.x32.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x32.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[32]); + +tcgen05.st.sync.aligned.16x32bx2.x64.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x64.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.16x32bx2.x64.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x64.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[64]); + +tcgen05.st.sync.aligned.16x32bx2.x128.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x128.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[128]); + +tcgen05.st.sync.aligned.16x32bx2.x128.unpack::16b.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.st.sync.aligned.16x32bx2.x128.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a + template = true> + __device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[128]); diff --git a/docs/libcudacxx/ptx/instructions/generated/tcgen05_wait.rst b/docs/libcudacxx/ptx/instructions/generated/tcgen05_wait.rst new file mode 100644 index 00000000000..ec48818eecc --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/tcgen05_wait.rst @@ -0,0 +1,18 @@ +.. + This file was automatically generated. Do not edit. + +tcgen05.wait::ld.sync.aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.wait::ld.sync.aligned; // PTX ISA 86, SM_100a, SM_101a + template + __device__ static inline void tcgen05_wait_ld(); + +tcgen05.wait::st.sync.aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tcgen05.wait::st.sync.aligned; // PTX ISA 86, SM_100a, SM_101a + template + __device__ static inline void tcgen05_wait_st(); diff --git a/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst b/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst index a8c4a260782..fbf010d6009 100644 --- a/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst +++ b/docs/libcudacxx/ptx/instructions/generated/tensormap_replace.rst @@ -5,9 +5,9 @@ tensormap.replace.tile.global_address.global.b1024.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } - template + template = true> __device__ static inline void tensormap_replace_global_address( cuda::ptx::space_global_t, void* tm_addr, @@ -17,9 +17,9 @@ tensormap.replace.tile.global_address.shared::cta.b1024.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } - template + template = true> __device__ static inline void tensormap_replace_global_address( cuda::ptx::space_shared_t, void* tm_addr, @@ -29,9 +29,9 @@ tensormap.replace.tile.rank.global.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } - template + template = true> __device__ static inline void tensormap_replace_rank( cuda::ptx::space_global_t, void* tm_addr, @@ -41,9 +41,9 @@ tensormap.replace.tile.rank.shared::cta.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } - template + template = true> __device__ static inline void tensormap_replace_rank( cuda::ptx::space_shared_t, void* tm_addr, @@ -53,9 +53,9 @@ tensormap.replace.tile.box_dim.global.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } - template + template = true> __device__ static inline void tensormap_replace_box_dim( cuda::ptx::space_global_t, void* tm_addr, @@ -66,9 +66,9 @@ tensormap.replace.tile.box_dim.shared::cta.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } - template + template = true> __device__ static inline void tensormap_replace_box_dim( cuda::ptx::space_shared_t, void* tm_addr, @@ -79,9 +79,9 @@ tensormap.replace.tile.global_dim.global.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } - template + template = true> __device__ static inline void tensormap_replace_global_dim( cuda::ptx::space_global_t, void* tm_addr, @@ -92,9 +92,9 @@ tensormap.replace.tile.global_dim.shared::cta.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } - template + template = true> __device__ static inline void tensormap_replace_global_dim( cuda::ptx::space_shared_t, void* tm_addr, @@ -105,9 +105,9 @@ tensormap.replace.tile.global_stride.global.b1024.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } - template + template = true> __device__ static inline void tensormap_replace_global_stride( cuda::ptx::space_global_t, void* tm_addr, @@ -118,9 +118,9 @@ tensormap.replace.tile.global_stride.shared::cta.b1024.b64 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } - template + template = true> __device__ static inline void tensormap_replace_global_stride( cuda::ptx::space_shared_t, void* tm_addr, @@ -131,9 +131,35 @@ tensormap.replace.tile.element_stride.global.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } - template + template = true> + __device__ static inline void tensormap_replace_element_stride( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); + +tensormap.replace.tile.element_stride.shared::cta.b1024.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a + // .space = { .shared::cta } + template = true> + __device__ static inline void tensormap_replace_element_stride( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); + +tensormap.replace.tile.element_stride.global.b1024.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a + // .space = { .global } + template = true> __device__ static inline void tensormap_replace_element_size( cuda::ptx::space_global_t, void* tm_addr, @@ -144,9 +170,9 @@ tensormap.replace.tile.element_stride.shared::cta.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } - template + template = true> __device__ static inline void tensormap_replace_element_size( cuda::ptx::space_shared_t, void* tm_addr, @@ -157,7 +183,7 @@ tensormap.replace.tile.elemtype.global.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } template __device__ static inline void tensormap_replace_elemtype( @@ -169,7 +195,7 @@ tensormap.replace.tile.elemtype.shared::cta.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } template __device__ static inline void tensormap_replace_elemtype( @@ -181,7 +207,7 @@ tensormap.replace.tile.interleave_layout.global.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } template __device__ static inline void tensormap_replace_interleave_layout( @@ -193,7 +219,7 @@ tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } template __device__ static inline void tensormap_replace_interleave_layout( @@ -205,7 +231,7 @@ tensormap.replace.tile.swizzle_mode.global.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } template __device__ static inline void tensormap_replace_swizzle_mode( @@ -217,7 +243,7 @@ tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } template __device__ static inline void tensormap_replace_swizzle_mode( @@ -229,7 +255,7 @@ tensormap.replace.tile.fill_mode.global.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } template __device__ static inline void tensormap_replace_fill_mode( @@ -241,10 +267,34 @@ tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: cuda - // tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a + // tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } template __device__ static inline void tensormap_replace_fill_mode( cuda::ptx::space_shared_t, void* tm_addr, cuda::ptx::n32_t new_val); + +tensormap.replace.tile.swizzle_atomicity.global.b1024.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tensormap.replace.tile.swizzle_atomicity.space.b1024.b32 [tm_addr], new_val; // PTX ISA 86, SM_100a, SM_101a + // .space = { .global } + template + __device__ static inline void tensormap_replace_swizzle_atomicity( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t new_val); + +tensormap.replace.tile.swizzle_atomicity.shared::cta.b1024.b32 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code:: cuda + + // tensormap.replace.tile.swizzle_atomicity.space.b1024.b32 [tm_addr], new_val; // PTX ISA 86, SM_100a, SM_101a + // .space = { .shared::cta } + template + __device__ static inline void tensormap_replace_swizzle_atomicity( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t new_val); diff --git a/docs/libcudacxx/ptx/instructions/special_registers.rst b/docs/libcudacxx/ptx/instructions/special_registers.rst index 1e9597fa726..1981f7fb908 100644 --- a/docs/libcudacxx/ptx/instructions/special_registers.rst +++ b/docs/libcudacxx/ptx/instructions/special_registers.rst @@ -6,4 +6,4 @@ Special registers - PTX ISA: `Special Register `__ -.. include:: generated/special_registers.rst +.. include:: generated/get_sreg.rst diff --git a/docs/repo.toml b/docs/repo.toml index 3313723c527..c36ebcf9244 100644 --- a/docs/repo.toml +++ b/docs/repo.toml @@ -52,7 +52,7 @@ api_output_directory = "api" use_fast_doxygen_conversion = true sphinx_generate_doxygen_groups = true sphinx_generate_doxygen_pages = true -sphinx_exclude_patterns = [] +sphinx_exclude_patterns = ['ptx/instructions/generated'] [repo_docs.projects.cub] name = "CUB" diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h index 10d55714c5b..75a72db7024 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h @@ -14,15 +14,12 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_ template _CCCL_DEVICE static inline void barrier_cluster_arrive() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("barrier.cluster.arrive;" - : - : - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.arrive;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 780 @@ -37,15 +34,12 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_be template _CCCL_DEVICE static inline void barrier_cluster_wait() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("barrier.cluster.wait;" - : - : - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.wait;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 780 @@ -62,16 +56,13 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_ template _CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t) { - // __sem == sem_release (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("barrier.cluster.arrive.release;" - : - : - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();)); +// __sem == sem_release (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.arrive.release;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -88,16 +79,13 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_ template _CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t) { - // __sem == sem_relaxed (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("barrier.cluster.arrive.relaxed;" - : - : - :);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();)); +// __sem == sem_relaxed (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.arrive.relaxed;" : : :); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -114,16 +102,13 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_be template _CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t) { - // __sem == sem_acquire (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("barrier.cluster.wait.acquire;" - : - : - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();)); +// __sem == sem_acquire (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.wait.acquire;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h new file mode 100644 index 00000000000..80fe3796e69 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster_aligned.h @@ -0,0 +1,130 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_BARRIER_CLUSTER_ALIGNED_H_ +#define _CUDA_PTX_GENERATED_BARRIER_CLUSTER_ALIGNED_H_ + +/* +// barrier.cluster.arrive.aligned; // PTX ISA 78, SM_90 +// .aligned = { .aligned } +// Marked volatile and as clobbering memory +template +__device__ static inline void barrier_cluster_arrive( + cuda::ptx::dot_aligned_t); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void barrier_cluster_arrive(dot_aligned_t) +{ +// __aligned == aligned (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.arrive.aligned;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 780 + +/* +// barrier.cluster.wait.aligned; // PTX ISA 78, SM_90 +// .aligned = { .aligned } +// Marked volatile and as clobbering memory +template +__device__ static inline void barrier_cluster_wait( + cuda::ptx::dot_aligned_t); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void barrier_cluster_wait(dot_aligned_t) +{ +// __aligned == aligned (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.wait.aligned;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 780 + +/* +// barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90 +// .sem = { .release } +// .aligned = { .aligned } +// Marked volatile and as clobbering memory +template +__device__ static inline void barrier_cluster_arrive( + cuda::ptx::sem_release_t, + cuda::ptx::dot_aligned_t); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t, dot_aligned_t) +{ +// __sem == sem_release (due to parameter type constraint) +// __aligned == aligned (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.arrive.release.aligned;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +/* +// barrier.cluster.arrive.sem.aligned; // PTX ISA 80, SM_90 +// .sem = { .relaxed } +// .aligned = { .aligned } +// Marked volatile +template +__device__ static inline void barrier_cluster_arrive( + cuda::ptx::sem_relaxed_t, + cuda::ptx::dot_aligned_t); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t, dot_aligned_t) +{ +// __sem == sem_relaxed (due to parameter type constraint) +// __aligned == aligned (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.arrive.relaxed.aligned;" : : :); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +/* +// barrier.cluster.wait.sem.aligned; // PTX ISA 80, SM_90 +// .sem = { .acquire } +// .aligned = { .aligned } +// Marked volatile and as clobbering memory +template +__device__ static inline void barrier_cluster_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::dot_aligned_t); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t, dot_aligned_t) +{ +// __sem == sem_acquire (due to parameter type constraint) +// __aligned == aligned (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("barrier.cluster.wait.acquire.aligned;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_BARRIER_CLUSTER_ALIGNED_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h b/libcudacxx/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h new file mode 100644 index 00000000000..19e3f92bd13 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h @@ -0,0 +1,240 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CLUSTERLAUNCHCONTROL_H_ +#define _CUDA_PTX_GENERATED_CLUSTERLAUNCHCONTROL_H_ + +/* +// clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 [addr], [smem_bar]; // PTX ISA +86, SM_100 template +__device__ static inline void clusterlaunchcontrol_try_cancel( + void* addr, + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_clusterlaunchcontrol_try_cancel_is_not_supported_before_SM_100__(); +template +_CCCL_DEVICE static inline void clusterlaunchcontrol_try_cancel(void* __addr, _CUDA_VSTD::uint64_t* __smem_bar) +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 1000 + asm("clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 [%0], [%1];" + : + : "r"(__as_ptr_smem(__addr)), "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_clusterlaunchcontrol_try_cancel_is_not_supported_before_SM_100__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128 [addr], +[smem_bar]; // PTX ISA 86, SM_100a, SM_101a template +__device__ static inline void clusterlaunchcontrol_try_cancel_multicast( + void* addr, + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_clusterlaunchcontrol_try_cancel_multicast_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void clusterlaunchcontrol_try_cancel_multicast(void* __addr, _CUDA_VSTD::uint64_t* __smem_bar) +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128 " + "[%0], [%1];" + : + : "r"(__as_ptr_smem(__addr)), "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_clusterlaunchcontrol_try_cancel_multicast_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// clusterlaunchcontrol.query_cancel.is_canceled.pred.b128 pred_is_canceled, try_cancel_response; // PTX ISA 86, SM_100 +template = true> +__device__ static inline bool clusterlaunchcontrol_query_cancel_is_canceled( + B128 try_cancel_response); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_clusterlaunchcontrol_query_cancel_is_canceled_is_not_supported_before_SM_100__(); +template = true> +_CCCL_DEVICE static inline bool clusterlaunchcontrol_query_cancel_is_canceled(_B128 __try_cancel_response) +{ + static_assert(sizeof(_B128) == 16, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 1000 + _CUDA_VSTD::uint32_t __pred_is_canceled; + asm("{\n\t .reg .b128 B128_try_cancel_response; \n\t" + "mov.b128 B128_try_cancel_response, {%1, %2}; \n" + "{\n\t .reg .pred P_OUT; \n\t" + "clusterlaunchcontrol.query_cancel.is_canceled.pred.b128 P_OUT, B128_try_cancel_response;\n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}\n\t" + "}" + : "=r"(__pred_is_canceled) + : "l"((*reinterpret_cast(&__try_cancel_response)).x), + "l"((*reinterpret_cast(&__try_cancel_response)).y) + :); + return static_cast(__pred_is_canceled); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_clusterlaunchcontrol_query_cancel_is_canceled_is_not_supported_before_SM_100__(); + return false; +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// clusterlaunchcontrol.query_cancel.get_first_ctaid::x.b32.b128 ret_dim, try_cancel_response; // PTX ISA 86, SM_100 +template = true, typename B128, enable_if_t += true> +__device__ static inline B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_x( + B128 try_cancel_response); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_clusterlaunchcontrol_query_cancel_get_first_ctaid_x_is_not_supported_before_SM_100__(); +template = true, + typename _B128, + _CUDA_VSTD::enable_if_t = true> +_CCCL_DEVICE static inline _B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_x(_B128 __try_cancel_response) +{ + static_assert(sizeof(_B32) == 4, ""); + static_assert(sizeof(_B128) == 16, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 1000 + _CUDA_VSTD::uint32_t __ret_dim; + asm("{\n\t .reg .b128 B128_try_cancel_response; \n\t" + "mov.b128 B128_try_cancel_response, {%1, %2}; \n" + "clusterlaunchcontrol.query_cancel.get_first_ctaid::x.b32.b128 %0, B128_try_cancel_response;\n\t" + "}" + : "=r"(__ret_dim) + : "l"((*reinterpret_cast(&__try_cancel_response)).x), + "l"((*reinterpret_cast(&__try_cancel_response)).y) + :); + return *reinterpret_cast<_B32*>(&__ret_dim); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_clusterlaunchcontrol_query_cancel_get_first_ctaid_x_is_not_supported_before_SM_100__(); + _CUDA_VSTD::uint32_t __err_out_var = 0; + return *reinterpret_cast<_B32*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// clusterlaunchcontrol.query_cancel.get_first_ctaid::y.b32.b128 ret_dim, try_cancel_response; // PTX ISA 86, SM_100 +template = true, typename B128, enable_if_t += true> +__device__ static inline B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_y( + B128 try_cancel_response); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_clusterlaunchcontrol_query_cancel_get_first_ctaid_y_is_not_supported_before_SM_100__(); +template = true, + typename _B128, + _CUDA_VSTD::enable_if_t = true> +_CCCL_DEVICE static inline _B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_y(_B128 __try_cancel_response) +{ + static_assert(sizeof(_B32) == 4, ""); + static_assert(sizeof(_B128) == 16, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 1000 + _CUDA_VSTD::uint32_t __ret_dim; + asm("{\n\t .reg .b128 B128_try_cancel_response; \n\t" + "mov.b128 B128_try_cancel_response, {%1, %2}; \n" + "clusterlaunchcontrol.query_cancel.get_first_ctaid::y.b32.b128 %0, B128_try_cancel_response;\n\t" + "}" + : "=r"(__ret_dim) + : "l"((*reinterpret_cast(&__try_cancel_response)).x), + "l"((*reinterpret_cast(&__try_cancel_response)).y) + :); + return *reinterpret_cast<_B32*>(&__ret_dim); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_clusterlaunchcontrol_query_cancel_get_first_ctaid_y_is_not_supported_before_SM_100__(); + _CUDA_VSTD::uint32_t __err_out_var = 0; + return *reinterpret_cast<_B32*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// clusterlaunchcontrol.query_cancel.get_first_ctaid::z.b32.b128 ret_dim, try_cancel_response; // PTX ISA 86, SM_100 +template = true, typename B128, enable_if_t += true> +__device__ static inline B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_z( + B128 try_cancel_response); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_clusterlaunchcontrol_query_cancel_get_first_ctaid_z_is_not_supported_before_SM_100__(); +template = true, + typename _B128, + _CUDA_VSTD::enable_if_t = true> +_CCCL_DEVICE static inline _B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_z(_B128 __try_cancel_response) +{ + static_assert(sizeof(_B32) == 4, ""); + static_assert(sizeof(_B128) == 16, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 1000 + _CUDA_VSTD::uint32_t __ret_dim; + asm("{\n\t .reg .b128 B128_try_cancel_response; \n\t" + "mov.b128 B128_try_cancel_response, {%1, %2}; \n" + "clusterlaunchcontrol.query_cancel.get_first_ctaid::z.b32.b128 %0, B128_try_cancel_response;\n\t" + "}" + : "=r"(__ret_dim) + : "l"((*reinterpret_cast(&__try_cancel_response)).x), + "l"((*reinterpret_cast(&__try_cancel_response)).y) + :); + return *reinterpret_cast<_B32*>(&__ret_dim); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_clusterlaunchcontrol_query_cancel_get_first_ctaid_z_is_not_supported_before_SM_100__(); + _CUDA_VSTD::uint32_t __err_out_var = 0; + return *reinterpret_cast<_B32*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// clusterlaunchcontrol.query_cancel.get_first_ctaid.v4.b32.b128 block_dim, try_cancel_response; // PTX ISA 86, SM_100 +template = true, typename B128, enable_if_t += true> +__device__ static inline void clusterlaunchcontrol_query_cancel_get_first_ctaid( + B32 (&block_dim)[4], + B128 try_cancel_response); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_clusterlaunchcontrol_query_cancel_get_first_ctaid_is_not_supported_before_SM_100__(); +template = true, + typename _B128, + _CUDA_VSTD::enable_if_t = true> +_CCCL_DEVICE static inline void +clusterlaunchcontrol_query_cancel_get_first_ctaid(_B32 (&__block_dim)[4], _B128 __try_cancel_response) +{ + static_assert(sizeof(_B32) == 4, ""); + static_assert(sizeof(_B128) == 16, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 1000 + asm("{\n\t .reg .b128 B128_try_cancel_response; \n\t" + "mov.b128 B128_try_cancel_response, {%4, %5}; \n" + "clusterlaunchcontrol.query_cancel.get_first_ctaid.v4.b32.b128 {%0, %1, %2, %3}, B128_try_cancel_response;\n\t" + "}" + : "=r"(__block_dim[0]), "=r"(__block_dim[1]), "=r"(__block_dim[2]), "=r"(__block_dim[3]) + : "l"((*reinterpret_cast(&__try_cancel_response)).x), + "l"((*reinterpret_cast(&__try_cancel_response)).y) + :); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_clusterlaunchcontrol_query_cancel_get_first_ctaid_is_not_supported_before_SM_100__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_CLUSTERLAUNCHCONTROL_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h index 8ba40d45f64..a9aa3534611 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h @@ -4,8 +4,7 @@ #define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_ /* -// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, -SM_90 +// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -28,22 +27,60 @@ _CCCL_DEVICE static inline void cp_async_bulk( const _CUDA_VSTD::uint32_t& __size, _CUDA_VSTD::uint64_t* __smem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 1a. unicast" - : - : "r"(__as_ptr_smem(__dstMem)), "l"(__as_ptr_gmem(__srcMem)), "r"(__size), "r"(__as_ptr_smem(__smem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" + : + : "r"(__as_ptr_smem(__dstMem)), "l"(__as_ptr_gmem(__srcMem)), "r"(__size), "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90 +// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // PTX ISA 86, SM_90 +// .dst = { .shared::cta } +// .src = { .global } +template +__device__ static inline void cp_async_bulk( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* srcMem, + const uint32_t& size, + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk( + space_shared_t, + space_global_t, + void* __dstMem, + const void* __srcMem, + const _CUDA_VSTD::uint32_t& __size, + _CUDA_VSTD::uint64_t* __smem_bar) +{ +// __space == space_shared (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" + : + : "r"(__as_ptr_smem(__dstMem)), "l"(__as_ptr_gmem(__srcMem)), "r"(__size), "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .shared::cta } template @@ -66,25 +103,25 @@ _CCCL_DEVICE static inline void cp_async_bulk( const _CUDA_VSTD::uint32_t& __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3]; // 2. " - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];" + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 +// cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -101,18 +138,56 @@ template _CCCL_DEVICE static inline void cp_async_bulk(space_global_t, space_shared_t, void* __dstMem, const void* __srcMem, const _CUDA_VSTD::uint32_t& __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2; // 3. " - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 +/* +// cp.async.bulk.dst.src.bulk_group.cp_mask [dstMem], [srcMem], size, byteMask; // PTX ISA 86, SM_100 +// .dst = { .global } +// .src = { .shared::cta } +template +__device__ static inline void cp_async_bulk_cp_mask( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + void* dstMem, + const void* srcMem, + const uint32_t& size, + const uint16_t& byteMask); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_cp_mask_is_not_supported_before_SM_100__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_cp_mask( + space_global_t, + space_shared_t, + void* __dstMem, + const void* __srcMem, + const _CUDA_VSTD::uint32_t& __size, + const _CUDA_VSTD::uint16_t& __byteMask) +{ +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 1000 + asm("cp.async.bulk.global.shared::cta.bulk_group.cp_mask [%0], [%1], %2, %3;" + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size), "h"(__byteMask) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_cp_mask_is_not_supported_before_SM_100__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + #endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h index 7bb58675ddb..3b906fd6922 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h @@ -13,15 +13,12 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_commit_group_is_not_suppor template _CCCL_DEVICE static inline void cp_async_bulk_commit_group() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("cp.async.bulk.commit_group;" - : - : - :);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("cp.async.bulk.commit_group;" : : :); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h index a5534ef0b48..7ac386343b9 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h @@ -4,8 +4,8 @@ #define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_MULTICAST_H_ /* -// cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], -ctaMask; // 1. PTX ISA 80, SM_90a +// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; +// PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -19,7 +19,7 @@ __device__ static inline void cp_async_bulk( const uint16_t& ctaMask); */ #if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void cp_async_bulk( space_cluster_t, @@ -30,22 +30,22 @@ _CCCL_DEVICE static inline void cp_async_bulk( _CUDA_VSTD::uint64_t* __smem_bar, const _CUDA_VSTD::uint16_t& __ctaMask) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1], %2, [%3], " - "%4; // 1. " - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__as_ptr_gmem(__srcMem)), - "r"(__size), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1], %2, [%3], %4;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__as_ptr_gmem(__srcMem)), + "r"(__size), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h index 3cbd26fda04..2326346f547 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h @@ -4,8 +4,8 @@ #define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_H_ /* -// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// -1a. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -28,23 +28,116 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[1], _CUDA_VSTD::uint64_t* __smem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2}], [%3];// " - "1a." - : - : "r"(__as_ptr_smem(__dstMem)), "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__smem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2}], [%3];" + : + : "r"(__as_ptr_smem(__dstMem)), "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 86, SM_90 +// .dst = { .shared::cta } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[1], + _CUDA_VSTD::uint64_t* __smem_bar) +{ +// __space == space_shared (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2}], [%3];" + : + : "r"(__as_ptr_smem(__dstMem)), "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], +[smem_bar]; // PTX ISA 86, SM_100a, SM_101a +// .dst = { .shared::cta } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[1], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_shared (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, {%2}], " + "[%3];" + : + : "r"(__as_ptr_smem(__dstMem)), "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, {%2}], " + "[%3];" + : + : "r"(__as_ptr_smem(__dstMem)), "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -65,23 +158,23 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[1], const void* __srcMem) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%0, {%1}], [%2]; // 3a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%0, {%1}], [%2];" + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// -1b. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -104,27 +197,132 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[2], _CUDA_VSTD::uint64_t* __smem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3}], " - "[%4];// 1b." - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__as_ptr_smem(__smem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3}], [%4];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 86, SM_90 +// .dst = { .shared::cta } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[2], + _CUDA_VSTD::uint64_t* __smem_bar) +{ +// __space == space_shared (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3}], [%4];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], +[smem_bar]; // PTX ISA 86, SM_100a, SM_101a +// .dst = { .shared::cta } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[2], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_shared (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, {%2, " + "%3}], [%4];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, {%2, " + "%3}], [%4];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -145,23 +343,23 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[2], const void* __srcMem) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%0, {%1, %2}], [%3]; // 3b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%0, {%1, %2}], [%3];" + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// -1c. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -184,28 +382,136 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[3], _CUDA_VSTD::uint64_t* __smem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4}], " - "[%5];// 1c." - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__smem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4}], [%5];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 86, SM_90 +// .dst = { .shared::cta } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[3], + _CUDA_VSTD::uint64_t* __smem_bar) +{ +// __space == space_shared (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4}], [%5];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], +[smem_bar]; // PTX ISA 86, SM_100a, SM_101a +// .dst = { .shared::cta } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[3], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_shared (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, {%2, %3, " + "%4}], [%5];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, {%2, %3, " + "%4}], [%5];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -226,27 +532,27 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[3], const void* __srcMem) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 3c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3}], [%4];" + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// -1d. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -269,29 +575,141 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[4], _CUDA_VSTD::uint64_t* __smem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, " - "%5}], [%6];// 1d." - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__smem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, %5}], " + "[%6];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 86, SM_90 +// .dst = { .shared::cta } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[4], + _CUDA_VSTD::uint64_t* __smem_bar) +{ +// __space == space_shared (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, %5}], [%6];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], +[smem_bar]; // PTX ISA 86, SM_100a, SM_101a +// .dst = { .shared::cta } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[4], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_shared (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, {%2, %3, " + "%4, %5}], [%6];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, {%2, %3, " + "%4, %5}], [%6];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -312,28 +730,28 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[4], const void* __srcMem) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 3d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5];" + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// -1e. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } template @@ -356,30 +774,146 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[5], _CUDA_VSTD::uint64_t* __smem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, %5, " - "%6}], [%7];// 1e." - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__smem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, %5, " + "%6}], [%7];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90 +// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar]; // +PTX ISA 86, SM_90 +// .dst = { .shared::cta } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + _CUDA_VSTD::uint64_t* __smem_bar) +{ +// __space == space_shared (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, %5, %6}], " + "[%7];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, tensorCoords], +[smem_bar]; // PTX ISA 86, SM_100a, SM_101a +// .dst = { .shared::cta } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_shared_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_shared (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, {%2, %3, " + "%4, %5, %6}], [%7];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, {%2, %3, " + "%4, %5, %6}], [%7];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } template @@ -400,23 +934,23 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[5], const void* __srcMem) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 3e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6];" + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h new file mode 100644 index 00000000000..f376f1b48c3 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h @@ -0,0 +1,288 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_GATHER_SCATTER_H_ +#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_GATHER_SCATTER_H_ + +/* +// cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], +[smem_bar]; // PTX ISA 86, SM_100 +// .dst = { .shared::cta } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_tile_gather4_is_not_supported_before_SM_100__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_gather4( + space_shared_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + _CUDA_VSTD::uint64_t* __smem_bar) +{ +// __space == space_shared (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 1000 + asm("cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes [%0], [%1, {%2, %3, %4, " + "%5, %6}], [%7];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_tile_gather4_is_not_supported_before_SM_100__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes.cta_group [dstMem], [tensorMap, +tensorCoords], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a +// .dst = { .shared::cta } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_shared_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_tile_gather4_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_gather4( + space_shared_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + _CUDA_VSTD::uint64_t* __smem_bar) +{ + // __space == space_shared (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::1 [%0], [%1, " + "{%2, %3, %4, %5, %6}], [%7];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::2 [%0], [%1, " + "{%2, %3, %4, %5, %6}], [%7];" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_tile_gather4_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // PTX ISA 86, SM_100a, SM_101a +// .dst = { .shared::cluster } +// .src = { .global } +template +__device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_tile_gather4_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_gather4( + space_cluster_t, + space_global_t, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) +{ +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster " + "[%0], [%1, {%2, %3, %4, %5, %6}], [%7], %8;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_tile_gather4_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.2d.dst.src.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], +[tensorMap, tensorCoords], [smem_bar], ctaMask; // PTX ISA 86, SM_100a, SM_101a +// .dst = { .shared::cluster } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor_tile_gather4( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_tile_gather4_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_gather4( + space_cluster_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) +{ + // __space == space_cluster (due to parameter type constraint) + // __space == space_global (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster." + "cta_group::1 [%0], [%1, {%2, %3, %4, %5, %6}], [%7], %8;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster." + "cta_group::2 [%0], [%1, {%2, %3, %4, %5, %6}], [%7], %8;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_tile_gather4_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// cp.async.bulk.tensor.2d.dst.src.tile::scatter4.bulk_group [tensorMap, tensorCoords], [srcMem]; // PTX ISA 80, +SM_100a, SM_101a +// .dst = { .global } +// .src = { .shared::cta } +template +__device__ static inline void cp_async_bulk_tensor_tile_scatter4( + cuda::ptx::space_global_t, + cuda::ptx::space_shared_t, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + const void* srcMem); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_tile_scatter4_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor_tile_scatter4( + space_global_t, + space_shared_t, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + const void* __srcMem) +{ +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6];" + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_tile_scatter4_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_GATHER_SCATTER_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h index 915979d18f3..b0d845b92a0 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h @@ -5,7 +5,7 @@ /* // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -19,7 +19,7 @@ __device__ static inline void cp_async_bulk_tensor( const uint16_t& ctaMask); */ #if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void cp_async_bulk_tensor( space_cluster_t, @@ -29,29 +29,95 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[1], _CUDA_VSTD::uint64_t* __smem_bar, const _CUDA_VSTD::uint16_t& __ctaMask) +{ +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1, " + "{%2}], [%3], %4;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a +// .dst = { .shared::cluster } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[1], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[1], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) { // __space == space_cluster (due to parameter type constraint) // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " - "[%1, {%2}], [%3], %4; // 2a." - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":1 [%0], [%1, {%2}], [%3], %4;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":2 [%0], [%1, {%2}], [%3], %4;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -65,7 +131,7 @@ __device__ static inline void cp_async_bulk_tensor( const uint16_t& ctaMask); */ #if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void cp_async_bulk_tensor( space_cluster_t, @@ -75,30 +141,98 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[2], _CUDA_VSTD::uint64_t* __smem_bar, const _CUDA_VSTD::uint16_t& __ctaMask) +{ +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1, " + "{%2, %3}], [%4], %5;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a +// .dst = { .shared::cluster } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[2], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[2], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) { // __space == space_cluster (due to parameter type constraint) // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " - "[%1, {%2, %3}], [%4], %5; // 2b." - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":1 [%0], [%1, {%2, %3}], [%4], %5;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":2 [%0], [%1, {%2, %3}], [%4], %5;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -112,7 +246,7 @@ __device__ static inline void cp_async_bulk_tensor( const uint16_t& ctaMask); */ #if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void cp_async_bulk_tensor( space_cluster_t, @@ -122,31 +256,101 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[3], _CUDA_VSTD::uint64_t* __smem_bar, const _CUDA_VSTD::uint16_t& __ctaMask) +{ +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1, " + "{%2, %3, %4}], [%5], %6;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a +// .dst = { .shared::cluster } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[3], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[3], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) { // __space == space_cluster (due to parameter type constraint) // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " - "[%1, {%2, %3, %4}], [%5], %6; // 2c." - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":1 [%0], [%1, {%2, %3, %4}], [%5], %6;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":2 [%0], [%1, {%2, %3, %4}], [%5], %6;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -160,7 +364,7 @@ __device__ static inline void cp_async_bulk_tensor( const uint16_t& ctaMask); */ #if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void cp_async_bulk_tensor( space_cluster_t, @@ -170,32 +374,104 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[4], _CUDA_VSTD::uint64_t* __smem_bar, const _CUDA_VSTD::uint16_t& __ctaMask) +{ +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1, " + "{%2, %3, %4, %5}], [%6], %7;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a +// .dst = { .shared::cluster } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[4], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[4], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) { // __space == space_cluster (due to parameter type constraint) // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " - "[%1, {%2, %3, %4, %5}], [%6], %7; // 2d." - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":1 [%0], [%1, {%2, %3, %4, %5}], [%6], %7;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":2 [%0], [%1, {%2, %3, %4, %5}], [%6], %7;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, -tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a // .dst = { .shared::cluster } // .src = { .global } template @@ -209,7 +485,7 @@ __device__ static inline void cp_async_bulk_tensor( const uint16_t& ctaMask); */ #if __cccl_ptx_isa >= 800 -extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void cp_async_bulk_tensor( space_cluster_t, @@ -219,27 +495,101 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( const _CUDA_VSTD::int32_t (&__tensorCoords)[5], _CUDA_VSTD::uint64_t* __smem_bar, const _CUDA_VSTD::uint16_t& __ctaMask) +{ +// __space == space_cluster (due to parameter type constraint) +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], [%1, " + "{%2, %3, %4, %5, %6}], [%7], %8;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +/* +// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group [dstMem], [tensorMap, +tensorCoords], [smem_bar], ctaMask; // PTX ISA 80, SM_100a, SM_101a +// .dst = { .shared::cluster } +// .src = { .global } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void cp_async_bulk_tensor( + cuda::ptx::space_cluster_t, + cuda::ptx::space_global_t, + cuda::ptx::cta_group_t cta_group, + void* dstMem, + const void* tensorMap, + const int32_t (&tensorCoords)[5], + uint64_t* smem_bar, + const uint16_t& ctaMask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void cp_async_bulk_tensor( + space_cluster_t, + space_global_t, + cta_group_t<_Cta_Group> __cta_group, + void* __dstMem, + const void* __tensorMap, + const _CUDA_VSTD::int32_t (&__tensorCoords)[5], + _CUDA_VSTD::uint64_t* __smem_bar, + const _CUDA_VSTD::uint16_t& __ctaMask) { // __space == space_cluster (due to parameter type constraint) // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%0], " - "[%1, {%2, %3, %4, %5, %6}], [%7], %8; // 2e." - : - : "r"(__as_ptr_smem(__dstMem)), - "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__smem_bar)), - "h"(__ctaMask) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":1 [%0], [%1, {%2, %3, %4, %5, %6}], [%7], %8;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group:" + ":2 [%0], [%1, {%2, %3, %4, %5, %6}], [%7], %8;" + : + : "r"(__as_ptr_smem(__dstMem)), + "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__smem_bar)), + "h"(__ctaMask) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h index 2057323665a..b0373a3e6a7 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h @@ -14,15 +14,12 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_is_not_supporte template _CCCL_DEVICE static inline void cp_async_bulk_wait_group(n32_t<_N32> __N) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("cp.async.bulk.wait_group %0;" - : - : "n"(__N.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("cp.async.bulk.wait_group %0;" : : "n"(__N.value) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -37,15 +34,12 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_read_is_not_sup template _CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(n32_t<_N32> __N) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("cp.async.bulk.wait_group.read %0;" - : - : "n"(__N.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("cp.async.bulk.wait_group.read %0;" : : "n"(__N.value) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h new file mode 100644 index 00000000000..b2bf07247c1 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h @@ -0,0 +1,26 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_MBARRIER_ARRIVE_H_ +#define _CUDA_PTX_GENERATED_CP_ASYNC_MBARRIER_ARRIVE_H_ + +/* +// cp.async.mbarrier.arrive.b64 [addr]; // PTX ISA 70, SM_80 +template +__device__ static inline void cp_async_mbarrier_arrive( + uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 700 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_mbarrier_arrive_is_not_supported_before_SM_80__(); +template +_CCCL_DEVICE static inline void cp_async_mbarrier_arrive(_CUDA_VSTD::uint64_t* __addr) +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800 + asm("cp.async.mbarrier.arrive.b64 [%0];" : : "r"(__as_ptr_smem(__addr)) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_mbarrier_arrive_is_not_supported_before_SM_80__(); +# endif +} +#endif // __cccl_ptx_isa >= 700 + +#endif // _CUDA_PTX_GENERATED_CP_ASYNC_MBARRIER_ARRIVE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h new file mode 100644 index 00000000000..816a3fc63b9 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h @@ -0,0 +1,26 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_MBARRIER_ARRIVE_NOINC_H_ +#define _CUDA_PTX_GENERATED_CP_ASYNC_MBARRIER_ARRIVE_NOINC_H_ + +/* +// cp.async.mbarrier.arrive.noinc.b64 [addr]; // PTX ISA 70, SM_80 +template +__device__ static inline void cp_async_mbarrier_arrive_noinc( + uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 700 +extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_mbarrier_arrive_noinc_is_not_supported_before_SM_80__(); +template +_CCCL_DEVICE static inline void cp_async_mbarrier_arrive_noinc(_CUDA_VSTD::uint64_t* __addr) +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800 + asm("cp.async.mbarrier.arrive.noinc.b64 [%0];" : : "r"(__as_ptr_smem(__addr)) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_async_mbarrier_arrive_noinc_is_not_supported_before_SM_80__(); +# endif +} +#endif // __cccl_ptx_isa >= 700 + +#endif // _CUDA_PTX_GENERATED_CP_ASYNC_MBARRIER_ARRIVE_NOINC_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h index a35684c85e1..499fda57c91 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h @@ -10,7 +10,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .b32 } // .op = { .and } -template +template = true> __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -22,7 +22,7 @@ __device__ static inline void cp_reduce_async_bulk( */ #if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template +template = true> _CCCL_DEVICE static inline void cp_reduce_async_bulk( space_cluster_t, space_shared_t, @@ -32,23 +32,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_b32 (due to parameter type constraint) - // __op == op_and_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_b32 (due to parameter type constraint) +// __op == op_and_op (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.and.b32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -59,7 +59,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .b32 } // .op = { .or } -template +template = true> __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -71,7 +71,7 @@ __device__ static inline void cp_reduce_async_bulk( */ #if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template +template = true> _CCCL_DEVICE static inline void cp_reduce_async_bulk( space_cluster_t, space_shared_t, @@ -81,23 +81,22 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_b32 (due to parameter type constraint) - // __op == op_or_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.or.b32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_b32 (due to parameter type constraint) +// __op == op_or_op (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.or.b32 [%0], [%1], %2, [%3]; // 1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -108,7 +107,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .b32 } // .op = { .xor } -template +template = true> __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -120,7 +119,7 @@ __device__ static inline void cp_reduce_async_bulk( */ #if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); -template +template = true> _CCCL_DEVICE static inline void cp_reduce_async_bulk( space_cluster_t, space_shared_t, @@ -130,23 +129,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_b32 (due to parameter type constraint) - // __op == op_xor_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.xor.b32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_b32 (due to parameter type constraint) +// __op == op_xor_op (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.xor.b32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -179,23 +178,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.u32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.u32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -228,23 +227,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.u32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.u32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -277,23 +276,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -326,23 +325,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_inc (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.inc.u32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_inc (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.inc.u32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -375,23 +374,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_dec (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.dec.u32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_dec (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.dec.u32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -424,23 +423,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.s32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s32 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.s32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -473,23 +472,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.s32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s32 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.max.s32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -522,23 +521,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.s32 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s32 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.s32 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -571,23 +570,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; " - "// 1." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u64 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; // " + "1." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -620,23 +619,23 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( _CUDA_VSTD::uint32_t __size, _CUDA_VSTD::uint64_t* __rdsmem_bar) { - // __space == space_cluster (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; " - "// 2." - : - : "r"(__as_ptr_remote_dsmem(__dstMem)), - "r"(__as_ptr_smem(__srcMem)), - "r"(__size), - "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_cluster (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s64 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64 [%0], [%1], %2, [%3]; // " + "2." + : + : "r"(__as_ptr_remote_dsmem(__dstMem)), + "r"(__as_ptr_smem(__srcMem)), + "r"(__size), + "r"(__as_ptr_remote_dsmem(__rdsmem_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -664,24 +663,26 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // __space == space_global (due to parameter type constraint) // __space == space_shared (due to parameter type constraint) static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); - // __op == op_and_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b32 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __op == op_and_op (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) + { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b32 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) + { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -709,24 +710,26 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // __space == space_global (due to parameter type constraint) // __space == space_shared (due to parameter type constraint) static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); - // __op == op_or_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b32 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __op == op_or_op (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) + { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b32 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) + { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -754,24 +757,26 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // __space == space_global (due to parameter type constraint) // __space == space_shared (due to parameter type constraint) static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); - // __op == op_xor_op (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b32 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { - asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64 [%0], [%1], %2; // 3." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __op == op_xor_op (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) + { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b32 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) + { + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64 [%0], [%1], %2; // 3." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -801,19 +806,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::uint32_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -843,19 +848,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::uint32_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -885,19 +890,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::uint32_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -927,19 +932,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::uint32_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_inc (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_inc (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -969,19 +974,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::uint32_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u32 (due to parameter type constraint) - // __op == op_dec (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u32 (due to parameter type constraint) +// __op == op_dec (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1011,19 +1016,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::int32_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s32 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1053,19 +1058,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::int32_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s32 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1095,19 +1100,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::int32_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s32 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1137,19 +1142,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::uint64_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u64 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u64 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1179,19 +1184,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::uint64_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u64 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u64 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1221,19 +1226,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::uint64_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_u64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_u64 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1263,19 +1268,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::int64_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s64 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s64 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1305,19 +1310,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::int64_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s64 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s64 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1342,19 +1347,19 @@ template _CCCL_DEVICE static inline void cp_reduce_async_bulk( space_global_t, space_shared_t, op_add_t, float* __dstMem, const float* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_f32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_f32 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1379,19 +1384,19 @@ template _CCCL_DEVICE static inline void cp_reduce_async_bulk( space_global_t, space_shared_t, op_add_t, double* __dstMem, const double* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_f64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_f64 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -1421,19 +1426,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const _CUDA_VSTD::int64_t* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_s64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [%0], [%1], %2; // 6." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_s64 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 [%0], [%1], %2; // 6." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h index 1e13bb5f4f2..5c177976468 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h @@ -29,19 +29,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const __nv_bfloat16* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_bf16 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_bf16 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -71,19 +71,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const __nv_bfloat16* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_bf16 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_bf16 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -113,19 +113,19 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( const __nv_bfloat16* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_bf16 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16 [%0], [%1], %2; // 5." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_bf16 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16 [%0], [%1], %2; // 5." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h index 0c4678c95bb..95d775d09e2 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h @@ -24,19 +24,19 @@ template _CCCL_DEVICE static inline void cp_reduce_async_bulk( space_global_t, space_shared_t, op_min_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_f16 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_f16 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -61,19 +61,19 @@ template _CCCL_DEVICE static inline void cp_reduce_async_bulk( space_global_t, space_shared_t, op_max_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_f16 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16 [%0], [%1], %2; // 4." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_f16 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16 [%0], [%1], %2; // 4." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -98,19 +98,19 @@ template _CCCL_DEVICE static inline void cp_reduce_async_bulk( space_global_t, space_shared_t, op_add_t, __half* __dstMem, const __half* __srcMem, _CUDA_VSTD::uint32_t __size) { - // __space == space_global (due to parameter type constraint) - // __space == space_shared (due to parameter type constraint) - // __type == type_f16 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16 [%0], [%1], %2; // 5." - : - : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); +// __space == space_global (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __type == type_f16 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16 [%0], [%1], %2; // 5." + : + : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h index 9ec5b2443d8..540b0e95ed5 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h @@ -34,53 +34,67 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec || __op == op_and_op || __op == op_or_op || __op == op_xor_op, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__op == op_add) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_min) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_max) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_inc) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_dec) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { - asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [%0, {%1}], [%2]; // 1a." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__op == op_add) + { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_min) + { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_max) + { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_inc) + { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_dec) + { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_and_op) + { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_or_op) + { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_xor_op) + { + asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [%0, {%1}], [%2]; // 1a." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -115,53 +129,67 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec || __op == op_and_op || __op == op_or_op || __op == op_xor_op, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__op == op_add) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_min) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_max) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_inc) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_dec) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { - asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." - : - : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__op == op_add) + { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_min) + { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_max) + { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_inc) + { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_dec) + { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_and_op) + { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_or_op) + { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_xor_op) + { + asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." + : + : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -196,85 +224,99 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec || __op == op_and_op || __op == op_or_op || __op == op_xor_op, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__op == op_add) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_min) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_max) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_inc) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_dec) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { - asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__op == op_add) + { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_min) + { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_max) + { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_inc) + { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_dec) + { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_and_op) + { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_or_op) + { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_xor_op) + { + asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -309,93 +351,107 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec || __op == op_and_op || __op == op_or_op || __op == op_xor_op, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__op == op_add) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_min) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_max) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_inc) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_dec) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { - asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__op == op_add) + { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_min) + { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_max) + { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_inc) + { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_dec) + { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_and_op) + { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_or_op) + { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_xor_op) + { + asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 @@ -430,109 +486,115 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( static_assert(__op == op_add || __op == op_min || __op == op_max || __op == op_inc || __op == op_dec || __op == op_and_op || __op == op_or_op || __op == op_xor_op, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__op == op_add) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_min) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_max) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_inc) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_dec) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // " - "1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { - asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " - "// 1e." - : - : "l"(__tensorMap), - "r"(__tensorCoords[0]), - "r"(__tensorCoords[1]), - "r"(__tensorCoords[2]), - "r"(__tensorCoords[3]), - "r"(__tensorCoords[4]), - "r"(__as_ptr_smem(__srcMem)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__op == op_add) + { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_min) + { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_max) + { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_inc) + { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_dec) + { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_and_op) + { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_or_op) + { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__op == op_xor_op) + { + asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // 1e." + : + : "l"(__tensorMap), + "r"(__tensorCoords[0]), + "r"(__tensorCoords[1]), + "r"(__tensorCoords[2]), + "r"(__tensorCoords[3]), + "r"(__tensorCoords[4]), + "r"(__as_ptr_smem(__srcMem)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h b/libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h new file mode 100644 index 00000000000..e8691178f14 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/elect_sync.h @@ -0,0 +1,36 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_ELECT_SYNC_H_ +#define _CUDA_PTX_GENERATED_ELECT_SYNC_H_ + +/* +// elect.sync _|is_elected, membermask; // PTX ISA 80, SM_90 +template +__device__ static inline bool elect_sync( + const uint32_t& membermask); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_elect_sync_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline bool elect_sync(const _CUDA_VSTD::uint32_t& __membermask) +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __is_elected; + asm volatile( + "{\n\t .reg .pred P_OUT; \n\t" + "elect.sync _|P_OUT, %1;\n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__is_elected) + : "r"(__membermask) + :); + return static_cast(__is_elected); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_elect_sync_is_not_supported_before_SM_90__(); + return false; +# endif +} +#endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_ELECT_SYNC_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence.h b/libcudacxx/include/cuda/__ptx/instructions/generated/fence.h index db00c4d4cba..c0bd9e9a3d2 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence.h @@ -4,71 +4,205 @@ #define _CUDA_PTX_GENERATED_FENCE_H_ /* -// fence{.sem}.scope; // 1. PTX ISA 60, SM_70 -// .sem = { .sc, .acq_rel } +// fence.sem.scope; // 1. PTX ISA 60, SM_70 +// .sem = { .sc } // .scope = { .cta, .gpu, .sys } -template +template __device__ static inline void fence( - cuda::ptx::sem_t sem, + cuda::ptx::sem_sc_t, cuda::ptx::scope_t scope); */ #if __cccl_ptx_isa >= 600 extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_70__(); -template -_CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_t<_Scope> __scope) +template +_CCCL_DEVICE static inline void fence(sem_sc_t, scope_t<_Scope> __scope) { - static_assert(__sem == sem_sc || __sem == sem_acq_rel, ""); + // __sem == sem_sc (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_gpu || __scope == scope_sys, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_70, - ( - _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_cta) { - asm volatile("fence.sc.cta; // 1." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_gpu) { - asm volatile("fence.sc.gpu; // 1." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_sys) { - asm volatile("fence.sc.sys; // 1." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_cta) { - asm volatile("fence.acq_rel.cta; // 1." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_gpu) { - asm volatile("fence.acq_rel.gpu; // 1." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_sys) { - asm volatile("fence.acq_rel.sys; // 1." : : : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_is_not_supported_before_SM_70__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm volatile("fence.sc.cta; // 1." : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) + { + asm volatile("fence.sc.gpu; // 1." : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_sys) + { + asm volatile("fence.sc.sys; // 1." : : : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_is_not_supported_before_SM_70__(); +# endif } #endif // __cccl_ptx_isa >= 600 /* -// fence{.sem}.scope; // 2. PTX ISA 78, SM_90 -// .sem = { .sc, .acq_rel } +// fence.sem.scope; // 2. PTX ISA 78, SM_90 +// .sem = { .sc } // .scope = { .cluster } -template +template __device__ static inline void fence( - cuda::ptx::sem_t sem, + cuda::ptx::sem_sc_t, cuda::ptx::scope_cluster_t); */ #if __cccl_ptx_isa >= 780 extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_90__(); -template -_CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_cluster_t) +template +_CCCL_DEVICE static inline void fence(sem_sc_t, scope_cluster_t) { - static_assert(__sem == sem_sc || __sem == sem_acq_rel, ""); - // __scope == scope_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__sem == sem_sc) { - asm volatile("fence.sc.cluster; // 2." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel) { - asm volatile("fence.acq_rel.cluster; // 2." : : : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_is_not_supported_before_SM_90__();)); +// __sem == sem_sc (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("fence.sc.cluster; // 2." : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 780 +/* +// fence.sem.scope; // 1. PTX ISA 60, SM_70 +// .sem = { .acq_rel } +// .scope = { .cta, .gpu, .sys } +template +__device__ static inline void fence( + cuda::ptx::sem_acq_rel_t, + cuda::ptx::scope_t scope); +*/ +#if __cccl_ptx_isa >= 600 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_70__(); +template +_CCCL_DEVICE static inline void fence(sem_acq_rel_t, scope_t<_Scope> __scope) +{ + // __sem == sem_acq_rel (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_gpu || __scope == scope_sys, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm volatile("fence.acq_rel.cta; // 1." : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) + { + asm volatile("fence.acq_rel.gpu; // 1." : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_sys) + { + asm volatile("fence.acq_rel.sys; // 1." : : : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_is_not_supported_before_SM_70__(); +# endif +} +#endif // __cccl_ptx_isa >= 600 + +/* +// fence.sem.scope; // 2. PTX ISA 78, SM_90 +// .sem = { .acq_rel } +// .scope = { .cluster } +template +__device__ static inline void fence( + cuda::ptx::sem_acq_rel_t, + cuda::ptx::scope_cluster_t); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence(sem_acq_rel_t, scope_cluster_t) +{ +// __sem == sem_acq_rel (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("fence.acq_rel.cluster; // 2." : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 780 + +/* +// fence.sem.scope; // PTX ISA 86, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +template +__device__ static inline void fence( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence(sem_acquire_t, scope_t<_Scope> __scope) +{ + // __sem == sem_acquire (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm volatile("fence.acquire.cta;" : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm volatile("fence.acquire.cluster;" : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) + { + asm volatile("fence.acquire.gpu;" : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_sys) + { + asm volatile("fence.acquire.sys;" : : : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// fence.sem.scope; // PTX ISA 86, SM_90 +// .sem = { .release } +// .scope = { .cta, .cluster, .gpu, .sys } +template +__device__ static inline void fence( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence(sem_release_t, scope_t<_Scope> __scope) +{ + // __sem == sem_release (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm volatile("fence.release.cta;" : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm volatile("fence.release.cluster;" : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) + { + asm volatile("fence.release.gpu;" : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_sys) + { + asm volatile("fence.release.sys;" : : : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + #endif // _CUDA_PTX_GENERATED_FENCE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h index e185913b3cd..6b0c8ec161d 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h @@ -17,17 +17,14 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_fence_mbarrier_init_is_not_supported_bef template _CCCL_DEVICE static inline void fence_mbarrier_init(sem_release_t, scope_cluster_t) { - // __sem == sem_release (due to parameter type constraint) - // __scope == scope_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("fence.mbarrier_init.release.cluster; // 3." - : - : - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__();)); +// __sem == sem_release (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("fence.mbarrier_init.release.cluster; // 3." : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h index 40229b84a96..e520d99bfaa 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h @@ -13,15 +13,12 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_alias_is_not_supported_befor template _CCCL_DEVICE static inline void fence_proxy_alias() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_70, - (asm volatile("fence.proxy.alias; // 4." - : - : - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700 + asm volatile("fence.proxy.alias; // 4." : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__(); +# endif } #endif // __cccl_ptx_isa >= 750 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h index f64b5faee5e..f8ee49909db 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h @@ -13,20 +13,17 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_async_is_not_supported_befor template _CCCL_DEVICE static inline void fence_proxy_async() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm volatile("fence.proxy.async; // 5." - : - : - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("fence.proxy.async; // 5." : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// fence.proxy.async{.space}; // 6. PTX ISA 80, SM_90 +// fence.proxy.async.space; // 6. PTX ISA 80, SM_90 // .space = { .global, .shared::cluster, .shared::cta } template __device__ static inline void fence_proxy_async( @@ -38,19 +35,23 @@ template _CCCL_DEVICE static inline void fence_proxy_async(space_t<_Space> __space) { static_assert(__space == space_global || __space == space_cluster || __space == space_shared, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__space == space_global) { - asm volatile("fence.proxy.async.global; // 6." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__space == space_cluster) { - asm volatile("fence.proxy.async.shared::cluster; // 6." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__space == space_shared) { - asm volatile("fence.proxy.async.shared::cta; // 6." : : : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__space == space_global) + { + asm volatile("fence.proxy.async.global; // 6." : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__space == space_cluster) + { + asm volatile("fence.proxy.async.shared::cluster; // 6." : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__space == space_shared) + { + asm volatile("fence.proxy.async.shared::cta; // 6." : : : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h new file mode 100644 index 00000000000..93c66063ea3 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h @@ -0,0 +1,62 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_FENCE_PROXY_ASYNC_GENERIC_SYNC_RESTRICT_H_ +#define _CUDA_PTX_GENERATED_FENCE_PROXY_ASYNC_GENERIC_SYNC_RESTRICT_H_ + +/* +// fence.proxy.async::generic.sem.sync_restrict::space.scope; // PTX ISA 86, SM_90 +// .sem = { .acquire } +// .space = { .shared::cluster } +// .scope = { .cluster } +template +__device__ static inline void fence_proxy_async_generic_sync_restrict( + cuda::ptx::sem_acquire_t, + cuda::ptx::space_cluster_t, + cuda::ptx::scope_cluster_t); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_async_generic_sync_restrict_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence_proxy_async_generic_sync_restrict(sem_acquire_t, space_cluster_t, scope_cluster_t) +{ +// __sem == sem_acquire (due to parameter type constraint) +// __space == space_cluster (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("fence.proxy.async::generic.acquire.sync_restrict::shared::cluster.cluster;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_async_generic_sync_restrict_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// fence.proxy.async::generic.sem.sync_restrict::space.scope; // PTX ISA 86, SM_90 +// .sem = { .release } +// .space = { .shared::cta } +// .scope = { .cluster } +template +__device__ static inline void fence_proxy_async_generic_sync_restrict( + cuda::ptx::sem_release_t, + cuda::ptx::space_shared_t, + cuda::ptx::scope_cluster_t); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_proxy_async_generic_sync_restrict_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence_proxy_async_generic_sync_restrict(sem_release_t, space_shared_t, scope_cluster_t) +{ +// __sem == sem_release (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("fence.proxy.async::generic.release.sync_restrict::shared::cta.cluster;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_async_generic_sync_restrict_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_FENCE_PROXY_ASYNC_GENERIC_SYNC_RESTRICT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h index 1e6119ee032..8988292b6d3 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h @@ -19,21 +19,27 @@ _CCCL_DEVICE static inline void fence_proxy_tensormap_generic(sem_release_t, sco { // __sem == sem_release (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm volatile("fence.proxy.tensormap::generic.release.cta; // 7." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm volatile("fence.proxy.tensormap::generic.release.cluster; // 7." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) { - asm volatile("fence.proxy.tensormap::generic.release.gpu; // 7." : : : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_sys) { - asm volatile("fence.proxy.tensormap::generic.release.sys; // 7." : : : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm volatile("fence.proxy.tensormap::generic.release.cta; // 7." : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm volatile("fence.proxy.tensormap::generic.release.cluster; // 7." : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) + { + asm volatile("fence.proxy.tensormap::generic.release.gpu; // 7." : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_sys) + { + asm volatile("fence.proxy.tensormap::generic.release.sys; // 7." : : : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 830 @@ -56,33 +62,39 @@ fence_proxy_tensormap_generic(sem_acquire_t, scope_t<_Scope> __scope, const void { // __sem == sem_acquire (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm volatile("fence.proxy.tensormap::generic.acquire.cta [%0], %1; // 8." - : - : "l"(__addr), "n"(__size.value) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm volatile("fence.proxy.tensormap::generic.acquire.cluster [%0], %1; // 8." - : - : "l"(__addr), "n"(__size.value) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) { - asm volatile("fence.proxy.tensormap::generic.acquire.gpu [%0], %1; // 8." - : - : "l"(__addr), "n"(__size.value) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_sys) { - asm volatile("fence.proxy.tensormap::generic.acquire.sys [%0], %1; // 8." - : - : "l"(__addr), "n"(__size.value) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm volatile("fence.proxy.tensormap::generic.acquire.cta [%0], %1; // 8." + : + : "l"(__addr), "n"(__size.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm volatile("fence.proxy.tensormap::generic.acquire.cluster [%0], %1; // 8." + : + : "l"(__addr), "n"(__size.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) + { + asm volatile("fence.proxy.tensormap::generic.acquire.gpu [%0], %1; // 8." + : + : "l"(__addr), "n"(__size.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_sys) + { + asm volatile("fence.proxy.tensormap::generic.acquire.sys [%0], %1; // 8." + : + : "l"(__addr), "n"(__size.value) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 830 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h new file mode 100644 index 00000000000..4930bec068b --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h @@ -0,0 +1,62 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_FENCE_SYNC_RESTRICT_H_ +#define _CUDA_PTX_GENERATED_FENCE_SYNC_RESTRICT_H_ + +/* +// fence.sem.sync_restrict::space.scope; // PTX ISA 86, SM_90 +// .sem = { .acquire } +// .space = { .shared::cluster } +// .scope = { .cluster } +template +__device__ static inline void fence_sync_restrict( + cuda::ptx::sem_acquire_t, + cuda::ptx::space_cluster_t, + cuda::ptx::scope_cluster_t); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_sync_restrict_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence_sync_restrict(sem_acquire_t, space_cluster_t, scope_cluster_t) +{ +// __sem == sem_acquire (due to parameter type constraint) +// __space == space_cluster (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("fence.acquire.sync_restrict::shared::cluster.cluster;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_sync_restrict_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// fence.sem.sync_restrict::space.scope; // PTX ISA 86, SM_90 +// .sem = { .release } +// .space = { .shared::cta } +// .scope = { .cluster } +template +__device__ static inline void fence_sync_restrict( + cuda::ptx::sem_release_t, + cuda::ptx::space_shared_t, + cuda::ptx::scope_cluster_t); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_fence_sync_restrict_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void fence_sync_restrict(sem_release_t, space_shared_t, scope_cluster_t) +{ +// __sem == sem_release (due to parameter type constraint) +// __space == space_shared (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm volatile("fence.release.sync_restrict::shared::cta.cluster;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_fence_sync_restrict_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_FENCE_SYNC_RESTRICT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h index 08128cc00a1..e5c8fa89225 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h @@ -133,17 +133,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nwarpid_is_not_supported_before template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nwarpid() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm volatile( - "mov.u32 %0, %%nwarpid;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_nwarpid_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm volatile("mov.u32 %0, %%nwarpid;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_nwarpid_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 200 @@ -262,17 +260,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nsmid_is_not_supported_before_S template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nsmid() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm volatile( - "mov.u32 %0, %%nsmid;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_nsmid_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm volatile("mov.u32 %0, %%nsmid;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_nsmid_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 200 @@ -301,20 +297,21 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_is_explicit_cluster_is_not_supp template _CCCL_DEVICE static inline bool get_sreg_is_explicit_cluster() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mov.pred P_OUT, %%is_explicit_cluster;\n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__sreg_value) - : - :); - return static_cast(__sreg_value);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_is_explicit_cluster_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mov.pred P_OUT, %%is_explicit_cluster;\n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__sreg_value) + : + :); + return static_cast(__sreg_value); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_is_explicit_cluster_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -328,17 +325,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_x_is_not_supported_be template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_x() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%clusterid.x;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_clusterid_x_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%clusterid.x;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_clusterid_x_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -352,17 +347,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_y_is_not_supported_be template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_y() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%clusterid.y;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_clusterid_y_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%clusterid.y;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_clusterid_y_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -376,17 +369,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clusterid_z_is_not_supported_be template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_z() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%clusterid.z;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_clusterid_z_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%clusterid.z;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_clusterid_z_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -400,17 +391,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_x_is_not_supported_b template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_x() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%nclusterid.x;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_nclusterid_x_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%nclusterid.x;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_nclusterid_x_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -424,17 +413,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_y_is_not_supported_b template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_y() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%nclusterid.y;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_nclusterid_y_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%nclusterid.y;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_nclusterid_y_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -448,17 +435,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_nclusterid_z_is_not_supported_b template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_z() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%nclusterid.z;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_nclusterid_z_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%nclusterid.z;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_nclusterid_z_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -472,17 +457,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_x_is_not_supporte template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_x() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%cluster_ctaid.x;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_ctaid_x_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_ctaid.x;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_ctaid_x_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -496,17 +479,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_y_is_not_supporte template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_y() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%cluster_ctaid.y;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_ctaid_y_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_ctaid.y;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_ctaid_y_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -520,17 +501,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctaid_z_is_not_supporte template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_z() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%cluster_ctaid.z;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_ctaid_z_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_ctaid.z;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_ctaid_z_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -544,17 +523,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_x_is_not_support template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_x() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%cluster_nctaid.x;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_nctaid_x_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_nctaid.x;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_nctaid_x_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -568,17 +545,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_y_is_not_support template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_y() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%cluster_nctaid.y;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_nctaid_y_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_nctaid.y;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_nctaid_y_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -592,17 +567,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctaid_z_is_not_support template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_z() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%cluster_nctaid.z;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_nctaid_z_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_nctaid.z;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_nctaid_z_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -616,17 +589,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_ctarank_is_not_supporte template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctarank() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%cluster_ctarank;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_ctarank_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_ctarank;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_ctarank_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -640,17 +611,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_cluster_nctarank_is_not_support template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctarank() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%cluster_nctarank;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_cluster_nctarank_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%cluster_nctarank;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_cluster_nctarank_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -664,17 +633,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_eq_is_not_supported_be template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_eq() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%lanemask_eq;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_lanemask_eq_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%lanemask_eq;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_lanemask_eq_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 200 @@ -688,17 +655,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_le_is_not_supported_be template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_le() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%lanemask_le;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_lanemask_le_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%lanemask_le;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_lanemask_le_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 200 @@ -712,17 +677,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_lt_is_not_supported_be template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_lt() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%lanemask_lt;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_lanemask_lt_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_lanemask_lt_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 200 @@ -736,17 +699,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_ge_is_not_supported_be template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_ge() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%lanemask_ge;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_lanemask_ge_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%lanemask_ge;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_lanemask_ge_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 200 @@ -760,17 +721,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_lanemask_gt_is_not_supported_be template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_gt() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%lanemask_gt;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_lanemask_gt_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%lanemask_gt;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_lanemask_gt_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 200 @@ -799,17 +758,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clock_hi_is_not_supported_befor template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock_hi() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm volatile( - "mov.u32 %0, %%clock_hi;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_clock_hi_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm volatile("mov.u32 %0, %%clock_hi;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_clock_hi_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 500 @@ -823,17 +780,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_clock64_is_not_supported_before template _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_clock64() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint64_t __sreg_value; asm volatile( - "mov.u64 %0, %%clock64;" - : "=l"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_clock64_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint64_t __sreg_value; + asm volatile("mov.u64 %0, %%clock64;" : "=l"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_clock64_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 200 @@ -847,17 +802,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_is_not_supported_be template _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_globaltimer() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint64_t __sreg_value; asm volatile( - "mov.u64 %0, %%globaltimer;" - : "=l"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_globaltimer_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint64_t __sreg_value; + asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_globaltimer_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 310 @@ -871,17 +824,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_lo_is_not_supported template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_lo() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm volatile( - "mov.u32 %0, %%globaltimer_lo;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_globaltimer_lo_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm volatile("mov.u32 %0, %%globaltimer_lo;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_globaltimer_lo_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 310 @@ -895,17 +846,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_globaltimer_hi_is_not_supported template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_hi() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; asm volatile( - "mov.u32 %0, %%globaltimer_hi;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_globaltimer_hi_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm volatile("mov.u32 %0, %%globaltimer_hi;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_globaltimer_hi_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 310 @@ -919,17 +868,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_total_smem_size_is_not_supporte template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_total_smem_size() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%total_smem_size;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_total_smem_size_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%total_smem_size;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_total_smem_size_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 410 @@ -943,17 +890,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_aggr_smem_size_is_not_supported template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_aggr_smem_size() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%aggr_smem_size;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_aggr_smem_size_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%aggr_smem_size;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_aggr_smem_size_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 810 @@ -967,17 +912,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_dynamic_smem_size_is_not_suppor template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_dynamic_smem_size() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_35, - (_CUDA_VSTD::uint32_t __sreg_value; - asm("mov.u32 %0, %%dynamic_smem_size;" - : "=r"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_dynamic_smem_size_is_not_supported_before_SM_35__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 350 + _CUDA_VSTD::uint32_t __sreg_value; + asm("mov.u32 %0, %%dynamic_smem_size;" : "=r"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_dynamic_smem_size_is_not_supported_before_SM_35__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 410 @@ -991,17 +934,15 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_get_sreg_current_graph_exec_is_not_suppo template _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_current_graph_exec() { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_50, - (_CUDA_VSTD::uint64_t __sreg_value; - asm("mov.u64 %0, %%current_graph_exec;" - : "=l"(__sreg_value) - : - :); - return __sreg_value;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500 + _CUDA_VSTD::uint64_t __sreg_value; + asm("mov.u64 %0, %%current_graph_exec;" : "=l"(__sreg_value) : :); + return __sreg_value; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 800 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h index a769868f45c..c78637db3e9 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h @@ -4,7 +4,7 @@ #define _CUDA_PTX_GENERATED_GETCTARANK_H_ /* -// getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90 +// getctarank.space.u32 dest, addr; // PTX ISA 78, SM_90 // .space = { .shared::cluster } template __device__ static inline uint32_t getctarank( @@ -16,18 +16,16 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_getctarank_is_not_supported_before_SM_90 template _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t getctarank(space_cluster_t, const void* __addr) { - // __space == space_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __dest; - asm("getctarank.shared::cluster.u32 %0, %1;" - : "=r"(__dest) - : "r"(__as_ptr_smem(__addr)) - :); - return __dest;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_getctarank_is_not_supported_before_SM_90__(); return 0;)); +// __space == space_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + asm("getctarank.shared::cluster.u32 %0, %1;" : "=r"(__dest) : "r"(__as_ptr_smem(__addr)) :); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_getctarank_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h new file mode 100644 index 00000000000..f93c8a62157 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mapa.h @@ -0,0 +1,33 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MAPA_H_ +#define _CUDA_PTX_GENERATED_MAPA_H_ + +/* +// mapa.space.u32 dest, addr, target_cta; // PTX ISA 78, SM_90 +// .space = { .shared::cluster } +template +__device__ static inline Tp* mapa( + cuda::ptx::space_cluster_t, + const Tp* addr, + uint32_t target_cta); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _CCCL_DEVICE void __cuda_ptx_mapa_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _Tp* mapa(space_cluster_t, const _Tp* __addr, _CUDA_VSTD::uint32_t __target_cta) +{ +// __space == space_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + asm("mapa.shared::cluster.u32 %0, %1, %2;" : "=r"(__dest) : "r"(__as_ptr_smem(__addr)), "r"(__target_cta) :); + return __from_ptr_dsmem<_Tp>(__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mapa_is_not_supported_before_SM_90__(); + return __from_ptr_dsmem<_Tp>(0); +# endif +} +#endif // __cccl_ptx_isa >= 780 + +#endif // _CUDA_PTX_GENERATED_MAPA_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h index e1afe25d8c2..5f7b23dbb68 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h @@ -14,17 +14,18 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_ template _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(_CUDA_VSTD::uint64_t* __addr) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_80, - (_CUDA_VSTD::uint64_t __state; - asm("mbarrier.arrive.shared.b64 %0, [%1]; // 1. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)) - : "memory"); - return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800 + _CUDA_VSTD::uint64_t __state; + asm("mbarrier.arrive.shared.b64 %0, [%1]; // 1. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory"); + return __state; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_80__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 700 @@ -41,22 +42,23 @@ template _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint64_t __state; - asm("mbarrier.arrive.shared::cta.b64 %0, [%1], %2; // 2. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), "r"(__count) - : "memory"); - return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __state; + asm("mbarrier.arrive.shared::cta.b64 %0, [%1], %2; // 2. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); + return __state; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 780 /* -// mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 +// mbarrier.arrive.sem.scope.space.b64 state, [addr]; // 3a. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cta, .cluster } // .space = { .shared::cta } @@ -75,29 +77,34 @@ mbarrier_arrive(sem_release_t, scope_t<_Scope> __scope, space_shared_t, _CUDA_VS { // __sem == sem_release (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1]; // 3a. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm("mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1]; // 3a. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)) - : "memory"); - } return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;)); +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __state; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1]; // 3a. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1]; // 3a. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory"); + } + return __state; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 800 /* -// mbarrier.arrive{.sem}{.scope}{.space}.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 +// mbarrier.arrive.sem.scope.space.b64 state, [addr], count; // 3b. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cta, .cluster } // .space = { .shared::cta } @@ -121,29 +128,34 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( { // __sem == sem_release (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1], %2; // 3b. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), "r"(__count) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm("mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1], %2; // 3b. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), "r"(__count) - : "memory"); - } return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); return 0;)); +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __state; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1], %2; // 3b. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1], %2; // 3b. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); + } + return __state; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 800 /* -// mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr]; // 4a. PTX ISA 80, SM_90 +// mbarrier.arrive.sem.scope.space.b64 _, [addr]; // 4a. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } @@ -160,23 +172,23 @@ template _CCCL_DEVICE static inline void mbarrier_arrive(sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr) { - // __sem == sem_release (due to parameter type constraint) - // __scope == scope_cluster (due to parameter type constraint) - // __space == space_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0]; // 4a. " - : - : "r"(__as_ptr_remote_dsmem(__addr)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();)); +// __sem == sem_release (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +// __space == space_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0]; // 4a. " + : + : "r"(__as_ptr_remote_dsmem(__addr)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 /* -// mbarrier.arrive{.sem}{.scope}{.space}.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90 +// mbarrier.arrive.sem.scope.space.b64 _, [addr], count; // 4b. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } @@ -194,19 +206,180 @@ template _CCCL_DEVICE static inline void mbarrier_arrive( sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) { - // __sem == sem_release (due to parameter type constraint) - // __scope == scope_cluster (due to parameter type constraint) - // __space == space_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0], %1; // 4b. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__count) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();)); +// __sem == sem_release (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +// __space == space_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("mbarrier.arrive.release.cluster.shared::cluster.b64 _, [%0], %1; // 4b. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__count) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 +/* +// mbarrier.arrive.sem.scope.space.b64 state, [addr], count; // PTX ISA 86, SM_90 +// .space = { .shared::cta } +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +template +__device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::space_shared_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& count); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( + space_shared_t, + sem_relaxed_t, + scope_t<_Scope> __scope, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __count) +{ + // __space == space_shared (due to parameter type constraint) + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __state; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("mbarrier.arrive.relaxed.cta.shared::cta.b64 %0, [%1], %2;" + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("mbarrier.arrive.relaxed.cluster.shared::cta.b64 %0, [%1], %2;" + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); + } + return __state; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// mbarrier.arrive.sem.scope.space.b64 state, [addr]; // PTX ISA 86, SM_90 +// .space = { .shared::cta } +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +template +__device__ static inline uint64_t mbarrier_arrive( + cuda::ptx::space_shared_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t +mbarrier_arrive(space_shared_t, sem_relaxed_t, scope_t<_Scope> __scope, _CUDA_VSTD::uint64_t* __addr) +{ + // __space == space_shared (due to parameter type constraint) + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __state; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("mbarrier.arrive.relaxed.cta.shared::cta.b64 %0, [%1];" + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("mbarrier.arrive.relaxed.cluster.shared::cta.b64 %0, [%1];" + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)) + : "memory"); + } + return __state; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// mbarrier.arrive.sem.scope.space.b64 _, [addr], count; // PTX ISA 86, SM_90 +// .space = { .shared::cluster } +// .sem = { .relaxed } +// .scope = { .cluster } +template +__device__ static inline void mbarrier_arrive( + cuda::ptx::space_cluster_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_cluster_t, + uint64_t* addr, + const uint32_t& count); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_arrive( + space_cluster_t, sem_relaxed_t, scope_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) +{ +// __space == space_cluster (due to parameter type constraint) +// __sem == sem_relaxed (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("mbarrier.arrive.relaxed.cluster.shared::cluster.b64 _, [%0], %1;" + : + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// mbarrier.arrive.sem.scope.space.b64 _, [addr]; // PTX ISA 86, SM_90 +// .space = { .shared::cluster } +// .sem = { .relaxed } +// .scope = { .cluster } +template +__device__ static inline void mbarrier_arrive( + cuda::ptx::space_cluster_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_cluster_t, + uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void +mbarrier_arrive(space_cluster_t, sem_relaxed_t, scope_cluster_t, _CUDA_VSTD::uint64_t* __addr) +{ +// __space == space_cluster (due to parameter type constraint) +// __sem == sem_relaxed (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("mbarrier.arrive.relaxed.cluster.shared::cluster.b64 _, [%0];" : : "r"(__as_ptr_smem(__addr)) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + #endif // _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h index 79301a57851..5cbcd4cb3aa 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h @@ -4,7 +4,7 @@ #define _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_EXPECT_TX_H_ /* -// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 +// mbarrier.arrive.expect_tx.sem.scope.space.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cta, .cluster } // .space = { .shared::cta } @@ -28,29 +28,34 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( { // __sem == sem_release (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint64_t __state; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2; // 8. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm("mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2; // 8. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) - : "memory"); - } return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); return 0;)); +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __state; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2; // 8. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2; // 8. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) + : "memory"); + } + return __state; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 800 /* -// mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 +// mbarrier.arrive.expect_tx.sem.scope.space.b64 _, [addr], tx_count; // 9. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } @@ -68,19 +73,104 @@ template _CCCL_DEVICE static inline void mbarrier_arrive_expect_tx( sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __tx_count) { - // __sem == sem_release (due to parameter type constraint) - // __scope == scope_cluster (due to parameter type constraint) - // __space == space_cluster (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1; // 9. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__tx_count) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();)); +// __sem == sem_release (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +// __space == space_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 _, [%0], %1; // 9. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__tx_count) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 800 +/* +// mbarrier.arrive.expect_tx.sem.scope.space.b64 state, [addr], txCount; // PTX ISA 86, SM_90 +// .space = { .shared::cta } +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +template +__device__ static inline uint64_t mbarrier_arrive_expect_tx( + cuda::ptx::space_shared_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& txCount); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( + space_shared_t, + sem_relaxed_t, + scope_t<_Scope> __scope, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __txCount) +{ + // __space == space_shared (due to parameter type constraint) + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __state; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("mbarrier.arrive.expect_tx.relaxed.cta.shared::cta.b64 %0, [%1], %2;" + : "=l"(__state) + : "r"(__as_ptr_dsmem(__addr)), "r"(__txCount) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("mbarrier.arrive.expect_tx.relaxed.cluster.shared::cta.b64 %0, [%1], %2;" + : "=l"(__state) + : "r"(__as_ptr_dsmem(__addr)), "r"(__txCount) + : "memory"); + } + return __state; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// mbarrier.arrive.expect_tx.sem.scope.space.b64 _, [addr], txCount; // PTX ISA 86, SM_90 +// .space = { .shared::cluster } +// .sem = { .relaxed } +// .scope = { .cluster } +template +__device__ static inline void mbarrier_arrive_expect_tx( + cuda::ptx::space_cluster_t, + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_cluster_t, + uint64_t* addr, + const uint32_t& txCount); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_arrive_expect_tx( + space_cluster_t, sem_relaxed_t, scope_cluster_t, _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __txCount) +{ +// __space == space_cluster (due to parameter type constraint) +// __sem == sem_relaxed (due to parameter type constraint) +// __scope == scope_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("mbarrier.arrive.expect_tx.relaxed.cluster.shared::cluster.b64 _, [%0], %1;" + : + : "r"(__as_ptr_dsmem(__addr)), "r"(__txCount) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + #endif // _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_EXPECT_TX_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h index cbfb275baa4..2a9ebacf295 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h @@ -16,17 +16,18 @@ template _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_no_complete(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_80, - (_CUDA_VSTD::uint64_t __state; - asm("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2; // 5. " - : "=l"(__state) - : "r"(__as_ptr_smem(__addr)), "r"(__count) - : "memory"); - return __state;), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); return 0;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800 + _CUDA_VSTD::uint64_t __state; + asm("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2; // 5. " + : "=l"(__state) + : "r"(__as_ptr_smem(__addr)), "r"(__count) + : "memory"); + return __state; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); + return 0; +# endif } #endif // __cccl_ptx_isa >= 700 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h new file mode 100644 index 00000000000..94d66b79a35 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h @@ -0,0 +1,94 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MBARRIER_EXPECT_TX_H_ +#define _CUDA_PTX_GENERATED_MBARRIER_EXPECT_TX_H_ + +/* +// mbarrier.expect_tx.sem.scope.space.b64 [addr], txCount; // 1. PTX ISA 80, SM_90 +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +// .space = { .shared::cta } +template +__device__ static inline void mbarrier_expect_tx( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_shared_t, + uint64_t* addr, + uint32_t txCount); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_expect_tx_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_expect_tx( + sem_relaxed_t, scope_t<_Scope> __scope, space_shared_t, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint32_t __txCount) +{ + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1; // 1." + : + : "r"(__as_ptr_smem(__addr)), "r"(__txCount) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("mbarrier.expect_tx.relaxed.cluster.shared::cta.b64 [%0], %1; // 1." + : + : "r"(__as_ptr_smem(__addr)), "r"(__txCount) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_expect_tx_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +/* +// mbarrier.expect_tx.sem.scope.space.b64 [addr], txCount; // 2. PTX ISA 80, SM_90 +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +// .space = { .shared::cluster } +template +__device__ static inline void mbarrier_expect_tx( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + cuda::ptx::space_cluster_t, + uint64_t* addr, + uint32_t txCount); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_expect_tx_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_expect_tx( + sem_relaxed_t, scope_t<_Scope> __scope, space_cluster_t, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint32_t __txCount) +{ + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +// __space == space_cluster (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("mbarrier.expect_tx.relaxed.cta.shared::cluster.b64 [%0], %1; // 2." + : + : "r"(__as_ptr_dsmem(__addr)), "r"(__txCount) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("mbarrier.expect_tx.relaxed.cluster.shared::cluster.b64 [%0], %1; // 2." + : + : "r"(__as_ptr_dsmem(__addr)), "r"(__txCount) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_expect_tx_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_MBARRIER_EXPECT_TX_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h index d1e5c57c97e..9ba345f8ff2 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h @@ -15,15 +15,12 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_init_is_not_supported_before_SM template _CCCL_DEVICE static inline void mbarrier_init(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __count) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_80, - (asm("mbarrier.init.shared.b64 [%0], %1;" - : - : "r"(__as_ptr_smem(__addr)), "r"(__count) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800 + asm("mbarrier.init.shared.b64 [%0], %1;" : : "r"(__as_ptr_smem(__addr)), "r"(__count) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__(); +# endif } #endif // __cccl_ptx_isa >= 700 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h index f3dbb6ed1c3..53263270f0d 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h @@ -15,25 +15,26 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_is_not_supported_befo template _CCCL_DEVICE static inline bool mbarrier_test_wait(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_80, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.shared.b64 P_OUT, [%1], %2; // 1. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state) - : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_80__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800 + _CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.shared.b64 P_OUT, [%1], %2; // 1. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_80__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 700 /* -// mbarrier.test_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 2. PTX +// mbarrier.test_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // 2. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } @@ -52,31 +53,87 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait( { // __sem == sem_acquire (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 2. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 2. " - "\n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 2. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 2. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 800 +/* +// mbarrier.test_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // PTX ISA 86, SM_90 +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +template +__device__ static inline void mbarrier_test_wait( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint64_t& state); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_test_wait( + sem_relaxed_t, + scope_t<_Scope> __scope, + bool __waitComplete, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint64_t& __state) +{ + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.test_wait.relaxed.cta.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.test_wait.relaxed.cluster.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + #endif // _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h index b975434b2de..3a281e22087 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h @@ -16,25 +16,26 @@ template _CCCL_DEVICE static inline bool mbarrier_test_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_80, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.parity.shared.b64 P_OUT, [%1], %2; // 3. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) - : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_80__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800 + _CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.parity.shared.b64 P_OUT, [%1], %2; // 3. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_80__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 710 /* -// mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX +// mbarrier.test_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } @@ -53,30 +54,87 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait_parity( { // __sem == sem_acquire (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 4. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 4. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 4. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 4. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 800 +/* +// mbarrier.test_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // PTX ISA 86, SM_90 +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +template +__device__ static inline void mbarrier_test_wait_parity( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint32_t& phaseParity); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_test_wait_parity( + sem_relaxed_t, + scope_t<_Scope> __scope, + bool __waitComplete, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __phaseParity) +{ + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.test_wait.parity.relaxed.cta.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.test_wait.parity.relaxed.cluster.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + #endif // _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_PARITY_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h index dd50a2c9f41..c048136b87a 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h @@ -15,20 +15,21 @@ extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_befor template _CCCL_DEVICE static inline bool mbarrier_try_wait(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.shared::cta.b64 P_OUT, [%1], %2; // 5a. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state) - : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.shared::cta.b64 P_OUT, [%1], %2; // 5a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -46,26 +47,27 @@ template _CCCL_DEVICE static inline bool mbarrier_try_wait( _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint64_t& __state, const _CUDA_VSTD::uint32_t& __suspendTimeHint) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.shared::cta.b64 P_OUT, [%1], %2, %3; // 5b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) - : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.shared::cta.b64 P_OUT, [%1], %2, %3; // 5b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) + : "memory"); + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 780 /* -// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 6a. -PTX ISA 80, SM_90 +// mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // 6a. PTX +ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -83,36 +85,40 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait( { // __sem == sem_acquire (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 6a. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 6a. " - "\n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 6a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 6a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 800 /* -// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. -PTX ISA 80, SM_90 +// mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. PTX +ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -135,30 +141,147 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait( { // __sem == sem_acquire (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2 , %3; // 6b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2 , %3; // 6b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2 , %3; // 6b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2 , %3; // 6b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) + : "memory"); + } + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 800 +/* +// mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // PTX ISA 86, SM_90 +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +template +__device__ static inline void mbarrier_try_wait( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint64_t& state, + const uint32_t& suspendTimeHint); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_try_wait( + sem_relaxed_t, + scope_t<_Scope> __scope, + bool __waitComplete, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint64_t& __state, + const _CUDA_VSTD::uint32_t& __suspendTimeHint) +{ + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.try_wait.relaxed.cta.shared::cta.b64 PRED_waitComplete, [%1], %2, %3;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), + "r"(__as_ptr_smem(__addr)), + "l"(__state), + "r"(__suspendTimeHint) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.try_wait.relaxed.cluster.shared::cta.b64 PRED_waitComplete, [%1], %2, %3;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), + "r"(__as_ptr_smem(__addr)), + "l"(__state), + "r"(__suspendTimeHint) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// mbarrier.try_wait.sem.scope.shared::cta.b64 waitComplete, [addr], state; // PTX ISA 86, SM_90 +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +template +__device__ static inline void mbarrier_try_wait( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint64_t& state); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_try_wait( + sem_relaxed_t, + scope_t<_Scope> __scope, + bool __waitComplete, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint64_t& __state) +{ + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.try_wait.relaxed.cta.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.try_wait.relaxed.cluster.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "l"(__state) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + #endif // _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h index d3deb3ca1d5..0d6f7d3a9df 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h @@ -16,20 +16,21 @@ template _CCCL_DEVICE static inline bool mbarrier_try_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.shared::cta.b64 P_OUT, [%1], %2; // 7a. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) - : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.shared::cta.b64 P_OUT, [%1], %2; // 7a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 780 @@ -47,26 +48,27 @@ template _CCCL_DEVICE static inline bool mbarrier_try_wait_parity( _CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_t& __phaseParity, const _CUDA_VSTD::uint32_t& __suspendTimeHint) { - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (_CUDA_VSTD::uint32_t __waitComplete; - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.shared::cta.b64 P_OUT, [%1], %2, %3; // 7b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) - : "memory"); - return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.shared::cta.b64 P_OUT, [%1], %2, %3; // 7b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) + : "memory"); + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 780 /* -// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. -PTX ISA 80, SM_90 +// mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. PTX +ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -84,35 +86,40 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait_parity( { // __sem == sem_acquire (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 8a. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 8a. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 8a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 8a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 800 /* -// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. -PTX ISA 80, SM_90 +// mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. PTX +ISA 80, SM_90 // .sem = { .acquire } // .scope = { .cta, .cluster } template @@ -135,30 +142,148 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait_parity( { // __sem == sem_acquire (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CUDA_VSTD::uint32_t __waitComplete; _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2, %3; // 8b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm("{\n\t .reg .pred P_OUT; \n\t" - "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2, %3; // 8b. \n\t" - "selp.b32 %0, 1, 0, P_OUT; \n" - "}" - : "=r"(__waitComplete) - : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) - : "memory"); - } return static_cast(__waitComplete);), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __waitComplete; + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2, %3; // 8b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2, %3; // 8b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) + : "memory"); + } + return static_cast(__waitComplete); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); + return false; +# endif } #endif // __cccl_ptx_isa >= 800 +/* +// mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // PTX ISA 86, +SM_90 +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +template +__device__ static inline void mbarrier_try_wait_parity( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint32_t& phaseParity, + const uint32_t& suspendTimeHint); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_try_wait_parity( + sem_relaxed_t, + scope_t<_Scope> __scope, + bool __waitComplete, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __phaseParity, + const _CUDA_VSTD::uint32_t& __suspendTimeHint) +{ + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.try_wait.parity.relaxed.cta.shared::cta.b64 PRED_waitComplete, [%1], %2, %3;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), + "r"(__as_ptr_smem(__addr)), + "r"(__phaseParity), + "r"(__suspendTimeHint) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.try_wait.parity.relaxed.cluster.shared::cta.b64 PRED_waitComplete, [%1], %2, %3;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), + "r"(__as_ptr_smem(__addr)), + "r"(__phaseParity), + "r"(__suspendTimeHint) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// mbarrier.try_wait.parity.sem.scope.shared::cta.b64 waitComplete, [addr], phaseParity; // PTX ISA 86, SM_90 +// .sem = { .relaxed } +// .scope = { .cta, .cluster } +template +__device__ static inline void mbarrier_try_wait_parity( + cuda::ptx::sem_relaxed_t, + cuda::ptx::scope_t scope, + bool waitComplete, + uint64_t* addr, + const uint32_t& phaseParity); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void mbarrier_try_wait_parity( + sem_relaxed_t, + scope_t<_Scope> __scope, + bool __waitComplete, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __phaseParity) +{ + // __sem == sem_relaxed (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.try_wait.parity.relaxed.cta.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm("{\n\t .reg .pred PRED_waitComplete; \n\t" + "setp.ne.b32 PRED_waitComplete, %0, 0;\n\t" + "mbarrier.try_wait.parity.relaxed.cluster.shared::cta.b64 PRED_waitComplete, [%1], %2;\n\t" + "}" + : + : "r"(static_cast<_CUDA_VSTD::uint32_t>(__waitComplete)), "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + #endif // _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_PARITY_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h b/libcudacxx/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h new file mode 100644 index 00000000000..51de5257bba --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h @@ -0,0 +1,2148 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MULTIMEM_LD_REDUCE_H_ +#define _CUDA_PTX_GENERATED_MULTIMEM_LD_REDUCE_H_ + +/* +// multimem.ld_reduce.sem.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .min } +template +__device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_min_t, + const uint32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t +multimem_ld_reduce(sem_weak_t, op_min_t, const _CUDA_VSTD::uint32_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + asm("multimem.ld_reduce.weak.global.min.u32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .min } +template +__device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_min_t, const _CUDA_VSTD::uint32_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.min.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.min.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.min.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.min.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.min.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.min.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.min.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.min.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .min } +template +__device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_min_t, + const uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t +multimem_ld_reduce(sem_weak_t, op_min_t, const _CUDA_VSTD::uint64_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + asm("multimem.ld_reduce.weak.global.min.u64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .min } +template +__device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_min_t, const _CUDA_VSTD::uint64_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.min.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.min.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.min.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.min.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.min.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.min.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.min.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.min.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .min } +template +__device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_min_t, + const int32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int32_t +multimem_ld_reduce(sem_weak_t, op_min_t, const _CUDA_VSTD::int32_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int32_t __dest; + asm("multimem.ld_reduce.weak.global.min.s32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .min } +template +__device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int32_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_min_t, const _CUDA_VSTD::int32_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int32_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.min.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.min.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.min.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.min.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.min.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.min.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.min.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.min.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .min } +template +__device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_min_t, + const int64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int64_t +multimem_ld_reduce(sem_weak_t, op_min_t, const _CUDA_VSTD::int64_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int64_t __dest; + asm("multimem.ld_reduce.weak.global.min.s64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .min } +template +__device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + const int64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int64_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_min_t, const _CUDA_VSTD::int64_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int64_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.min.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.min.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.min.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.min.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.min.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.min.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.min.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.min.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .max } +template +__device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_max_t, + const uint32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t +multimem_ld_reduce(sem_weak_t, op_max_t, const _CUDA_VSTD::uint32_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + asm("multimem.ld_reduce.weak.global.max.u32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .max } +template +__device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_max_t, const _CUDA_VSTD::uint32_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.max.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.max.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.max.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.max.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.max.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.max.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.max.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.max.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .max } +template +__device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_max_t, + const uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t +multimem_ld_reduce(sem_weak_t, op_max_t, const _CUDA_VSTD::uint64_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + asm("multimem.ld_reduce.weak.global.max.u64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .max } +template +__device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_max_t, const _CUDA_VSTD::uint64_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.max.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.max.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.max.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.max.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.max.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.max.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.max.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.max.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .max } +template +__device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_max_t, + const int32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int32_t +multimem_ld_reduce(sem_weak_t, op_max_t, const _CUDA_VSTD::int32_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int32_t __dest; + asm("multimem.ld_reduce.weak.global.max.s32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .max } +template +__device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int32_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_max_t, const _CUDA_VSTD::int32_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int32_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.max.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.max.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.max.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.max.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.max.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.max.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.max.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.max.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .max } +template +__device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_max_t, + const int64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int64_t +multimem_ld_reduce(sem_weak_t, op_max_t, const _CUDA_VSTD::int64_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int64_t __dest; + asm("multimem.ld_reduce.weak.global.max.s64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.s64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .max } +template +__device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + const int64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int64_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_max_t, const _CUDA_VSTD::int64_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int64_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.max.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.max.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.max.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.max.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.max.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.max.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.max.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.max.s64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .add } +template +__device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_add_t, + const uint32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t +multimem_ld_reduce(sem_weak_t, op_add_t, const _CUDA_VSTD::uint32_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + asm("multimem.ld_reduce.weak.global.add.u32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.u32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .add } +template +__device__ static inline uint32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_add_t, const _CUDA_VSTD::uint32_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.add.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.add.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.add.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.add.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.add.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.add.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.add.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.add.u32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .add } +template +__device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_add_t, + const uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t +multimem_ld_reduce(sem_weak_t, op_add_t, const _CUDA_VSTD::uint64_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + asm("multimem.ld_reduce.weak.global.add.u64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .add } +template +__device__ static inline uint64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const uint64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::uint64_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_add_t, const _CUDA_VSTD::uint64_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .add } +template +__device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_add_t, + const int32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int32_t +multimem_ld_reduce(sem_weak_t, op_add_t, const _CUDA_VSTD::int32_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int32_t __dest; + asm("multimem.ld_reduce.weak.global.add.s32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.s32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .add } +template +__device__ static inline int32_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int32_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int32_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_add_t, const _CUDA_VSTD::int32_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int32_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.add.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.add.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.add.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.add.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.add.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.add.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.add.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.add.s32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .add } +template +__device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_add_t, + const int64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int64_t +multimem_ld_reduce(sem_weak_t, op_add_t, const _CUDA_VSTD::int64_t* __addr) +{ +// __sem == sem_weak (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int64_t __dest; + asm("multimem.ld_reduce.weak.global.add.u64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.u64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .add } +template +__device__ static inline int64_t multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + const int64_t* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline _CUDA_VSTD::int64_t +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_add_t, const _CUDA_VSTD::int64_t* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::int64_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.add.u64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return __dest; +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + return 0; +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .and } +template = true> +__device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_and_op_t, + const B32* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true> +_CCCL_DEVICE static inline _B32 multimem_ld_reduce(sem_weak_t, op_and_op_t, const _B32* __addr) +{ + // __sem == sem_weak (due to parameter type constraint) + // __op == op_and_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + asm("multimem.ld_reduce.weak.global.and.b32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return *reinterpret_cast<_B32*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint32_t __err_out_var = 0; + return *reinterpret_cast<_B32*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .and } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B32* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline _B32 +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_and_op_t, const _B32* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_and_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.and.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.and.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.and.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.and.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.and.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.and.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.and.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.and.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return *reinterpret_cast<_B32*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint32_t __err_out_var = 0; + return *reinterpret_cast<_B32*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .or } +template = true> +__device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_or_op_t, + const B32* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true> +_CCCL_DEVICE static inline _B32 multimem_ld_reduce(sem_weak_t, op_or_op_t, const _B32* __addr) +{ + // __sem == sem_weak (due to parameter type constraint) + // __op == op_or_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + asm("multimem.ld_reduce.weak.global.or.b32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return *reinterpret_cast<_B32*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint32_t __err_out_var = 0; + return *reinterpret_cast<_B32*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .or } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B32* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline _B32 +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_or_op_t, const _B32* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_or_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.or.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.or.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.or.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.or.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.or.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.or.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.or.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.or.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return *reinterpret_cast<_B32*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint32_t __err_out_var = 0; + return *reinterpret_cast<_B32*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .xor } +template = true> +__device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_xor_op_t, + const B32* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true> +_CCCL_DEVICE static inline _B32 multimem_ld_reduce(sem_weak_t, op_xor_op_t, const _B32* __addr) +{ + // __sem == sem_weak (due to parameter type constraint) + // __op == op_xor_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + asm("multimem.ld_reduce.weak.global.xor.b32 %0, [%1];" : "=r"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return *reinterpret_cast<_B32*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint32_t __err_out_var = 0; + return *reinterpret_cast<_B32*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.b32 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .xor } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline B32 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B32* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline _B32 +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_xor_op_t, const _B32* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_xor_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint32_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.xor.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.xor.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.xor.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.xor.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.xor.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.xor.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.xor.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.xor.b32 %0, [%1];" + : "=r"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return *reinterpret_cast<_B32*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint32_t __err_out_var = 0; + return *reinterpret_cast<_B32*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .and } +template = true> +__device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_and_op_t, + const B64* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true> +_CCCL_DEVICE static inline _B64 multimem_ld_reduce(sem_weak_t, op_and_op_t, const _B64* __addr) +{ + // __sem == sem_weak (due to parameter type constraint) + // __op == op_and_op (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + asm("multimem.ld_reduce.weak.global.and.b64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return *reinterpret_cast<_B64*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint64_t __err_out_var = 0; + return *reinterpret_cast<_B64*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .and } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + const B64* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline _B64 +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_and_op_t, const _B64* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_and_op (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.and.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.and.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.and.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.and.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.and.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.and.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.and.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.and.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return *reinterpret_cast<_B64*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint64_t __err_out_var = 0; + return *reinterpret_cast<_B64*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .or } +template = true> +__device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_or_op_t, + const B64* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true> +_CCCL_DEVICE static inline _B64 multimem_ld_reduce(sem_weak_t, op_or_op_t, const _B64* __addr) +{ + // __sem == sem_weak (due to parameter type constraint) + // __op == op_or_op (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + asm("multimem.ld_reduce.weak.global.or.b64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return *reinterpret_cast<_B64*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint64_t __err_out_var = 0; + return *reinterpret_cast<_B64*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .or } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + const B64* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline _B64 +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_or_op_t, const _B64* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_or_op (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.or.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.or.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.or.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.or.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.or.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.or.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.or.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.or.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return *reinterpret_cast<_B64*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint64_t __err_out_var = 0; + return *reinterpret_cast<_B64*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .weak } +// .op = { .xor } +template = true> +__device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_weak_t, + cuda::ptx::op_xor_op_t, + const B64* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true> +_CCCL_DEVICE static inline _B64 multimem_ld_reduce(sem_weak_t, op_xor_op_t, const _B64* __addr) +{ + // __sem == sem_weak (due to parameter type constraint) + // __op == op_xor_op (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + asm("multimem.ld_reduce.weak.global.xor.b64 %0, [%1];" : "=l"(__dest) : "l"(__as_ptr_gmem(__addr)) : "memory"); + return *reinterpret_cast<_B64*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint64_t __err_out_var = 0; + return *reinterpret_cast<_B64*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.ld_reduce.sem.scope.global.op.b64 dest, [addr]; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .acquire } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .xor } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline B64 multimem_ld_reduce( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + const B64* addr); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline _B64 +multimem_ld_reduce(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_xor_op_t, const _B64* __addr) +{ + static_assert(__sem == sem_relaxed || __sem == sem_acquire, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_xor_op (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CUDA_VSTD::uint64_t __dest; + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.ld_reduce.relaxed.cta.global.xor.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.ld_reduce.relaxed.cluster.global.xor.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.ld_reduce.relaxed.gpu.global.xor.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.ld_reduce.relaxed.sys.global.xor.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cta) + { + asm("multimem.ld_reduce.acquire.cta.global.xor.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_cluster) + { + asm("multimem.ld_reduce.acquire.cluster.global.xor.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_gpu) + { + asm("multimem.ld_reduce.acquire.gpu.global.xor.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_acquire && __scope == scope_sys) + { + asm("multimem.ld_reduce.acquire.sys.global.xor.b64 %0, [%1];" + : "=l"(__dest) + : "l"(__as_ptr_gmem(__addr)) + : "memory"); + } + return *reinterpret_cast<_B64*>(&__dest); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_ld_reduce_is_not_supported_before_SM_90__(); + _CUDA_VSTD::uint64_t __err_out_var = 0; + return *reinterpret_cast<_B64*>(&__err_out_var); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +#endif // _CUDA_PTX_GENERATED_MULTIMEM_LD_REDUCE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/multimem_red.h b/libcudacxx/include/cuda/__ptx/instructions/generated/multimem_red.h new file mode 100644 index 00000000000..1ef97121d31 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/multimem_red.h @@ -0,0 +1,1272 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MULTIMEM_RED_H_ +#define _CUDA_PTX_GENERATED_MULTIMEM_RED_H_ + +/* +// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .min } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint32_t* addr, + uint32_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_min_t, _CUDA_VSTD::uint32_t* __addr, _CUDA_VSTD::uint32_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.min.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.min.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.min.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.min.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.min.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.min.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.min.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.min.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .min } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + uint64_t* addr, + uint64_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_min_t, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint64_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.min.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.min.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.min.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.min.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.min.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.min.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.min.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.min.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .min } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int32_t* addr, + int32_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_min_t, _CUDA_VSTD::int32_t* __addr, _CUDA_VSTD::int32_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.min.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.min.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.min.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.min.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.min.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.min.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.min.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.min.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .min } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_min_t, + int64_t* addr, + int64_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_min_t, _CUDA_VSTD::int64_t* __addr, _CUDA_VSTD::int64_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.min.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.min.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.min.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.min.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.min.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.min.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.min.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.min.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .max } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint32_t* addr, + uint32_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_max_t, _CUDA_VSTD::uint32_t* __addr, _CUDA_VSTD::uint32_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.max.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.max.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.max.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.max.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.max.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.max.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.max.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.max.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .max } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + uint64_t* addr, + uint64_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_max_t, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint64_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.max.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.max.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.max.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.max.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.max.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.max.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.max.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.max.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .max } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int32_t* addr, + int32_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_max_t, _CUDA_VSTD::int32_t* __addr, _CUDA_VSTD::int32_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.max.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.max.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.max.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.max.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.max.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.max.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.max.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.max.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.s64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .max } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_max_t, + int64_t* addr, + int64_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_max_t, _CUDA_VSTD::int64_t* __addr, _CUDA_VSTD::int64_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.max.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.max.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.max.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.max.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.max.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.max.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.max.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.max.s64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.u32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .add } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint32_t* addr, + uint32_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_add_t, _CUDA_VSTD::uint32_t* __addr, _CUDA_VSTD::uint32_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.add.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.add.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.add.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.add.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.add.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.add.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.add.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.add.u32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .add } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + uint64_t* addr, + uint64_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_add_t, _CUDA_VSTD::uint64_t* __addr, _CUDA_VSTD::uint64_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.s32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .add } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int32_t* addr, + int32_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_add_t, _CUDA_VSTD::int32_t* __addr, _CUDA_VSTD::int32_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.add.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.add.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.add.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.add.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.add.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.add.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.add.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.add.s32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.u64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .add } +template +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_add_t, + int64_t* addr, + int64_t val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void multimem_red( + sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_add_t, _CUDA_VSTD::int64_t* __addr, _CUDA_VSTD::int64_t __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.add.u64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__val) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .and } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B32* addr, + B32 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline void +multimem_red(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_and_op_t, _B32* __addr, _B32 __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_and_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.and.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.and.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.and.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.and.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.and.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.and.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.and.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.and.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .or } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B32* addr, + B32 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline void +multimem_red(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_or_op_t, _B32* __addr, _B32 __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_or_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.or.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.or.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.or.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.or.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.or.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.or.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.or.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.or.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.b32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .xor } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B32* addr, + B32 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline void +multimem_red(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_xor_op_t, _B32* __addr, _B32 __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_xor_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.xor.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.xor.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.xor.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.xor.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.xor.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.xor.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.xor.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.xor.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .and } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_and_op_t, + B64* addr, + B64 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline void +multimem_red(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_and_op_t, _B64* __addr, _B64 __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_and_op (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.and.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.and.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.and.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.and.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.and.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.and.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.and.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.and.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .or } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_or_op_t, + B64* addr, + B64 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline void +multimem_red(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_or_op_t, _B64* __addr, _B64 __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_or_op (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.or.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.or.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.or.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.or.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.or.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.or.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.or.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.or.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.red.sem.scope.global.op.b64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +// .op = { .xor } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline void multimem_red( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + cuda::ptx::op_xor_op_t, + B64* addr, + B64 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline void +multimem_red(sem_t<_Sem> __sem, scope_t<_Scope> __scope, op_xor_op_t, _B64* __addr, _B64 __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + // __op == op_xor_op (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.red.relaxed.cta.global.xor.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.red.relaxed.cluster.global.xor.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.red.relaxed.gpu.global.xor.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.red.relaxed.sys.global.xor.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.red.release.cta.global.xor.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.red.release.cluster.global.xor.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.red.release.gpu.global.xor.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.red.release.sys.global.xor.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_red_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +#endif // _CUDA_PTX_GENERATED_MULTIMEM_RED_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/multimem_st.h b/libcudacxx/include/cuda/__ptx/instructions/generated/multimem_st.h new file mode 100644 index 00000000000..91319874243 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/multimem_st.h @@ -0,0 +1,186 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_MULTIMEM_ST_H_ +#define _CUDA_PTX_GENERATED_MULTIMEM_ST_H_ + +/* +// multimem.st.sem.global.b32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .weak } +template = true> +__device__ static inline void multimem_st( + cuda::ptx::sem_weak_t, + B32* addr, + B32 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_st_is_not_supported_before_SM_90__(); +template = true> +_CCCL_DEVICE static inline void multimem_st(sem_weak_t, _B32* __addr, _B32 __val) +{ + // __sem == sem_weak (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("multimem.st.weak.global.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_st_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.st.sem.scope.global.b32 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B32* addr, + B32 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_st_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline void multimem_st(sem_t<_Sem> __sem, scope_t<_Scope> __scope, _B32* __addr, _B32 __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.st.relaxed.cta.global.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.st.relaxed.cluster.global.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.st.relaxed.gpu.global.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.st.relaxed.sys.global.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.st.release.cta.global.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.st.release.cluster.global.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.st.release.gpu.global.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.st.release.sys.global.b32 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__as_b32(__val)) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_st_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.st.sem.global.b64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .weak } +template = true> +__device__ static inline void multimem_st( + cuda::ptx::sem_weak_t, + B64* addr, + B64 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_st_is_not_supported_before_SM_90__(); +template = true> +_CCCL_DEVICE static inline void multimem_st(sem_weak_t, _B64* __addr, _B64 __val) +{ + // __sem == sem_weak (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("multimem.st.weak.global.b64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_st_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +/* +// multimem.st.sem.scope.global.b64 [addr], val; // PTX ISA 81, SM_90 +// .sem = { .relaxed, .release } +// .scope = { .cta, .cluster, .gpu, .sys } +template = true, cuda::ptx::dot_sem Sem, cuda::ptx::dot_scope Scope> +__device__ static inline void multimem_st( + cuda::ptx::sem_t sem, + cuda::ptx::scope_t scope, + B64* addr, + B64 val); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _CCCL_DEVICE void __cuda_ptx_multimem_st_is_not_supported_before_SM_90__(); +template = true, dot_sem _Sem, dot_scope _Scope> +_CCCL_DEVICE static inline void multimem_st(sem_t<_Sem> __sem, scope_t<_Scope> __scope, _B64* __addr, _B64 __val) +{ + static_assert(__sem == sem_relaxed || __sem == sem_release, ""); + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + static_assert(sizeof(_B64) == 8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cta) + { + asm("multimem.st.relaxed.cta.global.b64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_cluster) + { + asm("multimem.st.relaxed.cluster.global.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_gpu) + { + asm("multimem.st.relaxed.gpu.global.b64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_relaxed && __scope == scope_sys) + { + asm("multimem.st.relaxed.sys.global.b64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cta) + { + asm("multimem.st.release.cta.global.b64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_cluster) + { + asm("multimem.st.release.cluster.global.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_gpu) + { + asm("multimem.st.release.gpu.global.b64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__sem == sem_release && __scope == scope_sys) + { + asm("multimem.st.release.sys.global.b64 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "l"(__as_b64(__val)) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_multimem_st_is_not_supported_before_SM_90__(); +# endif +} +#endif // __cccl_ptx_isa >= 810 + +#endif // _CUDA_PTX_GENERATED_MULTIMEM_ST_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h index d88392f3635..767411d4719 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h @@ -4,8 +4,8 @@ #define _CUDA_PTX_GENERATED_RED_ASYNC_H_ /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .u32 } // .op = { .inc } template @@ -21,23 +21,23 @@ template _CCCL_DEVICE static inline void red_async( op_inc_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) { - // __type == type_u32 (due to parameter type constraint) - // __op == op_inc (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __type == type_u32 (due to parameter type constraint) +// __op == op_inc (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .u32 } // .op = { .dec } template @@ -53,23 +53,23 @@ template _CCCL_DEVICE static inline void red_async( op_dec_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) { - // __type == type_u32 (due to parameter type constraint) - // __op == op_dec (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __type == type_u32 (due to parameter type constraint) +// __op == op_dec (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .u32 } // .op = { .min } template @@ -85,23 +85,23 @@ template _CCCL_DEVICE static inline void red_async( op_min_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) { - // __type == type_u32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __type == type_u32 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .u32 } // .op = { .max } template @@ -117,23 +117,23 @@ template _CCCL_DEVICE static inline void red_async( op_max_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) { - // __type == type_u32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __type == type_u32 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .u32 } // .op = { .add } template @@ -149,23 +149,23 @@ template _CCCL_DEVICE static inline void red_async( op_add_t, _CUDA_VSTD::uint32_t* __dest, const _CUDA_VSTD::uint32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) { - // __type == type_u32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __type == type_u32 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .s32 } // .op = { .min } template @@ -181,23 +181,23 @@ template _CCCL_DEVICE static inline void red_async(op_min_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) { - // __type == type_s32 (due to parameter type constraint) - // __op == op_min (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __type == type_s32 (due to parameter type constraint) +// __op == op_min (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .s32 } // .op = { .max } template @@ -213,23 +213,23 @@ template _CCCL_DEVICE static inline void red_async(op_max_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) { - // __type == type_s32 (due to parameter type constraint) - // __op == op_max (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __type == type_s32 (due to parameter type constraint) +// __op == op_max (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .s32 } // .op = { .add } template @@ -245,26 +245,26 @@ template _CCCL_DEVICE static inline void red_async(op_add_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) { - // __type == type_s32 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __type == type_s32 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .b32 } // .op = { .and } -template +template = true> __device__ static inline void red_async( cuda::ptx::op_and_op_t, B32* dest, @@ -273,31 +273,31 @@ __device__ static inline void red_async( */ #if __cccl_ptx_isa >= 810 extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template +template = true> _CCCL_DEVICE static inline void red_async(op_and_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar) { // __type == type_b32 (due to parameter type constraint) // __op == op_and_op (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .b32 } // .op = { .or } -template +template = true> __device__ static inline void red_async( cuda::ptx::op_or_op_t, B32* dest, @@ -306,31 +306,31 @@ __device__ static inline void red_async( */ #if __cccl_ptx_isa >= 810 extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template +template = true> _CCCL_DEVICE static inline void red_async(op_or_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar) { // __type == type_b32 (due to parameter type constraint) // __op == op_or_op (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .b32 } // .op = { .xor } -template +template = true> __device__ static inline void red_async( cuda::ptx::op_xor_op_t, B32* dest, @@ -339,28 +339,28 @@ __device__ static inline void red_async( */ #if __cccl_ptx_isa >= 810 extern "C" _CCCL_DEVICE void __cuda_ptx_red_async_is_not_supported_before_SM_90__(); -template +template = true> _CCCL_DEVICE static inline void red_async(op_xor_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* __remote_bar) { // __type == type_b32 (due to parameter type constraint) // __op == op_xor_op (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // -PTX ISA 81, SM_90 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.type [dest], value, [remote_bar]; // PTX +ISA 81, SM_90 // .type = { .u64 } // .op = { .add } template @@ -376,22 +376,22 @@ template _CCCL_DEVICE static inline void red_async( op_add_t, _CUDA_VSTD::uint64_t* __dest, const _CUDA_VSTD::uint64_t& __value, _CUDA_VSTD::uint64_t* __remote_bar) { - // __type == type_u64 (due to parameter type constraint) - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [%0], %1, [%2]; " - : - : "r"(__as_ptr_remote_dsmem(__dest)), "l"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __type == type_u64 (due to parameter type constraint) +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), "l"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.op.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90 // .op = { .add } template @@ -407,17 +407,17 @@ template _CCCL_DEVICE static inline void red_async(op_add_t, _CUDA_VSTD::int64_t* __dest, const _CUDA_VSTD::int64_t& __value, _CUDA_VSTD::int64_t* __remote_bar) { - // __op == op_add (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [%0], %1, [%2]; // .u64 " - "intentional" - : - : "r"(__as_ptr_remote_dsmem(__dest)), "l"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); +// __op == op_add (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [%0], %1, [%2]; // .u64 " + "intentional" + : + : "r"(__as_ptr_remote_dsmem(__dest)), "l"(__value), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_red_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h index 18fd2c03a41..e59208e59ba 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h @@ -4,7 +4,7 @@ #define _CUDA_PTX_GENERATED_ST_ASYNC_H_ /* -// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. PTX ISA 81, +// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.type [addr], value, [remote_bar]; // 1. PTX ISA 81, SM_90 // .type = { .b32, .b64 } template @@ -19,28 +19,30 @@ template _CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type& __value, _CUDA_VSTD::uint64_t* __remote_bar) { static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { - asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 [%0], %1, [%2]; // 1. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { - asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [%0], %1, [%2]; // 1. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), "l"(__as_b64(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_st_async_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) + { + asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b32 [%0], %1, [%2]; // 1. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) + { + asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [%0], %1, [%2]; // 1. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), "l"(__as_b64(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_st_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* -// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2. PTX ISA 81, +// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.type [addr], value, [remote_bar]; // 2. PTX ISA 81, SM_90 // .type = { .b32, .b64 } template @@ -55,35 +57,37 @@ template _CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type (&__value)[2], _CUDA_VSTD::uint64_t* __remote_bar) { static_assert(sizeof(_Type) == 4 || sizeof(_Type) == 8, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) { - asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b32 [%0], {%1, %2}, [%3]; // 2. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), - "r"(__as_b32(__value[0])), - "r"(__as_b32(__value[1])), - "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory"); - } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { - asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [%0], {%1, %2}, [%3]; // 2. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), - "l"(__as_b64(__value[0])), - "l"(__as_b64(__value[1])), - "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_st_async_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (sizeof(_Type) == 4) + { + asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b32 [%0], {%1, %2}, [%3]; // 2. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), + "r"(__as_b32(__value[0])), + "r"(__as_b32(__value[1])), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) + { + asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [%0], {%1, %2}, [%3]; // 2. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), + "l"(__as_b64(__value[0])), + "l"(__as_b64(__value[1])), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_st_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 /* // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; // 3. PTX ISA 81, -SM_90 template +SM_90 template = true> __device__ static inline void st_async( B32* addr, const B32 (&value)[4], @@ -91,24 +95,24 @@ __device__ static inline void st_async( */ #if __cccl_ptx_isa >= 810 extern "C" _CCCL_DEVICE void __cuda_ptx_st_async_is_not_supported_before_SM_90__(); -template +template = true> _CCCL_DEVICE static inline void st_async(_B32* __addr, const _B32 (&__value)[4], _CUDA_VSTD::uint64_t* __remote_bar) { static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - (asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [%0], {%1, %2, %3, %4}, [%5]; // 3. " - : - : "r"(__as_ptr_remote_dsmem(__addr)), - "r"(__as_b32(__value[0])), - "r"(__as_b32(__value[1])), - "r"(__as_b32(__value[2])), - "r"(__as_b32(__value[3])), - "r"(__as_ptr_remote_dsmem(__remote_bar)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_st_async_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [%0], {%1, %2, %3, %4}, [%5]; // 3. " + : + : "r"(__as_ptr_remote_dsmem(__addr)), + "r"(__as_b32(__value[0])), + "r"(__as_b32(__value[1])), + "r"(__as_b32(__value[2])), + "r"(__as_b32(__value[3])), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_st_async_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 810 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/st_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/generated/st_bulk.h new file mode 100644 index 00000000000..bc02c785f86 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/st_bulk.h @@ -0,0 +1,31 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_ST_BULK_H_ +#define _CUDA_PTX_GENERATED_ST_BULK_H_ + +/* +// st.bulk.weak.shared::cta [addr], size, initval; // PTX ISA 86, SM_100 +template +__device__ static inline void st_bulk( + void* addr, + uint64_t size, + cuda::ptx::n32_t initval); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_st_bulk_is_not_supported_before_SM_100__(); +template +_CCCL_DEVICE static inline void st_bulk(void* __addr, _CUDA_VSTD::uint64_t __size, n32_t<_N32> __initval) +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 1000 + asm("st.bulk.weak.shared::cta [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__addr)), "l"(__size), "n"(__initval.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_st_bulk_is_not_supported_before_SM_100__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_ST_BULK_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h new file mode 100644 index 00000000000..27ca2f86080 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h @@ -0,0 +1,105 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_ALLOC_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_ALLOC_H_ + +/* +// tcgen05.alloc.cta_group.sync.aligned.shared::cta.b32 [dst], nCols; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_alloc( + cuda::ptx::cta_group_t cta_group, + uint32_t* dst, + const uint32_t& nCols); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_alloc_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void +tcgen05_alloc(cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t* __dst, const _CUDA_VSTD::uint32_t& __nCols) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile("tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__dst)), "r"(__nCols) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile("tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__dst)), "r"(__nCols) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_alloc_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.dealloc.cta_group.sync.aligned.b32 taddr, nCols; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_dealloc( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + const uint32_t& nCols); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_dealloc_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void +tcgen05_dealloc(cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, const _CUDA_VSTD::uint32_t& __nCols) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile("tcgen05.dealloc.cta_group::1.sync.aligned.b32 %0, %1;" : : "r"(__taddr), "r"(__nCols) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile("tcgen05.dealloc.cta_group::2.sync.aligned.b32 %0, %1;" : : "r"(__taddr), "r"(__nCols) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_dealloc_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.relinquish_alloc_permit.cta_group.sync.aligned; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_relinquish_alloc_permit( + cuda::ptx::cta_group_t cta_group); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_relinquish_alloc_permit_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_relinquish_alloc_permit(cta_group_t<_Cta_Group> __cta_group) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile("tcgen05.relinquish_alloc_permit.cta_group::1.sync.aligned;" : : : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile("tcgen05.relinquish_alloc_permit.cta_group::2.sync.aligned;" : : : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_relinquish_alloc_permit_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_ALLOC_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_commit.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_commit.h new file mode 100644 index 00000000000..30865d000df --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_commit.h @@ -0,0 +1,81 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_COMMIT_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_COMMIT_H_ + +/* +// tcgen05.commit.cta_group.mbarrier::arrive::one.shared::cluster.b64 [smem_bar]; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_commit( + cuda::ptx::cta_group_t cta_group, + uint64_t* smem_bar); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_commit_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_commit(cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint64_t* __smem_bar) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile("tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [%0];" + : + : "r"(__as_ptr_dsmem(__smem_bar)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile("tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%0];" + : + : "r"(__as_ptr_dsmem(__smem_bar)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_commit_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.commit.cta_group.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [smem_bar], ctaMask; // PTX ISA +86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_commit_multicast( + cuda::ptx::cta_group_t cta_group, + uint64_t* smem_bar, + uint16_t ctaMask); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_commit_multicast_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_commit_multicast( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint64_t* __smem_bar, _CUDA_VSTD::uint16_t __ctaMask) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile("tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%0], %1;" + : + : "r"(__as_ptr_dsmem(__smem_bar)), "h"(__ctaMask) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile("tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%0], %1;" + : + : "r"(__as_ptr_dsmem(__smem_bar)), "h"(__ctaMask) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_commit_multicast_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_COMMIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_cp.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_cp.h new file mode 100644 index 00000000000..e213f9ba745 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_cp.h @@ -0,0 +1,612 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_CP_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_CP_H_ + +/* +// tcgen05.cp.cta_group.128x256b [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_128x256b( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_128x256b_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void +tcgen05_cp_128x256b(cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.128x256b [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.128x256b [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_128x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.4x256b [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_4x256b( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_4x256b_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void +tcgen05_cp_4x256b(cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.4x256b [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.4x256b [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_4x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.128x128b [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_128x128b( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_128x128b_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void +tcgen05_cp_128x128b(cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.128x128b [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.128x128b [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_128x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.64x128b.warpx2::02_13 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_64x128b_warpx2_02_13( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_64x128b_warpx2_02_13_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_64x128b_warpx2_02_13( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.64x128b.warpx2::02_13 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.64x128b.warpx2::02_13 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_64x128b_warpx2_02_13_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.64x128b.warpx2::01_23 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_64x128b_warpx2_01_23( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_64x128b_warpx2_01_23_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_64x128b_warpx2_01_23( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.64x128b.warpx2::01_23 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.64x128b.warpx2::01_23 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_64x128b_warpx2_01_23_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.32x128b.warpx4 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_32x128b_warpx4( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_32x128b_warpx4_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_32x128b_warpx4( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.32x128b.warpx4 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.32x128b.warpx4 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_32x128b_warpx4_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.128x256b.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_128x256b_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_128x256b_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_128x256b_b8x16_b6x16_p32( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.128x256b.b8x16.b6x16_p32 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.128x256b.b8x16.b6x16_p32 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_128x256b_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.4x256b.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_4x256b_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_4x256b_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_4x256b_b8x16_b6x16_p32( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.4x256b.b8x16.b6x16_p32 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.4x256b.b8x16.b6x16_p32 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_4x256b_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.128x128b.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_128x128b_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_128x128b_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_128x128b_b8x16_b6x16_p32( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.128x128b.b8x16.b6x16_p32 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.128x128b.b8x16.b6x16_p32 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_128x128b_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.64x128b.warpx2::02_13.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b6x16_p32 [%0], %1;" + : + : "r"(__taddr), "l"(__s_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b6x16_p32 [%0], %1;" + : + : "r"(__taddr), "l"(__s_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.64x128b.warpx2::01_23.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b6x16_p32 [%0], %1;" + : + : "r"(__taddr), "l"(__s_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b6x16_p32 [%0], %1;" + : + : "r"(__taddr), "l"(__s_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.32x128b.warpx4.b8x16.b6x16_p32 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b6x16_p32 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b6x16_p32 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.128x256b.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_128x256b_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_128x256b_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_128x256b_b8x16_b4x16_p64( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.128x256b.b8x16.b4x16_p64 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.128x256b.b8x16.b4x16_p64 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_128x256b_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.4x256b.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_4x256b_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_4x256b_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_4x256b_b8x16_b4x16_p64( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.4x256b.b8x16.b4x16_p64 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.4x256b.b8x16.b4x16_p64 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_4x256b_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.128x128b.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_128x128b_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_cp_128x128b_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_128x128b_b8x16_b4x16_p64( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.128x128b.b8x16.b4x16_p64 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.128x128b.b8x16.b4x16_p64 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_128x128b_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.64x128b.warpx2::02_13.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b4x16_p64 [%0], %1;" + : + : "r"(__taddr), "l"(__s_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b4x16_p64 [%0], %1;" + : + : "r"(__taddr), "l"(__s_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.64x128b.warpx2::01_23.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b4x16_p64 [%0], %1;" + : + : "r"(__taddr), "l"(__s_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b4x16_p64 [%0], %1;" + : + : "r"(__taddr), "l"(__s_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.cp.cta_group.32x128b.warpx4.b8x16.b4x16_p64 [taddr], s_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr, + uint64_t s_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64( + cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr, _CUDA_VSTD::uint64_t __s_desc) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm("tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b4x16_p64 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm("tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b4x16_p64 [%0], %1;" : : "r"(__taddr), "l"(__s_desc) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_CP_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_fence.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_fence.h new file mode 100644 index 00000000000..efedcf86a57 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_fence.h @@ -0,0 +1,44 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_FENCE_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_FENCE_H_ + +/* +// tcgen05.fence::before_thread_sync; // PTX ISA 86, SM_100a, SM_101a +template +__device__ static inline void tcgen05_fence_before_thread_sync(); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_fence_before_thread_sync_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_fence_before_thread_sync() +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm volatile("tcgen05.fence::before_thread_sync;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_fence_before_thread_sync_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.fence::after_thread_sync; // PTX ISA 86, SM_100a, SM_101a +template +__device__ static inline void tcgen05_fence_after_thread_sync(); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_fence_after_thread_sync_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_fence_after_thread_sync() +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm volatile("tcgen05.fence::after_thread_sync;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_fence_after_thread_sync_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_FENCE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_ld.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_ld.h new file mode 100644 index 00000000000..e5ec1b686c2 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_ld.h @@ -0,0 +1,4446 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_LD_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_LD_H_ + +/* +// tcgen05.ld.sync.aligned.16x64b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[1], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[1], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x1.b32 {%0}, [%1];" : "=r"(__out[0]) : "r"(__taddr) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[1], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[1], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32 {%0}, [%1];" : "=r"(__out[0]) : "r"(__taddr) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[2], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x2.b32 {%0, %1}, [%2];" + : "=r"(__out[0]), "=r"(__out[1]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[2], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32 {%0, %1}, [%2];" + : "=r"(__out[0]), "=r"(__out[1]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[4], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x4.b32 {%0, %1, %2, %3}, [%4];" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[4], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32 {%0, %1, %2, %3}, [%4];" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[8], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x8.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[8], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x8.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[16], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x16.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15}, " + "[%16];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[16], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x64b.x16.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15}, [%16];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[32], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x64b.x32.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[32], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x64b.x32.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[64], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x64b.x64.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63}, [%64];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[64], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x64b.x64.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63}, [%64];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x128.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b( + B32 (&out)[128], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x64b.x128.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, " + "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, " + "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, " + "%123, %124, %125, %126, %127}, [%128];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x64b.x128.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x64b_pack_16b( + B32 (&out)[128], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x64b_pack_16b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x64b.x128.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, " + "%80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, " + "%102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, " + "%121, %122, %123, %124, %125, %126, %127}, [%128];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x64b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[2], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x128b.x1.b32 {%0, %1}, [%2];" + : "=r"(__out[0]), "=r"(__out[1]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[2], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32 {%0, %1}, [%2];" + : "=r"(__out[0]), "=r"(__out[1]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[4], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x128b.x2.b32 {%0, %1, %2, %3}, [%4];" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[4], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32 {%0, %1, %2, %3}, [%4];" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[8], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x128b.x4.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[8], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[16], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x128b.x8.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15}, " + "[%16];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[16], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x128b.x8.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15}, [%16];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[32], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x128b.x16.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[32], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x128b.x16.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[64], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x128b.x32.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63}, [%64];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[64], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x128b.x32.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63}, [%64];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b( + B32 (&out)[128], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x128b.x64.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, " + "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, " + "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, " + "%123, %124, %125, %126, %127}, [%128];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x128b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x128b_pack_16b( + B32 (&out)[128], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x128b_pack_16b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x128b.x64.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, " + "%80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, " + "%102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, " + "%121, %122, %123, %124, %125, %126, %127}, [%128];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x128b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[4], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x256b.x1.b32 {%0, %1, %2, %3}, [%4];" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[4], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b_pack_16b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32 {%0, %1, %2, %3}, [%4];" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[8], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x256b.x2.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[8], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b_pack_16b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[16], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x256b.x4.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15}, " + "[%16];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[16], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b_pack_16b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15}, [%16];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[32], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x256b.x8.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[32], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b_pack_16b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x256b.x8.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[64], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x256b.x16.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63}, [%64];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[64], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b_pack_16b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x256b.x16.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63}, [%64];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b( + B32 (&out)[128], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x256b.x32.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, " + "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, " + "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, " + "%123, %124, %125, %126, %127}, [%128];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x256b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_16x256b_pack_16b( + B32 (&out)[128], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_16x256b_pack_16b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x256b.x32.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, " + "%80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, " + "%102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, " + "%121, %122, %123, %124, %125, %126, %127}, [%128];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x256b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x1.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[1], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[1], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x1.b32 {%0}, [%1];" : "=r"(__out[0]) : "r"(__taddr) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[1], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[1], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32 {%0}, [%1];" : "=r"(__out[0]) : "r"(__taddr) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x2.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[2], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x2.b32 {%0, %1}, [%2];" + : "=r"(__out[0]), "=r"(__out[1]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[2], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32 {%0, %1}, [%2];" + : "=r"(__out[0]), "=r"(__out[1]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x4.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[4], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x4.b32 {%0, %1, %2, %3}, [%4];" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[4], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32 {%0, %1, %2, %3}, [%4];" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x8.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[8], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x8.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x8.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[8], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x8.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x16.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[16], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x16.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15}, " + "[%16];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x16.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[16], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.32x32b.x16.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15}, [%16];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x32.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[32], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.32x32b.x32.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x32.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[32], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.32x32b.x32.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x64.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[64], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.32x32b.x64.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63}, [%64];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[64], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63}, [%64];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x128.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b( + B32 (&out)[128], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.32x32b.x128.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, " + "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, " + "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, " + "%123, %124, %125, %126, %127}, [%128];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.32x32b.x128.pack::16b.b32 out, [taddr]; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_ld_32x32b_pack_16b( + B32 (&out)[128], + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_ld_32x32b_pack_16b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.32x32b.x128.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, " + "%80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, " + "%102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, " + "%121, %122, %123, %124, %125, %126, %127}, [%128];" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_32x32b_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x1.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[1], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2(_B32 (&__out)[1], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x1.b32 {%0}, [%1], %2;" + : "=r"(__out[0]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[1], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[1], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32 {%0}, [%1], %2;" + : "=r"(__out[0]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x2.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[2], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x2.b32 {%0, %1}, [%2], %3;" + : "=r"(__out[0]), "=r"(__out[1]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[2], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[2], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32 {%0, %1}, [%2], %3;" + : "=r"(__out[0]), "=r"(__out[1]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x4.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[4], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x4.b32 {%0, %1, %2, %3}, [%4], %5;" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[4], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[4], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32 {%0, %1, %2, %3}, [%4], %5;" + : "=r"(__out[0]), "=r"(__out[1]), "=r"(__out[2]), "=r"(__out[3]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x8.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[8], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x8.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8], %9;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x8.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[8], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[8], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x8.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8], %9;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x16.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[16], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x16.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, " + "%15}, [%16], %17;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x16.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[16], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[16], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.ld.sync.aligned.16x32bx2.x16.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15}, [%16], %17;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x32.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[32], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x32bx2.x32.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32], %33;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x32.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[32], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[32], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x32bx2.x32.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31}, [%32], %33;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x64.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[64], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x32bx2.x64.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63}, [%64], %65;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x64.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[64], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[64], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x32bx2.x64.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63}, [%64], %65;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x128.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2( + B32 (&out)[128], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x32bx2.x128.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, " + "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, " + "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, " + "%123, %124, %125, %126, %127}, [%128], %129;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.ld.sync.aligned.16x32bx2.x128.pack::16b.b32 out, [taddr], immHalfSplitoff; // PTX ISA 86, SM_100a, SM_101a +template = true, int N32> +__device__ static inline void tcgen05_ld_16x32bx2_pack_16b( + B32 (&out)[128], + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true, int _N32> +_CCCL_DEVICE static inline void +tcgen05_ld_16x32bx2_pack_16b(_B32 (&__out)[128], _CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.ld.sync.aligned.16x32bx2.x128.pack::16b.b32 {%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, " + "%80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, " + "%102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, " + "%121, %122, %123, %124, %125, %126, %127}, [%128], %129;" + : "=r"(__out[0]), + "=r"(__out[1]), + "=r"(__out[2]), + "=r"(__out[3]), + "=r"(__out[4]), + "=r"(__out[5]), + "=r"(__out[6]), + "=r"(__out[7]), + "=r"(__out[8]), + "=r"(__out[9]), + "=r"(__out[10]), + "=r"(__out[11]), + "=r"(__out[12]), + "=r"(__out[13]), + "=r"(__out[14]), + "=r"(__out[15]), + "=r"(__out[16]), + "=r"(__out[17]), + "=r"(__out[18]), + "=r"(__out[19]), + "=r"(__out[20]), + "=r"(__out[21]), + "=r"(__out[22]), + "=r"(__out[23]), + "=r"(__out[24]), + "=r"(__out[25]), + "=r"(__out[26]), + "=r"(__out[27]), + "=r"(__out[28]), + "=r"(__out[29]), + "=r"(__out[30]), + "=r"(__out[31]), + "=r"(__out[32]), + "=r"(__out[33]), + "=r"(__out[34]), + "=r"(__out[35]), + "=r"(__out[36]), + "=r"(__out[37]), + "=r"(__out[38]), + "=r"(__out[39]), + "=r"(__out[40]), + "=r"(__out[41]), + "=r"(__out[42]), + "=r"(__out[43]), + "=r"(__out[44]), + "=r"(__out[45]), + "=r"(__out[46]), + "=r"(__out[47]), + "=r"(__out[48]), + "=r"(__out[49]), + "=r"(__out[50]), + "=r"(__out[51]), + "=r"(__out[52]), + "=r"(__out[53]), + "=r"(__out[54]), + "=r"(__out[55]), + "=r"(__out[56]), + "=r"(__out[57]), + "=r"(__out[58]), + "=r"(__out[59]), + "=r"(__out[60]), + "=r"(__out[61]), + "=r"(__out[62]), + "=r"(__out[63]), + "=r"(__out[64]), + "=r"(__out[65]), + "=r"(__out[66]), + "=r"(__out[67]), + "=r"(__out[68]), + "=r"(__out[69]), + "=r"(__out[70]), + "=r"(__out[71]), + "=r"(__out[72]), + "=r"(__out[73]), + "=r"(__out[74]), + "=r"(__out[75]), + "=r"(__out[76]), + "=r"(__out[77]), + "=r"(__out[78]), + "=r"(__out[79]), + "=r"(__out[80]), + "=r"(__out[81]), + "=r"(__out[82]), + "=r"(__out[83]), + "=r"(__out[84]), + "=r"(__out[85]), + "=r"(__out[86]), + "=r"(__out[87]), + "=r"(__out[88]), + "=r"(__out[89]), + "=r"(__out[90]), + "=r"(__out[91]), + "=r"(__out[92]), + "=r"(__out[93]), + "=r"(__out[94]), + "=r"(__out[95]), + "=r"(__out[96]), + "=r"(__out[97]), + "=r"(__out[98]), + "=r"(__out[99]), + "=r"(__out[100]), + "=r"(__out[101]), + "=r"(__out[102]), + "=r"(__out[103]), + "=r"(__out[104]), + "=r"(__out[105]), + "=r"(__out[106]), + "=r"(__out[107]), + "=r"(__out[108]), + "=r"(__out[109]), + "=r"(__out[110]), + "=r"(__out[111]), + "=r"(__out[112]), + "=r"(__out[113]), + "=r"(__out[114]), + "=r"(__out[115]), + "=r"(__out[116]), + "=r"(__out[117]), + "=r"(__out[118]), + "=r"(__out[119]), + "=r"(__out[120]), + "=r"(__out[121]), + "=r"(__out[122]), + "=r"(__out[123]), + "=r"(__out[124]), + "=r"(__out[125]), + "=r"(__out[126]), + "=r"(__out[127]) + : "r"(__taddr), "n"(__immHalfSplitoff.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_ld_16x32bx2_pack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_LD_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_mma.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_mma.h new file mode 100644 index 00000000000..58e3f1e8363 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_mma.h @@ -0,0 +1,3842 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_MMA_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_MMA_H_ + +/* +// tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // +PTX ISA 86, SM_100a +// .kind = { .kind::f16, .kind::tf32 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + const _CUDA_VSTD::uint32_t (&__disable_output_lane)[4], + bool __enable_input_d, + n32_t<_N32> __scale_input_d) +{ + static_assert(__kind == kind_f16 || __kind == kind_tf32, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f16 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f16 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::tf32 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::tf32 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + const _CUDA_VSTD::uint32_t (&__disable_output_lane)[4], + bool __enable_input_d) +{ + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f16 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f16 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::tf32 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::tf32 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f8f6f4 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f8f6f4 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::i8 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::i8 [%0], %1, %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a +// .kind = { .kind::f16, .kind::tf32 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + n32_t<_N32> __scale_input_d) +{ + static_assert(__kind == kind_f16 || __kind == kind_tf32, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f16 [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f16 [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::tf32 [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::tf32 [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f16 [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f16 [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::tf32 [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::tf32 [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f8f6f4 [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f8f6f4 [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::i8 [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::i8 [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d, scale_input_d; // +PTX ISA 86, SM_100a +// .kind = { .kind::f16, .kind::tf32 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_tmem_a( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + const _CUDA_VSTD::uint32_t (&__disable_output_lane)[4], + bool __enable_input_d, + n32_t<_N32> __scale_input_d) +{ + static_assert(__kind == kind_f16 || __kind == kind_tf32, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f16 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f16 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::tf32 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::tf32 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d, %9;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + const uint32_t (&disable_output_lane)[4], + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_tmem_a( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + const _CUDA_VSTD::uint32_t (&__disable_output_lane)[4], + bool __enable_input_d) +{ + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f16 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f16 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::tf32 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::tf32 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f8f6f4 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f8f6f4 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::i8 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %8, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::i8 [%0], [%1], %2, %3, {%4, %5, %6, %7}, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(__disable_output_lane[0]), + "r"(__disable_output_lane[1]), + "r"(__disable_output_lane[2]), + "r"(__disable_output_lane[3]), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; // PTX ISA 86, SM_100a +// .kind = { .kind::f16, .kind::tf32 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + cuda::ptx::n32_t scale_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_tmem_a( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + n32_t<_N32> __scale_input_d) +{ + static_assert(__kind == kind_f16 || __kind == kind_tf32, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f16 [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f16 [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::tf32 [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::tf32 [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "n"(__scale_input_d.value) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_tmem_a( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f16 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f16 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f16 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::tf32 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::tf32 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::f8f6f4 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::f8f6f4 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::i8 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::i8 [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], +enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_block_scale_vec_1x_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], +enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2x( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_block_scale_vec_2x_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2x( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2x_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], +enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_block_scale_vec_4x_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], +enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_tmem_a( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], +enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2_tmem_a( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, [scale_A_tmem], [scale_B_tmem], +enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_tmem_a( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_fill( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_collector_a_fill( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2x_collector_a_fill( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_fill( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_collector_a_fill( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_use( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_collector_a_use( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2x_collector_a_use( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_use( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_collector_a_use( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], [%5], " + "PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_lastuse( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_collector_a_lastuse( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2x_collector_a_lastuse( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_lastuse( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_collector_a_lastuse( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x_collector_a_discard( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_collector_a_discard( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2x_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2x_collector_a_discard( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2x_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x_collector_a_discard( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_collector_a_discard( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf8f6f4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard( + cuda::ptx::kind_mxf8f6f4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard( + kind_mxf8f6f4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf8f6f4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4, .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard( + cuda::ptx::kind_t kind, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard( + kind_t<_Kind> __kind, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + static_assert(__kind == kind_mxf4 || __kind == kind_mxf4nvf4, ""); + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_mxf4nvf4 && __cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.cta_group.kind.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, b_desc, idesc, +[scale_A_tmem], [scale_B_tmem], enable_input_d; // PTX ISA 86, SM_100a, SM_101a +// .kind = { .kind::mxf4nvf4 } +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard( + cuda::ptx::kind_mxf4nvf4_t, + cuda::ptx::cta_group_t cta_group, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + uint32_t scale_A_tmem, + uint32_t scale_B_tmem, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard( + kind_mxf4nvf4_t, + cta_group_t<_Cta_Group> __cta_group, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + _CUDA_VSTD::uint32_t __scale_A_tmem, + _CUDA_VSTD::uint32_t __scale_B_tmem, + bool __enable_input_d) +{ + // __kind == kind_mxf4nvf4 (due to parameter type constraint) + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %6, 0;\n\t" + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [%0], %1, %2, %3, [%4], " + "[%5], PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(__scale_A_tmem), + "r"(__scale_B_tmem), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_MMA_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h new file mode 100644 index 00000000000..8d09698052d --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h @@ -0,0 +1,6438 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_MMA_WS_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_MMA_WS_H_ + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b0_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b0_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b0_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b0_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b0_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b0_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b0_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b0_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b0_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b0_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b0_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b0_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b0_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b0_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b0_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b0_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b0_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b0_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b0_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b0_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA +86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b0_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b0_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b0_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b0_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b0_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b0_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b0_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA +86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b0_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b0_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b1_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b1_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b1_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b1_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b1_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b1_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b1_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b1_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b1_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b1_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b1_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b1_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b1_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b1_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b1_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b1_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b1_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b1_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b1_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b1_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA +86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b1_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b1_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b1_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b1_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b1_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b1_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b1_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA +86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b1_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b1_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b2_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b2_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b2_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b2_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b2_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b2_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b2_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b2_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b2_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b2_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b2_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b2_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b2_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b2_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b2_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b2_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b2_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b2_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b2_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b2_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA +86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b2_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b2_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b2_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b2_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b2_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b2_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b2_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA +86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b2_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b2_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b3_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b3_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b3_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b3_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b3_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b3_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_fill_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b3_fill( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_fill_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b3_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b3_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b3_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b3_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b3_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b3_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_use_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b3_use( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_use_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b3_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b3_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b3_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b3_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b3_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b3_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA +86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b3_lastuse( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_lastuse_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b3_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b3_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [%0], %1, %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b3_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; // PTX ISA 86, +SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint64_t a_desc, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_mma_ws_collector_b3_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_collector_b3_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint64_t __a_desc, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [%0], %1, %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "l"(__a_desc), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_collector_b3_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, +zero_column_mask_desc; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d, + uint64_t zero_column_mask_desc); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d, + _CUDA_VSTD::uint64_t __zero_column_mask_desc) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [%0], [%1], %2, %3, PRED_enable_input_d, %5;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)), + "l"(__zero_column_mask_desc) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.mma.ws.cta_group.kind.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; // PTX ISA +86, SM_100a, SM_101a +// .cta_group = { .cta_group::1 } +// .kind = { .kind::f16, .kind::tf32, .kind::f8f6f4, .kind::i8 } +template +__device__ static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cuda::ptx::cta_group_1_t, + cuda::ptx::kind_t kind, + uint32_t d_tmem, + uint32_t a_tmem, + uint64_t b_desc, + uint32_t idesc, + bool enable_input_d); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_discard_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_mma_ws_tmem_a_collector_b3_discard( + cta_group_1_t, + kind_t<_Kind> __kind, + _CUDA_VSTD::uint32_t __d_tmem, + _CUDA_VSTD::uint32_t __a_tmem, + _CUDA_VSTD::uint64_t __b_desc, + _CUDA_VSTD::uint32_t __idesc, + bool __enable_input_d) +{ + // __cta_group == cta_group_1 (due to parameter type constraint) + static_assert(__kind == kind_f16 || __kind == kind_tf32 || __kind == kind_f8f6f4 || __kind == kind_i8, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__kind == kind_f16) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_tf32) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_f8f6f4) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__kind == kind_i8) + { + asm volatile( + "{\n\t .reg .pred PRED_enable_input_d; \n\t" + "setp.ne.b32 PRED_enable_input_d, %4, 0;\n\t" + "tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [%0], [%1], %2, %3, PRED_enable_input_d;\n\t" + "}" + : + : "r"(__d_tmem), + "r"(__a_tmem), + "l"(__b_desc), + "r"(__idesc), + "r"(static_cast<_CUDA_VSTD::uint32_t>(__enable_input_d)) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_mma_ws_tmem_a_collector_b3_discard_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_MMA_WS_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_shift.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_shift.h new file mode 100644 index 00000000000..0c28ba5d888 --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_shift.h @@ -0,0 +1,36 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_SHIFT_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_SHIFT_H_ + +/* +// tcgen05.shift.cta_group.down [taddr]; // PTX ISA 86, SM_100a, SM_101a +// .cta_group = { .cta_group::1, .cta_group::2 } +template +__device__ static inline void tcgen05_shift_down( + cuda::ptx::cta_group_t cta_group, + uint32_t taddr); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_shift_down_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_shift_down(cta_group_t<_Cta_Group> __cta_group, _CUDA_VSTD::uint32_t __taddr) +{ + static_assert(__cta_group == cta_group_1 || __cta_group == cta_group_2, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + _CCCL_IF_CONSTEXPR (__cta_group == cta_group_1) + { + asm volatile("tcgen05.shift.cta_group::1.down [%0];" : : "r"(__taddr) : "memory"); + } + else _CCCL_IF_CONSTEXPR (__cta_group == cta_group_2) + { + asm volatile("tcgen05.shift.cta_group::2.down [%0];" : : "r"(__taddr) : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_shift_down_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_SHIFT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_st.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_st.h new file mode 100644 index 00000000000..83e9d13810e --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_st.h @@ -0,0 +1,4554 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_ST_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_ST_H_ + +/* +// tcgen05.st.sync.aligned.16x64b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[1]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[1]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x1.b32 [%0], {%1};" : : "r"(__taddr), "r"(__as_b32(__values[0])) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[1]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[1]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x1.unpack::16b.b32 [%0], {%1};" + : + : "r"(__taddr), "r"(__as_b32(__values[0])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[2]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[2]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x2.b32 [%0], {%1, %2};" + : + : "r"(__taddr), "r"(__as_b32(__values[0])), "r"(__as_b32(__values[1])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[2]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[2]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x2.unpack::16b.b32 [%0], {%1, %2};" + : + : "r"(__taddr), "r"(__as_b32(__values[0])), "r"(__as_b32(__values[1])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x4.b32 [%0], {%1, %2, %3, %4};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x4.unpack::16b.b32 [%0], {%1, %2, %3, %4};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x8.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x8.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x16.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x64b.x16.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x64b.x32.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x64b.x32.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x64.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x64b.x64.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x64.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x64b.x64.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63, %64};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x128.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b( + uint32_t taddr, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x64b.x128.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, " + "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, " + "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, " + "%123, %124, %125, %126, %127, %128};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x64b.x128.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x64b_unpack_16b( + uint32_t taddr, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x64b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x64b.x128.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, " + "%35, %36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, " + "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, " + "%79, %80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, " + "%101, %102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, " + "%120, %121, %122, %123, %124, %125, %126, %127, %128};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x64b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[2]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[2]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x128b.x1.b32 [%0], {%1, %2};" + : + : "r"(__taddr), "r"(__as_b32(__values[0])), "r"(__as_b32(__values[1])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[2]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[2]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x128b.x1.unpack::16b.b32 [%0], {%1, %2};" + : + : "r"(__taddr), "r"(__as_b32(__values[0])), "r"(__as_b32(__values[1])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x128b.x2.b32 [%0], {%1, %2, %3, %4};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x128b.x2.unpack::16b.b32 [%0], {%1, %2, %3, %4};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x128b.x4.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x128b.x4.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x128b.x8.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x128b.x8.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x128b.x16.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x128b.x16.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x128b.x32.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x128b.x32.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, " + "%35, %36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, " + "%57, %58, %59, %60, %61, %62, %63, %64};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x64.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b( + uint32_t taddr, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x128b.x64.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, " + "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, " + "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, " + "%123, %124, %125, %126, %127, %128};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x128b.x64.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x128b_unpack_16b( + uint32_t taddr, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x128b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x128b.x64.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, " + "%35, %36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, " + "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, " + "%79, %80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, " + "%101, %102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, " + "%120, %121, %122, %123, %124, %125, %126, %127, %128};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x128b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x256b.x1.b32 [%0], {%1, %2, %3, %4};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x256b.x1.unpack::16b.b32 [%0], {%1, %2, %3, %4};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x256b.x2.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x256b.x2.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x256b.x4.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x256b.x4.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x256b.x8.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x256b.x8.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x256b.x16.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x256b.x16.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, " + "%35, %36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, " + "%57, %58, %59, %60, %61, %62, %63, %64};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b( + uint32_t taddr, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x256b.x32.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, " + "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, " + "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, " + "%123, %124, %125, %126, %127, %128};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x256b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x256b_unpack_16b( + uint32_t taddr, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_16x256b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x256b.x32.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, " + "%35, %36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, " + "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, " + "%79, %80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, " + "%101, %102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, " + "%120, %121, %122, %123, %124, %125, %126, %127, %128};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x256b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x1.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[1]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[1]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x1.b32 [%0], {%1};" : : "r"(__taddr), "r"(__as_b32(__values[0])) : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x1.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[1]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[1]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x1.unpack::16b.b32 [%0], {%1};" + : + : "r"(__taddr), "r"(__as_b32(__values[0])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x2.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[2]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[2]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x2.b32 [%0], {%1, %2};" + : + : "r"(__taddr), "r"(__as_b32(__values[0])), "r"(__as_b32(__values[1])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x2.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[2]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[2]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x2.unpack::16b.b32 [%0], {%1, %2};" + : + : "r"(__taddr), "r"(__as_b32(__values[0])), "r"(__as_b32(__values[1])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x4.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x4.b32 [%0], {%1, %2, %3, %4};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x4.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x4.unpack::16b.b32 [%0], {%1, %2, %3, %4};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x8.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x8.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x8.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x8.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x16.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x16.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x16.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.32x32b.x16.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x32.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.32x32b.x32.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x32.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.32x32b.x32.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x64.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.32x32b.x64.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x64.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.32x32b.x64.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, " + "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, " + "%36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, " + "%58, %59, %60, %61, %62, %63, %64};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x128.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b( + uint32_t taddr, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.32x32b.x128.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, " + "%82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, " + "%104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, " + "%123, %124, %125, %126, %127, %128};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.32x32b.x128.unpack::16b.b32 [taddr], values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_32x32b_unpack_16b( + uint32_t taddr, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void tcgen05_st_32x32b_unpack_16b(_CUDA_VSTD::uint32_t __taddr, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.32x32b.x128.unpack::16b.b32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, " + "%35, %36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, " + "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, " + "%79, %80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, " + "%101, %102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, " + "%120, %121, %122, %123, %124, %125, %126, %127, %128};" + : + : "r"(__taddr), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_32x32b_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x1.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[1]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[1]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x1.b32 [%0], %1, {%2};" + : + : "r"(__taddr), "n"(__immHalfSplitoff.value), "r"(__as_b32(__values[0])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x1.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[1]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2_unpack_16b(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[1]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x1.unpack::16b.b32 [%0], %1, {%2};" + : + : "r"(__taddr), "n"(__immHalfSplitoff.value), "r"(__as_b32(__values[0])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x2.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[2]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[2]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x2.b32 [%0], %1, {%2, %3};" + : + : "r"(__taddr), "n"(__immHalfSplitoff.value), "r"(__as_b32(__values[0])), "r"(__as_b32(__values[1])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x2.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[2]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2_unpack_16b(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[2]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x2.unpack::16b.b32 [%0], %1, {%2, %3};" + : + : "r"(__taddr), "n"(__immHalfSplitoff.value), "r"(__as_b32(__values[0])), "r"(__as_b32(__values[1])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x4.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x4.b32 [%0], %1, {%2, %3, %4, %5};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x4.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[4]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2_unpack_16b(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[4]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x4.unpack::16b.b32 [%0], %1, {%2, %3, %4, %5};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x8.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x8.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x8.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[8]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2_unpack_16b(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[8]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x8.unpack::16b.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x16.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x16.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, " + "%15, %16, %17};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x16.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, +SM_101a template = true> +__device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[16]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2_unpack_16b(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[16]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tcgen05.st.sync.aligned.16x32bx2.x16.unpack::16b.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x32.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x32bx2.x32.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x32.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, +SM_101a template = true> +__device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[32]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2_unpack_16b(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[32]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x32bx2.x32.unpack::16b.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x64.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x32bx2.x64.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, " + "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, %37, " + "%38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, " + "%60, %61, %62, %63, %64, %65};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x64.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, +SM_101a template = true> +__device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[64]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2_unpack_16b(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[64]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x32bx2.x64.unpack::16b.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, " + "%35, %36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, " + "%57, %58, %59, %60, %61, %62, %63, %64, %65};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x128.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, SM_101a +template = true> +__device__ static inline void tcgen05_st_16x32bx2( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x32bx2.x128.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, " + "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, %35, %36, " + "%37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, " + "%59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, " + "%81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, " + "%103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, " + "%122, %123, %124, %125, %126, %127, %128, %129};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.st.sync.aligned.16x32bx2.x128.unpack::16b.b32 [taddr], immHalfSplitoff, values; // PTX ISA 86, SM_100a, +SM_101a template = true> +__device__ static inline void tcgen05_st_16x32bx2_unpack_16b( + uint32_t taddr, + cuda::ptx::n32_t immHalfSplitoff, + const B32 (&values)[128]); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tcgen05_st_16x32bx2_unpack_16b(_CUDA_VSTD::uint32_t __taddr, n32_t<_N32> __immHalfSplitoff, const _B32 (&__values)[128]) +{ + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm( + "tcgen05.st.sync.aligned.16x32bx2.x128.unpack::16b.b32 [%0], %1, {%2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, " + "%13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28, %29, %30, %31, %32, %33, %34, " + "%35, %36, %37, %38, %39, %40, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, " + "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, " + "%79, %80, %81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, " + "%101, %102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, " + "%120, %121, %122, %123, %124, %125, %126, %127, %128, %129};" + : + : "r"(__taddr), + "n"(__immHalfSplitoff.value), + "r"(__as_b32(__values[0])), + "r"(__as_b32(__values[1])), + "r"(__as_b32(__values[2])), + "r"(__as_b32(__values[3])), + "r"(__as_b32(__values[4])), + "r"(__as_b32(__values[5])), + "r"(__as_b32(__values[6])), + "r"(__as_b32(__values[7])), + "r"(__as_b32(__values[8])), + "r"(__as_b32(__values[9])), + "r"(__as_b32(__values[10])), + "r"(__as_b32(__values[11])), + "r"(__as_b32(__values[12])), + "r"(__as_b32(__values[13])), + "r"(__as_b32(__values[14])), + "r"(__as_b32(__values[15])), + "r"(__as_b32(__values[16])), + "r"(__as_b32(__values[17])), + "r"(__as_b32(__values[18])), + "r"(__as_b32(__values[19])), + "r"(__as_b32(__values[20])), + "r"(__as_b32(__values[21])), + "r"(__as_b32(__values[22])), + "r"(__as_b32(__values[23])), + "r"(__as_b32(__values[24])), + "r"(__as_b32(__values[25])), + "r"(__as_b32(__values[26])), + "r"(__as_b32(__values[27])), + "r"(__as_b32(__values[28])), + "r"(__as_b32(__values[29])), + "r"(__as_b32(__values[30])), + "r"(__as_b32(__values[31])), + "r"(__as_b32(__values[32])), + "r"(__as_b32(__values[33])), + "r"(__as_b32(__values[34])), + "r"(__as_b32(__values[35])), + "r"(__as_b32(__values[36])), + "r"(__as_b32(__values[37])), + "r"(__as_b32(__values[38])), + "r"(__as_b32(__values[39])), + "r"(__as_b32(__values[40])), + "r"(__as_b32(__values[41])), + "r"(__as_b32(__values[42])), + "r"(__as_b32(__values[43])), + "r"(__as_b32(__values[44])), + "r"(__as_b32(__values[45])), + "r"(__as_b32(__values[46])), + "r"(__as_b32(__values[47])), + "r"(__as_b32(__values[48])), + "r"(__as_b32(__values[49])), + "r"(__as_b32(__values[50])), + "r"(__as_b32(__values[51])), + "r"(__as_b32(__values[52])), + "r"(__as_b32(__values[53])), + "r"(__as_b32(__values[54])), + "r"(__as_b32(__values[55])), + "r"(__as_b32(__values[56])), + "r"(__as_b32(__values[57])), + "r"(__as_b32(__values[58])), + "r"(__as_b32(__values[59])), + "r"(__as_b32(__values[60])), + "r"(__as_b32(__values[61])), + "r"(__as_b32(__values[62])), + "r"(__as_b32(__values[63])), + "r"(__as_b32(__values[64])), + "r"(__as_b32(__values[65])), + "r"(__as_b32(__values[66])), + "r"(__as_b32(__values[67])), + "r"(__as_b32(__values[68])), + "r"(__as_b32(__values[69])), + "r"(__as_b32(__values[70])), + "r"(__as_b32(__values[71])), + "r"(__as_b32(__values[72])), + "r"(__as_b32(__values[73])), + "r"(__as_b32(__values[74])), + "r"(__as_b32(__values[75])), + "r"(__as_b32(__values[76])), + "r"(__as_b32(__values[77])), + "r"(__as_b32(__values[78])), + "r"(__as_b32(__values[79])), + "r"(__as_b32(__values[80])), + "r"(__as_b32(__values[81])), + "r"(__as_b32(__values[82])), + "r"(__as_b32(__values[83])), + "r"(__as_b32(__values[84])), + "r"(__as_b32(__values[85])), + "r"(__as_b32(__values[86])), + "r"(__as_b32(__values[87])), + "r"(__as_b32(__values[88])), + "r"(__as_b32(__values[89])), + "r"(__as_b32(__values[90])), + "r"(__as_b32(__values[91])), + "r"(__as_b32(__values[92])), + "r"(__as_b32(__values[93])), + "r"(__as_b32(__values[94])), + "r"(__as_b32(__values[95])), + "r"(__as_b32(__values[96])), + "r"(__as_b32(__values[97])), + "r"(__as_b32(__values[98])), + "r"(__as_b32(__values[99])), + "r"(__as_b32(__values[100])), + "r"(__as_b32(__values[101])), + "r"(__as_b32(__values[102])), + "r"(__as_b32(__values[103])), + "r"(__as_b32(__values[104])), + "r"(__as_b32(__values[105])), + "r"(__as_b32(__values[106])), + "r"(__as_b32(__values[107])), + "r"(__as_b32(__values[108])), + "r"(__as_b32(__values[109])), + "r"(__as_b32(__values[110])), + "r"(__as_b32(__values[111])), + "r"(__as_b32(__values[112])), + "r"(__as_b32(__values[113])), + "r"(__as_b32(__values[114])), + "r"(__as_b32(__values[115])), + "r"(__as_b32(__values[116])), + "r"(__as_b32(__values[117])), + "r"(__as_b32(__values[118])), + "r"(__as_b32(__values[119])), + "r"(__as_b32(__values[120])), + "r"(__as_b32(__values[121])), + "r"(__as_b32(__values[122])), + "r"(__as_b32(__values[123])), + "r"(__as_b32(__values[124])), + "r"(__as_b32(__values[125])), + "r"(__as_b32(__values[126])), + "r"(__as_b32(__values[127])) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_st_16x32bx2_unpack_16b_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_ST_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_wait.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_wait.h new file mode 100644 index 00000000000..5f683c07fea --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tcgen05_wait.h @@ -0,0 +1,44 @@ +// This file was automatically generated. Do not edit. + +#ifndef _CUDA_PTX_GENERATED_TCGEN05_WAIT_H_ +#define _CUDA_PTX_GENERATED_TCGEN05_WAIT_H_ + +/* +// tcgen05.wait::ld.sync.aligned; // PTX ISA 86, SM_100a, SM_101a +template +__device__ static inline void tcgen05_wait_ld(); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_wait_ld_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_wait_ld() +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm volatile("tcgen05.wait::ld.sync.aligned;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_wait_ld_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tcgen05.wait::st.sync.aligned; // PTX ISA 86, SM_100a, SM_101a +template +__device__ static inline void tcgen05_wait_st(); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tcgen05_wait_st_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void tcgen05_wait_st() +{ +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm volatile("tcgen05.wait::st.sync.aligned;" : : : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tcgen05_wait_st_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +#endif // _CUDA_PTX_GENERATED_TCGEN05_WAIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h index b51b5185db0..db5e7dde640 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h @@ -24,37 +24,43 @@ tensormap_cp_fenceproxy(sem_release_t, scope_t<_Scope> __scope, void* __dst, con { // __sem == sem_release (due to parameter type constraint) static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); - NV_IF_ELSE_TARGET( - NV_PROVIDES_SM_90, - ( - _CCCL_IF_CONSTEXPR (__scope == scope_cta) { - asm volatile( - "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned [%0], [%1], %2;" - : - : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { - asm volatile( - "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned [%0], [%1], %2;" - : - : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) { - asm volatile( - "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned [%0], [%1], %2;" - : - : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) - : "memory"); - } else _CCCL_IF_CONSTEXPR (__scope == scope_sys) { - asm volatile( - "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned [%0], [%1], %2;" - : - : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) - : "memory"); - }), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 900 + _CCCL_IF_CONSTEXPR (__scope == scope_cta) + { + asm volatile( + "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) + { + asm volatile( + "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) + { + asm volatile( + "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) + : "memory"); + } + else _CCCL_IF_CONSTEXPR (__scope == scope_sys) + { + asm volatile( + "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) + : "memory"); + } +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__(); +# endif } #endif // __cccl_ptx_isa >= 830 diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h index 3889026750d..53c56e159f7 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h @@ -4,121 +4,127 @@ #define _CUDA_PTX_GENERATED_TENSORMAP_REPLACE_H_ /* -// tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } -template +template = true> __device__ static inline void tensormap_replace_global_address( cuda::ptx::space_global_t, void* tm_addr, B64 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_global_address(space_global_t, void* __tm_addr, _B64 __new_val) { // __space == space_global (due to parameter type constraint) static_assert(sizeof(_B64) == 8, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.global_address.global.b1024.b64 [%0], %1;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "l"(__as_b64(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.global_address.global.b1024.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "l"(__as_b64(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } -template +template = true> __device__ static inline void tensormap_replace_global_address( cuda::ptx::space_shared_t, void* tm_addr, B64 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_global_address(space_shared_t, void* __tm_addr, _B64 __new_val) { // __space == space_shared (due to parameter type constraint) static_assert(sizeof(_B64) == 8, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.global_address.shared::cta.b1024.b64 [%0], %1;" - : - : "r"(__as_ptr_smem(__tm_addr)), "l"(__as_b64(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.global_address.shared::cta.b1024.b64 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "l"(__as_b64(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } -template +template = true> __device__ static inline void tensormap_replace_rank( cuda::ptx::space_global_t, void* tm_addr, B32 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_rank(space_global_t, void* __tm_addr, _B32 __new_val) { // __space == space_global (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.rank.global.b1024.b32 [%0], %1;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "r"(__as_b32(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.rank.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } -template +template = true> __device__ static inline void tensormap_replace_rank( cuda::ptx::space_shared_t, void* tm_addr, B32 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_rank(space_shared_t, void* __tm_addr, _B32 __new_val) { // __space == space_shared (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.rank.shared::cta.b1024.b32 [%0], %1;" - : - : "r"(__as_ptr_smem(__tm_addr)), "r"(__as_b32(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.rank.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } -template +template = true> __device__ static inline void tensormap_replace_box_dim( cuda::ptx::space_global_t, void* tm_addr, @@ -126,29 +132,30 @@ __device__ static inline void tensormap_replace_box_dim( B32 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_box_dim(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) { // __space == space_global (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.box_dim.global.b1024.b32 [%0], %1, %2;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.box_dim.global.b1024.b32 [%0], %1, %2;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } -template +template = true> __device__ static inline void tensormap_replace_box_dim( cuda::ptx::space_shared_t, void* tm_addr, @@ -156,29 +163,30 @@ __device__ static inline void tensormap_replace_box_dim( B32 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_box_dim(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) { // __space == space_shared (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [%0], %1, %2;" - : - : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } -template +template = true> __device__ static inline void tensormap_replace_global_dim( cuda::ptx::space_global_t, void* tm_addr, @@ -186,29 +194,30 @@ __device__ static inline void tensormap_replace_global_dim( B32 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_global_dim(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) { // __space == space_global (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.global_dim.global.b1024.b32 [%0], %1, %2;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.global_dim.global.b1024.b32 [%0], %1, %2;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } -template +template = true> __device__ static inline void tensormap_replace_global_dim( cuda::ptx::space_shared_t, void* tm_addr, @@ -216,29 +225,30 @@ __device__ static inline void tensormap_replace_global_dim( B32 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_global_dim(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) { // __space == space_shared (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%0], %1, %2;" - : - : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } -template +template = true> __device__ static inline void tensormap_replace_global_stride( cuda::ptx::space_global_t, void* tm_addr, @@ -246,29 +256,31 @@ __device__ static inline void tensormap_replace_global_stride( B64 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_global_stride(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B64 __new_val) { // __space == space_global (due to parameter type constraint) static_assert(sizeof(_B64) == 8, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.global_stride.global.b1024.b64 [%0], %1, %2;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "l"(__as_b64(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.global_stride.global.b1024.b64 [%0], %1, %2;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "l"(__as_b64(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } -template +template = true> __device__ static inline void tensormap_replace_global_stride( cuda::ptx::space_shared_t, void* tm_addr, @@ -276,29 +288,98 @@ __device__ static inline void tensormap_replace_global_stride( B64 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_global_stride(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B64 __new_val) { // __space == space_shared (due to parameter type constraint) static_assert(sizeof(_B64) == 8, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], %1, %2;" - : - : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "l"(__as_b64(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "l"(__as_b64(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, +SM_101a // .space = { .global } -template +template = true> +__device__ static inline void tensormap_replace_element_stride( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_element_stride_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tensormap_replace_element_stride(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) +{ + // __space == space_global (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.element_stride.global.b1024.b32 [%0], %1, %2;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_element_stride_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, +SM_101a +// .space = { .shared::cta } +template = true> +__device__ static inline void tensormap_replace_element_stride( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_element_stride_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> +_CCCL_DEVICE static inline void +tensormap_replace_element_stride(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) +{ + // __space == space_shared (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_element_stride_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, +SM_101a +// .space = { .global } +template = true> __device__ static inline void tensormap_replace_element_size( cuda::ptx::space_global_t, void* tm_addr, @@ -306,29 +387,32 @@ __device__ static inline void tensormap_replace_element_size( B32 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_element_size(space_global_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) { // __space == space_global (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.element_stride.global.b1024.b32 [%0], %1, %2;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.element_stride.global.b1024.b32 [%0], %1, %2;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a, SM_100a, +SM_101a // .space = { .shared::cta } -template +template = true> __device__ static inline void tensormap_replace_element_size( cuda::ptx::space_shared_t, void* tm_addr, @@ -336,27 +420,29 @@ __device__ static inline void tensormap_replace_element_size( B32 new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__(); -template +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +template = true> _CCCL_DEVICE static inline void tensormap_replace_element_size(space_shared_t, void* __tm_addr, n32_t<_N32> __ord, _B32 __new_val) { // __space == space_shared (due to parameter type constraint) static_assert(sizeof(_B32) == 4, ""); - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [%0], %1, %2;" - : - : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();)); +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__ord.value), "r"(__as_b32(__new_val)) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } template __device__ static inline void tensormap_replace_elemtype( @@ -365,25 +451,26 @@ __device__ static inline void tensormap_replace_elemtype( cuda::ptx::n32_t new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void tensormap_replace_elemtype(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) { - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.elemtype.global.b1024.b32 [%0], %1;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();)); +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.elemtype.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } template __device__ static inline void tensormap_replace_elemtype( @@ -392,25 +479,26 @@ __device__ static inline void tensormap_replace_elemtype( cuda::ptx::n32_t new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void tensormap_replace_elemtype(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) { - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%0], %1;" - : - : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();)); +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } template __device__ static inline void tensormap_replace_interleave_layout( @@ -419,26 +507,28 @@ __device__ static inline void tensormap_replace_interleave_layout( cuda::ptx::n32_t new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void tensormap_replace_interleave_layout(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) { - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.interleave_layout.global.b1024.b32 [%0], %1;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();)); +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.interleave_layout.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } template __device__ static inline void tensormap_replace_interleave_layout( @@ -447,26 +537,28 @@ __device__ static inline void tensormap_replace_interleave_layout( cuda::ptx::n32_t new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void tensormap_replace_interleave_layout(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) { - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [%0], %1;" - : - : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();)); +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } template __device__ static inline void tensormap_replace_swizzle_mode( @@ -475,25 +567,27 @@ __device__ static inline void tensormap_replace_swizzle_mode( cuda::ptx::n32_t new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void tensormap_replace_swizzle_mode(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) { - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.swizzle_mode.global.b1024.b32 [%0], %1;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();)); +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.swizzle_mode.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } template __device__ static inline void tensormap_replace_swizzle_mode( @@ -502,25 +596,27 @@ __device__ static inline void tensormap_replace_swizzle_mode( cuda::ptx::n32_t new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void +__cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void tensormap_replace_swizzle_mode(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) { - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [%0], %1;" - : - : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();)); +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .global } template __device__ static inline void tensormap_replace_fill_mode( @@ -529,25 +625,26 @@ __device__ static inline void tensormap_replace_fill_mode( cuda::ptx::n32_t new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) { - // __space == space_global (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.fill_mode.global.b1024.b32 [%0], %1;" - : - : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();)); +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.fill_mode.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 /* -// tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a, SM_100a, SM_101a // .space = { .shared::cta } template __device__ static inline void tensormap_replace_fill_mode( @@ -556,21 +653,78 @@ __device__ static inline void tensormap_replace_fill_mode( cuda::ptx::n32_t new_val); */ #if __cccl_ptx_isa >= 830 -extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__(); +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); template _CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) { - // __space == space_shared (due to parameter type constraint) - NV_IF_ELSE_TARGET( - NV_HAS_FEATURE_SM_90a, - (asm("tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [%0], %1;" - : - : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) - : "memory");), - ( - // Unsupported architectures will have a linker error with a semi-decent error message - __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();)); +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM90_ALL || __CUDA_ARCH_FEAT_SM100_ALL \ + || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a_SM_100a_SM_101a__(); +# endif } #endif // __cccl_ptx_isa >= 830 +/* +// tensormap.replace.tile.swizzle_atomicity.space.b1024.b32 [tm_addr], new_val; // PTX ISA 86, SM_100a, SM_101a +// .space = { .global } +template +__device__ static inline void tensormap_replace_swizzle_atomicity( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_atomicity_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void +tensormap_replace_swizzle_atomicity(space_global_t, void* __tm_addr, n32_t<_N32> __new_val) +{ +// __space == space_global (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.swizzle_atomicity.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_swizzle_atomicity_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + +/* +// tensormap.replace.tile.swizzle_atomicity.space.b1024.b32 [tm_addr], new_val; // PTX ISA 86, SM_100a, SM_101a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_swizzle_atomicity( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 860 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_atomicity_is_not_supported_before_SM_100a_SM_101a__(); +template +_CCCL_DEVICE static inline void +tensormap_replace_swizzle_atomicity(space_shared_t, void* __tm_addr, n32_t<_N32> __new_val) +{ +// __space == space_shared (due to parameter type constraint) +# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL + asm("tensormap.replace.tile.swizzle_atomicity.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), "n"(__new_val.value) + : "memory"); +# else + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_swizzle_atomicity_is_not_supported_before_SM_100a_SM_101a__(); +# endif +} +#endif // __cccl_ptx_isa >= 860 + #endif // _CUDA_PTX_GENERATED_TENSORMAP_REPLACE_H_ diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h new file mode 100644 index 00000000000..6f5a022dbc8 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster_aligned.h @@ -0,0 +1,61 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_barrier_cluster_aligned(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // barrier.cluster.arrive.aligned; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::barrier_cluster_arrive));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // barrier.cluster.wait.aligned; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::barrier_cluster_wait));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // barrier.cluster.arrive.release.aligned; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::barrier_cluster_arrive));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // barrier.cluster.arrive.relaxed.aligned; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::barrier_cluster_arrive));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // barrier.cluster.wait.acquire.aligned; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::barrier_cluster_wait));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/clusterlaunchcontrol.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/clusterlaunchcontrol.h new file mode 100644 index 00000000000..c5df06bc787 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/clusterlaunchcontrol.h @@ -0,0 +1,84 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_clusterlaunchcontrol(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_PROVIDES_SM_100, + ( + // clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 [addr], + // [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::clusterlaunchcontrol_try_cancel));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128 + // [addr], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::clusterlaunchcontrol_try_cancel_multicast));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128 + // [addr], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::clusterlaunchcontrol_try_cancel_multicast));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_PROVIDES_SM_100, + ( + // clusterlaunchcontrol.query_cancel.is_canceled.pred.b128 pred_is_canceled, try_cancel_response; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::clusterlaunchcontrol_query_cancel_is_canceled));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_PROVIDES_SM_100, + ( + // clusterlaunchcontrol.query_cancel.get_first_ctaid::x.b32.b128 ret_dim, try_cancel_response; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::clusterlaunchcontrol_query_cancel_get_first_ctaid_x));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_PROVIDES_SM_100, + ( + // clusterlaunchcontrol.query_cancel.get_first_ctaid::y.b32.b128 ret_dim, try_cancel_response; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::clusterlaunchcontrol_query_cancel_get_first_ctaid_y));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_PROVIDES_SM_100, + ( + // clusterlaunchcontrol.query_cancel.get_first_ctaid::z.b32.b128 ret_dim, try_cancel_response; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::clusterlaunchcontrol_query_cancel_get_first_ctaid_z));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_PROVIDES_SM_100, + ( + // clusterlaunchcontrol.query_cancel.get_first_ctaid.v4.b32.b128 block_dim, try_cancel_response; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::clusterlaunchcontrol_query_cancel_get_first_ctaid));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.h index a342954591a..de118140440 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.h @@ -20,20 +20,30 @@ __global__ void test_cp_async_bulk(void** fn_ptr) NV_IF_TARGET( NV_PROVIDES_SM_90, ( - // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // - // 1a. unicast + // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; * fn_ptr++ = reinterpret_cast( static_cast( cuda::ptx::cp_async_bulk));)); #endif // __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk));)); +#endif // __cccl_ptx_isa >= 860 + #if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_PROVIDES_SM_90, ( // cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, - // [rdsmem_bar]; // 2. + // [rdsmem_bar]; * fn_ptr++ = reinterpret_cast( static_cast( @@ -44,10 +54,21 @@ __global__ void test_cp_async_bulk(void** fn_ptr) NV_IF_TARGET( NV_PROVIDES_SM_90, ( - // cp.async.bulk.global.shared::cta.bulk_group [dstMem], [srcMem], size; // 3. + // cp.async.bulk.global.shared::cta.bulk_group [dstMem], [srcMem], size; * fn_ptr++ = reinterpret_cast( static_cast( cuda::ptx::cp_async_bulk));)); #endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_100, + ( + // cp.async.bulk.global.shared::cta.bulk_group.cp_mask [dstMem], [srcMem], size, byteMask; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_cp_mask));)); +#endif // __cccl_ptx_isa >= 860 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.h index 6e2a986e7bd..81298beb481 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.h @@ -21,7 +21,33 @@ __global__ void test_cp_async_bulk_multicast(void** fn_ptr) NV_HAS_FEATURE_SM_90a, ( // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], - // size, [smem_bar], ctaMask; // 1. + // size, [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], + // size, [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], + // size, [smem_bar], ctaMask; * fn_ptr++ = reinterpret_cast( static_cast( static_cast( cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + #if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_PROVIDES_SM_90, ( - // cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. + // cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; * fn_ptr++ = reinterpret_cast( static_cast( @@ -44,18 +105,79 @@ __global__ void test_cp_async_bulk_tensor(void** fn_ptr) NV_PROVIDES_SM_90, ( // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, - // tensorCoords], [smem_bar];// 1b. + // tensorCoords], [smem_bar]; * fn_ptr++ = reinterpret_cast( static_cast( cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + #if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_PROVIDES_SM_90, ( - // cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. + // cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; * fn_ptr++ = reinterpret_cast( static_cast( @@ -67,18 +189,79 @@ __global__ void test_cp_async_bulk_tensor(void** fn_ptr) NV_PROVIDES_SM_90, ( // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, - // tensorCoords], [smem_bar];// 1c. + // tensorCoords], [smem_bar]; * fn_ptr++ = reinterpret_cast( static_cast( cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + #if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_PROVIDES_SM_90, ( - // cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. + // cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; * fn_ptr++ = reinterpret_cast( static_cast( @@ -90,18 +273,79 @@ __global__ void test_cp_async_bulk_tensor(void** fn_ptr) NV_PROVIDES_SM_90, ( // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, - // tensorCoords], [smem_bar];// 1d. + // tensorCoords], [smem_bar]; * fn_ptr++ = reinterpret_cast( static_cast( cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + #if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_PROVIDES_SM_90, ( - // cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. + // cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; * fn_ptr++ = reinterpret_cast( static_cast( @@ -113,18 +357,79 @@ __global__ void test_cp_async_bulk_tensor(void** fn_ptr) NV_PROVIDES_SM_90, ( // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, - // tensorCoords], [smem_bar];// 1e. + // tensorCoords], [smem_bar]; * fn_ptr++ = reinterpret_cast( static_cast( cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 860 + #if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_PROVIDES_SM_90, ( - // cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. + // cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; * fn_ptr++ = reinterpret_cast( static_cast( diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_gather_scatter.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_gather_scatter.h new file mode 100644 index 00000000000..930cfa09125 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_gather_scatter.h @@ -0,0 +1,180 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_cp_async_bulk_tensor_gather_scatter(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_100, + ( + // cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes [dstMem], [tensorMap, + // tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor_tile_gather4));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4)); + // cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::1 [dstMem], + // [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4)); + // cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4)); + // cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4)); + // cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor_tile_gather4));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group [tensorMap, tensorCoords], [srcMem]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor_tile_scatter4));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group [tensorMap, tensorCoords], [srcMem]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::cp_async_bulk_tensor_tile_scatter4));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.h index 617bc9507bd..3f3a08764d2 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.h @@ -21,7 +21,7 @@ __global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr) NV_HAS_FEATURE_SM_90a, ( // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], - // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. + // [tensorMap, tensorCoords], [smem_bar], ctaMask; * fn_ptr++ = reinterpret_cast( static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 #if __cccl_ptx_isa >= 800 @@ -37,7 +116,20 @@ __global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr) NV_HAS_FEATURE_SM_90a, ( // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], - // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; * fn_ptr++ = reinterpret_cast( static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 #if __cccl_ptx_isa >= 800 @@ -53,7 +211,33 @@ __global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr) NV_HAS_FEATURE_SM_90a, ( // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], - // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; * fn_ptr++ = reinterpret_cast( static_cast(cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + #if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], - // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; * fn_ptr++ = reinterpret_cast( static_cast(cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + #if __cccl_ptx_isa >= 800 NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], - // [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. + // [tensorMap, tensorCoords], [smem_bar], ctaMask; * fn_ptr++ = reinterpret_cast( static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], + // [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor)); + // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 + // [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_bulk_tensor));)); #endif // __cccl_ptx_isa >= 800 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_mbarrier_arrive.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_mbarrier_arrive.h new file mode 100644 index 00000000000..663c07b4121 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_mbarrier_arrive.h @@ -0,0 +1,26 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_cp_async_mbarrier_arrive(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 700 + NV_IF_TARGET( + NV_PROVIDES_SM_80, + ( + // cp.async.mbarrier.arrive.b64 [addr]; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::cp_async_mbarrier_arrive));)); +#endif // __cccl_ptx_isa >= 700 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_mbarrier_arrive_noinc.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_mbarrier_arrive_noinc.h new file mode 100644 index 00000000000..a089c727903 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_mbarrier_arrive_noinc.h @@ -0,0 +1,26 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_cp_async_mbarrier_arrive_noinc(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 700 + NV_IF_TARGET(NV_PROVIDES_SM_80, + ( + // cp.async.mbarrier.arrive.noinc.b64 [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::cp_async_mbarrier_arrive_noinc));)); +#endif // __cccl_ptx_isa >= 700 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h new file mode 100644 index 00000000000..298225881d1 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/elect_sync.h @@ -0,0 +1,26 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_elect_sync(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // elect.sync _|is_elected, membermask; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::elect_sync));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.h index aecfcde5e01..0738677ed33 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence.h @@ -28,10 +28,24 @@ __global__ void test_fence(void** fn_ptr) static_cast(cuda::ptx::fence)); // fence.sc.sys; // 1. * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence)); - // fence.acq_rel.cta; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence)); + static_cast(cuda::ptx::fence));)); +#endif // __cccl_ptx_isa >= 600 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // fence.sc.cluster; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 600 + NV_IF_TARGET( + NV_PROVIDES_SM_70, + ( + // fence.acq_rel.cta; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence)); // fence.acq_rel.gpu; // 1. * fn_ptr++ = reinterpret_cast( static_cast(cuda::ptx::fence)); @@ -41,14 +55,46 @@ __global__ void test_fence(void** fn_ptr) #endif // __cccl_ptx_isa >= 600 #if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // fence.acq_rel.cluster; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence));)); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 860 NV_IF_TARGET( NV_PROVIDES_SM_90, ( - // fence.sc.cluster; // 2. + // fence.acquire.cta; * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence)); - // fence.acq_rel.cluster; // 2. + static_cast(cuda::ptx::fence)); + // fence.acquire.cluster; * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::fence));)); -#endif // __cccl_ptx_isa >= 780 + static_cast(cuda::ptx::fence)); + // fence.acquire.gpu; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence)); + // fence.acquire.sys; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // fence.release.cta; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence)); + // fence.release.cluster; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence)); + // fence.release.gpu; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence)); + // fence.release.sys; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::fence));)); +#endif // __cccl_ptx_isa >= 860 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async_generic_sync_restrict.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async_generic_sync_restrict.h new file mode 100644 index 00000000000..7af3a09ad2b --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_async_generic_sync_restrict.h @@ -0,0 +1,38 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_fence_proxy_async_generic_sync_restrict(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // fence.proxy.async::generic.acquire.sync_restrict::shared::cluster.cluster; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::fence_proxy_async_generic_sync_restrict));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // fence.proxy.async::generic.release.sync_restrict::shared::cta.cluster; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::fence_proxy_async_generic_sync_restrict));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_sync_restrict.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_sync_restrict.h new file mode 100644 index 00000000000..c673d840428 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_sync_restrict.h @@ -0,0 +1,38 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_fence_sync_restrict(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // fence.acquire.sync_restrict::shared::cluster.cluster; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::fence_sync_restrict));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // fence.release.sync_restrict::shared::cta.cluster; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::fence_sync_restrict));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h new file mode 100644 index 00000000000..9160be1fe2d --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mapa.h @@ -0,0 +1,27 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_mapa(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mapa.shared::cluster.u32 dest, addr, target_cta; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mapa));)); +#endif // __cccl_ptx_isa >= 780 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.h index 3cddcb3b54c..d32773c118d 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive.h @@ -87,4 +87,60 @@ __global__ void test_mbarrier_arrive(void** fn_ptr) cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, cuda::ptx::space_cluster_t, uint64_t*, const uint32_t&)>( cuda::ptx::mbarrier_arrive));)); #endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.relaxed.cta.shared::cta.b64 state, [addr], count; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive)); + // mbarrier.arrive.relaxed.cluster.shared::cta.b64 state, [addr], count; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_arrive));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.relaxed.cta.shared::cta.b64 state, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive)); + // mbarrier.arrive.relaxed.cluster.shared::cta.b64 state, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.relaxed.cluster.shared::cluster.b64 _, [addr], count; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.relaxed.cluster.shared::cluster.b64 _, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive));)); +#endif // __cccl_ptx_isa >= 860 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.h index a2ef4b619bb..8ef925662ac 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_arrive_expect_tx.h @@ -44,4 +44,33 @@ __global__ void test_mbarrier_arrive_expect_tx(void** fn_ptr) cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, cuda::ptx::space_cluster_t, uint64_t*, const uint32_t&)>( cuda::ptx::mbarrier_arrive_expect_tx));)); #endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.expect_tx.relaxed.cta.shared::cta.b64 state, [addr], txCount; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive_expect_tx)); + // mbarrier.arrive.expect_tx.relaxed.cluster.shared::cta.b64 state, [addr], txCount; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_arrive_expect_tx));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.arrive.expect_tx.relaxed.cluster.shared::cluster.b64 _, [addr], txCount; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_arrive_expect_tx));)); +#endif // __cccl_ptx_isa >= 860 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_expect_tx.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_expect_tx.h new file mode 100644 index 00000000000..8dd3b6a2037 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_expect_tx.h @@ -0,0 +1,50 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_mbarrier_expect_tx(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [addr], txCount; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_expect_tx)); + // mbarrier.expect_tx.relaxed.cluster.shared::cta.b64 [addr], txCount; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_expect_tx));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.expect_tx.relaxed.cta.shared::cluster.b64 [addr], txCount; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_expect_tx)); + // mbarrier.expect_tx.relaxed.cluster.shared::cluster.b64 [addr], txCount; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_expect_tx));)); +#endif // __cccl_ptx_isa >= 800 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_test_wait.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_test_wait.h new file mode 100644 index 00000000000..c9c0d0d14fb --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_test_wait.h @@ -0,0 +1,55 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_mbarrier_test_wait(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 700 + NV_IF_TARGET(NV_PROVIDES_SM_80, + ( + // mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_test_wait));)); +#endif // __cccl_ptx_isa >= 700 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.test_wait.acquire.cta.shared::cta.b64 waitComplete, [addr], state; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait)); + // mbarrier.test_wait.acquire.cluster.shared::cta.b64 waitComplete, [addr], state; // 2. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.test_wait.relaxed.cta.shared::cta.b64 waitComplete, [addr], state; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait)); + // mbarrier.test_wait.relaxed.cluster.shared::cta.b64 waitComplete, [addr], state; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_test_wait_parity.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_test_wait_parity.h new file mode 100644 index 00000000000..f44c0554308 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_test_wait_parity.h @@ -0,0 +1,55 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_mbarrier_test_wait_parity(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 710 + NV_IF_TARGET(NV_PROVIDES_SM_80, + ( + // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::mbarrier_test_wait_parity));)); +#endif // __cccl_ptx_isa >= 710 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait_parity)); + // mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait_parity));)); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.test_wait.parity.relaxed.cta.shared::cta.b64 waitComplete, [addr], phaseParity; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait_parity)); + // mbarrier.test_wait.parity.relaxed.cluster.shared::cta.b64 waitComplete, [addr], phaseParity; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_test_wait_parity));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.h index 00166f8172c..1a1b347751c 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait.h @@ -66,4 +66,35 @@ __global__ void test_mbarrier_try_wait(void** fn_ptr) cuda::ptx::sem_acquire_t, cuda::ptx::scope_cluster_t, uint64_t*, const uint64_t&, const uint32_t&)>( cuda::ptx::mbarrier_try_wait));)); #endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.relaxed.cta.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait)); + // mbarrier.try_wait.relaxed.cluster.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.relaxed.cta.shared::cta.b64 waitComplete, [addr], state; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait)); + // mbarrier.try_wait.relaxed.cluster.shared::cta.b64 waitComplete, [addr], state; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait));)); +#endif // __cccl_ptx_isa >= 860 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.h index 8aa588fbab0..4a5ef3e926f 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_try_wait_parity.h @@ -65,4 +65,36 @@ __global__ void test_mbarrier_try_wait_parity(void** fn_ptr) cuda::ptx::sem_acquire_t, cuda::ptx::scope_cluster_t, uint64_t*, const uint32_t&, const uint32_t&)>( cuda::ptx::mbarrier_try_wait_parity));)); #endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.parity.relaxed.cta.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait_parity)); + // mbarrier.try_wait.parity.relaxed.cluster.shared::cta.b64 waitComplete, [addr], phaseParity, + // suspendTimeHint; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait_parity));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // mbarrier.try_wait.parity.relaxed.cta.shared::cta.b64 waitComplete, [addr], phaseParity; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait_parity)); + // mbarrier.try_wait.parity.relaxed.cluster.shared::cta.b64 waitComplete, [addr], phaseParity; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::mbarrier_try_wait_parity));)); +#endif // __cccl_ptx_isa >= 860 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.h deleted file mode 100644 index 80129e5016c..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait.h +++ /dev/null @@ -1,24 +0,0 @@ -__global__ void test_mbarrier_test_wait(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 700 - NV_IF_TARGET(NV_PROVIDES_SM_80, - ( - // mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_test_wait));)); -#endif // __cccl_ptx_isa >= 700 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.test_wait.acquire.cta.shared::cta.b64 waitComplete, [addr], state; // 2. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_test_wait)); - // mbarrier.test_wait.acquire.cluster.shared::cta.b64 waitComplete, [addr], state; // 2. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_test_wait));)); -#endif // __cccl_ptx_isa >= 800 -} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.h deleted file mode 100644 index 30902c58905..00000000000 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/mbarrier_wait_parity.h +++ /dev/null @@ -1,24 +0,0 @@ -__global__ void test_mbarrier_test_wait_parity(void** fn_ptr) -{ -#if __cccl_ptx_isa >= 710 - NV_IF_TARGET(NV_PROVIDES_SM_80, - ( - // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. - * fn_ptr++ = reinterpret_cast( - static_cast(cuda::ptx::mbarrier_test_wait_parity));)); -#endif // __cccl_ptx_isa >= 710 - -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET( - NV_PROVIDES_SM_90, - ( - // mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_test_wait_parity)); - // mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. - * fn_ptr++ = reinterpret_cast( - static_cast( - cuda::ptx::mbarrier_test_wait_parity));)); -#endif // __cccl_ptx_isa >= 800 -} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_ld_reduce.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_ld_reduce.h new file mode 100644 index 00000000000..c0259451a1b --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_ld_reduce.h @@ -0,0 +1,1020 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_multimem_ld_reduce(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.min.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.min.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.min.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.min.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.min.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.min.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.min.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.min.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.min.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.min.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.min.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.min.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.min.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.min.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.min.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.min.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.min.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.min.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.min.s32 dest, [addr]; + * fn_ptr++ = + reinterpret_cast(static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.min.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.min.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.min.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.min.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.min.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.min.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.min.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.min.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.min.s64 dest, [addr]; + * fn_ptr++ = + reinterpret_cast(static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.min.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.min.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.min.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.min.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.min.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.min.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.min.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.min.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.max.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.max.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.max.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.max.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.max.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.max.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.max.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.max.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.max.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.max.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.max.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.max.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.max.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.max.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.max.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.max.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.max.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.max.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.max.s32 dest, [addr]; + * fn_ptr++ = + reinterpret_cast(static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.max.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.max.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.max.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.max.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.max.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.max.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.max.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.max.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.max.s64 dest, [addr]; + * fn_ptr++ = + reinterpret_cast(static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.max.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.max.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.max.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.max.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.max.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.max.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.max.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.max.s64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.add.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.add.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.add.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.add.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.add.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.add.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.add.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.add.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.add.u32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.add.s32 dest, [addr]; + * fn_ptr++ = + reinterpret_cast(static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.add.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.add.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.add.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.add.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.add.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.add.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.add.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.add.s32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.add.u64 dest, [addr]; + * fn_ptr++ = + reinterpret_cast(static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.add.u64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.and.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.and.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.and.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.and.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.and.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.and.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.and.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.and.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.and.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.or.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.or.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.or.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.or.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.or.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.or.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.or.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.or.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.or.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.xor.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.xor.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.xor.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.xor.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.xor.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.xor.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.xor.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.xor.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.xor.b32 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.and.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.and.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.and.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.and.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.and.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.and.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.and.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.and.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.and.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.or.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.or.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.or.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.or.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.or.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.or.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.or.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.or.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.or.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.weak.global.xor.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.ld_reduce.relaxed.cta.global.xor.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.cluster.global.xor.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.gpu.global.xor.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.relaxed.sys.global.xor.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cta.global.xor.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.cluster.global.xor.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.gpu.global.xor.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce)); + // multimem.ld_reduce.acquire.sys.global.xor.b64 dest, [addr]; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_ld_reduce));)); +#endif // __cccl_ptx_isa >= 810 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_red.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_red.h new file mode 100644 index 00000000000..dd0011e3fb2 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_red.h @@ -0,0 +1,840 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_multimem_red(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.min.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.min.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.min.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.min.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.min.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.min.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.min.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.min.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.min.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.min.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.min.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.min.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.min.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.min.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.min.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.min.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.min.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.min.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.min.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.min.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.min.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.min.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.min.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.min.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.min.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.min.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.min.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.min.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.min.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.min.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.min.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.min.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.max.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.max.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.max.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.max.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.max.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.max.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.max.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.max.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.max.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.max.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.max.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.max.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.max.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.max.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.max.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.max.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.max.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.max.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.max.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.max.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.max.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.max.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.max.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.max.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.max.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.max.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.max.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.max.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.max.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.max.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.max.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.max.s64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.add.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.add.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.add.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.add.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.add.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.add.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.add.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.add.u32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.add.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.add.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.add.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.add.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.add.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.add.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.add.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.add.s32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.add.u64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.and.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.and.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.and.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.and.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.and.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.and.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.and.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.and.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.or.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.or.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.or.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.or.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.or.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.or.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.or.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.or.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.xor.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.xor.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.xor.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.xor.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.xor.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.xor.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.xor.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.xor.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.and.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.and.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.and.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.and.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.and.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.and.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.and.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.and.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.or.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.or.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.or.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.or.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.or.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.or.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.or.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.or.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.red.relaxed.cta.global.xor.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.cluster.global.xor.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.gpu.global.xor.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.relaxed.sys.global.xor.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cta.global.xor.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.cluster.global.xor.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.gpu.global.xor.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red)); + // multimem.red.release.sys.global.xor.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_red));)); +#endif // __cccl_ptx_isa >= 810 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_st.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_st.h new file mode 100644 index 00000000000..b61c25430ed --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/multimem_st.h @@ -0,0 +1,110 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_multimem_st(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // multimem.st.weak.global.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::multimem_st));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.st.relaxed.cta.global.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.relaxed.cluster.global.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.relaxed.gpu.global.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.relaxed.sys.global.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.release.cta.global.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.release.cluster.global.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.release.gpu.global.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.release.sys.global.b32 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, + ( + // multimem.st.weak.global.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::multimem_st));)); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET( + NV_PROVIDES_SM_90, + ( + // multimem.st.relaxed.cta.global.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.relaxed.cluster.global.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.relaxed.gpu.global.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.relaxed.sys.global.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.release.cta.global.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.release.cluster.global.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.release.gpu.global.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st)); + // multimem.st.release.sys.global.b64 [addr], val; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::multimem_st));)); +#endif // __cccl_ptx_isa >= 810 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/st_bulk.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/st_bulk.h new file mode 100644 index 00000000000..d9203b625e8 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/st_bulk.h @@ -0,0 +1,26 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_st_bulk(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_PROVIDES_SM_100, + ( + // st.bulk.weak.shared::cta [addr], size, initval; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::st_bulk));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_alloc.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_alloc.h new file mode 100644 index 00000000000..48a40f6f23c --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_alloc.h @@ -0,0 +1,81 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_alloc(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 [dst], nCols; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_alloc)); + // tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 [dst], nCols; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_alloc));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 [dst], nCols; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_alloc)); + // tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 [dst], nCols; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_alloc));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.dealloc.cta_group::1.sync.aligned.b32 taddr, nCols; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_dealloc)); + // tcgen05.dealloc.cta_group::2.sync.aligned.b32 taddr, nCols; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_dealloc));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.dealloc.cta_group::1.sync.aligned.b32 taddr, nCols; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_dealloc)); + // tcgen05.dealloc.cta_group::2.sync.aligned.b32 taddr, nCols; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_dealloc));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.relinquish_alloc_permit.cta_group::1.sync.aligned; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_relinquish_alloc_permit)); + // tcgen05.relinquish_alloc_permit.cta_group::2.sync.aligned; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_relinquish_alloc_permit));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.relinquish_alloc_permit.cta_group::1.sync.aligned; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_relinquish_alloc_permit)); + // tcgen05.relinquish_alloc_permit.cta_group::2.sync.aligned; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_relinquish_alloc_permit));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_commit.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_commit.h new file mode 100644 index 00000000000..c41981e6917 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_commit.h @@ -0,0 +1,62 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_commit(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_commit)); + // tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_commit));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_commit)); + // tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [smem_bar]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_commit));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_commit_multicast)); + // tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [smem_bar], + // ctaMask; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_commit_multicast));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [smem_bar], ctaMask; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_commit_multicast)); + // tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [smem_bar], + // ctaMask; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_commit_multicast));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_cp.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_cp.h new file mode 100644 index 00000000000..4c37cb11cfa --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_cp.h @@ -0,0 +1,396 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_cp(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.128x256b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_128x256b)); + // tcgen05.cp.cta_group::2.128x256b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_128x256b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.128x256b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_128x256b)); + // tcgen05.cp.cta_group::2.128x256b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_128x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.4x256b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_4x256b)); + // tcgen05.cp.cta_group::2.4x256b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_4x256b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.4x256b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_4x256b)); + // tcgen05.cp.cta_group::2.4x256b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_4x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.128x128b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_128x128b)); + // tcgen05.cp.cta_group::2.128x128b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_128x128b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.128x128b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_128x128b)); + // tcgen05.cp.cta_group::2.128x128b [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_128x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::02_13 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::02_13 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::02_13 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::02_13 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::01_23 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::01_23 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::01_23 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::01_23 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.32x128b.warpx4 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_32x128b_warpx4)); + // tcgen05.cp.cta_group::2.32x128b.warpx4 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.32x128b.warpx4 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_cp_32x128b_warpx4)); + // tcgen05.cp.cta_group::2.32x128b.warpx4 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.128x256b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x256b_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.128x256b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x256b_b8x16_b6x16_p32));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.128x256b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x256b_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.128x256b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x256b_b8x16_b6x16_p32));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.4x256b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_4x256b_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.4x256b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_4x256b_b8x16_b6x16_p32));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.4x256b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_4x256b_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.4x256b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_4x256b_b8x16_b6x16_p32));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.128x128b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x128b_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.128x128b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x128b_b8x16_b6x16_p32));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.128x128b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x128b_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.128x128b.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x128b_b8x16_b6x16_p32));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13_b8x16_b6x16_p32));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23_b8x16_b6x16_p32));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32)); + // tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b6x16_p32 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4_b8x16_b6x16_p32));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.128x256b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x256b_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.128x256b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x256b_b8x16_b4x16_p64));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.128x256b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x256b_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.128x256b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x256b_b8x16_b4x16_p64));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.4x256b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_4x256b_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.4x256b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_4x256b_b8x16_b4x16_p64));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.4x256b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_4x256b_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.4x256b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_4x256b_b8x16_b4x16_p64));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.128x128b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x128b_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.128x128b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x128b_b8x16_b4x16_p64));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.128x128b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x128b_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.128x128b.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_128x128b_b8x16_b4x16_p64));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_02_13_b8x16_b4x16_p64));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_64x128b_warpx2_01_23_b8x16_b4x16_p64));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64)); + // tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b4x16_p64 [taddr], s_desc; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_cp_32x128b_warpx4_b8x16_b4x16_p64));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_fence.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_fence.h new file mode 100644 index 00000000000..75b2ec35fa5 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_fence.h @@ -0,0 +1,44 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_fence(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.fence::before_thread_sync; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tcgen05_fence_before_thread_sync));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.fence::before_thread_sync; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tcgen05_fence_before_thread_sync));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.fence::after_thread_sync; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tcgen05_fence_after_thread_sync));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.fence::after_thread_sync; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tcgen05_fence_after_thread_sync));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_ld.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_ld.h new file mode 100644 index 00000000000..48ecce5869e --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_ld.h @@ -0,0 +1,1012 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_ld(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x1.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x1.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x2.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x2.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x4.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x4.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x8.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x8.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x8.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x8.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x16.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x16.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x16.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x16.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x32.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x32.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x32.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x32.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x64.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x64.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x64.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x64.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x128.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x128.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x64b.x128.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x64b.x128.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x64b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x1.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x1.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x2.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x2.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x4.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x4.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x8.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x8.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x8.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x8.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x16.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x16.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x16.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x16.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x32.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x32.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x32.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x32.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x64.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x64.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x128b.x64.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x128b.x64.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x128b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x1.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x1.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x2.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x2.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x4.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x4.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x8.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x8.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x8.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x8.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x16.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x16.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x16.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x16.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x32.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x32.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x256b.x32.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x256b.x32.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_16x256b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x1.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x1.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x2.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x2.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x4.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x4.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x8.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x8.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x8.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x8.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x16.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x16.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x16.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x16.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x32.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x32.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x32.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x32.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x64.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x64.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x64.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x128.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x128.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.32x32b.x128.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.32x32b.x128.pack::16b.b32 out, [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_ld_32x32b_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x1.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x1.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x2.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x2.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x4.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x4.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x8.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x8.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x8.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x8.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x16.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x16.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x16.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x16.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x32.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x32.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x32.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x32.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x64.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x64.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x64.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x64.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x128.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x128.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_ld_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x128.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.ld.sync.aligned.16x32bx2.x128.pack::16b.b32 out, [taddr], immHalfSplitoff; + * fn_ptr++ = reinterpret_cast(static_cast)>( + cuda::ptx::tcgen05_ld_16x32bx2_pack_16b));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_mma.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_mma.h new file mode 100644 index 00000000000..7146c395fa7 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_mma.h @@ -0,0 +1,2928 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_mma(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, + // scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, + // scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, + // scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d, + // scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::f8f6f4 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f8f6f4 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::i8 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::i8 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::f8f6f4 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f8f6f4 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::i8 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::i8 [d_tmem], a_desc, b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], a_desc, b_desc, idesc, enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::f8f6f4 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f8f6f4 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::i8 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::i8 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::f8f6f4 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::f8f6f4 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::1.kind::i8 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma)); + // tcgen05.mma.cta_group::2.kind::i8 [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d, + // scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, + // enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, + // enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, + // enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma_tmem_a));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::f8f6f4 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f8f6f4 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::i8 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::i8 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::f8f6f4 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f8f6f4 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::i8 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::i8 [d_tmem], [a_tmem], b_desc, idesc, disable_output_lane, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_tmem_a));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, scale_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast)>(cuda::ptx::tcgen05_mma_tmem_a));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::f8f6f4 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f8f6f4 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::i8 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::i8 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f16 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::tf32 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::f8f6f4 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::f8f6f4 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::1.kind::i8 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a)); + // tcgen05.mma.cta_group::2.kind::i8 [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_tmem_a));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [d_tmem], a_desc, b_desc, idesc, + // [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_fill)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_fill)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::fill [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_use)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_use)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::use [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_lastuse)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_lastuse)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::lastuse [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_collector_a_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_discard)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_discard)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2x_collector_a_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_collector_a_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.scale_vec::1X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_1x_tmem_a_collector_a_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, b_desc, + // idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::2X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_2_tmem_a_collector_a_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard)); + // tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X.collector::a::discard [d_tmem], a_desc, + // b_desc, idesc, [scale_A_tmem], [scale_B_tmem], enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_mma_block_scale_vec_4x_tmem_a_collector_a_discard));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_mma_ws.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_mma_ws.h new file mode 100644 index 00000000000..7e1674f39fc --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_mma_ws.h @@ -0,0 +1,3570 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_mma_ws(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b0_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b0_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b1_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b1::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b1_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b2_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b2::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b2_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::fill [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_fill));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::use [d_tmem], [a_tmem], b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_use));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::lastuse [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_lastuse));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d, + // zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [d_tmem], a_desc, b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_collector_b3_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d, zero_column_mask_desc; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.mma.ws.cta_group::1.kind::f16.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard)); + // tcgen05.mma.ws.cta_group::1.kind::i8.collector::b3::discard [d_tmem], [a_tmem], b_desc, idesc, + // enable_input_d; + * fn_ptr++ = reinterpret_cast( + static_cast( + cuda::ptx::tcgen05_mma_ws_tmem_a_collector_b3_discard));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_shift.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_shift.h new file mode 100644 index 00000000000..293d2787a87 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_shift.h @@ -0,0 +1,39 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_shift(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.shift.cta_group::1.down [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_shift_down)); + // tcgen05.shift.cta_group::2.down [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_shift_down));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.shift.cta_group::1.down [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_shift_down)); + // tcgen05.shift.cta_group::2.down [taddr]; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_shift_down));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_st.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_st.h new file mode 100644 index 00000000000..ec8cb758e5d --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_st.h @@ -0,0 +1,1012 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_st(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x1.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x1.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x1.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x1.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x2.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x2.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x2.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x2.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x4.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x4.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x4.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x4.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x8.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x8.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x8.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x8.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x16.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x16.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x16.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x16.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x32.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x32.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x32.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x32.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x64.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x64.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x64.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x64.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x128.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x128.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x64b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x64b.x128.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x64b.x128.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x64b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x1.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x1.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x1.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x1.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x2.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x2.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x2.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x2.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x4.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x4.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x4.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x4.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x8.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x8.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x8.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x8.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x16.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x16.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x16.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x16.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x32.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x32.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x32.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x32.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x64.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x64.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x128b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x128b.x64.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x128b.x64.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x128b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x1.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x1.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x1.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x1.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x2.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x2.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x2.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x2.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x4.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x4.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x4.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x4.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x8.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x8.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x8.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x8.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x16.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x16.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x16.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x16.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x32.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x32.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_16x256b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x256b.x32.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x256b.x32.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_16x256b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x1.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x1.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x1.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x1.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x2.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x2.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x2.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x2.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x4.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x4.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x4.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x4.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x8.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x8.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x8.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x8.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x16.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x16.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x16.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x16.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x32.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x32.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x32.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x32.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x64.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x64.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x64.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x64.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x128.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x128.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tcgen05_st_32x32b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.32x32b.x128.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.32x32b.x128.unpack::16b.b32 [taddr], values; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tcgen05_st_32x32b_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x1.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast( + static_cast, const int32_t(&)[1])>(cuda::ptx::tcgen05_st_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x1.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast( + static_cast, const int32_t(&)[1])>(cuda::ptx::tcgen05_st_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x1.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[1])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x1.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[1])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x2.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast( + static_cast, const int32_t(&)[2])>(cuda::ptx::tcgen05_st_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x2.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast( + static_cast, const int32_t(&)[2])>(cuda::ptx::tcgen05_st_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x2.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[2])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x2.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[2])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x4.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast( + static_cast, const int32_t(&)[4])>(cuda::ptx::tcgen05_st_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x4.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast( + static_cast, const int32_t(&)[4])>(cuda::ptx::tcgen05_st_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x4.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[4])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x4.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[4])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x8.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast( + static_cast, const int32_t(&)[8])>(cuda::ptx::tcgen05_st_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x8.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast( + static_cast, const int32_t(&)[8])>(cuda::ptx::tcgen05_st_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x8.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[8])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x8.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[8])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x16.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[16])>( + cuda::ptx::tcgen05_st_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x16.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[16])>( + cuda::ptx::tcgen05_st_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x16.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[16])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x16.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[16])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x32.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[32])>( + cuda::ptx::tcgen05_st_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x32.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[32])>( + cuda::ptx::tcgen05_st_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x32.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[32])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x32.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[32])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x64.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[64])>( + cuda::ptx::tcgen05_st_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x64.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[64])>( + cuda::ptx::tcgen05_st_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x64.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[64])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x64.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[64])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x128.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[128])>( + cuda::ptx::tcgen05_st_16x32bx2));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x128.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[128])>( + cuda::ptx::tcgen05_st_16x32bx2));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x128.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[128])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.st.sync.aligned.16x32bx2.x128.unpack::16b.b32 [taddr], immHalfSplitoff, values; + * fn_ptr++ = reinterpret_cast(static_cast, const int32_t(&)[128])>( + cuda::ptx::tcgen05_st_16x32bx2_unpack_16b));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_wait.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_wait.h new file mode 100644 index 00000000000..424d884049c --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tcgen05_wait.h @@ -0,0 +1,40 @@ +// This file was automatically generated. Do not edit. + +// We use a special strategy to force the generation of the PTX. This is mainly +// a fight against dead-code-elimination in the NVVM layer. +// +// The reason we need this strategy is because certain older versions of ptxas +// segfault when a non-sensical sequence of PTX is generated. So instead, we try +// to force the instantiation and compilation to PTX of all the overloads of the +// PTX wrapping functions. +// +// We do this by writing a function pointer of each overload to the kernel +// parameter `fn_ptr`. +// +// Because `fn_ptr` is possibly visible outside this translation unit, the +// compiler must compile all the functions which are stored. + +__global__ void test_tcgen05_wait(void** fn_ptr) +{ +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.wait::ld.sync.aligned; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tcgen05_wait_ld));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.wait::ld.sync.aligned; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tcgen05_wait_ld));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET(NV_HAS_FEATURE_SM_100a, + ( + // tcgen05.wait::st.sync.aligned; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tcgen05_wait_st));)); + NV_IF_TARGET(NV_HAS_FEATURE_SM_101a, + ( + // tcgen05.wait::st.sync.aligned; + * fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tcgen05_wait_st));)); +#endif // __cccl_ptx_isa >= 860 +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.h index 95446eb81fa..1439bc84bd0 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.h +++ b/libcudacxx/test/libcudacxx/cuda/ptx/generated/tensormap_replace.h @@ -20,7 +20,19 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.global_address.global.b1024.b64 [tm_addr], new_val; + // tensormap.replace.tile.global_address.global.b1024.b64 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tensormap_replace_global_address));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.global_address.global.b1024.b64 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tensormap_replace_global_address));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.global_address.global.b1024.b64 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast(static_cast( cuda::ptx::tensormap_replace_global_address));)); #endif // __cccl_ptx_isa >= 830 @@ -29,7 +41,19 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.global_address.shared::cta.b1024.b64 [tm_addr], new_val; + // tensormap.replace.tile.global_address.shared::cta.b1024.b64 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tensormap_replace_global_address));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.global_address.shared::cta.b1024.b64 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast(static_cast( + cuda::ptx::tensormap_replace_global_address));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.global_address.shared::cta.b1024.b64 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast(static_cast( cuda::ptx::tensormap_replace_global_address));)); #endif // __cccl_ptx_isa >= 830 @@ -38,7 +62,19 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.rank.global.b1024.b32 [tm_addr], new_val; + // tensormap.replace.tile.rank.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tensormap_replace_rank));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.rank.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tensormap_replace_rank));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.rank.global.b1024.b32 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast( static_cast(cuda::ptx::tensormap_replace_rank));)); #endif // __cccl_ptx_isa >= 830 @@ -47,7 +83,19 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.rank.shared::cta.b1024.b32 [tm_addr], new_val; + // tensormap.replace.tile.rank.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tensormap_replace_rank));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.rank.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = reinterpret_cast( + static_cast(cuda::ptx::tensormap_replace_rank));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.rank.shared::cta.b1024.b32 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast( static_cast(cuda::ptx::tensormap_replace_rank));)); #endif // __cccl_ptx_isa >= 830 @@ -56,7 +104,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.box_dim.global.b1024.b32 [tm_addr], ord, new_val; + // tensormap.replace.tile.box_dim.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_box_dim));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.box_dim.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_box_dim));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.box_dim.global.b1024.b32 [tm_addr], ord, new_val; * fn_ptr++ = reinterpret_cast(static_cast, int32_t)>( cuda::ptx::tensormap_replace_box_dim));)); @@ -66,7 +128,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; + // tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_box_dim));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_box_dim));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; * fn_ptr++ = reinterpret_cast(static_cast, int32_t)>( cuda::ptx::tensormap_replace_box_dim));)); @@ -76,7 +152,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.global_dim.global.b1024.b32 [tm_addr], ord, new_val; + // tensormap.replace.tile.global_dim.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_global_dim));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.global_dim.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_global_dim));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.global_dim.global.b1024.b32 [tm_addr], ord, new_val; * fn_ptr++ = reinterpret_cast(static_cast, int32_t)>( cuda::ptx::tensormap_replace_global_dim));)); @@ -86,7 +176,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; + // tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_global_dim));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_global_dim));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; * fn_ptr++ = reinterpret_cast(static_cast, int32_t)>( cuda::ptx::tensormap_replace_global_dim));)); @@ -96,7 +200,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.global_stride.global.b1024.b64 [tm_addr], ord, new_val; + // tensormap.replace.tile.global_stride.global.b1024.b64 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int64_t)>( + cuda::ptx::tensormap_replace_global_stride));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.global_stride.global.b1024.b64 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int64_t)>( + cuda::ptx::tensormap_replace_global_stride));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.global_stride.global.b1024.b64 [tm_addr], ord, new_val; * fn_ptr++ = reinterpret_cast(static_cast, int64_t)>( cuda::ptx::tensormap_replace_global_stride));)); @@ -106,17 +224,93 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [tm_addr], ord, new_val; + // tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [tm_addr], ord, new_val; * fn_ptr++ = reinterpret_cast(static_cast, int64_t)>( cuda::ptx::tensormap_replace_global_stride));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int64_t)>( + cuda::ptx::tensormap_replace_global_stride));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int64_t)>( + cuda::ptx::tensormap_replace_global_stride));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.element_stride.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_stride));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.element_stride.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_stride));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.element_stride.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_stride));)); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_90a, + ( + // tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_stride));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_stride));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_stride));)); #endif // __cccl_ptx_isa >= 830 #if __cccl_ptx_isa >= 830 NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.element_stride.global.b1024.b32 [tm_addr], ord, new_val; + // tensormap.replace.tile.element_stride.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_size));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.element_stride.global.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_size));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.element_stride.global.b1024.b32 [tm_addr], ord, new_val; * fn_ptr++ = reinterpret_cast(static_cast, int32_t)>( cuda::ptx::tensormap_replace_element_size));)); @@ -126,7 +320,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [tm_addr], ord, new_val; + // tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_size));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [tm_addr], ord, new_val; + * fn_ptr++ = + reinterpret_cast(static_cast, int32_t)>( + cuda::ptx::tensormap_replace_element_size));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [tm_addr], ord, new_val; * fn_ptr++ = reinterpret_cast(static_cast, int32_t)>( cuda::ptx::tensormap_replace_element_size));)); @@ -136,7 +344,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.elemtype.global.b1024.b32 [tm_addr], new_val; + // tensormap.replace.tile.elemtype.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_elemtype));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.elemtype.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_elemtype));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.elemtype.global.b1024.b32 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast(static_cast)>( cuda::ptx::tensormap_replace_elemtype));)); @@ -146,7 +368,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [tm_addr], new_val; + // tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_elemtype));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_elemtype));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast(static_cast)>( cuda::ptx::tensormap_replace_elemtype));)); @@ -160,6 +396,20 @@ __global__ void test_tensormap_replace(void** fn_ptr) * fn_ptr++ = reinterpret_cast(static_cast)>( cuda::ptx::tensormap_replace_interleave_layout));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.interleave_layout.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_interleave_layout));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.interleave_layout.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_interleave_layout));)); #endif // __cccl_ptx_isa >= 830 #if __cccl_ptx_isa >= 830 @@ -170,13 +420,41 @@ __global__ void test_tensormap_replace(void** fn_ptr) * fn_ptr++ = reinterpret_cast(static_cast)>( cuda::ptx::tensormap_replace_interleave_layout));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_interleave_layout));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_interleave_layout));)); #endif // __cccl_ptx_isa >= 830 #if __cccl_ptx_isa >= 830 NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.swizzle_mode.global.b1024.b32 [tm_addr], new_val; + // tensormap.replace.tile.swizzle_mode.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_mode));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.swizzle_mode.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_mode));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.swizzle_mode.global.b1024.b32 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast(static_cast)>( cuda::ptx::tensormap_replace_swizzle_mode));)); @@ -186,7 +464,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [tm_addr], new_val; + // tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_mode));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_mode));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast(static_cast)>( cuda::ptx::tensormap_replace_swizzle_mode));)); @@ -196,7 +488,21 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.fill_mode.global.b1024.b32 [tm_addr], new_val; + // tensormap.replace.tile.fill_mode.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_fill_mode));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.fill_mode.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_fill_mode));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.fill_mode.global.b1024.b32 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast(static_cast)>( cuda::ptx::tensormap_replace_fill_mode));)); @@ -206,9 +512,57 @@ __global__ void test_tensormap_replace(void** fn_ptr) NV_IF_TARGET( NV_HAS_FEATURE_SM_90a, ( - // tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [tm_addr], new_val; + // tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_fill_mode));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_fill_mode));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [tm_addr], new_val; * fn_ptr++ = reinterpret_cast(static_cast)>( cuda::ptx::tensormap_replace_fill_mode));)); #endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.swizzle_atomicity.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_atomicity));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.swizzle_atomicity.global.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_atomicity));)); +#endif // __cccl_ptx_isa >= 860 + +#if __cccl_ptx_isa >= 860 + NV_IF_TARGET( + NV_HAS_FEATURE_SM_100a, + ( + // tensormap.replace.tile.swizzle_atomicity.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_atomicity));)); + NV_IF_TARGET( + NV_HAS_FEATURE_SM_101a, + ( + // tensormap.replace.tile.swizzle_atomicity.shared::cta.b1024.b32 [tm_addr], new_val; + * fn_ptr++ = + reinterpret_cast(static_cast)>( + cuda::ptx::tensormap_replace_swizzle_atomicity));)); +#endif // __cccl_ptx_isa >= 860 } diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/nvrtc_workaround.h b/libcudacxx/test/libcudacxx/cuda/ptx/nvrtc_workaround.h new file mode 100644 index 00000000000..fef34f25ef4 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/nvrtc_workaround.h @@ -0,0 +1,34 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// NVRTC ships a built-in copy of , so including CCCL's version of this header will omit the +// content since the header guards are already defined. To make older NVRTC versions have a few newer feature macros +// required for the PTX tests, we define them here outside the header guards. +// TODO(bgruber): limit this workaround to NVRTC versions older than the first one shipping those macros +#ifdef __CUDACC_RTC__ +# ifndef NV_HAS_FEATURE_SM_100a +# define NV_HAS_FEATURE_SM_100a __NV_HAS_FEATURE_SM_100a +# if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && defined(__CUDA_ARCH_FEAT_SM100_ALL)) +# define _NV_TARGET_BOOL___NV_HAS_FEATURE_SM_100a 1 +# else +# define _NV_TARGET_BOOL___NV_HAS_FEATURE_SM_100a 0 +# endif +# endif // NV_HAS_FEATURE_SM_100a + +// Re-enable sm_101a support in nvcc. +# ifndef NV_HAS_FEATURE_SM_101a +# define NV_HAS_FEATURE_SM_101a __NV_HAS_FEATURE_SM_101a +# if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1010) && defined(__CUDA_ARCH_FEAT_SM101_ALL)) +# define _NV_TARGET_BOOL___NV_HAS_FEATURE_SM_101a 1 +# else +# define _NV_TARGET_BOOL___NV_HAS_FEATURE_SM_101a 0 +# endif +# endif // NV_HAS_FEATURE_SM_101a +#endif // __CUDACC_RTC__ diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp index 33d08621ef4..003d8f97017 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.barrier.cluster.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/barrier_cluster.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp index e7ff21c2730..1bf931109ed 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.commit_group.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/cp_async_bulk_commit_group.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp index fdd35749cc6..be56b1b922c 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/cp_async_bulk.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp index ae1546828ae..226dbe5cf47 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp @@ -16,6 +16,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/cp_async_bulk_multicast.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp index eeb7b4bf5a5..42bc5b8e355 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/cp_async_bulk_tensor.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp index d07351a2275..65172d72897 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp @@ -16,6 +16,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/cp_async_bulk_tensor_multicast.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp index 87910d04941..b31a9fb6a81 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.wait_group.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/cp_async_bulk_wait_group.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp index 8b916d74bf9..76a9357ae2f 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/cp_reduce_async_bulk.h" #ifdef _LIBCUDACXX_HAS_NVF16 diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp index f6a6fd61735..289f3dd9411 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.reduce.async.bulk.tensor.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/cp_reduce_async_bulk_tensor.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp index 56f54b345f7..c439720b8f8 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.fence.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/fence.h" #include "generated/fence_mbarrier_init.h" #include "generated/fence_proxy_alias.h" diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp index 91a6dd94bf1..adf6bb3e769 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.get_sreg.compile.pass.cpp @@ -15,6 +15,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/get_sreg.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp index ed39816b7d6..9935b0563d2 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/getctarank.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp index 93263910906..a0948e86b18 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.arrive.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/mbarrier_arrive.h" #include "generated/mbarrier_arrive_expect_tx.h" #include "generated/mbarrier_arrive_no_complete.h" diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp index 7af0db56b70..0583b4f6e29 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.init.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/mbarrier_init.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp index 896abb8a7d8..732db4f16a1 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp @@ -14,10 +14,12 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header +#include "generated/mbarrier_test_wait.h" +#include "generated/mbarrier_test_wait_parity.h" #include "generated/mbarrier_try_wait.h" #include "generated/mbarrier_try_wait_parity.h" -#include "generated/mbarrier_wait.h" -#include "generated/mbarrier_wait_parity.h" int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp index c6f66503b1f..2993ba3893d 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.red.async.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/red_async.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp index 7c008b77126..a833a3770f4 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.async.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/st_async.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp index bb5578fc730..5d8566be5b5 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/tensormap_cp_fenceproxy.h" int main(int, char**) diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp index 264b7956fbb..f0c91aa2296 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp @@ -14,6 +14,8 @@ #include #include +#include "nvrtc_workaround.h" +// above header needs to be included before the generated test header #include "generated/tensormap_replace.h" int main(int, char**)