From 07fb9adef493d009cdb3fc1c1569efbe6e9ebba7 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Mon, 25 Nov 2024 15:26:27 +0100 Subject: [PATCH] Regenerate PTX header and docs Overwrites all generated PTX header and documentation files and runs `pre-commit run --all-files` --- .../generated/barrier_cluster.rst | 10 +- .../instructions/generated/cp_async_bulk.rst | 6 +- .../generated/cp_async_bulk_commit_group.rst | 2 +- .../generated/cp_async_bulk_multicast.rst | 2 +- .../generated/cp_async_bulk_tensor.rst | 20 ++-- .../cp_async_bulk_tensor_multicast.rst | 10 +- .../generated/cp_reduce_async_bulk.rst | 52 ++++----- .../generated/cp_reduce_async_bulk_bf16.rst | 6 +- .../generated/cp_reduce_async_bulk_f16.rst | 6 +- .../generated/fence_mbarrier_init.rst | 2 +- .../generated/fence_proxy_alias.rst | 2 +- .../generated/fence_proxy_async.rst | 2 +- .../ptx/instructions/generated/getctarank.rst | 2 +- .../generated/mbarrier_arrive.rst | 8 +- .../generated/mbarrier_arrive_expect_tx.rst | 2 +- .../generated/mbarrier_arrive_no_complete.rst | 2 +- .../instructions/generated/mbarrier_init.rst | 2 +- .../generated/mbarrier_test_wait.rst | 2 +- .../generated/mbarrier_test_wait_parity.rst | 2 +- .../generated/mbarrier_try_wait.rst | 4 +- .../generated/mbarrier_try_wait_parity.rst | 4 +- .../ptx/instructions/generated/red_async.rst | 20 ++-- .../cuda/__ptx/instructions/barrier_cluster.h | 2 +- .../cuda/__ptx/instructions/cp_async_bulk.h | 4 +- .../instructions/cp_async_bulk_commit_group.h | 2 +- .../__ptx/instructions/cp_async_bulk_tensor.h | 4 +- .../instructions/cp_async_bulk_wait_group.h | 2 +- .../__ptx/instructions/cp_reduce_async_bulk.h | 6 +- .../cp_reduce_async_bulk_tensor.h | 2 +- .../include/cuda/__ptx/instructions/fence.h | 10 +- ...{barrier_cluster.inc => barrier_cluster.h} | 15 ++- .../{cp_async_bulk.inc => cp_async_bulk.h} | 11 +- ...group.inc => cp_async_bulk_commit_group.h} | 7 +- ...ulticast.inc => cp_async_bulk_multicast.h} | 7 +- ...bulk_tensor.inc => cp_async_bulk_tensor.h} | 25 +++-- ...t.inc => cp_async_bulk_tensor_multicast.h} | 15 ++- ...t_group.inc => cp_async_bulk_wait_group.h} | 13 ++- ..._async_bulk.inc => cp_reduce_async_bulk.h} | 65 +++++------ ...k_bf16.inc => cp_reduce_async_bulk_bf16.h} | 25 +++-- ...ulk_f16.inc => cp_reduce_async_bulk_f16.h} | 23 ++-- ...nsor.inc => cp_reduce_async_bulk_tensor.h} | 75 +++++++------ .../generated/{fence.inc => fence.h} | 17 +-- ...barrier_init.inc => fence_mbarrier_init.h} | 7 +- ...ce_proxy_alias.inc => fence_proxy_alias.h} | 7 +- ...ce_proxy_async.inc => fence_proxy_async.h} | 11 +- ...ic.inc => fence_proxy_tensormap_generic.h} | 17 +-- .../generated/{get_sreg.inc => get_sreg.h} | 101 +++++++++--------- .../{getctarank.inc => getctarank.h} | 7 +- ...{mbarrier_arrive.inc => mbarrier_arrive.h} | 17 +-- ...ect_tx.inc => mbarrier_arrive_expect_tx.h} | 9 +- ...lete.inc => mbarrier_arrive_no_complete.h} | 7 +- .../{mbarrier_init.inc => mbarrier_init.h} | 7 +- ...ier_test_wait.inc => mbarrier_test_wait.h} | 9 +- ...parity.inc => mbarrier_test_wait_parity.h} | 9 +- ...rrier_try_wait.inc => mbarrier_try_wait.h} | 13 ++- ..._parity.inc => mbarrier_try_wait_parity.h} | 13 ++- .../generated/{red_async.inc => red_async.h} | 25 +++-- .../generated/{st_async.inc => st_async.h} | 9 +- ...nceproxy.inc => tensormap_cp_fenceproxy.h} | 11 +- ...sormap_replace.inc => tensormap_replace.h} | 5 + .../cuda/__ptx/instructions/get_sreg.h | 2 +- .../cuda/__ptx/instructions/getctarank.h | 2 +- .../cuda/__ptx/instructions/mbarrier_arrive.h | 6 +- .../cuda/__ptx/instructions/mbarrier_init.h | 2 +- .../cuda/__ptx/instructions/mbarrier_wait.h | 8 +- .../cuda/__ptx/instructions/red_async.h | 2 +- .../cuda/__ptx/instructions/st_async.h | 2 +- .../instructions/tensormap_cp_fenceproxy.h | 2 +- .../__ptx/instructions/tensormap_replace.h | 2 +- 69 files changed, 478 insertions(+), 332 deletions(-) rename libcudacxx/include/cuda/__ptx/instructions/generated/{barrier_cluster.inc => barrier_cluster.h} (93%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_async_bulk.inc => cp_async_bulk.h} (94%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_async_bulk_commit_group.inc => cp_async_bulk_commit_group.h} (77%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_async_bulk_multicast.inc => cp_async_bulk_multicast.h} (88%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_async_bulk_tensor.inc => cp_async_bulk_tensor.h} (97%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_async_bulk_tensor_multicast.inc => cp_async_bulk_tensor_multicast.h} (96%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_async_bulk_wait_group.inc => cp_async_bulk_wait_group.h} (84%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_reduce_async_bulk.inc => cp_reduce_async_bulk.h} (98%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_reduce_async_bulk_bf16.inc => cp_reduce_async_bulk_bf16.h} (90%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_reduce_async_bulk_f16.inc => cp_reduce_async_bulk_f16.h} (90%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_reduce_async_bulk_tensor.inc => cp_reduce_async_bulk_tensor.h} (91%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{fence.inc => fence.h} (83%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{fence_mbarrier_init.inc => fence_mbarrier_init.h} (84%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{fence_proxy_alias.inc => fence_proxy_alias.h} (79%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{fence_proxy_async.inc => fence_proxy_async.h} (85%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{fence_proxy_tensormap_generic.inc => fence_proxy_tensormap_generic.h} (86%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{get_sreg.inc => get_sreg.h} (95%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{getctarank.inc => getctarank.h} (85%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_arrive.inc => mbarrier_arrive.h} (95%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_arrive_expect_tx.inc => mbarrier_arrive_expect_tx.h} (92%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_arrive_no_complete.inc => mbarrier_arrive_no_complete.h} (83%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_init.inc => mbarrier_init.h} (82%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_test_wait.inc => mbarrier_test_wait.h} (92%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_test_wait_parity.inc => mbarrier_test_wait_parity.h} (91%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_try_wait.inc => mbarrier_try_wait.h} (94%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_try_wait_parity.inc => mbarrier_try_wait_parity.h} (94%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{red_async.inc => red_async.h} (97%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{st_async.inc => st_async.h} (94%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{tensormap_cp_fenceproxy.inc => tensormap_cp_fenceproxy.h} (86%) rename libcudacxx/include/cuda/__ptx/instructions/generated/{tensormap_replace.inc => tensormap_replace.h} (99%) diff --git a/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst index bd994990c05..5bc83e77e50 100644 --- a/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst +++ b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst @@ -4,7 +4,7 @@ barrier.cluster.arrive // barrier.cluster.arrive; // PTX ISA 78, SM_90 // Marked volatile and as clobbering memory - template + template __device__ static inline void barrier_cluster_arrive(); barrier.cluster.wait @@ -13,7 +13,7 @@ barrier.cluster.wait // barrier.cluster.wait; // PTX ISA 78, SM_90 // Marked volatile and as clobbering memory - template + template __device__ static inline void barrier_cluster_wait(); barrier.cluster.arrive.release @@ -23,7 +23,7 @@ barrier.cluster.arrive.release // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90 // .sem = { .release } // Marked volatile and as clobbering memory - template + template __device__ static inline void barrier_cluster_arrive( cuda::ptx::sem_release_t); @@ -34,7 +34,7 @@ barrier.cluster.arrive.relaxed // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90 // .sem = { .relaxed } // Marked volatile - template + template __device__ static inline void barrier_cluster_arrive( cuda::ptx::sem_relaxed_t); @@ -45,6 +45,6 @@ barrier.cluster.wait.acquire // barrier.cluster.wait.sem; // PTX ISA 80, SM_90 // .sem = { .acquire } // Marked volatile and as clobbering memory - template + template __device__ static inline void barrier_cluster_wait( cuda::ptx::sem_acquire_t); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst index f5c236f8bf9..7a734ab2504 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst @@ -5,7 +5,7 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -21,7 +21,7 @@ cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .shared::cta } - template + template __device__ static inline void cp_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -37,7 +37,7 @@ cp.async.bulk.global.shared::cta.bulk_group // cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } - template + template __device__ static inline void cp_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst index 984b4aff976..8f8a42716fc 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst @@ -3,5 +3,5 @@ cp.async.bulk.commit_group .. code:: cuda // cp.async.bulk.commit_group; // PTX ISA 80, SM_90 - template + template __device__ static inline void cp_async_bulk_commit_group(); diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst index 9cb15d06fa3..b76a5de99ef 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst @@ -5,7 +5,7 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::clu // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst index 40eb070e66a..004acac4801 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst @@ -5,7 +5,7 @@ cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1a. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -21,7 +21,7 @@ cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group // cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -36,7 +36,7 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1b. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -52,7 +52,7 @@ cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group // cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -67,7 +67,7 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1c. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -83,7 +83,7 @@ cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group // cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -98,7 +98,7 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1d. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -114,7 +114,7 @@ cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group // cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -129,7 +129,7 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1e. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -145,7 +145,7 @@ cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group // cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst index 2481c80bf3c..084ad54708f 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst @@ -5,7 +5,7 @@ cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -22,7 +22,7 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -39,7 +39,7 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -56,7 +56,7 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -73,7 +73,7 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } - template + template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst index cc82d633375..f23664d6ad6 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst @@ -64,7 +64,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.mi // .src = { .shared::cta } // .type = { .u32 } // .op = { .min } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -83,7 +83,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ma // .src = { .shared::cta } // .type = { .u32 } // .op = { .max } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -102,7 +102,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ad // .src = { .shared::cta } // .type = { .u32 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -121,7 +121,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.in // .src = { .shared::cta } // .type = { .u32 } // .op = { .inc } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -140,7 +140,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.de // .src = { .shared::cta } // .type = { .u32 } // .op = { .dec } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -159,7 +159,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.mi // .src = { .shared::cta } // .type = { .s32 } // .op = { .min } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -178,7 +178,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ma // .src = { .shared::cta } // .type = { .s32 } // .op = { .max } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -197,7 +197,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ad // .src = { .shared::cta } // .type = { .s32 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -216,7 +216,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ad // .src = { .shared::cta } // .type = { .u64 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -235,7 +235,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ad // .src = { .shared::cta } // .type = { .s64 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -362,7 +362,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32 // .src = { .shared::cta } // .type = { .u32 } // .op = { .min } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -380,7 +380,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32 // .src = { .shared::cta } // .type = { .u32 } // .op = { .max } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -398,7 +398,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32 // .src = { .shared::cta } // .type = { .u32 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -416,7 +416,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32 // .src = { .shared::cta } // .type = { .u32 } // .op = { .inc } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -434,7 +434,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32 // .src = { .shared::cta } // .type = { .u32 } // .op = { .dec } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -452,7 +452,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32 // .src = { .shared::cta } // .type = { .s32 } // .op = { .min } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -470,7 +470,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32 // .src = { .shared::cta } // .type = { .s32 } // .op = { .max } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -488,7 +488,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32 // .src = { .shared::cta } // .type = { .s32 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -506,7 +506,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64 // .src = { .shared::cta } // .type = { .u64 } // .op = { .min } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -524,7 +524,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64 // .src = { .shared::cta } // .type = { .u64 } // .op = { .max } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -542,7 +542,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 // .src = { .shared::cta } // .type = { .u64 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -560,7 +560,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64 // .src = { .shared::cta } // .type = { .s64 } // .op = { .min } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -578,7 +578,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64 // .src = { .shared::cta } // .type = { .s64 } // .op = { .max } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -596,7 +596,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32 // .src = { .shared::cta } // .type = { .f32 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -614,7 +614,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64 // .src = { .shared::cta } // .type = { .f64 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -632,7 +632,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64 // .src = { .shared::cta } // .type = { .s64 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst index e4dea98a119..756c9d591d3 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst @@ -7,7 +7,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16 // .src = { .shared::cta } // .type = { .bf16 } // .op = { .min } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -25,7 +25,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16 // .src = { .shared::cta } // .type = { .bf16 } // .op = { .max } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -43,7 +43,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16 // .src = { .shared::cta } // .type = { .bf16 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst index 18c5e0bfc60..f4121b2cc34 100644 --- a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst +++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst @@ -7,7 +7,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16 // .src = { .shared::cta } // .type = { .f16 } // .op = { .min } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -25,7 +25,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16 // .src = { .shared::cta } // .type = { .f16 } // .op = { .max } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -43,7 +43,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16 // .src = { .shared::cta } // .type = { .f16 } // .op = { .add } - template + template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst b/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst index 0f5298e3359..89513d30961 100644 --- a/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst +++ b/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst @@ -5,7 +5,7 @@ fence.mbarrier_init.release.cluster // fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cluster } - template + template __device__ static inline void fence_mbarrier_init( cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t); diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst index 935aab9b6df..eee6afdb932 100644 --- a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst +++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst @@ -3,5 +3,5 @@ fence.proxy.alias .. code:: cuda // fence.proxy.alias; // 4. PTX ISA 75, SM_70 - template + template __device__ static inline void fence_proxy_alias(); diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst index 3e741a1f6c4..def6dddfca4 100644 --- a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst +++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst @@ -3,7 +3,7 @@ fence.proxy.async .. code:: cuda // fence.proxy.async; // 5. PTX ISA 80, SM_90 - template + template __device__ static inline void fence_proxy_async(); fence.proxy.async.global diff --git a/docs/libcudacxx/ptx/instructions/generated/getctarank.rst b/docs/libcudacxx/ptx/instructions/generated/getctarank.rst index c85f52ee302..9e577ad2982 100644 --- a/docs/libcudacxx/ptx/instructions/generated/getctarank.rst +++ b/docs/libcudacxx/ptx/instructions/generated/getctarank.rst @@ -4,7 +4,7 @@ getctarank.shared::cluster.u32 // getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90 // .space = { .shared::cluster } - template + template __device__ static inline uint32_t getctarank( cuda::ptx::space_cluster_t, const void* addr); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst index 92cd106cad9..ba845c8e93f 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst @@ -3,7 +3,7 @@ mbarrier.arrive.shared.b64 .. code:: cuda // mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80 - template + template __device__ static inline uint64_t mbarrier_arrive( uint64_t* addr); @@ -12,7 +12,7 @@ mbarrier.arrive.shared::cta.b64 .. code:: cuda // mbarrier.arrive.shared::cta.b64 state, [addr], count; // 2. PTX ISA 78, SM_90 - template + template __device__ static inline uint64_t mbarrier_arrive( uint64_t* addr, const uint32_t& count); @@ -87,7 +87,7 @@ mbarrier.arrive.release.cluster.shared::cluster.b64 // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } - template + template __device__ static inline void mbarrier_arrive( cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, @@ -102,7 +102,7 @@ mbarrier.arrive.release.cluster.shared::cluster.b64 // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } - template + template __device__ static inline void mbarrier_arrive( cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst index 0087ae2f458..5dc7eb93925 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst @@ -38,7 +38,7 @@ mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64 // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } - template + template __device__ static inline void mbarrier_arrive_expect_tx( cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst index b6d7edbbeee..a18298f90e9 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst @@ -3,7 +3,7 @@ mbarrier.arrive.noComplete.shared.b64 .. code:: cuda // mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 5. PTX ISA 70, SM_80 - template + template __device__ static inline uint64_t mbarrier_arrive_no_complete( uint64_t* addr, const uint32_t& count); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst index 3e529d86d78..615bb24d437 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst @@ -3,7 +3,7 @@ mbarrier.init.shared.b64 .. code:: cuda // mbarrier.init.shared.b64 [addr], count; // PTX ISA 70, SM_80 - template + template __device__ static inline void mbarrier_init( uint64_t* addr, const uint32_t& count); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst index 4cb241c7ca8..731e3041e33 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst @@ -3,7 +3,7 @@ mbarrier.test_wait.shared.b64 .. code:: cuda // mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. PTX ISA 70, SM_80 - template + template __device__ static inline bool mbarrier_test_wait( uint64_t* addr, const uint64_t& state); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst index e750c4a543f..dded9d97516 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst @@ -3,7 +3,7 @@ mbarrier.test_wait.parity.shared.b64 .. code:: cuda // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. PTX ISA 71, SM_80 - template + template __device__ static inline bool mbarrier_test_wait_parity( uint64_t* addr, const uint32_t& phaseParity); diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst index ce648c66ee9..1659b510bf7 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst @@ -3,7 +3,7 @@ mbarrier.try_wait.shared::cta.b64 .. code:: cuda // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state; // 5a. PTX ISA 78, SM_90 - template + template __device__ static inline bool mbarrier_try_wait( uint64_t* addr, const uint64_t& state); @@ -13,7 +13,7 @@ mbarrier.try_wait.shared::cta.b64 .. code:: cuda // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // 5b. PTX ISA 78, SM_90 - template + template __device__ static inline bool mbarrier_try_wait( uint64_t* addr, const uint64_t& state, diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst index 3210dc0eab1..0be63975238 100644 --- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst +++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst @@ -3,7 +3,7 @@ mbarrier.try_wait.parity.shared::cta.b64 .. code:: cuda // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity; // 7a. PTX ISA 78, SM_90 - template + template __device__ static inline bool mbarrier_try_wait_parity( uint64_t* addr, const uint32_t& phaseParity); @@ -13,7 +13,7 @@ mbarrier.try_wait.parity.shared::cta.b64 .. code:: cuda // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 7b. PTX ISA 78, SM_90 - template + template __device__ static inline bool mbarrier_try_wait_parity( uint64_t* addr, const uint32_t& phaseParity, diff --git a/docs/libcudacxx/ptx/instructions/generated/red_async.rst b/docs/libcudacxx/ptx/instructions/generated/red_async.rst index d6b9cf36549..c1a8a9bf2f8 100644 --- a/docs/libcudacxx/ptx/instructions/generated/red_async.rst +++ b/docs/libcudacxx/ptx/instructions/generated/red_async.rst @@ -5,7 +5,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .inc } - template + template __device__ static inline void red_async( cuda::ptx::op_inc_t, uint32_t* dest, @@ -19,7 +19,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .dec } - template + template __device__ static inline void red_async( cuda::ptx::op_dec_t, uint32_t* dest, @@ -33,7 +33,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .min } - template + template __device__ static inline void red_async( cuda::ptx::op_min_t, uint32_t* dest, @@ -47,7 +47,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .max } - template + template __device__ static inline void red_async( cuda::ptx::op_max_t, uint32_t* dest, @@ -61,7 +61,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .add } - template + template __device__ static inline void red_async( cuda::ptx::op_add_t, uint32_t* dest, @@ -75,7 +75,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .s32 } // .op = { .min } - template + template __device__ static inline void red_async( cuda::ptx::op_min_t, int32_t* dest, @@ -89,7 +89,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .s32 } // .op = { .max } - template + template __device__ static inline void red_async( cuda::ptx::op_max_t, int32_t* dest, @@ -103,7 +103,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .s32 } // .op = { .add } - template + template __device__ static inline void red_async( cuda::ptx::op_add_t, int32_t* dest, @@ -159,7 +159,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u64 } // .op = { .add } - template + template __device__ static inline void red_async( cuda::ptx::op_add_t, uint64_t* dest, @@ -172,7 +172,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90 // .op = { .add } - template + template __device__ static inline void red_async( cuda::ptx::op_add_t, int64_t* dest, diff --git a/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h b/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h index 8b09ddd1110..93b6a06037c 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h +++ b/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.3. Parallel Synchronization and Communication Instructions: barrier.cluster // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h index 480a02a701e..abfba441ac9 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h @@ -32,8 +32,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.6. Data Movement and Conversion Instructions: cp.async.bulk // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk -#include -#include +#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h index bd97259cf19..f9320e975f2 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.12. Data Movement and Conversion Instructions: cp.async.bulk.commit_group // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h index 5b9f575ce5f..7de5b41b744 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h @@ -32,8 +32,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.9. Data Movement and Conversion Instructions: cp.async.bulk.tensor // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor -#include -#include +#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h index 00a3700e1a9..0d933e2cc34 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.13. Data Movement and Conversion Instructions: cp.async.bulk.wait_group // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h index ee6d90bc4d9..f1487301ada 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h @@ -43,12 +43,12 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk -#include +#include #ifdef _LIBCUDACXX_HAS_NVF16 -# include +# include #endif // _LIBCUDACXX_HAS_NVF16 #ifdef _LIBCUDACXX_HAS_NVBF16 -# include +# include #endif // _LIBCUDACXX_HAS_NVBF16 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h index a6b23a706c7..436c42d4c3f 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +++ b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.24.10. Data Movement and Conversion Instructions: cp.reduce.async.bulk.tensor // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/fence.h b/libcudacxx/include/cuda/__ptx/instructions/fence.h index 045f09cb40e..a8dccf979c2 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/fence.h +++ b/libcudacxx/include/cuda/__ptx/instructions/fence.h @@ -32,11 +32,11 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.4. Parallel Synchronization and Communication Instructions: membar/fence // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h similarity index 93% rename from libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h index ca9238bc3ff..976bbfead31 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h @@ -1,7 +1,10 @@ +#ifndef _CUDA_PTX_GENERATED_BARRIER_CLUSTER_H_ +#define _CUDA_PTX_GENERATED_BARRIER_CLUSTER_H_ + /* // barrier.cluster.arrive; // PTX ISA 78, SM_90 // Marked volatile and as clobbering memory -template +template __device__ static inline void barrier_cluster_arrive(); */ #if __cccl_ptx_isa >= 780 @@ -24,7 +27,7 @@ _CCCL_DEVICE static inline void barrier_cluster_arrive() /* // barrier.cluster.wait; // PTX ISA 78, SM_90 // Marked volatile and as clobbering memory -template +template __device__ static inline void barrier_cluster_wait(); */ #if __cccl_ptx_isa >= 780 @@ -48,7 +51,7 @@ _CCCL_DEVICE static inline void barrier_cluster_wait() // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90 // .sem = { .release } // Marked volatile and as clobbering memory -template +template __device__ static inline void barrier_cluster_arrive( cuda::ptx::sem_release_t); */ @@ -74,7 +77,7 @@ _CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t) // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90 // .sem = { .relaxed } // Marked volatile -template +template __device__ static inline void barrier_cluster_arrive( cuda::ptx::sem_relaxed_t); */ @@ -100,7 +103,7 @@ _CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t) // barrier.cluster.wait.sem; // PTX ISA 80, SM_90 // .sem = { .acquire } // Marked volatile and as clobbering memory -template +template __device__ static inline void barrier_cluster_wait( cuda::ptx::sem_acquire_t); */ @@ -121,3 +124,5 @@ _CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t) __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_BARRIER_CLUSTER_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h similarity index 94% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h index 69f77053b95..60eccae6818 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h @@ -1,9 +1,12 @@ +#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_ +#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_ + /* // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -41,7 +44,7 @@ _CCCL_DEVICE static inline void cp_async_bulk( // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .shared::cta } -template +template __device__ static inline void cp_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -82,7 +85,7 @@ _CCCL_DEVICE static inline void cp_async_bulk( // cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } -template +template __device__ static inline void cp_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -109,3 +112,5 @@ cp_async_bulk(space_global_t, space_shared_t, void* __dstMem, const void* __srcM __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h similarity index 77% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h index 24baddaea8f..46624839d9f 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h @@ -1,6 +1,9 @@ +#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_COMMIT_GROUP_H_ +#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_COMMIT_GROUP_H_ + /* // cp.async.bulk.commit_group; // PTX ISA 80, SM_90 -template +template __device__ static inline void cp_async_bulk_commit_group(); */ #if __cccl_ptx_isa >= 800 @@ -19,3 +22,5 @@ _CCCL_DEVICE static inline void cp_async_bulk_commit_group() __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_COMMIT_GROUP_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h similarity index 88% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h index cdd5a535eb6..1eec2156ee5 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h @@ -1,9 +1,12 @@ +#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_MULTICAST_H_ +#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_MULTICAST_H_ + /* // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -43,3 +46,5 @@ _CCCL_DEVICE static inline void cp_async_bulk( __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_MULTICAST_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h similarity index 97% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h index 547888d5b0f..ed10727d60c 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h @@ -1,9 +1,12 @@ +#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_H_ +#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_H_ + /* // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1a. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -42,7 +45,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( // cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -79,7 +82,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( 1b. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -122,7 +125,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( // cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -159,7 +162,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( 1c. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -203,7 +206,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( // cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -244,7 +247,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( 1d. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -289,7 +292,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( // cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -331,7 +334,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( 1e. PTX ISA 80, SM_90 // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -377,7 +380,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( // cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -414,3 +417,5 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h similarity index 96% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h index 020698a15b1..aa376141dc0 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h @@ -1,9 +1,12 @@ +#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_MULTICAST_H_ +#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_MULTICAST_H_ + /* // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -49,7 +52,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -96,7 +99,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -144,7 +147,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -193,7 +196,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a // .dst = { .shared::cluster } // .src = { .global } -template +template __device__ static inline void cp_async_bulk_tensor( cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, @@ -237,3 +240,5 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor( __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_MULTICAST_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h similarity index 84% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h index 1a715a0fac6..4ec68009575 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h @@ -1,3 +1,6 @@ +#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_WAIT_GROUP_H_ +#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_WAIT_GROUP_H_ + /* // cp.async.bulk.wait_group N; // PTX ISA 80, SM_90 template @@ -7,13 +10,13 @@ __device__ static inline void cp_async_bulk_wait_group( #if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__(); template -_CCCL_DEVICE static inline void cp_async_bulk_wait_group(n32_t<_N32> __n) +_CCCL_DEVICE static inline void cp_async_bulk_wait_group(n32_t<_N32> __N) { NV_IF_ELSE_TARGET( NV_PROVIDES_SM_90, (asm volatile("cp.async.bulk.wait_group %0;" : - : "n"(__n.value) + : "n"(__N.value) : "memory");), ( // Unsupported architectures will have a linker error with a semi-decent error message @@ -30,16 +33,18 @@ __device__ static inline void cp_async_bulk_wait_group_read( #if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__(); template -_CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(n32_t<_N32> __n) +_CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(n32_t<_N32> __N) { NV_IF_ELSE_TARGET( NV_PROVIDES_SM_90, (asm volatile("cp.async.bulk.wait_group.read %0;" : - : "n"(__n.value) + : "n"(__N.value) : "memory");), ( // Unsupported architectures will have a linker error with a semi-decent error message __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_WAIT_GROUP_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h similarity index 98% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h index 50059ff6c5b..94545f2eb47 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h @@ -1,5 +1,6 @@ -// 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk +#ifndef _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_H_ +#define _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_H_ + /* // cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX ISA 80, SM_90 @@ -154,7 +155,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .u32 } // .op = { .min } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -203,7 +204,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .u32 } // .op = { .max } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -252,7 +253,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .u32 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -301,7 +302,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .u32 } // .op = { .inc } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -350,7 +351,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .u32 } // .op = { .dec } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -399,7 +400,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .s32 } // .op = { .min } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -448,7 +449,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .s32 } // .op = { .max } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -497,7 +498,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .s32 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -546,7 +547,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .u64 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -595,7 +596,7 @@ ISA 80, SM_90 // .src = { .shared::cta } // .type = { .s64 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, @@ -670,7 +671,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( : : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { + } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64 [%0], [%1], %2; // 3." : : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) @@ -715,7 +716,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( : : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { + } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64 [%0], [%1], %2; // 3." : : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) @@ -760,7 +761,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( : : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { + } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64 [%0], [%1], %2; // 3." : : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size) @@ -778,7 +779,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .u32 } // .op = { .min } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -820,7 +821,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .u32 } // .op = { .max } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -862,7 +863,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .u32 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -904,7 +905,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .u32 } // .op = { .inc } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -946,7 +947,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .u32 } // .op = { .dec } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -988,7 +989,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .s32 } // .op = { .min } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1030,7 +1031,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .s32 } // .op = { .max } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1072,7 +1073,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .s32 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1114,7 +1115,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .u64 } // .op = { .min } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1156,7 +1157,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .u64 } // .op = { .max } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1198,7 +1199,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .u64 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1240,7 +1241,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .s64 } // .op = { .min } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1282,7 +1283,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .s64 } // .op = { .max } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1324,7 +1325,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .f32 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1361,7 +1362,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .f64 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1398,7 +1399,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .s64 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -1433,3 +1434,5 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h similarity index 90% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h index c657e8d1935..455cc294545 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h @@ -1,11 +1,13 @@ -#ifdef _LIBCUDACXX_HAS_NVBF16 +#ifndef _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_BF16_H_ +#define _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_BF16_H_ + /* // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } // .type = { .bf16 } // .op = { .min } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -14,7 +16,7 @@ __device__ static inline void cp_reduce_async_bulk( const __nv_bfloat16* srcMem, uint32_t size); */ -# if __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); template _CCCL_DEVICE static inline void cp_reduce_async_bulk( @@ -39,7 +41,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // Unsupported architectures will have a linker error with a semi-decent error message __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); } -# endif // __cccl_ptx_isa >= 800 +#endif // __cccl_ptx_isa >= 800 /* // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 @@ -47,7 +49,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .bf16 } // .op = { .max } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -56,7 +58,7 @@ __device__ static inline void cp_reduce_async_bulk( const __nv_bfloat16* srcMem, uint32_t size); */ -# if __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); template _CCCL_DEVICE static inline void cp_reduce_async_bulk( @@ -81,7 +83,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // Unsupported architectures will have a linker error with a semi-decent error message __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); } -# endif // __cccl_ptx_isa >= 800 +#endif // __cccl_ptx_isa >= 800 /* // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90 @@ -89,7 +91,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .bf16 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -98,7 +100,7 @@ __device__ static inline void cp_reduce_async_bulk( const __nv_bfloat16* srcMem, uint32_t size); */ -# if __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); template _CCCL_DEVICE static inline void cp_reduce_async_bulk( @@ -123,5 +125,6 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // Unsupported architectures will have a linker error with a semi-decent error message __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); } -# endif // __cccl_ptx_isa >= 800 -#endif // _LIBCUDACXX_HAS_NVBF16 +#endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_BF16_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h similarity index 90% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h index 3a52630db53..4ddc4589afa 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h @@ -1,10 +1,13 @@ +#ifndef _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_F16_H_ +#define _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_F16_H_ + /* // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 // .dst = { .global } // .src = { .shared::cta } // .type = { .f16 } // .op = { .min } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -13,7 +16,7 @@ __device__ static inline void cp_reduce_async_bulk( const __half* srcMem, uint32_t size); */ -# if __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); template _CCCL_DEVICE static inline void cp_reduce_async_bulk( @@ -33,7 +36,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // Unsupported architectures will have a linker error with a semi-decent error message __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); } -# endif // __cccl_ptx_isa >= 800 +#endif // __cccl_ptx_isa >= 800 /* // cp.reduce.async.bulk.dst.src.bulk_group.op.type [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90 @@ -41,7 +44,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .f16 } // .op = { .max } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -50,7 +53,7 @@ __device__ static inline void cp_reduce_async_bulk( const __half* srcMem, uint32_t size); */ -# if __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); template _CCCL_DEVICE static inline void cp_reduce_async_bulk( @@ -70,7 +73,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // Unsupported architectures will have a linker error with a semi-decent error message __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); } -# endif // __cccl_ptx_isa >= 800 +#endif // __cccl_ptx_isa >= 800 /* // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90 @@ -78,7 +81,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // .src = { .shared::cta } // .type = { .f16 } // .op = { .add } -template +template __device__ static inline void cp_reduce_async_bulk( cuda::ptx::space_global_t, cuda::ptx::space_shared_t, @@ -87,7 +90,7 @@ __device__ static inline void cp_reduce_async_bulk( const __half* srcMem, uint32_t size); */ -# if __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 800 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__(); template _CCCL_DEVICE static inline void cp_reduce_async_bulk( @@ -107,4 +110,6 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk( // Unsupported architectures will have a linker error with a semi-decent error message __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();)); } -# endif // __cccl_ptx_isa >= 800 +#endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_F16_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h similarity index 91% rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h index 32008f6af5b..f069f4bb755 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h @@ -1,3 +1,6 @@ +#ifndef _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_TENSOR_H_ +#define _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_TENSOR_H_ + /* // cp.reduce.async.bulk.tensor.1d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1a. PTX ISA 80, SM_90 @@ -37,37 +40,37 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { + } else _CCCL_IF_CONSTEXPR (__op == op_min) { asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [%0, {%1}], [%2]; // 1a." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { + } else _CCCL_IF_CONSTEXPR (__op == op_max) { asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [%0, {%1}], [%2]; // 1a." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { + } else _CCCL_IF_CONSTEXPR (__op == op_inc) { asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [%0, {%1}], [%2]; // 1a." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { + } else _CCCL_IF_CONSTEXPR (__op == op_dec) { asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [%0, {%1}], [%2]; // 1a." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [%0, {%1}], [%2]; // 1a." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [%0, {%1}], [%2]; // 1a." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [%0, {%1}], [%2]; // 1a." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem)) @@ -118,37 +121,37 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { + } else _CCCL_IF_CONSTEXPR (__op == op_min) { asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { + } else _CCCL_IF_CONSTEXPR (__op == op_max) { asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { + } else _CCCL_IF_CONSTEXPR (__op == op_inc) { asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { + } else _CCCL_IF_CONSTEXPR (__op == op_dec) { asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b." : : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem)) @@ -203,7 +206,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[2]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { + } else _CCCL_IF_CONSTEXPR (__op == op_min) { asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." : : "l"(__tensorMap), @@ -212,7 +215,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[2]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { + } else _CCCL_IF_CONSTEXPR (__op == op_max) { asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." : : "l"(__tensorMap), @@ -221,7 +224,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[2]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { + } else _CCCL_IF_CONSTEXPR (__op == op_inc) { asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." : : "l"(__tensorMap), @@ -230,7 +233,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[2]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { + } else _CCCL_IF_CONSTEXPR (__op == op_dec) { asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." : : "l"(__tensorMap), @@ -239,7 +242,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[2]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." : : "l"(__tensorMap), @@ -248,7 +251,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[2]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." : : "l"(__tensorMap), @@ -257,7 +260,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[2]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c." : : "l"(__tensorMap), @@ -317,7 +320,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[3]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { + } else _CCCL_IF_CONSTEXPR (__op == op_min) { asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." : : "l"(__tensorMap), @@ -327,7 +330,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[3]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { + } else _CCCL_IF_CONSTEXPR (__op == op_max) { asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." : : "l"(__tensorMap), @@ -337,7 +340,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[3]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { + } else _CCCL_IF_CONSTEXPR (__op == op_inc) { asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." : : "l"(__tensorMap), @@ -347,7 +350,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[3]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { + } else _CCCL_IF_CONSTEXPR (__op == op_dec) { asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." : : "l"(__tensorMap), @@ -357,7 +360,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[3]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." : : "l"(__tensorMap), @@ -367,7 +370,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[3]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." : : "l"(__tensorMap), @@ -377,7 +380,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[3]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d." : : "l"(__tensorMap), @@ -440,7 +443,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[4]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) { + } else _CCCL_IF_CONSTEXPR (__op == op_min) { asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " "// 1e." : @@ -452,7 +455,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[4]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) { + } else _CCCL_IF_CONSTEXPR (__op == op_max) { asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " "// 1e." : @@ -464,7 +467,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[4]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) { + } else _CCCL_IF_CONSTEXPR (__op == op_inc) { asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " "// 1e." : @@ -476,7 +479,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[4]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) { + } else _CCCL_IF_CONSTEXPR (__op == op_dec) { asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " "// 1e." : @@ -488,7 +491,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[4]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_and_op) { asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " "// 1e." : @@ -500,7 +503,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[4]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_or_op) { asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // " "1e." : @@ -512,7 +515,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( "r"(__tensorCoords[4]), "r"(__as_ptr_smem(__srcMem)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) { + } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) { asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; " "// 1e." : @@ -530,3 +533,5 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor( __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_TENSOR_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence.h similarity index 83% rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence.h index f10ec07ebb5..795513a999d 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence.h @@ -1,3 +1,6 @@ +#ifndef _CUDA_PTX_GENERATED_FENCE_H_ +#define _CUDA_PTX_GENERATED_FENCE_H_ + /* // fence{.sem}.scope; // 1. PTX ISA 60, SM_70 // .sem = { .sc, .acq_rel } @@ -19,15 +22,15 @@ _CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_t<_Scope> __scope ( _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_cta) { asm volatile("fence.sc.cta; // 1." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_gpu) { + } else _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_gpu) { asm volatile("fence.sc.gpu; // 1." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_sys) { + } else _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_sys) { asm volatile("fence.sc.sys; // 1." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_cta) { + } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_cta) { asm volatile("fence.acq_rel.cta; // 1." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_gpu) { + } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_gpu) { asm volatile("fence.acq_rel.gpu; // 1." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_sys) { + } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_sys) { asm volatile("fence.acq_rel.sys; // 1." : : : "memory"); }), ( @@ -57,7 +60,7 @@ _CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_cluster_t) ( _CCCL_IF_CONSTEXPR (__sem == sem_sc) { asm volatile("fence.sc.cluster; // 2." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel) { + } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel) { asm volatile("fence.acq_rel.cluster; // 2." : : : "memory"); }), ( @@ -65,3 +68,5 @@ _CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_cluster_t) __cuda_ptx_fence_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 780 + +#endif // _CUDA_PTX_GENERATED_FENCE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h similarity index 84% rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h index 0d39c222598..a926e7ee730 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h @@ -1,8 +1,11 @@ +#ifndef _CUDA_PTX_GENERATED_FENCE_MBARRIER_INIT_H_ +#define _CUDA_PTX_GENERATED_FENCE_MBARRIER_INIT_H_ + /* // fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90 // .sem = { .release } // .scope = { .cluster } -template +template __device__ static inline void fence_mbarrier_init( cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t); @@ -25,3 +28,5 @@ _CCCL_DEVICE static inline void fence_mbarrier_init(sem_release_t, scope_cluster __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_FENCE_MBARRIER_INIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h similarity index 79% rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h index 98260b851ca..afbf20297e6 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h @@ -1,6 +1,9 @@ +#ifndef _CUDA_PTX_GENERATED_FENCE_PROXY_ALIAS_H_ +#define _CUDA_PTX_GENERATED_FENCE_PROXY_ALIAS_H_ + /* // fence.proxy.alias; // 4. PTX ISA 75, SM_70 -template +template __device__ static inline void fence_proxy_alias(); */ #if __cccl_ptx_isa >= 750 @@ -19,3 +22,5 @@ _CCCL_DEVICE static inline void fence_proxy_alias() __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__();)); } #endif // __cccl_ptx_isa >= 750 + +#endif // _CUDA_PTX_GENERATED_FENCE_PROXY_ALIAS_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h similarity index 85% rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h index f0a37baabdb..2319a629b7e 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h @@ -1,6 +1,9 @@ +#ifndef _CUDA_PTX_GENERATED_FENCE_PROXY_ASYNC_H_ +#define _CUDA_PTX_GENERATED_FENCE_PROXY_ASYNC_H_ + /* // fence.proxy.async; // 5. PTX ISA 80, SM_90 -template +template __device__ static inline void fence_proxy_async(); */ #if __cccl_ptx_isa >= 800 @@ -38,9 +41,9 @@ _CCCL_DEVICE static inline void fence_proxy_async(space_t<_Space> __space) ( _CCCL_IF_CONSTEXPR (__space == space_global) { asm volatile("fence.proxy.async.global; // 6." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__space == space_cluster) { + } else _CCCL_IF_CONSTEXPR (__space == space_cluster) { asm volatile("fence.proxy.async.shared::cluster; // 6." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__space == space_shared) { + } else _CCCL_IF_CONSTEXPR (__space == space_shared) { asm volatile("fence.proxy.async.shared::cta; // 6." : : : "memory"); }), ( @@ -48,3 +51,5 @@ _CCCL_DEVICE static inline void fence_proxy_async(space_t<_Space> __space) __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_FENCE_PROXY_ASYNC_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h similarity index 86% rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h index 3e5b2a265f4..6ec97a2a3c6 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h @@ -1,3 +1,6 @@ +#ifndef _CUDA_PTX_GENERATED_FENCE_PROXY_TENSORMAP_GENERIC_H_ +#define _CUDA_PTX_GENERATED_FENCE_PROXY_TENSORMAP_GENERIC_H_ + /* // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90 // .sem = { .release } @@ -19,11 +22,11 @@ _CCCL_DEVICE static inline void fence_proxy_tensormap_generic(sem_release_t, sco ( _CCCL_IF_CONSTEXPR (__scope == scope_cta) { asm volatile("fence.proxy.tensormap::generic.release.cta; // 7." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm volatile("fence.proxy.tensormap::generic.release.cluster; // 7." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) { asm volatile("fence.proxy.tensormap::generic.release.gpu; // 7." : : : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_sys) { asm volatile("fence.proxy.tensormap::generic.release.sys; // 7." : : : "memory"); }), ( @@ -59,17 +62,17 @@ fence_proxy_tensormap_generic(sem_acquire_t, scope_t<_Scope> __scope, const void : : "l"(__addr), "n"(__size.value) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm volatile("fence.proxy.tensormap::generic.acquire.cluster [%0], %1; // 8." : : "l"(__addr), "n"(__size.value) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) { asm volatile("fence.proxy.tensormap::generic.acquire.gpu [%0], %1; // 8." : : "l"(__addr), "n"(__size.value) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_sys) { asm volatile("fence.proxy.tensormap::generic.acquire.sys [%0], %1; // 8." : : "l"(__addr), "n"(__size.value) @@ -80,3 +83,5 @@ fence_proxy_tensormap_generic(sem_acquire_t, scope_t<_Scope> __scope, const void __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 830 + +#endif // _CUDA_PTX_GENERATED_FENCE_PROXY_TENSORMAP_GENERIC_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h similarity index 95% rename from libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h index dd3079915f7..010651e9d3b 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h @@ -1,6 +1,9 @@ +#ifndef _CUDA_PTX_GENERATED_GET_SREG_H_ +#define _CUDA_PTX_GENERATED_GET_SREG_H_ + /* // mov.u32 sreg_value, %%tid.x; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_tid_x(); */ #if __cccl_ptx_isa >= 200 @@ -15,7 +18,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_x() /* // mov.u32 sreg_value, %%tid.y; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_tid_y(); */ #if __cccl_ptx_isa >= 200 @@ -30,7 +33,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_y() /* // mov.u32 sreg_value, %%tid.z; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_tid_z(); */ #if __cccl_ptx_isa >= 200 @@ -45,7 +48,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_z() /* // mov.u32 sreg_value, %%ntid.x; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_ntid_x(); */ #if __cccl_ptx_isa >= 200 @@ -60,7 +63,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_x() /* // mov.u32 sreg_value, %%ntid.y; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_ntid_y(); */ #if __cccl_ptx_isa >= 200 @@ -75,7 +78,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_y() /* // mov.u32 sreg_value, %%ntid.z; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_ntid_z(); */ #if __cccl_ptx_isa >= 200 @@ -90,7 +93,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_z() /* // mov.u32 sreg_value, %%laneid; // PTX ISA 13 -template +template __device__ static inline uint32_t get_sreg_laneid(); */ #if __cccl_ptx_isa >= 130 @@ -105,7 +108,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_laneid() /* // mov.u32 sreg_value, %%warpid; // PTX ISA 13 -template +template __device__ static inline uint32_t get_sreg_warpid(); */ #if __cccl_ptx_isa >= 130 @@ -120,7 +123,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_warpid() /* // mov.u32 sreg_value, %%nwarpid; // PTX ISA 20, SM_35 -template +template __device__ static inline uint32_t get_sreg_nwarpid(); */ #if __cccl_ptx_isa >= 200 @@ -144,7 +147,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nwarpid() /* // mov.u32 sreg_value, %%ctaid.x; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_ctaid_x(); */ #if __cccl_ptx_isa >= 200 @@ -159,7 +162,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_x() /* // mov.u32 sreg_value, %%ctaid.y; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_ctaid_y(); */ #if __cccl_ptx_isa >= 200 @@ -174,7 +177,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_y() /* // mov.u32 sreg_value, %%ctaid.z; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_ctaid_z(); */ #if __cccl_ptx_isa >= 200 @@ -189,7 +192,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_z() /* // mov.u32 sreg_value, %%nctaid.x; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_nctaid_x(); */ #if __cccl_ptx_isa >= 200 @@ -204,7 +207,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_x() /* // mov.u32 sreg_value, %%nctaid.y; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_nctaid_y(); */ #if __cccl_ptx_isa >= 200 @@ -219,7 +222,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_y() /* // mov.u32 sreg_value, %%nctaid.z; // PTX ISA 20 -template +template __device__ static inline uint32_t get_sreg_nctaid_z(); */ #if __cccl_ptx_isa >= 200 @@ -234,7 +237,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_z() /* // mov.u32 sreg_value, %%smid; // PTX ISA 13 -template +template __device__ static inline uint32_t get_sreg_smid(); */ #if __cccl_ptx_isa >= 130 @@ -249,7 +252,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_smid() /* // mov.u32 sreg_value, %%nsmid; // PTX ISA 20, SM_35 -template +template __device__ static inline uint32_t get_sreg_nsmid(); */ #if __cccl_ptx_isa >= 200 @@ -273,7 +276,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nsmid() /* // mov.u64 sreg_value, %%gridid; // PTX ISA 30 -template +template __device__ static inline uint64_t get_sreg_gridid(); */ #if __cccl_ptx_isa >= 300 @@ -288,7 +291,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_gridid() /* // mov.pred sreg_value, %%is_explicit_cluster; // PTX ISA 78, SM_90 -template +template __device__ static inline bool get_sreg_is_explicit_cluster(); */ #if __cccl_ptx_isa >= 780 @@ -315,7 +318,7 @@ _CCCL_DEVICE static inline bool get_sreg_is_explicit_cluster() /* // mov.u32 sreg_value, %%clusterid.x; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_clusterid_x(); */ #if __cccl_ptx_isa >= 780 @@ -339,7 +342,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_x() /* // mov.u32 sreg_value, %%clusterid.y; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_clusterid_y(); */ #if __cccl_ptx_isa >= 780 @@ -363,7 +366,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_y() /* // mov.u32 sreg_value, %%clusterid.z; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_clusterid_z(); */ #if __cccl_ptx_isa >= 780 @@ -387,7 +390,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_z() /* // mov.u32 sreg_value, %%nclusterid.x; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_nclusterid_x(); */ #if __cccl_ptx_isa >= 780 @@ -411,7 +414,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_x() /* // mov.u32 sreg_value, %%nclusterid.y; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_nclusterid_y(); */ #if __cccl_ptx_isa >= 780 @@ -435,7 +438,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_y() /* // mov.u32 sreg_value, %%nclusterid.z; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_nclusterid_z(); */ #if __cccl_ptx_isa >= 780 @@ -459,7 +462,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_z() /* // mov.u32 sreg_value, %%cluster_ctaid.x; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_cluster_ctaid_x(); */ #if __cccl_ptx_isa >= 780 @@ -483,7 +486,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_x() /* // mov.u32 sreg_value, %%cluster_ctaid.y; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_cluster_ctaid_y(); */ #if __cccl_ptx_isa >= 780 @@ -507,7 +510,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_y() /* // mov.u32 sreg_value, %%cluster_ctaid.z; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_cluster_ctaid_z(); */ #if __cccl_ptx_isa >= 780 @@ -531,7 +534,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_z() /* // mov.u32 sreg_value, %%cluster_nctaid.x; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_cluster_nctaid_x(); */ #if __cccl_ptx_isa >= 780 @@ -555,7 +558,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_x() /* // mov.u32 sreg_value, %%cluster_nctaid.y; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_cluster_nctaid_y(); */ #if __cccl_ptx_isa >= 780 @@ -579,7 +582,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_y() /* // mov.u32 sreg_value, %%cluster_nctaid.z; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_cluster_nctaid_z(); */ #if __cccl_ptx_isa >= 780 @@ -603,7 +606,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_z() /* // mov.u32 sreg_value, %%cluster_ctarank; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_cluster_ctarank(); */ #if __cccl_ptx_isa >= 780 @@ -627,7 +630,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctarank() /* // mov.u32 sreg_value, %%cluster_nctarank; // PTX ISA 78, SM_90 -template +template __device__ static inline uint32_t get_sreg_cluster_nctarank(); */ #if __cccl_ptx_isa >= 780 @@ -651,7 +654,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctarank() /* // mov.u32 sreg_value, %%lanemask_eq; // PTX ISA 20, SM_35 -template +template __device__ static inline uint32_t get_sreg_lanemask_eq(); */ #if __cccl_ptx_isa >= 200 @@ -675,7 +678,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_eq() /* // mov.u32 sreg_value, %%lanemask_le; // PTX ISA 20, SM_35 -template +template __device__ static inline uint32_t get_sreg_lanemask_le(); */ #if __cccl_ptx_isa >= 200 @@ -699,7 +702,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_le() /* // mov.u32 sreg_value, %%lanemask_lt; // PTX ISA 20, SM_35 -template +template __device__ static inline uint32_t get_sreg_lanemask_lt(); */ #if __cccl_ptx_isa >= 200 @@ -723,7 +726,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_lt() /* // mov.u32 sreg_value, %%lanemask_ge; // PTX ISA 20, SM_35 -template +template __device__ static inline uint32_t get_sreg_lanemask_ge(); */ #if __cccl_ptx_isa >= 200 @@ -747,7 +750,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_ge() /* // mov.u32 sreg_value, %%lanemask_gt; // PTX ISA 20, SM_35 -template +template __device__ static inline uint32_t get_sreg_lanemask_gt(); */ #if __cccl_ptx_isa >= 200 @@ -771,7 +774,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_gt() /* // mov.u32 sreg_value, %%clock; // PTX ISA 10 -template +template __device__ static inline uint32_t get_sreg_clock(); */ #if __cccl_ptx_isa >= 100 @@ -786,7 +789,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock() /* // mov.u32 sreg_value, %%clock_hi; // PTX ISA 50, SM_35 -template +template __device__ static inline uint32_t get_sreg_clock_hi(); */ #if __cccl_ptx_isa >= 500 @@ -810,7 +813,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock_hi() /* // mov.u64 sreg_value, %%clock64; // PTX ISA 20, SM_35 -template +template __device__ static inline uint64_t get_sreg_clock64(); */ #if __cccl_ptx_isa >= 200 @@ -834,7 +837,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_clock64() /* // mov.u64 sreg_value, %%globaltimer; // PTX ISA 31, SM_35 -template +template __device__ static inline uint64_t get_sreg_globaltimer(); */ #if __cccl_ptx_isa >= 310 @@ -858,7 +861,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_globaltimer() /* // mov.u32 sreg_value, %%globaltimer_lo; // PTX ISA 31, SM_35 -template +template __device__ static inline uint32_t get_sreg_globaltimer_lo(); */ #if __cccl_ptx_isa >= 310 @@ -882,7 +885,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_lo() /* // mov.u32 sreg_value, %%globaltimer_hi; // PTX ISA 31, SM_35 -template +template __device__ static inline uint32_t get_sreg_globaltimer_hi(); */ #if __cccl_ptx_isa >= 310 @@ -906,7 +909,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_hi() /* // mov.u32 sreg_value, %%total_smem_size; // PTX ISA 41, SM_35 -template +template __device__ static inline uint32_t get_sreg_total_smem_size(); */ #if __cccl_ptx_isa >= 410 @@ -930,7 +933,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_total_smem_size() /* // mov.u32 sreg_value, %%aggr_smem_size; // PTX ISA 81, SM_90 -template +template __device__ static inline uint32_t get_sreg_aggr_smem_size(); */ #if __cccl_ptx_isa >= 810 @@ -954,7 +957,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_aggr_smem_size() /* // mov.u32 sreg_value, %%dynamic_smem_size; // PTX ISA 41, SM_35 -template +template __device__ static inline uint32_t get_sreg_dynamic_smem_size(); */ #if __cccl_ptx_isa >= 410 @@ -978,7 +981,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_dynamic_smem_size() /* // mov.u64 sreg_value, %%current_graph_exec; // PTX ISA 80, SM_50 -template +template __device__ static inline uint64_t get_sreg_current_graph_exec(); */ #if __cccl_ptx_isa >= 800 @@ -999,3 +1002,5 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_current_graph_exec() __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__(); return 0;)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_GET_SREG_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h similarity index 85% rename from libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h index 51bd351be87..ef0a5547512 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h @@ -1,7 +1,10 @@ +#ifndef _CUDA_PTX_GENERATED_GETCTARANK_H_ +#define _CUDA_PTX_GENERATED_GETCTARANK_H_ + /* // getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90 // .space = { .shared::cluster } -template +template __device__ static inline uint32_t getctarank( cuda::ptx::space_cluster_t, const void* addr); @@ -25,3 +28,5 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t getctarank(space_cluster_t, cons __cuda_ptx_getctarank_is_not_supported_before_SM_90__(); return 0;)); } #endif // __cccl_ptx_isa >= 780 + +#endif // _CUDA_PTX_GENERATED_GETCTARANK_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h similarity index 95% rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h index f3e2b860d50..21e9a522348 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h @@ -1,6 +1,9 @@ +#ifndef _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_H_ +#define _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_H_ + /* // mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80 -template +template __device__ static inline uint64_t mbarrier_arrive( uint64_t* addr); */ @@ -25,7 +28,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(_CUDA_VSTD::uint /* // mbarrier.arrive.shared::cta.b64 state, [addr], count; // 2. PTX ISA 78, SM_90 -template +template __device__ static inline uint64_t mbarrier_arrive( uint64_t* addr, const uint32_t& count); @@ -79,7 +82,7 @@ mbarrier_arrive(sem_release_t, scope_t<_Scope> __scope, space_shared_t, _CUDA_VS : "=l"(__state) : "r"(__as_ptr_smem(__addr)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm("mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1]; // 3a. " : "=l"(__state) : "r"(__as_ptr_smem(__addr)) @@ -125,7 +128,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( : "=l"(__state) : "r"(__as_ptr_smem(__addr)), "r"(__count) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm("mbarrier.arrive.release.cluster.shared::cta.b64 %0, [%1], %2; // 3b. " : "=l"(__state) : "r"(__as_ptr_smem(__addr)), "r"(__count) @@ -142,7 +145,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive( // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } -template +template __device__ static inline void mbarrier_arrive( cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, @@ -175,7 +178,7 @@ mbarrier_arrive(sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uin // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } -template +template __device__ static inline void mbarrier_arrive( cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, @@ -203,3 +206,5 @@ _CCCL_DEVICE static inline void mbarrier_arrive( __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h similarity index 92% rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h index efb749957b1..cc33cc2fae1 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h @@ -1,3 +1,6 @@ +#ifndef _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_EXPECT_TX_H_ +#define _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_EXPECT_TX_H_ + /* // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8. PTX ISA 80, SM_90 // .sem = { .release } @@ -32,7 +35,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( : "=l"(__state) : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm("mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2; // 8. " : "=l"(__state) : "r"(__as_ptr_smem(__addr)), "r"(__tx_count) @@ -49,7 +52,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx( // .sem = { .release } // .scope = { .cluster } // .space = { .shared::cluster } -template +template __device__ static inline void mbarrier_arrive_expect_tx( cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, @@ -77,3 +80,5 @@ _CCCL_DEVICE static inline void mbarrier_arrive_expect_tx( __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_EXPECT_TX_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h similarity index 83% rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h index 879bedebdc9..86ccd825f69 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h @@ -1,6 +1,9 @@ +#ifndef _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_NO_COMPLETE_H_ +#define _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_NO_COMPLETE_H_ + /* // mbarrier.arrive.noComplete.shared.b64 state, [addr], count; // 5. PTX ISA 70, SM_80 -template +template __device__ static inline uint64_t mbarrier_arrive_no_complete( uint64_t* addr, const uint32_t& count); @@ -24,3 +27,5 @@ mbarrier_arrive_no_complete(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); return 0;)); } #endif // __cccl_ptx_isa >= 700 + +#endif // _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_NO_COMPLETE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h similarity index 82% rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h index 3afeeacfccf..7d44538e4d8 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h @@ -1,6 +1,9 @@ +#ifndef _CUDA_PTX_GENERATED_MBARRIER_INIT_H_ +#define _CUDA_PTX_GENERATED_MBARRIER_INIT_H_ + /* // mbarrier.init.shared.b64 [addr], count; // PTX ISA 70, SM_80 -template +template __device__ static inline void mbarrier_init( uint64_t* addr, const uint32_t& count); @@ -21,3 +24,5 @@ _CCCL_DEVICE static inline void mbarrier_init(_CUDA_VSTD::uint64_t* __addr, cons __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__();)); } #endif // __cccl_ptx_isa >= 700 + +#endif // _CUDA_PTX_GENERATED_MBARRIER_INIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h similarity index 92% rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h index 301c0364af4..0818671bda4 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h @@ -1,6 +1,9 @@ +#ifndef _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_H_ +#define _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_H_ + /* // mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. PTX -ISA 70, SM_80 template +ISA 70, SM_80 template __device__ static inline bool mbarrier_test_wait( uint64_t* addr, const uint64_t& state); @@ -58,7 +61,7 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait( : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), "l"(__state) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm("{\n\t .reg .pred P_OUT; \n\t" "mbarrier.test_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 2. " "\n\t" @@ -73,3 +76,5 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait( __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); return false;)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h similarity index 91% rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h index 604cfd92045..51742d91c37 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h @@ -1,6 +1,9 @@ +#ifndef _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_PARITY_H_ +#define _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_PARITY_H_ + /* // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. PTX -ISA 71, SM_80 template +ISA 71, SM_80 template __device__ static inline bool mbarrier_test_wait_parity( uint64_t* addr, const uint32_t& phaseParity); @@ -59,7 +62,7 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait_parity( : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm("{\n\t .reg .pred P_OUT; \n\t" "mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 4. \n\t" "selp.b32 %0, 1, 0, P_OUT; \n" @@ -73,3 +76,5 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait_parity( __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); return false;)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_PARITY_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h similarity index 94% rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h index c5f2062664c..1560e010675 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h @@ -1,6 +1,9 @@ +#ifndef _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_H_ +#define _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_H_ + /* // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state; // 5a. -PTX ISA 78, SM_90 template +PTX ISA 78, SM_90 template __device__ static inline bool mbarrier_try_wait( uint64_t* addr, const uint64_t& state); @@ -29,7 +32,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait(_CUDA_VSTD::uint64_t* __addr, /* // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // 5b. PTX -ISA 78, SM_90 template +ISA 78, SM_90 template __device__ static inline bool mbarrier_try_wait( uint64_t* addr, const uint64_t& state, @@ -89,7 +92,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait( : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), "l"(__state) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm("{\n\t .reg .pred P_OUT; \n\t" "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 6a. " "\n\t" @@ -141,7 +144,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait( : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm("{\n\t .reg .pred P_OUT; \n\t" "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2 , %3; // 6b. \n\t" "selp.b32 %0, 1, 0, P_OUT; \n" @@ -155,3 +158,5 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait( __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h similarity index 94% rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h index 321bfc515da..64d71d85979 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h @@ -1,6 +1,9 @@ +#ifndef _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_PARITY_H_ +#define _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_PARITY_H_ + /* // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity; // 7a. -PTX ISA 78, SM_90 template +PTX ISA 78, SM_90 template __device__ static inline bool mbarrier_try_wait_parity( uint64_t* addr, const uint32_t& phaseParity); @@ -30,7 +33,7 @@ mbarrier_try_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_ /* // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 7b. -PTX ISA 78, SM_90 template +PTX ISA 78, SM_90 template __device__ static inline bool mbarrier_try_wait_parity( uint64_t* addr, const uint32_t& phaseParity, @@ -90,7 +93,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait_parity( : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm("{\n\t .reg .pred P_OUT; \n\t" "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 8a. \n\t" "selp.b32 %0, 1, 0, P_OUT; \n" @@ -141,7 +144,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait_parity( : "=r"(__waitComplete) : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm("{\n\t .reg .pred P_OUT; \n\t" "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2, %3; // 8b. \n\t" "selp.b32 %0, 1, 0, P_OUT; \n" @@ -155,3 +158,5 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait_parity( __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;)); } #endif // __cccl_ptx_isa >= 800 + +#endif // _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_PARITY_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h similarity index 97% rename from libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h index 3157fa1c627..5ac257a1615 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h @@ -1,9 +1,12 @@ +#ifndef _CUDA_PTX_GENERATED_RED_ASYNC_H_ +#define _CUDA_PTX_GENERATED_RED_ASYNC_H_ + /* // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .inc } -template +template __device__ static inline void red_async( cuda::ptx::op_inc_t, uint32_t* dest, @@ -35,7 +38,7 @@ _CCCL_DEVICE static inline void red_async( PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .dec } -template +template __device__ static inline void red_async( cuda::ptx::op_dec_t, uint32_t* dest, @@ -67,7 +70,7 @@ _CCCL_DEVICE static inline void red_async( PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .min } -template +template __device__ static inline void red_async( cuda::ptx::op_min_t, uint32_t* dest, @@ -99,7 +102,7 @@ _CCCL_DEVICE static inline void red_async( PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .max } -template +template __device__ static inline void red_async( cuda::ptx::op_max_t, uint32_t* dest, @@ -131,7 +134,7 @@ _CCCL_DEVICE static inline void red_async( PTX ISA 81, SM_90 // .type = { .u32 } // .op = { .add } -template +template __device__ static inline void red_async( cuda::ptx::op_add_t, uint32_t* dest, @@ -163,7 +166,7 @@ _CCCL_DEVICE static inline void red_async( PTX ISA 81, SM_90 // .type = { .s32 } // .op = { .min } -template +template __device__ static inline void red_async( cuda::ptx::op_min_t, int32_t* dest, @@ -195,7 +198,7 @@ red_async(op_min_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __va PTX ISA 81, SM_90 // .type = { .s32 } // .op = { .max } -template +template __device__ static inline void red_async( cuda::ptx::op_max_t, int32_t* dest, @@ -227,7 +230,7 @@ red_async(op_max_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __va PTX ISA 81, SM_90 // .type = { .s32 } // .op = { .add } -template +template __device__ static inline void red_async( cuda::ptx::op_add_t, int32_t* dest, @@ -358,7 +361,7 @@ red_async(op_xor_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t* PTX ISA 81, SM_90 // .type = { .u64 } // .op = { .add } -template +template __device__ static inline void red_async( cuda::ptx::op_add_t, uint64_t* dest, @@ -389,7 +392,7 @@ _CCCL_DEVICE static inline void red_async( // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90 // .op = { .add } -template +template __device__ static inline void red_async( cuda::ptx::op_add_t, int64_t* dest, @@ -415,3 +418,5 @@ red_async(op_add_t, _CUDA_VSTD::int64_t* __dest, const _CUDA_VSTD::int64_t& __va __cuda_ptx_red_async_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 810 + +#endif // _CUDA_PTX_GENERATED_RED_ASYNC_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h similarity index 94% rename from libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h index 9dfab243ffe..fdd31ff6c77 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h @@ -1,3 +1,6 @@ +#ifndef _CUDA_PTX_GENERATED_ST_ASYNC_H_ +#define _CUDA_PTX_GENERATED_ST_ASYNC_H_ + /* // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. PTX ISA 81, SM_90 @@ -22,7 +25,7 @@ _CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type& __value, _C : : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { + } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [%0], %1, [%2]; // 1. " : : "r"(__as_ptr_remote_dsmem(__addr)), "l"(__as_b64(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar)) @@ -61,7 +64,7 @@ _CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type (&__value)[2 "r"(__as_b32(__value[1])), "r"(__as_ptr_remote_dsmem(__remote_bar)) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) { + } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) { asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [%0], {%1, %2}, [%3]; // 2. " : : "r"(__as_ptr_remote_dsmem(__addr)), @@ -106,3 +109,5 @@ _CCCL_DEVICE static inline void st_async(_B32* __addr, const _B32 (&__value)[4], __cuda_ptx_st_async_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 810 + +#endif // _CUDA_PTX_GENERATED_ST_ASYNC_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h similarity index 86% rename from libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h index 033d0606e7f..48a217ea519 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h @@ -1,3 +1,6 @@ +#ifndef _CUDA_PTX_GENERATED_TENSORMAP_CP_FENCEPROXY_H_ +#define _CUDA_PTX_GENERATED_TENSORMAP_CP_FENCEPROXY_H_ + /* // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.sem.scope.sync.aligned [dst], [src], size; // PTX ISA 83, SM_90 @@ -28,19 +31,19 @@ tensormap_cp_fenceproxy(sem_release_t, scope_t<_Scope> __scope, void* __dst, con : : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) { asm volatile( "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned [%0], [%1], %2;" : : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) { asm volatile( "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned [%0], [%1], %2;" : : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value) : "memory"); - } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) { + } else _CCCL_IF_CONSTEXPR (__scope == scope_sys) { asm volatile( "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned [%0], [%1], %2;" : @@ -52,3 +55,5 @@ tensormap_cp_fenceproxy(sem_release_t, scope_t<_Scope> __scope, void* __dst, con __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__();)); } #endif // __cccl_ptx_isa >= 830 + +#endif // _CUDA_PTX_GENERATED_TENSORMAP_CP_FENCEPROXY_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h similarity index 99% rename from libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc rename to libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h index 3b1060ead38..2abedca266f 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc +++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h @@ -1,3 +1,6 @@ +#ifndef _CUDA_PTX_GENERATED_TENSORMAP_REPLACE_H_ +#define _CUDA_PTX_GENERATED_TENSORMAP_REPLACE_H_ + /* // tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a // .space = { .global } @@ -567,3 +570,5 @@ _CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_shared_t, void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();)); } #endif // __cccl_ptx_isa >= 830 + +#endif // _CUDA_PTX_GENERATED_TENSORMAP_REPLACE_H_ diff --git a/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h b/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h index 033005beb5b..3157f7d1da9 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h +++ b/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 10. Special Registers // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/getctarank.h b/libcudacxx/include/cuda/__ptx/instructions/getctarank.h index f5ed3424d3b..c41084f5ae3 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/getctarank.h +++ b/libcudacxx/include/cuda/__ptx/instructions/getctarank.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.23. Data Movement and Conversion Instructions: getctarank // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h index fb1341a61d8..0a44942df82 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h +++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h @@ -32,9 +32,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.15.13. Parallel Synchronization and Communication Instructions: mbarrier.arrive // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive -#include -#include -#include +#include +#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h index 575abda7a41..b3539245e03 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h +++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.15.9. Parallel Synchronization and Communication Instructions: mbarrier.init // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h index 2d6adb78eec..dfcc03bc01c 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h +++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h @@ -32,10 +32,10 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.15.16. Parallel Synchronization and Communication Instructions: mbarrier.test_wait/mbarrier.try_wait // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait -#include -#include -#include -#include +#include +#include +#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/red_async.h b/libcudacxx/include/cuda/__ptx/instructions/red_async.h index a610cf2b583..d14a96dc725 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/red_async.h +++ b/libcudacxx/include/cuda/__ptx/instructions/red_async.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.7. Parallel Synchronization and Communication Instructions: red.async // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/st_async.h b/libcudacxx/include/cuda/__ptx/instructions/st_async.h index 09199b4a3ce..ffad9f176d0 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/st_async.h +++ b/libcudacxx/include/cuda/__ptx/instructions/st_async.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.12. Data Movement and Conversion Instructions: st.async // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h b/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h index de179f69735..22eaa502305 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +++ b/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.12.15.18. Parallel Synchronization and Communication Instructions: tensormap.cp_fenceproxy // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX diff --git a/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h b/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h index 2f81d8b4361..681a820b070 100644 --- a/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h +++ b/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h @@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // 9.7.8.25. Data Movement and Conversion Instructions: tensormap.replace // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace -#include +#include _LIBCUDACXX_END_NAMESPACE_CUDA_PTX