From 07fb9adef493d009cdb3fc1c1569efbe6e9ebba7 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Mon, 25 Nov 2024 15:26:27 +0100
Subject: [PATCH] Regenerate PTX header and docs

Overwrites all generated PTX header and documentation files and runs `pre-commit run --all-files`
---
 .../generated/barrier_cluster.rst             |  10 +-
 .../instructions/generated/cp_async_bulk.rst  |   6 +-
 .../generated/cp_async_bulk_commit_group.rst  |   2 +-
 .../generated/cp_async_bulk_multicast.rst     |   2 +-
 .../generated/cp_async_bulk_tensor.rst        |  20 ++--
 .../cp_async_bulk_tensor_multicast.rst        |  10 +-
 .../generated/cp_reduce_async_bulk.rst        |  52 ++++-----
 .../generated/cp_reduce_async_bulk_bf16.rst   |   6 +-
 .../generated/cp_reduce_async_bulk_f16.rst    |   6 +-
 .../generated/fence_mbarrier_init.rst         |   2 +-
 .../generated/fence_proxy_alias.rst           |   2 +-
 .../generated/fence_proxy_async.rst           |   2 +-
 .../ptx/instructions/generated/getctarank.rst |   2 +-
 .../generated/mbarrier_arrive.rst             |   8 +-
 .../generated/mbarrier_arrive_expect_tx.rst   |   2 +-
 .../generated/mbarrier_arrive_no_complete.rst |   2 +-
 .../instructions/generated/mbarrier_init.rst  |   2 +-
 .../generated/mbarrier_test_wait.rst          |   2 +-
 .../generated/mbarrier_test_wait_parity.rst   |   2 +-
 .../generated/mbarrier_try_wait.rst           |   4 +-
 .../generated/mbarrier_try_wait_parity.rst    |   4 +-
 .../ptx/instructions/generated/red_async.rst  |  20 ++--
 .../cuda/__ptx/instructions/barrier_cluster.h |   2 +-
 .../cuda/__ptx/instructions/cp_async_bulk.h   |   4 +-
 .../instructions/cp_async_bulk_commit_group.h |   2 +-
 .../__ptx/instructions/cp_async_bulk_tensor.h |   4 +-
 .../instructions/cp_async_bulk_wait_group.h   |   2 +-
 .../__ptx/instructions/cp_reduce_async_bulk.h |   6 +-
 .../cp_reduce_async_bulk_tensor.h             |   2 +-
 .../include/cuda/__ptx/instructions/fence.h   |  10 +-
 ...{barrier_cluster.inc => barrier_cluster.h} |  15 ++-
 .../{cp_async_bulk.inc => cp_async_bulk.h}    |  11 +-
 ...group.inc => cp_async_bulk_commit_group.h} |   7 +-
 ...ulticast.inc => cp_async_bulk_multicast.h} |   7 +-
 ...bulk_tensor.inc => cp_async_bulk_tensor.h} |  25 +++--
 ...t.inc => cp_async_bulk_tensor_multicast.h} |  15 ++-
 ...t_group.inc => cp_async_bulk_wait_group.h} |  13 ++-
 ..._async_bulk.inc => cp_reduce_async_bulk.h} |  65 +++++------
 ...k_bf16.inc => cp_reduce_async_bulk_bf16.h} |  25 +++--
 ...ulk_f16.inc => cp_reduce_async_bulk_f16.h} |  23 ++--
 ...nsor.inc => cp_reduce_async_bulk_tensor.h} |  75 +++++++------
 .../generated/{fence.inc => fence.h}          |  17 +--
 ...barrier_init.inc => fence_mbarrier_init.h} |   7 +-
 ...ce_proxy_alias.inc => fence_proxy_alias.h} |   7 +-
 ...ce_proxy_async.inc => fence_proxy_async.h} |  11 +-
 ...ic.inc => fence_proxy_tensormap_generic.h} |  17 +--
 .../generated/{get_sreg.inc => get_sreg.h}    | 101 +++++++++---------
 .../{getctarank.inc => getctarank.h}          |   7 +-
 ...{mbarrier_arrive.inc => mbarrier_arrive.h} |  17 +--
 ...ect_tx.inc => mbarrier_arrive_expect_tx.h} |   9 +-
 ...lete.inc => mbarrier_arrive_no_complete.h} |   7 +-
 .../{mbarrier_init.inc => mbarrier_init.h}    |   7 +-
 ...ier_test_wait.inc => mbarrier_test_wait.h} |   9 +-
 ...parity.inc => mbarrier_test_wait_parity.h} |   9 +-
 ...rrier_try_wait.inc => mbarrier_try_wait.h} |  13 ++-
 ..._parity.inc => mbarrier_try_wait_parity.h} |  13 ++-
 .../generated/{red_async.inc => red_async.h}  |  25 +++--
 .../generated/{st_async.inc => st_async.h}    |   9 +-
 ...nceproxy.inc => tensormap_cp_fenceproxy.h} |  11 +-
 ...sormap_replace.inc => tensormap_replace.h} |   5 +
 .../cuda/__ptx/instructions/get_sreg.h        |   2 +-
 .../cuda/__ptx/instructions/getctarank.h      |   2 +-
 .../cuda/__ptx/instructions/mbarrier_arrive.h |   6 +-
 .../cuda/__ptx/instructions/mbarrier_init.h   |   2 +-
 .../cuda/__ptx/instructions/mbarrier_wait.h   |   8 +-
 .../cuda/__ptx/instructions/red_async.h       |   2 +-
 .../cuda/__ptx/instructions/st_async.h        |   2 +-
 .../instructions/tensormap_cp_fenceproxy.h    |   2 +-
 .../__ptx/instructions/tensormap_replace.h    |   2 +-
 69 files changed, 478 insertions(+), 332 deletions(-)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{barrier_cluster.inc => barrier_cluster.h} (93%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_async_bulk.inc => cp_async_bulk.h} (94%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_async_bulk_commit_group.inc => cp_async_bulk_commit_group.h} (77%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_async_bulk_multicast.inc => cp_async_bulk_multicast.h} (88%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_async_bulk_tensor.inc => cp_async_bulk_tensor.h} (97%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_async_bulk_tensor_multicast.inc => cp_async_bulk_tensor_multicast.h} (96%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_async_bulk_wait_group.inc => cp_async_bulk_wait_group.h} (84%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_reduce_async_bulk.inc => cp_reduce_async_bulk.h} (98%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_reduce_async_bulk_bf16.inc => cp_reduce_async_bulk_bf16.h} (90%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_reduce_async_bulk_f16.inc => cp_reduce_async_bulk_f16.h} (90%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{cp_reduce_async_bulk_tensor.inc => cp_reduce_async_bulk_tensor.h} (91%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{fence.inc => fence.h} (83%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{fence_mbarrier_init.inc => fence_mbarrier_init.h} (84%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{fence_proxy_alias.inc => fence_proxy_alias.h} (79%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{fence_proxy_async.inc => fence_proxy_async.h} (85%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{fence_proxy_tensormap_generic.inc => fence_proxy_tensormap_generic.h} (86%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{get_sreg.inc => get_sreg.h} (95%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{getctarank.inc => getctarank.h} (85%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_arrive.inc => mbarrier_arrive.h} (95%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_arrive_expect_tx.inc => mbarrier_arrive_expect_tx.h} (92%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_arrive_no_complete.inc => mbarrier_arrive_no_complete.h} (83%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_init.inc => mbarrier_init.h} (82%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_test_wait.inc => mbarrier_test_wait.h} (92%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_test_wait_parity.inc => mbarrier_test_wait_parity.h} (91%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_try_wait.inc => mbarrier_try_wait.h} (94%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{mbarrier_try_wait_parity.inc => mbarrier_try_wait_parity.h} (94%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{red_async.inc => red_async.h} (97%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{st_async.inc => st_async.h} (94%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{tensormap_cp_fenceproxy.inc => tensormap_cp_fenceproxy.h} (86%)
 rename libcudacxx/include/cuda/__ptx/instructions/generated/{tensormap_replace.inc => tensormap_replace.h} (99%)

diff --git a/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst
index bd994990c05..5bc83e77e50 100644
--- a/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst
@@ -4,7 +4,7 @@ barrier.cluster.arrive
 
    // barrier.cluster.arrive; // PTX ISA 78, SM_90
    // Marked volatile and as clobbering memory
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void barrier_cluster_arrive();
 
 barrier.cluster.wait
@@ -13,7 +13,7 @@ barrier.cluster.wait
 
    // barrier.cluster.wait; // PTX ISA 78, SM_90
    // Marked volatile and as clobbering memory
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void barrier_cluster_wait();
 
 barrier.cluster.arrive.release
@@ -23,7 +23,7 @@ barrier.cluster.arrive.release
    // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
    // .sem       = { .release }
    // Marked volatile and as clobbering memory
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void barrier_cluster_arrive(
      cuda::ptx::sem_release_t);
 
@@ -34,7 +34,7 @@ barrier.cluster.arrive.relaxed
    // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
    // .sem       = { .relaxed }
    // Marked volatile
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void barrier_cluster_arrive(
      cuda::ptx::sem_relaxed_t);
 
@@ -45,6 +45,6 @@ barrier.cluster.wait.acquire
    // barrier.cluster.wait.sem; // PTX ISA 80, SM_90
    // .sem       = { .acquire }
    // Marked volatile and as clobbering memory
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void barrier_cluster_wait(
      cuda::ptx::sem_acquire_t);
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst
index f5c236f8bf9..7a734ab2504 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst
@@ -5,7 +5,7 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes
    // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -21,7 +21,7 @@ cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes
    // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2.  PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -37,7 +37,7 @@ cp.async.bulk.global.shared::cta.bulk_group
    // cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3.  PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst
index 984b4aff976..8f8a42716fc 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst
@@ -3,5 +3,5 @@ cp.async.bulk.commit_group
 .. code:: cuda
 
    // cp.async.bulk.commit_group; // PTX ISA 80, SM_90
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_commit_group();
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst
index 9cb15d06fa3..b76a5de99ef 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst
@@ -5,7 +5,7 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::clu
    // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1.  PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst
index 40eb070e66a..004acac4801 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst
@@ -5,7 +5,7 @@ cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1a. PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -21,7 +21,7 @@ cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group
    // cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -36,7 +36,7 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1b. PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -52,7 +52,7 @@ cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group
    // cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -67,7 +67,7 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1c. PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -83,7 +83,7 @@ cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group
    // cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -98,7 +98,7 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1d. PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -114,7 +114,7 @@ cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group
    // cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -129,7 +129,7 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1e. PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -145,7 +145,7 @@ cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group
    // cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst
index 2481c80bf3c..084ad54708f 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst
@@ -5,7 +5,7 @@ cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -22,7 +22,7 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -39,7 +39,7 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -56,7 +56,7 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -73,7 +73,7 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst
index cc82d633375..f23664d6ad6 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk.rst
@@ -64,7 +64,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.mi
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -83,7 +83,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ma
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -102,7 +102,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ad
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -121,7 +121,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.in
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .inc }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -140,7 +140,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.de
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .dec }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -159,7 +159,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.mi
    // .src       = { .shared::cta }
    // .type      = { .s32 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -178,7 +178,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ma
    // .src       = { .shared::cta }
    // .type      = { .s32 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -197,7 +197,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ad
    // .src       = { .shared::cta }
    // .type      = { .s32 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -216,7 +216,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ad
    // .src       = { .shared::cta }
    // .type      = { .u64 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -235,7 +235,7 @@ cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.ad
    // .src       = { .shared::cta }
    // .type      = { .s64 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -362,7 +362,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u32
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -380,7 +380,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u32
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -398,7 +398,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u32
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -416,7 +416,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.inc.u32
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .inc }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -434,7 +434,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.dec.u32
    // .src       = { .shared::cta }
    // .type      = { .u32 }
    // .op        = { .dec }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -452,7 +452,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s32
    // .src       = { .shared::cta }
    // .type      = { .s32 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -470,7 +470,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s32
    // .src       = { .shared::cta }
    // .type      = { .s32 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -488,7 +488,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.s32
    // .src       = { .shared::cta }
    // .type      = { .s32 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -506,7 +506,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.u64
    // .src       = { .shared::cta }
    // .type      = { .u64 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -524,7 +524,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.u64
    // .src       = { .shared::cta }
    // .type      = { .u64 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -542,7 +542,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64
    // .src       = { .shared::cta }
    // .type      = { .u64 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -560,7 +560,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.s64
    // .src       = { .shared::cta }
    // .type      = { .s64 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -578,7 +578,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.s64
    // .src       = { .shared::cta }
    // .type      = { .s64 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -596,7 +596,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32
    // .src       = { .shared::cta }
    // .type      = { .f32 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -614,7 +614,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f64
    // .src       = { .shared::cta }
    // .type      = { .f64 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -632,7 +632,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.u64
    // .src       = { .shared::cta }
    // .type      = { .s64 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst
index e4dea98a119..756c9d591d3 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_bf16.rst
@@ -7,7 +7,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.bf16
    // .src       = { .shared::cta }
    // .type      = { .bf16 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -25,7 +25,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.bf16
    // .src       = { .shared::cta }
    // .type      = { .bf16 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -43,7 +43,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.bf16
    // .src       = { .shared::cta }
    // .type      = { .bf16 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst
index 18c5e0bfc60..f4121b2cc34 100644
--- a/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/cp_reduce_async_bulk_f16.rst
@@ -7,7 +7,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16
    // .src       = { .shared::cta }
    // .type      = { .f16 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -25,7 +25,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.max.f16
    // .src       = { .shared::cta }
    // .type      = { .f16 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -43,7 +43,7 @@ cp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16
    // .src       = { .shared::cta }
    // .type      = { .f16 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_reduce_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst b/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst
index 0f5298e3359..89513d30961 100644
--- a/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/fence_mbarrier_init.rst
@@ -5,7 +5,7 @@ fence.mbarrier_init.release.cluster
    // fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90
    // .sem       = { .release }
    // .scope     = { .cluster }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void fence_mbarrier_init(
      cuda::ptx::sem_release_t,
      cuda::ptx::scope_cluster_t);
diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst
index 935aab9b6df..eee6afdb932 100644
--- a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_alias.rst
@@ -3,5 +3,5 @@ fence.proxy.alias
 .. code:: cuda
 
    // fence.proxy.alias; // 4. PTX ISA 75, SM_70
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void fence_proxy_alias();
diff --git a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst
index 3e741a1f6c4..def6dddfca4 100644
--- a/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/fence_proxy_async.rst
@@ -3,7 +3,7 @@ fence.proxy.async
 .. code:: cuda
 
    // fence.proxy.async; // 5. PTX ISA 80, SM_90
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void fence_proxy_async();
 
 fence.proxy.async.global
diff --git a/docs/libcudacxx/ptx/instructions/generated/getctarank.rst b/docs/libcudacxx/ptx/instructions/generated/getctarank.rst
index c85f52ee302..9e577ad2982 100644
--- a/docs/libcudacxx/ptx/instructions/generated/getctarank.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/getctarank.rst
@@ -4,7 +4,7 @@ getctarank.shared::cluster.u32
 
    // getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90
    // .space     = { .shared::cluster }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline uint32_t getctarank(
      cuda::ptx::space_cluster_t,
      const void* addr);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst
index 92cd106cad9..ba845c8e93f 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive.rst
@@ -3,7 +3,7 @@ mbarrier.arrive.shared.b64
 .. code:: cuda
 
    // mbarrier.arrive.shared.b64                                  state,  [addr];           // 1.  PTX ISA 70, SM_80
-   template <typename=void>
+   template <typename = void>
    __device__ static inline uint64_t mbarrier_arrive(
      uint64_t* addr);
 
@@ -12,7 +12,7 @@ mbarrier.arrive.shared::cta.b64
 .. code:: cuda
 
    // mbarrier.arrive.shared::cta.b64                             state,  [addr], count;    // 2.  PTX ISA 78, SM_90
-   template <typename=void>
+   template <typename = void>
    __device__ static inline uint64_t mbarrier_arrive(
      uint64_t* addr,
      const uint32_t& count);
@@ -87,7 +87,7 @@ mbarrier.arrive.release.cluster.shared::cluster.b64
    // .sem       = { .release }
    // .scope     = { .cluster }
    // .space     = { .shared::cluster }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void mbarrier_arrive(
      cuda::ptx::sem_release_t,
      cuda::ptx::scope_cluster_t,
@@ -102,7 +102,7 @@ mbarrier.arrive.release.cluster.shared::cluster.b64
    // .sem       = { .release }
    // .scope     = { .cluster }
    // .space     = { .shared::cluster }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void mbarrier_arrive(
      cuda::ptx::sem_release_t,
      cuda::ptx::scope_cluster_t,
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst
index 0087ae2f458..5dc7eb93925 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_expect_tx.rst
@@ -38,7 +38,7 @@ mbarrier.arrive.expect_tx.release.cluster.shared::cluster.b64
    // .sem       = { .release }
    // .scope     = { .cluster }
    // .space     = { .shared::cluster }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void mbarrier_arrive_expect_tx(
      cuda::ptx::sem_release_t,
      cuda::ptx::scope_cluster_t,
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst
index b6d7edbbeee..a18298f90e9 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_arrive_no_complete.rst
@@ -3,7 +3,7 @@ mbarrier.arrive.noComplete.shared.b64
 .. code:: cuda
 
    // mbarrier.arrive.noComplete.shared.b64                       state,  [addr], count;    // 5.  PTX ISA 70, SM_80
-   template <typename=void>
+   template <typename = void>
    __device__ static inline uint64_t mbarrier_arrive_no_complete(
      uint64_t* addr,
      const uint32_t& count);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst
index 3e529d86d78..615bb24d437 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_init.rst
@@ -3,7 +3,7 @@ mbarrier.init.shared.b64
 .. code:: cuda
 
    // mbarrier.init.shared.b64 [addr], count; // PTX ISA 70, SM_80
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void mbarrier_init(
      uint64_t* addr,
      const uint32_t& count);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst
index 4cb241c7ca8..731e3041e33 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait.rst
@@ -3,7 +3,7 @@ mbarrier.test_wait.shared.b64
 .. code:: cuda
 
    // mbarrier.test_wait.shared.b64 waitComplete, [addr], state;                                                  // 1.  PTX ISA 70, SM_80
-   template <typename=void>
+   template <typename = void>
    __device__ static inline bool mbarrier_test_wait(
      uint64_t* addr,
      const uint64_t& state);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst
index e750c4a543f..dded9d97516 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_test_wait_parity.rst
@@ -3,7 +3,7 @@ mbarrier.test_wait.parity.shared.b64
 .. code:: cuda
 
    // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity;                                     // 3.  PTX ISA 71, SM_80
-   template <typename=void>
+   template <typename = void>
    __device__ static inline bool mbarrier_test_wait_parity(
      uint64_t* addr,
      const uint32_t& phaseParity);
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst
index ce648c66ee9..1659b510bf7 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait.rst
@@ -3,7 +3,7 @@ mbarrier.try_wait.shared::cta.b64
 .. code:: cuda
 
    // mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state;                                      // 5a.  PTX ISA 78, SM_90
-   template <typename=void>
+   template <typename = void>
    __device__ static inline bool mbarrier_try_wait(
      uint64_t* addr,
      const uint64_t& state);
@@ -13,7 +13,7 @@ mbarrier.try_wait.shared::cta.b64
 .. code:: cuda
 
    // mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state, suspendTimeHint;                    // 5b.  PTX ISA 78, SM_90
-   template <typename=void>
+   template <typename = void>
    __device__ static inline bool mbarrier_try_wait(
      uint64_t* addr,
      const uint64_t& state,
diff --git a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst
index 3210dc0eab1..0be63975238 100644
--- a/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/mbarrier_try_wait_parity.rst
@@ -3,7 +3,7 @@ mbarrier.try_wait.parity.shared::cta.b64
 .. code:: cuda
 
    // mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity;                                // 7a.  PTX ISA 78, SM_90
-   template <typename=void>
+   template <typename = void>
    __device__ static inline bool mbarrier_try_wait_parity(
      uint64_t* addr,
      const uint32_t& phaseParity);
@@ -13,7 +13,7 @@ mbarrier.try_wait.parity.shared::cta.b64
 .. code:: cuda
 
    // mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint;               // 7b.  PTX ISA 78, SM_90
-   template <typename=void>
+   template <typename = void>
    __device__ static inline bool mbarrier_try_wait_parity(
      uint64_t* addr,
      const uint32_t& phaseParity,
diff --git a/docs/libcudacxx/ptx/instructions/generated/red_async.rst b/docs/libcudacxx/ptx/instructions/generated/red_async.rst
index d6b9cf36549..c1a8a9bf2f8 100644
--- a/docs/libcudacxx/ptx/instructions/generated/red_async.rst
+++ b/docs/libcudacxx/ptx/instructions/generated/red_async.rst
@@ -5,7 +5,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
    // .type      = { .u32 }
    // .op        = { .inc }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_inc_t,
      uint32_t* dest,
@@ -19,7 +19,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
    // .type      = { .u32 }
    // .op        = { .dec }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_dec_t,
      uint32_t* dest,
@@ -33,7 +33,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
    // .type      = { .u32 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_min_t,
      uint32_t* dest,
@@ -47,7 +47,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
    // .type      = { .u32 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_max_t,
      uint32_t* dest,
@@ -61,7 +61,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
    // .type      = { .u32 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_add_t,
      uint32_t* dest,
@@ -75,7 +75,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
    // .type      = { .s32 }
    // .op        = { .min }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_min_t,
      int32_t* dest,
@@ -89,7 +89,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
    // .type      = { .s32 }
    // .op        = { .max }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_max_t,
      int32_t* dest,
@@ -103,7 +103,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
    // .type      = { .s32 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_add_t,
      int32_t* dest,
@@ -159,7 +159,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
    // .type      = { .u64 }
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_add_t,
      uint64_t* dest,
@@ -172,7 +172,7 @@ red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64
 
    // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64  [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90
    // .op        = { .add }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void red_async(
      cuda::ptx::op_add_t,
      int64_t* dest,
diff --git a/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h b/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h
index 8b09ddd1110..93b6a06037c 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/barrier_cluster.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.3. Parallel Synchronization and Communication Instructions: barrier.cluster
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster
-#include <cuda/__ptx/instructions/generated/barrier_cluster.inc>
+#include <cuda/__ptx/instructions/generated/barrier_cluster.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h
index 480a02a701e..abfba441ac9 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk.h
@@ -32,8 +32,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.6. Data Movement and Conversion Instructions: cp.async.bulk
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
-#include <cuda/__ptx/instructions/generated/cp_async_bulk.inc>
-#include <cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc>
+#include <cuda/__ptx/instructions/generated/cp_async_bulk.h>
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h
index bd97259cf19..f9320e975f2 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.12. Data Movement and Conversion Instructions: cp.async.bulk.commit_group
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group
-#include <cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc>
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h
index 5b9f575ce5f..7de5b41b744 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h
@@ -32,8 +32,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.9. Data Movement and Conversion Instructions: cp.async.bulk.tensor
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-#include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc>
-#include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc>
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h>
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h
index 00a3700e1a9..0d933e2cc34 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.13. Data Movement and Conversion Instructions: cp.async.bulk.wait_group
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group
-#include <cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc>
+#include <cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h
index ee6d90bc4d9..f1487301ada 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h
@@ -43,12 +43,12 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk
-#include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc>
+#include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h>
 #ifdef _LIBCUDACXX_HAS_NVF16
-#  include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc>
+#  include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h>
 #endif // _LIBCUDACXX_HAS_NVF16
 #ifdef _LIBCUDACXX_HAS_NVBF16
-#  include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc>
+#  include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h>
 #endif // _LIBCUDACXX_HAS_NVBF16
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
diff --git a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h
index a6b23a706c7..436c42d4c3f 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.24.10. Data Movement and Conversion Instructions: cp.reduce.async.bulk.tensor
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor
-#include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc>
+#include <cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/fence.h b/libcudacxx/include/cuda/__ptx/instructions/fence.h
index 045f09cb40e..a8dccf979c2 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/fence.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/fence.h
@@ -32,11 +32,11 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.4. Parallel Synchronization and Communication Instructions: membar/fence
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence
-#include <cuda/__ptx/instructions/generated/fence.inc>
-#include <cuda/__ptx/instructions/generated/fence_mbarrier_init.inc>
-#include <cuda/__ptx/instructions/generated/fence_proxy_alias.inc>
-#include <cuda/__ptx/instructions/generated/fence_proxy_async.inc>
-#include <cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc>
+#include <cuda/__ptx/instructions/generated/fence.h>
+#include <cuda/__ptx/instructions/generated/fence_mbarrier_init.h>
+#include <cuda/__ptx/instructions/generated/fence_proxy_alias.h>
+#include <cuda/__ptx/instructions/generated/fence_proxy_async.h>
+#include <cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h
similarity index 93%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h
index ca9238bc3ff..976bbfead31 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/barrier_cluster.h
@@ -1,7 +1,10 @@
+#ifndef _CUDA_PTX_GENERATED_BARRIER_CLUSTER_H_
+#define _CUDA_PTX_GENERATED_BARRIER_CLUSTER_H_
+
 /*
 // barrier.cluster.arrive; // PTX ISA 78, SM_90
 // Marked volatile and as clobbering memory
-template <typename=void>
+template <typename = void>
 __device__ static inline void barrier_cluster_arrive();
 */
 #if __cccl_ptx_isa >= 780
@@ -24,7 +27,7 @@ _CCCL_DEVICE static inline void barrier_cluster_arrive()
 /*
 // barrier.cluster.wait; // PTX ISA 78, SM_90
 // Marked volatile and as clobbering memory
-template <typename=void>
+template <typename = void>
 __device__ static inline void barrier_cluster_wait();
 */
 #if __cccl_ptx_isa >= 780
@@ -48,7 +51,7 @@ _CCCL_DEVICE static inline void barrier_cluster_wait()
 // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
 // .sem       = { .release }
 // Marked volatile and as clobbering memory
-template <typename=void>
+template <typename = void>
 __device__ static inline void barrier_cluster_arrive(
   cuda::ptx::sem_release_t);
 */
@@ -74,7 +77,7 @@ _CCCL_DEVICE static inline void barrier_cluster_arrive(sem_release_t)
 // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
 // .sem       = { .relaxed }
 // Marked volatile
-template <typename=void>
+template <typename = void>
 __device__ static inline void barrier_cluster_arrive(
   cuda::ptx::sem_relaxed_t);
 */
@@ -100,7 +103,7 @@ _CCCL_DEVICE static inline void barrier_cluster_arrive(sem_relaxed_t)
 // barrier.cluster.wait.sem; // PTX ISA 80, SM_90
 // .sem       = { .acquire }
 // Marked volatile and as clobbering memory
-template <typename=void>
+template <typename = void>
 __device__ static inline void barrier_cluster_wait(
   cuda::ptx::sem_acquire_t);
 */
@@ -121,3 +124,5 @@ _CCCL_DEVICE static inline void barrier_cluster_wait(sem_acquire_t)
       __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_BARRIER_CLUSTER_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h
similarity index 94%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h
index 69f77053b95..60eccae6818 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk.h
@@ -1,9 +1,12 @@
+#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_
+#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_
+
 /*
 // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80,
 SM_90
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -41,7 +44,7 @@ _CCCL_DEVICE static inline void cp_async_bulk(
 // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2.  PTX ISA 80, SM_90
 // .dst       = { .shared::cluster }
 // .src       = { .shared::cta }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -82,7 +85,7 @@ _CCCL_DEVICE static inline void cp_async_bulk(
 // cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3.  PTX ISA 80, SM_90
 // .dst       = { .global }
 // .src       = { .shared::cta }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -109,3 +112,5 @@ cp_async_bulk(space_global_t, space_shared_t, void* __dstMem, const void* __srcM
       __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h
similarity index 77%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h
index 24baddaea8f..46624839d9f 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h
@@ -1,6 +1,9 @@
+#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_COMMIT_GROUP_H_
+#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_COMMIT_GROUP_H_
+
 /*
 // cp.async.bulk.commit_group; // PTX ISA 80, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_commit_group();
 */
 #if __cccl_ptx_isa >= 800
@@ -19,3 +22,5 @@ _CCCL_DEVICE static inline void cp_async_bulk_commit_group()
       __cuda_ptx_cp_async_bulk_commit_group_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_COMMIT_GROUP_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h
similarity index 88%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h
index cdd5a535eb6..1eec2156ee5 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h
@@ -1,9 +1,12 @@
+#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_MULTICAST_H_
+#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_MULTICAST_H_
+
 /*
 // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar],
 ctaMask; // 1.  PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -43,3 +46,5 @@ _CCCL_DEVICE static inline void cp_async_bulk(
       __cuda_ptx_cp_async_bulk_is_not_supported_before_SM_90a__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_MULTICAST_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h
similarity index 97%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h
index 547888d5b0f..ed10727d60c 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h
@@ -1,9 +1,12 @@
+#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_H_
+#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_H_
+
 /*
 // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];//
 1a. PTX ISA 80, SM_90
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -42,7 +45,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 // cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90
 // .dst       = { .global }
 // .src       = { .shared::cta }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -79,7 +82,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 1b. PTX ISA 80, SM_90
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -122,7 +125,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 // cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90
 // .dst       = { .global }
 // .src       = { .shared::cta }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -159,7 +162,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 1c. PTX ISA 80, SM_90
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -203,7 +206,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 // cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90
 // .dst       = { .global }
 // .src       = { .shared::cta }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -244,7 +247,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 1d. PTX ISA 80, SM_90
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -289,7 +292,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 // cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90
 // .dst       = { .global }
 // .src       = { .shared::cta }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -331,7 +334,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 1e. PTX ISA 80, SM_90
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -377,7 +380,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 // cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90
 // .dst       = { .global }
 // .src       = { .shared::cta }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -414,3 +417,5 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
       __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h
similarity index 96%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h
index 020698a15b1..aa376141dc0 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h
@@ -1,9 +1,12 @@
+#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_MULTICAST_H_
+#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_MULTICAST_H_
+
 /*
 // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap,
 tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -49,7 +52,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -96,7 +99,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -144,7 +147,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -193,7 +196,7 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
 tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a
 // .dst       = { .shared::cluster }
 // .src       = { .global }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_async_bulk_tensor(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_global_t,
@@ -237,3 +240,5 @@ _CCCL_DEVICE static inline void cp_async_bulk_tensor(
       __cuda_ptx_cp_async_bulk_tensor_is_not_supported_before_SM_90a__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_TENSOR_MULTICAST_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h
similarity index 84%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h
index 1a715a0fac6..4ec68009575 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h
@@ -1,3 +1,6 @@
+#ifndef _CUDA_PTX_GENERATED_CP_ASYNC_BULK_WAIT_GROUP_H_
+#define _CUDA_PTX_GENERATED_CP_ASYNC_BULK_WAIT_GROUP_H_
+
 /*
 // cp.async.bulk.wait_group N; // PTX ISA 80, SM_90
 template <int N32>
@@ -7,13 +10,13 @@ __device__ static inline void cp_async_bulk_wait_group(
 #if __cccl_ptx_isa >= 800
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_is_not_supported_before_SM_90__();
 template <int _N32>
-_CCCL_DEVICE static inline void cp_async_bulk_wait_group(n32_t<_N32> __n)
+_CCCL_DEVICE static inline void cp_async_bulk_wait_group(n32_t<_N32> __N)
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (asm volatile("cp.async.bulk.wait_group %0;"
                   :
-                  : "n"(__n.value)
+                  : "n"(__N.value)
                   : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
@@ -30,16 +33,18 @@ __device__ static inline void cp_async_bulk_wait_group_read(
 #if __cccl_ptx_isa >= 800
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__();
 template <int _N32>
-_CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(n32_t<_N32> __n)
+_CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(n32_t<_N32> __N)
 {
   NV_IF_ELSE_TARGET(
     NV_PROVIDES_SM_90,
     (asm volatile("cp.async.bulk.wait_group.read %0;"
                   :
-                  : "n"(__n.value)
+                  : "n"(__N.value)
                   : "memory");),
     (
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_async_bulk_wait_group_read_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_ASYNC_BULK_WAIT_GROUP_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h
similarity index 98%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h
index 50059ff6c5b..94545f2eb47 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h
@@ -1,5 +1,6 @@
-// 9.7.8.24.7. Data Movement and Conversion Instructions: cp.reduce.async.bulk
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk
+#ifndef _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_H_
+#define _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_H_
+
 /*
 // cp.reduce.async.bulk.dst.src.mbarrier::complete_tx::bytes.op.type [dstMem], [srcMem], size, [rdsmem_bar]; // 1. PTX
 ISA 80, SM_90
@@ -154,7 +155,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -203,7 +204,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -252,7 +253,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -301,7 +302,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .inc }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -350,7 +351,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .dec }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -399,7 +400,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .s32 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -448,7 +449,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .s32 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -497,7 +498,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .s32 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -546,7 +547,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .u64 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -595,7 +596,7 @@ ISA 80, SM_90
 // .src       = { .shared::cta }
 // .type      = { .s64 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_cluster_t,
   cuda::ptx::space_shared_t,
@@ -670,7 +671,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
             :
             : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
+      } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) {
         asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.and.b64  [%0], [%1], %2; // 3."
             :
             : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
@@ -715,7 +716,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
             :
             : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
+      } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) {
         asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.or.b64  [%0], [%1], %2; // 3."
             :
             : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
@@ -760,7 +761,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
             :
             : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
+      } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) {
         asm("cp.reduce.async.bulk.global.shared::cta.bulk_group.xor.b64  [%0], [%1], %2; // 3."
             :
             : "l"(__as_ptr_gmem(__dstMem)), "r"(__as_ptr_smem(__srcMem)), "r"(__size)
@@ -778,7 +779,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -820,7 +821,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -862,7 +863,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -904,7 +905,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .inc }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -946,7 +947,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .u32 }
 // .op        = { .dec }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -988,7 +989,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .s32 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1030,7 +1031,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .s32 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1072,7 +1073,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .s32 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1114,7 +1115,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .u64 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1156,7 +1157,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .u64 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1198,7 +1199,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .u64 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1240,7 +1241,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .s64 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1282,7 +1283,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .s64 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1324,7 +1325,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .f32 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1361,7 +1362,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .f64 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1398,7 +1399,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .s64 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -1433,3 +1434,5 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h
similarity index 90%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h
index c657e8d1935..455cc294545 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h
@@ -1,11 +1,13 @@
-#ifdef _LIBCUDACXX_HAS_NVBF16
+#ifndef _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_BF16_H_
+#define _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_BF16_H_
+
 /*
 // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
 // .dst       = { .global }
 // .src       = { .shared::cta }
 // .type      = { .bf16 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -14,7 +16,7 @@ __device__ static inline void cp_reduce_async_bulk(
   const __nv_bfloat16* srcMem,
   uint32_t size);
 */
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
 template <typename = void>
 _CCCL_DEVICE static inline void cp_reduce_async_bulk(
@@ -39,7 +41,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
 }
-#  endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 800
 
 /*
 // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
@@ -47,7 +49,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .bf16 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -56,7 +58,7 @@ __device__ static inline void cp_reduce_async_bulk(
   const __nv_bfloat16* srcMem,
   uint32_t size);
 */
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
 template <typename = void>
 _CCCL_DEVICE static inline void cp_reduce_async_bulk(
@@ -81,7 +83,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
 }
-#  endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 800
 
 /*
 // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type  [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90
@@ -89,7 +91,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .bf16 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -98,7 +100,7 @@ __device__ static inline void cp_reduce_async_bulk(
   const __nv_bfloat16* srcMem,
   uint32_t size);
 */
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
 template <typename = void>
 _CCCL_DEVICE static inline void cp_reduce_async_bulk(
@@ -123,5 +125,6 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
 }
-#  endif // __cccl_ptx_isa >= 800
-#endif // _LIBCUDACXX_HAS_NVBF16
+#endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_BF16_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h
similarity index 90%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h
index 3a52630db53..4ddc4589afa 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h
@@ -1,10 +1,13 @@
+#ifndef _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_F16_H_
+#define _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_F16_H_
+
 /*
 // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
 // .dst       = { .global }
 // .src       = { .shared::cta }
 // .type      = { .f16 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -13,7 +16,7 @@ __device__ static inline void cp_reduce_async_bulk(
   const __half* srcMem,
   uint32_t size);
 */
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
 template <typename = void>
 _CCCL_DEVICE static inline void cp_reduce_async_bulk(
@@ -33,7 +36,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
 }
-#  endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 800
 
 /*
 // cp.reduce.async.bulk.dst.src.bulk_group.op.type  [dstMem], [srcMem], size; // 4. PTX ISA 80, SM_90
@@ -41,7 +44,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .f16 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -50,7 +53,7 @@ __device__ static inline void cp_reduce_async_bulk(
   const __half* srcMem,
   uint32_t size);
 */
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
 template <typename = void>
 _CCCL_DEVICE static inline void cp_reduce_async_bulk(
@@ -70,7 +73,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
 }
-#  endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 800
 
 /*
 // cp.reduce.async.bulk.dst.src.bulk_group.op.noftz.type  [dstMem], [srcMem], size; // 5. PTX ISA 80, SM_90
@@ -78,7 +81,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
 // .src       = { .shared::cta }
 // .type      = { .f16 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void cp_reduce_async_bulk(
   cuda::ptx::space_global_t,
   cuda::ptx::space_shared_t,
@@ -87,7 +90,7 @@ __device__ static inline void cp_reduce_async_bulk(
   const __half* srcMem,
   uint32_t size);
 */
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();
 template <typename = void>
 _CCCL_DEVICE static inline void cp_reduce_async_bulk(
@@ -107,4 +110,6 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk(
       // Unsupported architectures will have a linker error with a semi-decent error message
       __cuda_ptx_cp_reduce_async_bulk_is_not_supported_before_SM_90__();));
 }
-#  endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_F16_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h
similarity index 91%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h
index 32008f6af5b..f069f4bb755 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h
@@ -1,3 +1,6 @@
+#ifndef _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_TENSOR_H_
+#define _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_TENSOR_H_
+
 /*
 // cp.reduce.async.bulk.tensor.1d.dst.src.op.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 1a. PTX ISA 80,
 SM_90
@@ -37,37 +40,37 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_min) {
         asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [%0, {%1}], [%2]; // 1a."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_max) {
         asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [%0, {%1}], [%2]; // 1a."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_inc) {
         asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [%0, {%1}], [%2]; // 1a."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_dec) {
         asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [%0, {%1}], [%2]; // 1a."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_and_op) {
         asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [%0, {%1}], [%2]; // 1a."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_or_op) {
         asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [%0, {%1}], [%2]; // 1a."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) {
         asm("cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [%0, {%1}], [%2]; // 1a."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__as_ptr_smem(__srcMem))
@@ -118,37 +121,37 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_min) {
         asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_max) {
         asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_inc) {
         asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_dec) {
         asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_and_op) {
         asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_or_op) {
         asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) {
         asm("cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2}], [%3]; // 1b."
             :
             : "l"(__tensorMap), "r"(__tensorCoords[0]), "r"(__tensorCoords[1]), "r"(__as_ptr_smem(__srcMem))
@@ -203,7 +206,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[2]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_min) {
         asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
             :
             : "l"(__tensorMap),
@@ -212,7 +215,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[2]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_max) {
         asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
             :
             : "l"(__tensorMap),
@@ -221,7 +224,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[2]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_inc) {
         asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
             :
             : "l"(__tensorMap),
@@ -230,7 +233,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[2]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_dec) {
         asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
             :
             : "l"(__tensorMap),
@@ -239,7 +242,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[2]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_and_op) {
         asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
             :
             : "l"(__tensorMap),
@@ -248,7 +251,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[2]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_or_op) {
         asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
             :
             : "l"(__tensorMap),
@@ -257,7 +260,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[2]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) {
         asm("cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3}], [%4]; // 1c."
             :
             : "l"(__tensorMap),
@@ -317,7 +320,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[3]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_min) {
         asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
             :
             : "l"(__tensorMap),
@@ -327,7 +330,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[3]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_max) {
         asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
             :
             : "l"(__tensorMap),
@@ -337,7 +340,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[3]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_inc) {
         asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
             :
             : "l"(__tensorMap),
@@ -347,7 +350,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[3]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_dec) {
         asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
             :
             : "l"(__tensorMap),
@@ -357,7 +360,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[3]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_and_op) {
         asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
             :
             : "l"(__tensorMap),
@@ -367,7 +370,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[3]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_or_op) {
         asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
             :
             : "l"(__tensorMap),
@@ -377,7 +380,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[3]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) {
         asm("cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4}], [%5]; // 1d."
             :
             : "l"(__tensorMap),
@@ -440,7 +443,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[4]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_min) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_min) {
         asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
             "// 1e."
             :
@@ -452,7 +455,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[4]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_max) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_max) {
         asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
             "// 1e."
             :
@@ -464,7 +467,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[4]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_inc) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_inc) {
         asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
             "// 1e."
             :
@@ -476,7 +479,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[4]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_dec) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_dec) {
         asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
             "// 1e."
             :
@@ -488,7 +491,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[4]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_and_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_and_op) {
         asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
             "// 1e."
             :
@@ -500,7 +503,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[4]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_or_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_or_op) {
         asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; // "
             "1e."
             :
@@ -512,7 +515,7 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
               "r"(__tensorCoords[4]),
               "r"(__as_ptr_smem(__srcMem))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__op == op_xor_op) {
+      } else _CCCL_IF_CONSTEXPR (__op == op_xor_op) {
         asm("cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [%0, {%1, %2, %3, %4, %5}], [%6]; "
             "// 1e."
             :
@@ -530,3 +533,5 @@ _CCCL_DEVICE static inline void cp_reduce_async_bulk_tensor(
       __cuda_ptx_cp_reduce_async_bulk_tensor_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_CP_REDUCE_ASYNC_BULK_TENSOR_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence.h
similarity index 83%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence.h
index f10ec07ebb5..795513a999d 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence.h
@@ -1,3 +1,6 @@
+#ifndef _CUDA_PTX_GENERATED_FENCE_H_
+#define _CUDA_PTX_GENERATED_FENCE_H_
+
 /*
 // fence{.sem}.scope; // 1. PTX ISA 60, SM_70
 // .sem       = { .sc, .acq_rel }
@@ -19,15 +22,15 @@ _CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_t<_Scope> __scope
     (
       _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_cta) {
         asm volatile("fence.sc.cta; // 1." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_gpu) {
+      } else _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_gpu) {
         asm volatile("fence.sc.gpu; // 1." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_sys) {
+      } else _CCCL_IF_CONSTEXPR (__sem == sem_sc && __scope == scope_sys) {
         asm volatile("fence.sc.sys; // 1." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_cta) {
+      } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_cta) {
         asm volatile("fence.acq_rel.cta; // 1." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_gpu) {
+      } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_gpu) {
         asm volatile("fence.acq_rel.gpu; // 1." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_sys) {
+      } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel && __scope == scope_sys) {
         asm volatile("fence.acq_rel.sys; // 1." : : : "memory");
       }),
     (
@@ -57,7 +60,7 @@ _CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_cluster_t)
     (
       _CCCL_IF_CONSTEXPR (__sem == sem_sc) {
         asm volatile("fence.sc.cluster; // 2." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__sem == sem_acq_rel) {
+      } else _CCCL_IF_CONSTEXPR (__sem == sem_acq_rel) {
         asm volatile("fence.acq_rel.cluster; // 2." : : : "memory");
       }),
     (
@@ -65,3 +68,5 @@ _CCCL_DEVICE static inline void fence(sem_t<_Sem> __sem, scope_cluster_t)
       __cuda_ptx_fence_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 780
+
+#endif // _CUDA_PTX_GENERATED_FENCE_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h
similarity index 84%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h
index 0d39c222598..a926e7ee730 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h
@@ -1,8 +1,11 @@
+#ifndef _CUDA_PTX_GENERATED_FENCE_MBARRIER_INIT_H_
+#define _CUDA_PTX_GENERATED_FENCE_MBARRIER_INIT_H_
+
 /*
 // fence.mbarrier_init.sem.scope; // 3. PTX ISA 80, SM_90
 // .sem       = { .release }
 // .scope     = { .cluster }
-template <typename=void>
+template <typename = void>
 __device__ static inline void fence_mbarrier_init(
   cuda::ptx::sem_release_t,
   cuda::ptx::scope_cluster_t);
@@ -25,3 +28,5 @@ _CCCL_DEVICE static inline void fence_mbarrier_init(sem_release_t, scope_cluster
       __cuda_ptx_fence_mbarrier_init_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_FENCE_MBARRIER_INIT_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h
similarity index 79%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h
index 98260b851ca..afbf20297e6 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h
@@ -1,6 +1,9 @@
+#ifndef _CUDA_PTX_GENERATED_FENCE_PROXY_ALIAS_H_
+#define _CUDA_PTX_GENERATED_FENCE_PROXY_ALIAS_H_
+
 /*
 // fence.proxy.alias; // 4. PTX ISA 75, SM_70
-template <typename=void>
+template <typename = void>
 __device__ static inline void fence_proxy_alias();
 */
 #if __cccl_ptx_isa >= 750
@@ -19,3 +22,5 @@ _CCCL_DEVICE static inline void fence_proxy_alias()
       __cuda_ptx_fence_proxy_alias_is_not_supported_before_SM_70__();));
 }
 #endif // __cccl_ptx_isa >= 750
+
+#endif // _CUDA_PTX_GENERATED_FENCE_PROXY_ALIAS_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h
similarity index 85%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h
index f0a37baabdb..2319a629b7e 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_async.h
@@ -1,6 +1,9 @@
+#ifndef _CUDA_PTX_GENERATED_FENCE_PROXY_ASYNC_H_
+#define _CUDA_PTX_GENERATED_FENCE_PROXY_ASYNC_H_
+
 /*
 // fence.proxy.async; // 5. PTX ISA 80, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline void fence_proxy_async();
 */
 #if __cccl_ptx_isa >= 800
@@ -38,9 +41,9 @@ _CCCL_DEVICE static inline void fence_proxy_async(space_t<_Space> __space)
     (
       _CCCL_IF_CONSTEXPR (__space == space_global) {
         asm volatile("fence.proxy.async.global; // 6." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__space == space_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__space == space_cluster) {
         asm volatile("fence.proxy.async.shared::cluster; // 6." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__space == space_shared) {
+      } else _CCCL_IF_CONSTEXPR (__space == space_shared) {
         asm volatile("fence.proxy.async.shared::cta; // 6." : : : "memory");
       }),
     (
@@ -48,3 +51,5 @@ _CCCL_DEVICE static inline void fence_proxy_async(space_t<_Space> __space)
       __cuda_ptx_fence_proxy_async_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_FENCE_PROXY_ASYNC_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h
similarity index 86%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h
index 3e5b2a265f4..6ec97a2a3c6 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h
@@ -1,3 +1,6 @@
+#ifndef _CUDA_PTX_GENERATED_FENCE_PROXY_TENSORMAP_GENERIC_H_
+#define _CUDA_PTX_GENERATED_FENCE_PROXY_TENSORMAP_GENERIC_H_
+
 /*
 // fence.proxy.tensormap::generic.release.scope; // 7. PTX ISA 83, SM_90
 // .sem       = { .release }
@@ -19,11 +22,11 @@ _CCCL_DEVICE static inline void fence_proxy_tensormap_generic(sem_release_t, sco
     (
       _CCCL_IF_CONSTEXPR (__scope == scope_cta) {
         asm volatile("fence.proxy.tensormap::generic.release.cta; // 7." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm volatile("fence.proxy.tensormap::generic.release.cluster; // 7." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) {
         asm volatile("fence.proxy.tensormap::generic.release.gpu; // 7." : : : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_sys) {
         asm volatile("fence.proxy.tensormap::generic.release.sys; // 7." : : : "memory");
       }),
     (
@@ -59,17 +62,17 @@ fence_proxy_tensormap_generic(sem_acquire_t, scope_t<_Scope> __scope, const void
                      :
                      : "l"(__addr), "n"(__size.value)
                      : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm volatile("fence.proxy.tensormap::generic.acquire.cluster [%0], %1; // 8."
                      :
                      : "l"(__addr), "n"(__size.value)
                      : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) {
         asm volatile("fence.proxy.tensormap::generic.acquire.gpu [%0], %1; // 8."
                      :
                      : "l"(__addr), "n"(__size.value)
                      : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_sys) {
         asm volatile("fence.proxy.tensormap::generic.acquire.sys [%0], %1; // 8."
                      :
                      : "l"(__addr), "n"(__size.value)
@@ -80,3 +83,5 @@ fence_proxy_tensormap_generic(sem_acquire_t, scope_t<_Scope> __scope, const void
       __cuda_ptx_fence_proxy_tensormap_generic_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 830
+
+#endif // _CUDA_PTX_GENERATED_FENCE_PROXY_TENSORMAP_GENERIC_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h
similarity index 95%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h
index dd3079915f7..010651e9d3b 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/get_sreg.h
@@ -1,6 +1,9 @@
+#ifndef _CUDA_PTX_GENERATED_GET_SREG_H_
+#define _CUDA_PTX_GENERATED_GET_SREG_H_
+
 /*
 // mov.u32 sreg_value, %%tid.x; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_tid_x();
 */
 #if __cccl_ptx_isa >= 200
@@ -15,7 +18,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_x()
 
 /*
 // mov.u32 sreg_value, %%tid.y; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_tid_y();
 */
 #if __cccl_ptx_isa >= 200
@@ -30,7 +33,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_y()
 
 /*
 // mov.u32 sreg_value, %%tid.z; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_tid_z();
 */
 #if __cccl_ptx_isa >= 200
@@ -45,7 +48,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_tid_z()
 
 /*
 // mov.u32 sreg_value, %%ntid.x; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_ntid_x();
 */
 #if __cccl_ptx_isa >= 200
@@ -60,7 +63,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_x()
 
 /*
 // mov.u32 sreg_value, %%ntid.y; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_ntid_y();
 */
 #if __cccl_ptx_isa >= 200
@@ -75,7 +78,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_y()
 
 /*
 // mov.u32 sreg_value, %%ntid.z; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_ntid_z();
 */
 #if __cccl_ptx_isa >= 200
@@ -90,7 +93,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ntid_z()
 
 /*
 // mov.u32 sreg_value, %%laneid; // PTX ISA 13
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_laneid();
 */
 #if __cccl_ptx_isa >= 130
@@ -105,7 +108,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_laneid()
 
 /*
 // mov.u32 sreg_value, %%warpid; // PTX ISA 13
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_warpid();
 */
 #if __cccl_ptx_isa >= 130
@@ -120,7 +123,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_warpid()
 
 /*
 // mov.u32 sreg_value, %%nwarpid; // PTX ISA 20, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_nwarpid();
 */
 #if __cccl_ptx_isa >= 200
@@ -144,7 +147,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nwarpid()
 
 /*
 // mov.u32 sreg_value, %%ctaid.x; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_ctaid_x();
 */
 #if __cccl_ptx_isa >= 200
@@ -159,7 +162,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_x()
 
 /*
 // mov.u32 sreg_value, %%ctaid.y; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_ctaid_y();
 */
 #if __cccl_ptx_isa >= 200
@@ -174,7 +177,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_y()
 
 /*
 // mov.u32 sreg_value, %%ctaid.z; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_ctaid_z();
 */
 #if __cccl_ptx_isa >= 200
@@ -189,7 +192,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_ctaid_z()
 
 /*
 // mov.u32 sreg_value, %%nctaid.x; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_nctaid_x();
 */
 #if __cccl_ptx_isa >= 200
@@ -204,7 +207,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_x()
 
 /*
 // mov.u32 sreg_value, %%nctaid.y; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_nctaid_y();
 */
 #if __cccl_ptx_isa >= 200
@@ -219,7 +222,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_y()
 
 /*
 // mov.u32 sreg_value, %%nctaid.z; // PTX ISA 20
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_nctaid_z();
 */
 #if __cccl_ptx_isa >= 200
@@ -234,7 +237,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nctaid_z()
 
 /*
 // mov.u32 sreg_value, %%smid; // PTX ISA 13
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_smid();
 */
 #if __cccl_ptx_isa >= 130
@@ -249,7 +252,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_smid()
 
 /*
 // mov.u32 sreg_value, %%nsmid; // PTX ISA 20, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_nsmid();
 */
 #if __cccl_ptx_isa >= 200
@@ -273,7 +276,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nsmid()
 
 /*
 // mov.u64 sreg_value, %%gridid; // PTX ISA 30
-template <typename=void>
+template <typename = void>
 __device__ static inline uint64_t get_sreg_gridid();
 */
 #if __cccl_ptx_isa >= 300
@@ -288,7 +291,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_gridid()
 
 /*
 // mov.pred sreg_value, %%is_explicit_cluster; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline bool get_sreg_is_explicit_cluster();
 */
 #if __cccl_ptx_isa >= 780
@@ -315,7 +318,7 @@ _CCCL_DEVICE static inline bool get_sreg_is_explicit_cluster()
 
 /*
 // mov.u32 sreg_value, %%clusterid.x; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_clusterid_x();
 */
 #if __cccl_ptx_isa >= 780
@@ -339,7 +342,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_x()
 
 /*
 // mov.u32 sreg_value, %%clusterid.y; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_clusterid_y();
 */
 #if __cccl_ptx_isa >= 780
@@ -363,7 +366,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_y()
 
 /*
 // mov.u32 sreg_value, %%clusterid.z; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_clusterid_z();
 */
 #if __cccl_ptx_isa >= 780
@@ -387,7 +390,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clusterid_z()
 
 /*
 // mov.u32 sreg_value, %%nclusterid.x; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_nclusterid_x();
 */
 #if __cccl_ptx_isa >= 780
@@ -411,7 +414,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_x()
 
 /*
 // mov.u32 sreg_value, %%nclusterid.y; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_nclusterid_y();
 */
 #if __cccl_ptx_isa >= 780
@@ -435,7 +438,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_y()
 
 /*
 // mov.u32 sreg_value, %%nclusterid.z; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_nclusterid_z();
 */
 #if __cccl_ptx_isa >= 780
@@ -459,7 +462,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_nclusterid_z()
 
 /*
 // mov.u32 sreg_value, %%cluster_ctaid.x; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_cluster_ctaid_x();
 */
 #if __cccl_ptx_isa >= 780
@@ -483,7 +486,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_x()
 
 /*
 // mov.u32 sreg_value, %%cluster_ctaid.y; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_cluster_ctaid_y();
 */
 #if __cccl_ptx_isa >= 780
@@ -507,7 +510,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_y()
 
 /*
 // mov.u32 sreg_value, %%cluster_ctaid.z; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_cluster_ctaid_z();
 */
 #if __cccl_ptx_isa >= 780
@@ -531,7 +534,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctaid_z()
 
 /*
 // mov.u32 sreg_value, %%cluster_nctaid.x; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_cluster_nctaid_x();
 */
 #if __cccl_ptx_isa >= 780
@@ -555,7 +558,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_x()
 
 /*
 // mov.u32 sreg_value, %%cluster_nctaid.y; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_cluster_nctaid_y();
 */
 #if __cccl_ptx_isa >= 780
@@ -579,7 +582,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_y()
 
 /*
 // mov.u32 sreg_value, %%cluster_nctaid.z; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_cluster_nctaid_z();
 */
 #if __cccl_ptx_isa >= 780
@@ -603,7 +606,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctaid_z()
 
 /*
 // mov.u32 sreg_value, %%cluster_ctarank; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_cluster_ctarank();
 */
 #if __cccl_ptx_isa >= 780
@@ -627,7 +630,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_ctarank()
 
 /*
 // mov.u32 sreg_value, %%cluster_nctarank; // PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_cluster_nctarank();
 */
 #if __cccl_ptx_isa >= 780
@@ -651,7 +654,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_cluster_nctarank()
 
 /*
 // mov.u32 sreg_value, %%lanemask_eq; // PTX ISA 20, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_lanemask_eq();
 */
 #if __cccl_ptx_isa >= 200
@@ -675,7 +678,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_eq()
 
 /*
 // mov.u32 sreg_value, %%lanemask_le; // PTX ISA 20, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_lanemask_le();
 */
 #if __cccl_ptx_isa >= 200
@@ -699,7 +702,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_le()
 
 /*
 // mov.u32 sreg_value, %%lanemask_lt; // PTX ISA 20, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_lanemask_lt();
 */
 #if __cccl_ptx_isa >= 200
@@ -723,7 +726,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_lt()
 
 /*
 // mov.u32 sreg_value, %%lanemask_ge; // PTX ISA 20, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_lanemask_ge();
 */
 #if __cccl_ptx_isa >= 200
@@ -747,7 +750,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_ge()
 
 /*
 // mov.u32 sreg_value, %%lanemask_gt; // PTX ISA 20, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_lanemask_gt();
 */
 #if __cccl_ptx_isa >= 200
@@ -771,7 +774,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_lanemask_gt()
 
 /*
 // mov.u32 sreg_value, %%clock; // PTX ISA 10
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_clock();
 */
 #if __cccl_ptx_isa >= 100
@@ -786,7 +789,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock()
 
 /*
 // mov.u32 sreg_value, %%clock_hi; // PTX ISA 50, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_clock_hi();
 */
 #if __cccl_ptx_isa >= 500
@@ -810,7 +813,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_clock_hi()
 
 /*
 // mov.u64 sreg_value, %%clock64; // PTX ISA 20, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint64_t get_sreg_clock64();
 */
 #if __cccl_ptx_isa >= 200
@@ -834,7 +837,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_clock64()
 
 /*
 // mov.u64 sreg_value, %%globaltimer; // PTX ISA 31, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint64_t get_sreg_globaltimer();
 */
 #if __cccl_ptx_isa >= 310
@@ -858,7 +861,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_globaltimer()
 
 /*
 // mov.u32 sreg_value, %%globaltimer_lo; // PTX ISA 31, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_globaltimer_lo();
 */
 #if __cccl_ptx_isa >= 310
@@ -882,7 +885,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_lo()
 
 /*
 // mov.u32 sreg_value, %%globaltimer_hi; // PTX ISA 31, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_globaltimer_hi();
 */
 #if __cccl_ptx_isa >= 310
@@ -906,7 +909,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_globaltimer_hi()
 
 /*
 // mov.u32 sreg_value, %%total_smem_size; // PTX ISA 41, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_total_smem_size();
 */
 #if __cccl_ptx_isa >= 410
@@ -930,7 +933,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_total_smem_size()
 
 /*
 // mov.u32 sreg_value, %%aggr_smem_size; // PTX ISA 81, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_aggr_smem_size();
 */
 #if __cccl_ptx_isa >= 810
@@ -954,7 +957,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_aggr_smem_size()
 
 /*
 // mov.u32 sreg_value, %%dynamic_smem_size; // PTX ISA 41, SM_35
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t get_sreg_dynamic_smem_size();
 */
 #if __cccl_ptx_isa >= 410
@@ -978,7 +981,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t get_sreg_dynamic_smem_size()
 
 /*
 // mov.u64 sreg_value, %%current_graph_exec; // PTX ISA 80, SM_50
-template <typename=void>
+template <typename = void>
 __device__ static inline uint64_t get_sreg_current_graph_exec();
 */
 #if __cccl_ptx_isa >= 800
@@ -999,3 +1002,5 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t get_sreg_current_graph_exec()
       __cuda_ptx_get_sreg_current_graph_exec_is_not_supported_before_SM_50__(); return 0;));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_GET_SREG_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h
similarity index 85%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h
index 51bd351be87..ef0a5547512 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/getctarank.h
@@ -1,7 +1,10 @@
+#ifndef _CUDA_PTX_GENERATED_GETCTARANK_H_
+#define _CUDA_PTX_GENERATED_GETCTARANK_H_
+
 /*
 // getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90
 // .space     = { .shared::cluster }
-template <typename=void>
+template <typename = void>
 __device__ static inline uint32_t getctarank(
   cuda::ptx::space_cluster_t,
   const void* addr);
@@ -25,3 +28,5 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint32_t getctarank(space_cluster_t, cons
       __cuda_ptx_getctarank_is_not_supported_before_SM_90__(); return 0;));
 }
 #endif // __cccl_ptx_isa >= 780
+
+#endif // _CUDA_PTX_GENERATED_GETCTARANK_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h
similarity index 95%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h
index f3e2b860d50..21e9a522348 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h
@@ -1,6 +1,9 @@
+#ifndef _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_H_
+#define _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_H_
+
 /*
 // mbarrier.arrive.shared.b64                                  state,  [addr];           // 1.  PTX ISA 70, SM_80
-template <typename=void>
+template <typename = void>
 __device__ static inline uint64_t mbarrier_arrive(
   uint64_t* addr);
 */
@@ -25,7 +28,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(_CUDA_VSTD::uint
 
 /*
 // mbarrier.arrive.shared::cta.b64                             state,  [addr], count;    // 2.  PTX ISA 78, SM_90
-template <typename=void>
+template <typename = void>
 __device__ static inline uint64_t mbarrier_arrive(
   uint64_t* addr,
   const uint32_t& count);
@@ -79,7 +82,7 @@ mbarrier_arrive(sem_release_t, scope_t<_Scope> __scope, space_shared_t, _CUDA_VS
             : "=l"(__state)
             : "r"(__as_ptr_smem(__addr))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm("mbarrier.arrive.release.cluster.shared::cta.b64                   %0,  [%1];           // 3a. "
             : "=l"(__state)
             : "r"(__as_ptr_smem(__addr))
@@ -125,7 +128,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
             : "=l"(__state)
             : "r"(__as_ptr_smem(__addr)), "r"(__count)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm("mbarrier.arrive.release.cluster.shared::cta.b64                   %0,  [%1], %2;    // 3b. "
             : "=l"(__state)
             : "r"(__as_ptr_smem(__addr)), "r"(__count)
@@ -142,7 +145,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive(
 // .sem       = { .release }
 // .scope     = { .cluster }
 // .space     = { .shared::cluster }
-template <typename=void>
+template <typename = void>
 __device__ static inline void mbarrier_arrive(
   cuda::ptx::sem_release_t,
   cuda::ptx::scope_cluster_t,
@@ -175,7 +178,7 @@ mbarrier_arrive(sem_release_t, scope_cluster_t, space_cluster_t, _CUDA_VSTD::uin
 // .sem       = { .release }
 // .scope     = { .cluster }
 // .space     = { .shared::cluster }
-template <typename=void>
+template <typename = void>
 __device__ static inline void mbarrier_arrive(
   cuda::ptx::sem_release_t,
   cuda::ptx::scope_cluster_t,
@@ -203,3 +206,5 @@ _CCCL_DEVICE static inline void mbarrier_arrive(
       __cuda_ptx_mbarrier_arrive_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h
similarity index 92%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h
index efb749957b1..cc33cc2fae1 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h
@@ -1,3 +1,6 @@
+#ifndef _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_EXPECT_TX_H_
+#define _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_EXPECT_TX_H_
+
 /*
 // mbarrier.arrive.expect_tx{.sem}{.scope}{.space}.b64 state, [addr], tx_count; // 8.  PTX ISA 80, SM_90
 // .sem       = { .release }
@@ -32,7 +35,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
             : "=l"(__state)
             : "r"(__as_ptr_smem(__addr)), "r"(__tx_count)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm("mbarrier.arrive.expect_tx.release.cluster.shared::cta.b64 %0, [%1], %2; // 8. "
             : "=l"(__state)
             : "r"(__as_ptr_smem(__addr)), "r"(__tx_count)
@@ -49,7 +52,7 @@ _CCCL_DEVICE static inline _CUDA_VSTD::uint64_t mbarrier_arrive_expect_tx(
 // .sem       = { .release }
 // .scope     = { .cluster }
 // .space     = { .shared::cluster }
-template <typename=void>
+template <typename = void>
 __device__ static inline void mbarrier_arrive_expect_tx(
   cuda::ptx::sem_release_t,
   cuda::ptx::scope_cluster_t,
@@ -77,3 +80,5 @@ _CCCL_DEVICE static inline void mbarrier_arrive_expect_tx(
       __cuda_ptx_mbarrier_arrive_expect_tx_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_EXPECT_TX_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h
similarity index 83%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h
index 879bedebdc9..86ccd825f69 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h
@@ -1,6 +1,9 @@
+#ifndef _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_NO_COMPLETE_H_
+#define _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_NO_COMPLETE_H_
+
 /*
 // mbarrier.arrive.noComplete.shared.b64                       state,  [addr], count;    // 5.  PTX ISA 70, SM_80
-template <typename=void>
+template <typename = void>
 __device__ static inline uint64_t mbarrier_arrive_no_complete(
   uint64_t* addr,
   const uint32_t& count);
@@ -24,3 +27,5 @@ mbarrier_arrive_no_complete(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint
       __cuda_ptx_mbarrier_arrive_no_complete_is_not_supported_before_SM_80__(); return 0;));
 }
 #endif // __cccl_ptx_isa >= 700
+
+#endif // _CUDA_PTX_GENERATED_MBARRIER_ARRIVE_NO_COMPLETE_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h
similarity index 82%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h
index 3afeeacfccf..7d44538e4d8 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_init.h
@@ -1,6 +1,9 @@
+#ifndef _CUDA_PTX_GENERATED_MBARRIER_INIT_H_
+#define _CUDA_PTX_GENERATED_MBARRIER_INIT_H_
+
 /*
 // mbarrier.init.shared.b64 [addr], count; // PTX ISA 70, SM_80
-template <typename=void>
+template <typename = void>
 __device__ static inline void mbarrier_init(
   uint64_t* addr,
   const uint32_t& count);
@@ -21,3 +24,5 @@ _CCCL_DEVICE static inline void mbarrier_init(_CUDA_VSTD::uint64_t* __addr, cons
       __cuda_ptx_mbarrier_init_is_not_supported_before_SM_80__();));
 }
 #endif // __cccl_ptx_isa >= 700
+
+#endif // _CUDA_PTX_GENERATED_MBARRIER_INIT_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h
similarity index 92%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h
index 301c0364af4..0818671bda4 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h
@@ -1,6 +1,9 @@
+#ifndef _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_H_
+#define _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_H_
+
 /*
 // mbarrier.test_wait.shared.b64 waitComplete, [addr], state;                                                  // 1. PTX
-ISA 70, SM_80 template <typename=void>
+ISA 70, SM_80 template <typename = void>
 __device__ static inline bool mbarrier_test_wait(
   uint64_t* addr,
   const uint64_t& state);
@@ -58,7 +61,7 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait(
             : "=r"(__waitComplete)
             : "r"(__as_ptr_smem(__addr)), "l"(__state)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm("{\n\t .reg .pred P_OUT; \n\t"
             "mbarrier.test_wait.acquire.cluster.shared::cta.b64        P_OUT, [%1], %2;                        // 2.  "
             "\n\t"
@@ -73,3 +76,5 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait(
       __cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); return false;));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h
similarity index 91%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h
index 604cfd92045..51742d91c37 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h
@@ -1,6 +1,9 @@
+#ifndef _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_PARITY_H_
+#define _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_PARITY_H_
+
 /*
 // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity;                                     // 3. PTX
-ISA 71, SM_80 template <typename=void>
+ISA 71, SM_80 template <typename = void>
 __device__ static inline bool mbarrier_test_wait_parity(
   uint64_t* addr,
   const uint32_t& phaseParity);
@@ -59,7 +62,7 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait_parity(
             : "=r"(__waitComplete)
             : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm("{\n\t .reg .pred P_OUT; \n\t"
             "mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2;                  // 4. \n\t"
             "selp.b32 %0, 1, 0, P_OUT; \n"
@@ -73,3 +76,5 @@ _CCCL_DEVICE static inline bool mbarrier_test_wait_parity(
       __cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); return false;));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_MBARRIER_TEST_WAIT_PARITY_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h
similarity index 94%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h
index c5f2062664c..1560e010675 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h
@@ -1,6 +1,9 @@
+#ifndef _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_H_
+#define _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_H_
+
 /*
 // mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state;                                      // 5a.
-PTX ISA 78, SM_90 template <typename=void>
+PTX ISA 78, SM_90 template <typename = void>
 __device__ static inline bool mbarrier_try_wait(
   uint64_t* addr,
   const uint64_t& state);
@@ -29,7 +32,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait(_CUDA_VSTD::uint64_t* __addr,
 
 /*
 // mbarrier.try_wait.shared::cta.b64         waitComplete, [addr], state, suspendTimeHint;                    // 5b. PTX
-ISA 78, SM_90 template <typename=void>
+ISA 78, SM_90 template <typename = void>
 __device__ static inline bool mbarrier_try_wait(
   uint64_t* addr,
   const uint64_t& state,
@@ -89,7 +92,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait(
             : "=r"(__waitComplete)
             : "r"(__as_ptr_smem(__addr)), "l"(__state)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm("{\n\t .reg .pred P_OUT; \n\t"
             "mbarrier.try_wait.acquire.cluster.shared::cta.b64         P_OUT, [%1], %2;                        // 6a. "
             "\n\t"
@@ -141,7 +144,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait(
             : "=r"(__waitComplete)
             : "r"(__as_ptr_smem(__addr)), "l"(__state), "r"(__suspendTimeHint)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm("{\n\t .reg .pred P_OUT; \n\t"
             "mbarrier.try_wait.acquire.cluster.shared::cta.b64         P_OUT, [%1], %2 , %3;      // 6b. \n\t"
             "selp.b32 %0, 1, 0, P_OUT; \n"
@@ -155,3 +158,5 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait(
       __cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); return false;));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h
similarity index 94%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h
index 321bfc515da..64d71d85979 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h
@@ -1,6 +1,9 @@
+#ifndef _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_PARITY_H_
+#define _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_PARITY_H_
+
 /*
 // mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity;                                // 7a.
-PTX ISA 78, SM_90 template <typename=void>
+PTX ISA 78, SM_90 template <typename = void>
 __device__ static inline bool mbarrier_try_wait_parity(
   uint64_t* addr,
   const uint32_t& phaseParity);
@@ -30,7 +33,7 @@ mbarrier_try_wait_parity(_CUDA_VSTD::uint64_t* __addr, const _CUDA_VSTD::uint32_
 
 /*
 // mbarrier.try_wait.parity.shared::cta.b64  waitComplete, [addr], phaseParity, suspendTimeHint;               // 7b.
-PTX ISA 78, SM_90 template <typename=void>
+PTX ISA 78, SM_90 template <typename = void>
 __device__ static inline bool mbarrier_try_wait_parity(
   uint64_t* addr,
   const uint32_t& phaseParity,
@@ -90,7 +93,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait_parity(
             : "=r"(__waitComplete)
             : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm("{\n\t .reg .pred P_OUT; \n\t"
             "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64  P_OUT, [%1], %2;                  // 8a. \n\t"
             "selp.b32 %0, 1, 0, P_OUT; \n"
@@ -141,7 +144,7 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait_parity(
             : "=r"(__waitComplete)
             : "r"(__as_ptr_smem(__addr)), "r"(__phaseParity), "r"(__suspendTimeHint)
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm("{\n\t .reg .pred P_OUT; \n\t"
             "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64  P_OUT, [%1], %2, %3; // 8b. \n\t"
             "selp.b32 %0, 1, 0, P_OUT; \n"
@@ -155,3 +158,5 @@ _CCCL_DEVICE static inline bool mbarrier_try_wait_parity(
       __cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); return false;));
 }
 #endif // __cccl_ptx_isa >= 800
+
+#endif // _CUDA_PTX_GENERATED_MBARRIER_TRY_WAIT_PARITY_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h
similarity index 97%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h
index 3157fa1c627..5ac257a1615 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/red_async.h
@@ -1,9 +1,12 @@
+#ifndef _CUDA_PTX_GENERATED_RED_ASYNC_H_
+#define _CUDA_PTX_GENERATED_RED_ASYNC_H_
+
 /*
 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  //
 PTX ISA 81, SM_90
 // .type      = { .u32 }
 // .op        = { .inc }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_inc_t,
   uint32_t* dest,
@@ -35,7 +38,7 @@ _CCCL_DEVICE static inline void red_async(
 PTX ISA 81, SM_90
 // .type      = { .u32 }
 // .op        = { .dec }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_dec_t,
   uint32_t* dest,
@@ -67,7 +70,7 @@ _CCCL_DEVICE static inline void red_async(
 PTX ISA 81, SM_90
 // .type      = { .u32 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_min_t,
   uint32_t* dest,
@@ -99,7 +102,7 @@ _CCCL_DEVICE static inline void red_async(
 PTX ISA 81, SM_90
 // .type      = { .u32 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_max_t,
   uint32_t* dest,
@@ -131,7 +134,7 @@ _CCCL_DEVICE static inline void red_async(
 PTX ISA 81, SM_90
 // .type      = { .u32 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_add_t,
   uint32_t* dest,
@@ -163,7 +166,7 @@ _CCCL_DEVICE static inline void red_async(
 PTX ISA 81, SM_90
 // .type      = { .s32 }
 // .op        = { .min }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_min_t,
   int32_t* dest,
@@ -195,7 +198,7 @@ red_async(op_min_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __va
 PTX ISA 81, SM_90
 // .type      = { .s32 }
 // .op        = { .max }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_max_t,
   int32_t* dest,
@@ -227,7 +230,7 @@ red_async(op_max_t, _CUDA_VSTD::int32_t* __dest, const _CUDA_VSTD::int32_t& __va
 PTX ISA 81, SM_90
 // .type      = { .s32 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_add_t,
   int32_t* dest,
@@ -358,7 +361,7 @@ red_async(op_xor_op_t, _B32* __dest, const _B32& __value, _CUDA_VSTD::uint64_t*
 PTX ISA 81, SM_90
 // .type      = { .u64 }
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_add_t,
   uint64_t* dest,
@@ -389,7 +392,7 @@ _CCCL_DEVICE static inline void red_async(
 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64  [dest], value, [remote_bar]; // .u64
 intentional PTX ISA 81, SM_90
 // .op        = { .add }
-template <typename=void>
+template <typename = void>
 __device__ static inline void red_async(
   cuda::ptx::op_add_t,
   int64_t* dest,
@@ -415,3 +418,5 @@ red_async(op_add_t, _CUDA_VSTD::int64_t* __dest, const _CUDA_VSTD::int64_t& __va
       __cuda_ptx_red_async_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 810
+
+#endif // _CUDA_PTX_GENERATED_RED_ASYNC_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h
similarity index 94%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h
index 9dfab243ffe..fdd31ff6c77 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/st_async.h
@@ -1,3 +1,6 @@
+#ifndef _CUDA_PTX_GENERATED_ST_ASYNC_H_
+#define _CUDA_PTX_GENERATED_ST_ASYNC_H_
+
 /*
 // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar];    // 1.  PTX ISA 81,
 SM_90
@@ -22,7 +25,7 @@ _CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type& __value, _C
             :
             : "r"(__as_ptr_remote_dsmem(__addr)), "r"(__as_b32(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
+      } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) {
         asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.b64 [%0], %1, [%2];    // 1. "
             :
             : "r"(__as_ptr_remote_dsmem(__addr)), "l"(__as_b64(__value)), "r"(__as_ptr_remote_dsmem(__remote_bar))
@@ -61,7 +64,7 @@ _CCCL_DEVICE static inline void st_async(_Type* __addr, const _Type (&__value)[2
               "r"(__as_b32(__value[1])),
               "r"(__as_ptr_remote_dsmem(__remote_bar))
             : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (sizeof(_Type) == 8) {
+      } else _CCCL_IF_CONSTEXPR (sizeof(_Type) == 8) {
         asm("st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2.b64 [%0], {%1, %2}, [%3]; // 2. "
             :
             : "r"(__as_ptr_remote_dsmem(__addr)),
@@ -106,3 +109,5 @@ _CCCL_DEVICE static inline void st_async(_B32* __addr, const _B32 (&__value)[4],
       __cuda_ptx_st_async_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 810
+
+#endif // _CUDA_PTX_GENERATED_ST_ASYNC_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h
similarity index 86%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h
index 033d0606e7f..48a217ea519 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h
@@ -1,3 +1,6 @@
+#ifndef _CUDA_PTX_GENERATED_TENSORMAP_CP_FENCEPROXY_H_
+#define _CUDA_PTX_GENERATED_TENSORMAP_CP_FENCEPROXY_H_
+
 /*
 // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.sem.scope.sync.aligned  [dst], [src], size; // PTX ISA
 83, SM_90
@@ -28,19 +31,19 @@ tensormap_cp_fenceproxy(sem_release_t, scope_t<_Scope> __scope, void* __dst, con
           :
           : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value)
           : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_cluster) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_cluster) {
         asm volatile(
           "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned  [%0], [%1], %2;"
           :
           : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value)
           : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_gpu) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_gpu) {
         asm volatile(
           "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned  [%0], [%1], %2;"
           :
           : "l"(__as_ptr_gmem(__dst)), "r"(__as_ptr_smem(__src)), "n"(__size.value)
           : "memory");
-      } _CCCL_ELSE_IF_CONSTEXPR (__scope == scope_sys) {
+      } else _CCCL_IF_CONSTEXPR (__scope == scope_sys) {
         asm volatile(
           "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned  [%0], [%1], %2;"
           :
@@ -52,3 +55,5 @@ tensormap_cp_fenceproxy(sem_release_t, scope_t<_Scope> __scope, void* __dst, con
       __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__();));
 }
 #endif // __cccl_ptx_isa >= 830
+
+#endif // _CUDA_PTX_GENERATED_TENSORMAP_CP_FENCEPROXY_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h
similarity index 99%
rename from libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc
rename to libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h
index 3b1060ead38..2abedca266f 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.inc
+++ b/libcudacxx/include/cuda/__ptx/instructions/generated/tensormap_replace.h
@@ -1,3 +1,6 @@
+#ifndef _CUDA_PTX_GENERATED_TENSORMAP_REPLACE_H_
+#define _CUDA_PTX_GENERATED_TENSORMAP_REPLACE_H_
+
 /*
 // tensormap.replace.tile.global_address.space.b1024.b64    [tm_addr], new_val; // PTX ISA 83, SM_90a
 // .space     = { .global }
@@ -567,3 +570,5 @@ _CCCL_DEVICE static inline void tensormap_replace_fill_mode(space_shared_t, void
       __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();));
 }
 #endif // __cccl_ptx_isa >= 830
+
+#endif // _CUDA_PTX_GENERATED_TENSORMAP_REPLACE_H_
diff --git a/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h b/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h
index 033005beb5b..3157f7d1da9 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/get_sreg.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 10. Special Registers
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers
-#include <cuda/__ptx/instructions/generated/get_sreg.inc>
+#include <cuda/__ptx/instructions/generated/get_sreg.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/getctarank.h b/libcudacxx/include/cuda/__ptx/instructions/getctarank.h
index f5ed3424d3b..c41084f5ae3 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/getctarank.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/getctarank.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.23. Data Movement and Conversion Instructions: getctarank
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank
-#include <cuda/__ptx/instructions/generated/getctarank.inc>
+#include <cuda/__ptx/instructions/generated/getctarank.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h
index fb1341a61d8..0a44942df82 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_arrive.h
@@ -32,9 +32,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.15.13. Parallel Synchronization and Communication Instructions: mbarrier.arrive
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
-#include <cuda/__ptx/instructions/generated/mbarrier_arrive.inc>
-#include <cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.inc>
-#include <cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.inc>
+#include <cuda/__ptx/instructions/generated/mbarrier_arrive.h>
+#include <cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h>
+#include <cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h
index 575abda7a41..b3539245e03 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_init.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.15.9. Parallel Synchronization and Communication Instructions: mbarrier.init
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init
-#include <cuda/__ptx/instructions/generated/mbarrier_init.inc>
+#include <cuda/__ptx/instructions/generated/mbarrier_init.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h
index 2d6adb78eec..dfcc03bc01c 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/mbarrier_wait.h
@@ -32,10 +32,10 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.15.16. Parallel Synchronization and Communication Instructions: mbarrier.test_wait/mbarrier.try_wait
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait
-#include <cuda/__ptx/instructions/generated/mbarrier_test_wait.inc>
-#include <cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.inc>
-#include <cuda/__ptx/instructions/generated/mbarrier_try_wait.inc>
-#include <cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.inc>
+#include <cuda/__ptx/instructions/generated/mbarrier_test_wait.h>
+#include <cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h>
+#include <cuda/__ptx/instructions/generated/mbarrier_try_wait.h>
+#include <cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/red_async.h b/libcudacxx/include/cuda/__ptx/instructions/red_async.h
index a610cf2b583..d14a96dc725 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/red_async.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/red_async.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.7. Parallel Synchronization and Communication Instructions: red.async
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async
-#include <cuda/__ptx/instructions/generated/red_async.inc>
+#include <cuda/__ptx/instructions/generated/red_async.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/st_async.h b/libcudacxx/include/cuda/__ptx/instructions/st_async.h
index 09199b4a3ce..ffad9f176d0 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/st_async.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/st_async.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.12. Data Movement and Conversion Instructions: st.async
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async
-#include <cuda/__ptx/instructions/generated/st_async.inc>
+#include <cuda/__ptx/instructions/generated/st_async.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h b/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h
index de179f69735..22eaa502305 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.12.15.18. Parallel Synchronization and Communication Instructions: tensormap.cp_fenceproxy
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy
-#include <cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.inc>
+#include <cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX
 
diff --git a/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h b/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h
index 2f81d8b4361..681a820b070 100644
--- a/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h
+++ b/libcudacxx/include/cuda/__ptx/instructions/tensormap_replace.h
@@ -32,7 +32,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 
 // 9.7.8.25. Data Movement and Conversion Instructions: tensormap.replace
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace
-#include <cuda/__ptx/instructions/generated/tensormap_replace.inc>
+#include <cuda/__ptx/instructions/generated/tensormap_replace.h>
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_PTX