davebayer
diff --git a/‎docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst
+8-5 b/‎docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst
+8-5
diff --git a/‎docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst
+6-3 b/‎docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst
+6-3
diff --git a/‎docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst
+4-1 b/‎docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst
+4-1
diff --git a/‎docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst
+4-1 b/‎docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst
+4-1
diff --git a/‎docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst
+13-10 b/‎docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst
+13-10
diff --git a/‎docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst
+8-5 b/‎docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst
+8-5
diff --git a/‎docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst
+3 b/‎docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst
+3
@@ -1,10 +1,13 @@
+..
+   This file was automatically generated. Do not edit.
+
 barrier.cluster.arrive
 ^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // barrier.cluster.arrive; // PTX ISA 78, SM_90
    // Marked volatile and as clobbering memory
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void barrier_cluster_arrive();
 
 barrier.cluster.wait
@@ -13,7 +16,7 @@ barrier.cluster.wait
 
    // barrier.cluster.wait; // PTX ISA 78, SM_90
    // Marked volatile and as clobbering memory
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void barrier_cluster_wait();
 
 barrier.cluster.arrive.release
@@ -23,7 +26,7 @@ barrier.cluster.arrive.release
    // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
    // .sem       = { .release }
    // Marked volatile and as clobbering memory
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void barrier_cluster_arrive(
      cuda::ptx::sem_release_t);
 
@@ -34,7 +37,7 @@ barrier.cluster.arrive.relaxed
    // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
    // .sem       = { .relaxed }
    // Marked volatile
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void barrier_cluster_arrive(
      cuda::ptx::sem_relaxed_t);
 
@@ -45,6 +48,6 @@ barrier.cluster.wait.acquire
    // barrier.cluster.wait.sem; // PTX ISA 80, SM_90
    // .sem       = { .acquire }
    // Marked volatile and as clobbering memory
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void barrier_cluster_wait(
      cuda::ptx::sem_acquire_t);
@@ -1,11 +1,14 @@
+..
+   This file was automatically generated. Do not edit.
+
 cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -21,7 +24,7 @@ cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes
    // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2.  PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -37,7 +40,7 @@ cp.async.bulk.global.shared::cta.bulk_group
    // cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3.  PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
 
@@ -1,7 +1,10 @@
+..
+   This file was automatically generated. Do not edit.
+
 cp.async.bulk.commit_group
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // cp.async.bulk.commit_group; // PTX ISA 80, SM_90
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_commit_group();
@@ -1,11 +1,14 @@
+..
+   This file was automatically generated. Do not edit.
+
 cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1.  PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
 
@@ -1,11 +1,14 @@
+..
+   This file was automatically generated. Do not edit.
+
 cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1a. PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -21,7 +24,7 @@ cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group
    // cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -36,7 +39,7 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1b. PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -52,7 +55,7 @@ cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group
    // cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -67,7 +70,7 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1c. PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -83,7 +86,7 @@ cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group
    // cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -98,7 +101,7 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1d. PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -114,7 +117,7 @@ cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group
    // cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -129,7 +132,7 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1e. PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -145,7 +148,7 @@ cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group
    // cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
 
@@ -1,11 +1,14 @@
+..
+   This file was automatically generated. Do not edit.
+
 cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
    // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -22,7 +25,7 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -39,7 +42,7 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -56,7 +59,7 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -73,7 +76,7 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
 
@@ -1,3 +1,6 @@
+..
+   This file was automatically generated. Do not edit.
+
 cp.async.bulk.wait_group
 ^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda