Regenerate PTX header and docs

Overwrites all generated PTX header and documentation files and runs `pre-commit run --all-files`
NVIDIA · Nov 25, 2024 · 07fb9ad · 07fb9ad
1 parent 0b36a7d
commit 07fb9ad
Show file tree

Hide file tree

Showing 69 changed files with 478 additions and 332 deletions.
diff --git a/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst b/docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst
@@ -4,7 +4,7 @@ barrier.cluster.arrive
 
    // barrier.cluster.arrive; // PTX ISA 78, SM_90
    // Marked volatile and as clobbering memory
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void barrier_cluster_arrive();
 
 barrier.cluster.wait
@@ -13,7 +13,7 @@ barrier.cluster.wait
 
    // barrier.cluster.wait; // PTX ISA 78, SM_90
    // Marked volatile and as clobbering memory
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void barrier_cluster_wait();
 
 barrier.cluster.arrive.release
@@ -23,7 +23,7 @@ barrier.cluster.arrive.release
    // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
    // .sem       = { .release }
    // Marked volatile and as clobbering memory
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void barrier_cluster_arrive(
      cuda::ptx::sem_release_t);
 
@@ -34,7 +34,7 @@ barrier.cluster.arrive.relaxed
    // barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
    // .sem       = { .relaxed }
    // Marked volatile
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void barrier_cluster_arrive(
      cuda::ptx::sem_relaxed_t);
 
@@ -45,6 +45,6 @@ barrier.cluster.wait.acquire
    // barrier.cluster.wait.sem; // PTX ISA 80, SM_90
    // .sem       = { .acquire }
    // Marked volatile and as clobbering memory
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void barrier_cluster_wait(
      cuda::ptx::sem_acquire_t);
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst
@@ -5,7 +5,7 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes
    // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -21,7 +21,7 @@ cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes
    // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2.  PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_shared_t,
@@ -37,7 +37,7 @@ cp.async.bulk.global.shared::cta.bulk_group
    // cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3.  PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,

diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_commit_group.rst
@@ -3,5 +3,5 @@ cp.async.bulk.commit_group
 .. code:: cuda
 
    // cp.async.bulk.commit_group; // PTX ISA 80, SM_90
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_commit_group();
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst
@@ -5,7 +5,7 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::clu
    // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1.  PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,

diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst
@@ -5,7 +5,7 @@ cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1a. PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -21,7 +21,7 @@ cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group
    // cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -36,7 +36,7 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1b. PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -52,7 +52,7 @@ cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group
    // cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -67,7 +67,7 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1c. PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -83,7 +83,7 @@ cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group
    // cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -98,7 +98,7 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1d. PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -114,7 +114,7 @@ cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group
    // cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,
@@ -129,7 +129,7 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1e. PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -145,7 +145,7 @@ cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group
    // cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_global_t,
      cuda::ptx::space_shared_t,

diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst
@@ -5,7 +5,7 @@ cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -22,7 +22,7 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -39,7 +39,7 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -56,7 +56,7 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,
@@ -73,7 +73,7 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
    // cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
-   template <typename=void>
+   template <typename = void>
    __device__ static inline void cp_async_bulk_tensor(
      cuda::ptx::space_cluster_t,
      cuda::ptx::space_global_t,