Skip to content

Commit

Permalink
Regenerate cuda::ptx headers/docs and run format (NVIDIA#2937)
Browse files Browse the repository at this point in the history
Overwrites all generated PTX header and documentation files and runs `pre-commit run --all-files`. Also exclude generated PTX headers from header check.
  • Loading branch information
bernhardmgruber authored and davebayer committed Dec 2, 2024
1 parent 2c50c2b commit 10661de
Show file tree
Hide file tree
Showing 78 changed files with 631 additions and 332 deletions.
13 changes: 8 additions & 5 deletions docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
..
This file was automatically generated. Do not edit.
barrier.cluster.arrive
^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda
// barrier.cluster.arrive; // PTX ISA 78, SM_90
// Marked volatile and as clobbering memory
template <typename=void>
template <typename = void>
__device__ static inline void barrier_cluster_arrive();
barrier.cluster.wait
Expand All @@ -13,7 +16,7 @@ barrier.cluster.wait
// barrier.cluster.wait; // PTX ISA 78, SM_90
// Marked volatile and as clobbering memory
template <typename=void>
template <typename = void>
__device__ static inline void barrier_cluster_wait();
barrier.cluster.arrive.release
Expand All @@ -23,7 +26,7 @@ barrier.cluster.arrive.release
// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
// .sem = { .release }
// Marked volatile and as clobbering memory
template <typename=void>
template <typename = void>
__device__ static inline void barrier_cluster_arrive(
cuda::ptx::sem_release_t);
Expand All @@ -34,7 +37,7 @@ barrier.cluster.arrive.relaxed
// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
// .sem = { .relaxed }
// Marked volatile
template <typename=void>
template <typename = void>
__device__ static inline void barrier_cluster_arrive(
cuda::ptx::sem_relaxed_t);
Expand All @@ -45,6 +48,6 @@ barrier.cluster.wait.acquire
// barrier.cluster.wait.sem; // PTX ISA 80, SM_90
// .sem = { .acquire }
// Marked volatile and as clobbering memory
template <typename=void>
template <typename = void>
__device__ static inline void barrier_cluster_wait(
cuda::ptx::sem_acquire_t);
9 changes: 6 additions & 3 deletions docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
..
This file was automatically generated. Do not edit.
cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda
// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, SM_90
// .dst = { .shared::cluster }
// .src = { .global }
template <typename=void>
template <typename = void>
__device__ static inline void cp_async_bulk(
cuda::ptx::space_cluster_t,
cuda::ptx::space_global_t,
Expand All @@ -21,7 +24,7 @@ cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes
// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90
// .dst = { .shared::cluster }
// .src = { .shared::cta }
template <typename=void>
template <typename = void>
__device__ static inline void cp_async_bulk(
cuda::ptx::space_cluster_t,
cuda::ptx::space_shared_t,
Expand All @@ -37,7 +40,7 @@ cp.async.bulk.global.shared::cta.bulk_group
// cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90
// .dst = { .global }
// .src = { .shared::cta }
template <typename=void>
template <typename = void>
__device__ static inline void cp_async_bulk(
cuda::ptx::space_global_t,
cuda::ptx::space_shared_t,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
..
This file was automatically generated. Do not edit.
cp.async.bulk.commit_group
^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda
// cp.async.bulk.commit_group; // PTX ISA 80, SM_90
template <typename=void>
template <typename = void>
__device__ static inline void cp_async_bulk_commit_group();
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
..
This file was automatically generated. Do not edit.
cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda
// cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1. PTX ISA 80, SM_90a
// .dst = { .shared::cluster }
// .src = { .global }
template <typename=void>
template <typename = void>
__device__ static inline void cp_async_bulk(
cuda::ptx::space_cluster_t,
cuda::ptx::space_global_t,
Expand Down
23 changes: 13 additions & 10 deletions docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
..
This file was automatically generated. Do not edit.
cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda
// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1a. PTX ISA 80, SM_90
// .dst = { .shared::cluster }
// .src = { .global }
template <typename=void>
template <typename = void>
__device__ static inline void cp_async_bulk_tensor(
cuda::ptx::space_cluster_t,
cuda::ptx::space_global_t,
Expand All @@ -21,7 +24,7 @@ cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group
// cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90
// .dst = { .global }
// .src = { .shared::cta }
template <typename=void>
template <typename = void>
__device__ static inline void cp_async_bulk_tensor(
cuda::ptx::space_global_t,
cuda::ptx::space_shared_t,
Expand All @@ -36,7 +39,7 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1b. PTX ISA 80, SM_90
// .dst = { .shared::cluster }
// .src = { .global }
template <typename=void>
template <typename = void>
__device__ static inline void cp_async_bulk_tensor(
cuda::ptx::space_cluster_t,
cuda::ptx::space_global_t,
Expand All @@ -52,7 +55,7 @@ cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group
// cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90
// .dst = { .global }
// .src = { .shared::cta }
template <typename=void>
template <typename = void>
__device__ static inline void cp_async_bulk_tensor(
cuda::ptx::space_global_t,
cuda::ptx::space_shared_t,
Expand All @@ -67,7 +70,7 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1c. PTX ISA 80, SM_90
// .dst = { .shared::cluster }
// .src = { .global }
template <typename=void>
template <typename = void>
__device__ static inline void cp_async_bulk_tensor(
cuda::ptx::space_cluster_t,
cuda::ptx::space_global_t,
Expand All @@ -83,7 +86,7 @@ cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group
// cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90
// .dst = { .global }
// .src = { .shared::cta }
template <typename=void>
template <typename = void>
__device__ static inline void cp_async_bulk_tensor(
cuda::ptx::space_global_t,
cuda::ptx::space_shared_t,
Expand All @@ -98,7 +101,7 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1d. PTX ISA 80, SM_90
// .dst = { .shared::cluster }
// .src = { .global }
template <typename=void>
template <typename = void>
__device__ static inline void cp_async_bulk_tensor(
cuda::ptx::space_cluster_t,
cuda::ptx::space_global_t,
Expand All @@ -114,7 +117,7 @@ cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group
// cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90
// .dst = { .global }
// .src = { .shared::cta }
template <typename=void>
template <typename = void>
__device__ static inline void cp_async_bulk_tensor(
cuda::ptx::space_global_t,
cuda::ptx::space_shared_t,
Expand All @@ -129,7 +132,7 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1e. PTX ISA 80, SM_90
// .dst = { .shared::cluster }
// .src = { .global }
template <typename=void>
template <typename = void>
__device__ static inline void cp_async_bulk_tensor(
cuda::ptx::space_cluster_t,
cuda::ptx::space_global_t,
Expand All @@ -145,7 +148,7 @@ cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group
// cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90
// .dst = { .global }
// .src = { .shared::cta }
template <typename=void>
template <typename = void>
__device__ static inline void cp_async_bulk_tensor(
cuda::ptx::space_global_t,
cuda::ptx::space_shared_t,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
..
This file was automatically generated. Do not edit.
cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda
// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a
// .dst = { .shared::cluster }
// .src = { .global }
template <typename=void>
template <typename = void>
__device__ static inline void cp_async_bulk_tensor(
cuda::ptx::space_cluster_t,
cuda::ptx::space_global_t,
Expand All @@ -22,7 +25,7 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a
// .dst = { .shared::cluster }
// .src = { .global }
template <typename=void>
template <typename = void>
__device__ static inline void cp_async_bulk_tensor(
cuda::ptx::space_cluster_t,
cuda::ptx::space_global_t,
Expand All @@ -39,7 +42,7 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a
// .dst = { .shared::cluster }
// .src = { .global }
template <typename=void>
template <typename = void>
__device__ static inline void cp_async_bulk_tensor(
cuda::ptx::space_cluster_t,
cuda::ptx::space_global_t,
Expand All @@ -56,7 +59,7 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a
// .dst = { .shared::cluster }
// .src = { .global }
template <typename=void>
template <typename = void>
__device__ static inline void cp_async_bulk_tensor(
cuda::ptx::space_cluster_t,
cuda::ptx::space_global_t,
Expand All @@ -73,7 +76,7 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a
// .dst = { .shared::cluster }
// .src = { .global }
template <typename=void>
template <typename = void>
__device__ static inline void cp_async_bulk_tensor(
cuda::ptx::space_cluster_t,
cuda::ptx::space_global_t,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
..
This file was automatically generated. Do not edit.
cp.async.bulk.wait_group
^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: cuda
Expand Down
Loading

0 comments on commit 10661de

Please sign in to comment.