Skip to content

Commit aededb9

Browse files
bernhardmgruberdavebayer
authored andcommitted
Regenerate cuda::ptx headers/docs and run format (NVIDIA#2937)
Overwrites all generated PTX header and documentation files and runs `pre-commit run --all-files`. Also exclude generated PTX headers from header check.
1 parent 3c6fcd2 commit aededb9

File tree

78 files changed

+631
-332
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

78 files changed

+631
-332
lines changed

docs/libcudacxx/ptx/instructions/generated/barrier_cluster.rst

+8-5
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1+
..
2+
This file was automatically generated. Do not edit.
3+
14
barrier.cluster.arrive
25
^^^^^^^^^^^^^^^^^^^^^^
36
.. code:: cuda
47
58
// barrier.cluster.arrive; // PTX ISA 78, SM_90
69
// Marked volatile and as clobbering memory
7-
template <typename=void>
10+
template <typename = void>
811
__device__ static inline void barrier_cluster_arrive();
912
1013
barrier.cluster.wait
@@ -13,7 +16,7 @@ barrier.cluster.wait
1316
1417
// barrier.cluster.wait; // PTX ISA 78, SM_90
1518
// Marked volatile and as clobbering memory
16-
template <typename=void>
19+
template <typename = void>
1720
__device__ static inline void barrier_cluster_wait();
1821
1922
barrier.cluster.arrive.release
@@ -23,7 +26,7 @@ barrier.cluster.arrive.release
2326
// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
2427
// .sem = { .release }
2528
// Marked volatile and as clobbering memory
26-
template <typename=void>
29+
template <typename = void>
2730
__device__ static inline void barrier_cluster_arrive(
2831
cuda::ptx::sem_release_t);
2932
@@ -34,7 +37,7 @@ barrier.cluster.arrive.relaxed
3437
// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
3538
// .sem = { .relaxed }
3639
// Marked volatile
37-
template <typename=void>
40+
template <typename = void>
3841
__device__ static inline void barrier_cluster_arrive(
3942
cuda::ptx::sem_relaxed_t);
4043
@@ -45,6 +48,6 @@ barrier.cluster.wait.acquire
4548
// barrier.cluster.wait.sem; // PTX ISA 80, SM_90
4649
// .sem = { .acquire }
4750
// Marked volatile and as clobbering memory
48-
template <typename=void>
51+
template <typename = void>
4952
__device__ static inline void barrier_cluster_wait(
5053
cuda::ptx::sem_acquire_t);

docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst

+6-3
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
1+
..
2+
This file was automatically generated. Do not edit.
3+
14
cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes
25
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
36
.. code:: cuda
47
58
// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, SM_90
69
// .dst = { .shared::cluster }
710
// .src = { .global }
8-
template <typename=void>
11+
template <typename = void>
912
__device__ static inline void cp_async_bulk(
1013
cuda::ptx::space_cluster_t,
1114
cuda::ptx::space_global_t,
@@ -21,7 +24,7 @@ cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes
2124
// cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2. PTX ISA 80, SM_90
2225
// .dst = { .shared::cluster }
2326
// .src = { .shared::cta }
24-
template <typename=void>
27+
template <typename = void>
2528
__device__ static inline void cp_async_bulk(
2629
cuda::ptx::space_cluster_t,
2730
cuda::ptx::space_shared_t,
@@ -37,7 +40,7 @@ cp.async.bulk.global.shared::cta.bulk_group
3740
// cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3. PTX ISA 80, SM_90
3841
// .dst = { .global }
3942
// .src = { .shared::cta }
40-
template <typename=void>
43+
template <typename = void>
4144
__device__ static inline void cp_async_bulk(
4245
cuda::ptx::space_global_t,
4346
cuda::ptx::space_shared_t,
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1+
..
2+
This file was automatically generated. Do not edit.
3+
14
cp.async.bulk.commit_group
25
^^^^^^^^^^^^^^^^^^^^^^^^^^
36
.. code:: cuda
47
58
// cp.async.bulk.commit_group; // PTX ISA 80, SM_90
6-
template <typename=void>
9+
template <typename = void>
710
__device__ static inline void cp_async_bulk_commit_group();

docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
1+
..
2+
This file was automatically generated. Do not edit.
3+
14
cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster
25
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
36
.. code:: cuda
47
58
// cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1. PTX ISA 80, SM_90a
69
// .dst = { .shared::cluster }
710
// .src = { .global }
8-
template <typename=void>
11+
template <typename = void>
912
__device__ static inline void cp_async_bulk(
1013
cuda::ptx::space_cluster_t,
1114
cuda::ptx::space_global_t,

docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor.rst

+13-10
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
1+
..
2+
This file was automatically generated. Do not edit.
3+
14
cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
25
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
36
.. code:: cuda
47
58
// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1a. PTX ISA 80, SM_90
69
// .dst = { .shared::cluster }
710
// .src = { .global }
8-
template <typename=void>
11+
template <typename = void>
912
__device__ static inline void cp_async_bulk_tensor(
1013
cuda::ptx::space_cluster_t,
1114
cuda::ptx::space_global_t,
@@ -21,7 +24,7 @@ cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group
2124
// cp.async.bulk.tensor.1d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. PTX ISA 80, SM_90
2225
// .dst = { .global }
2326
// .src = { .shared::cta }
24-
template <typename=void>
27+
template <typename = void>
2528
__device__ static inline void cp_async_bulk_tensor(
2629
cuda::ptx::space_global_t,
2730
cuda::ptx::space_shared_t,
@@ -36,7 +39,7 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
3639
// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1b. PTX ISA 80, SM_90
3740
// .dst = { .shared::cluster }
3841
// .src = { .global }
39-
template <typename=void>
42+
template <typename = void>
4043
__device__ static inline void cp_async_bulk_tensor(
4144
cuda::ptx::space_cluster_t,
4245
cuda::ptx::space_global_t,
@@ -52,7 +55,7 @@ cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group
5255
// cp.async.bulk.tensor.2d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. PTX ISA 80, SM_90
5356
// .dst = { .global }
5457
// .src = { .shared::cta }
55-
template <typename=void>
58+
template <typename = void>
5659
__device__ static inline void cp_async_bulk_tensor(
5760
cuda::ptx::space_global_t,
5861
cuda::ptx::space_shared_t,
@@ -67,7 +70,7 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
6770
// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1c. PTX ISA 80, SM_90
6871
// .dst = { .shared::cluster }
6972
// .src = { .global }
70-
template <typename=void>
73+
template <typename = void>
7174
__device__ static inline void cp_async_bulk_tensor(
7275
cuda::ptx::space_cluster_t,
7376
cuda::ptx::space_global_t,
@@ -83,7 +86,7 @@ cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group
8386
// cp.async.bulk.tensor.3d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. PTX ISA 80, SM_90
8487
// .dst = { .global }
8588
// .src = { .shared::cta }
86-
template <typename=void>
89+
template <typename = void>
8790
__device__ static inline void cp_async_bulk_tensor(
8891
cuda::ptx::space_global_t,
8992
cuda::ptx::space_shared_t,
@@ -98,7 +101,7 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
98101
// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1d. PTX ISA 80, SM_90
99102
// .dst = { .shared::cluster }
100103
// .src = { .global }
101-
template <typename=void>
104+
template <typename = void>
102105
__device__ static inline void cp_async_bulk_tensor(
103106
cuda::ptx::space_cluster_t,
104107
cuda::ptx::space_global_t,
@@ -114,7 +117,7 @@ cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group
114117
// cp.async.bulk.tensor.4d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. PTX ISA 80, SM_90
115118
// .dst = { .global }
116119
// .src = { .shared::cta }
117-
template <typename=void>
120+
template <typename = void>
118121
__device__ static inline void cp_async_bulk_tensor(
119122
cuda::ptx::space_global_t,
120123
cuda::ptx::space_shared_t,
@@ -129,7 +132,7 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
129132
// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1e. PTX ISA 80, SM_90
130133
// .dst = { .shared::cluster }
131134
// .src = { .global }
132-
template <typename=void>
135+
template <typename = void>
133136
__device__ static inline void cp_async_bulk_tensor(
134137
cuda::ptx::space_cluster_t,
135138
cuda::ptx::space_global_t,
@@ -145,7 +148,7 @@ cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group
145148
// cp.async.bulk.tensor.5d.dst.src.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. PTX ISA 80, SM_90
146149
// .dst = { .global }
147150
// .src = { .shared::cta }
148-
template <typename=void>
151+
template <typename = void>
149152
__device__ static inline void cp_async_bulk_tensor(
150153
cuda::ptx::space_global_t,
151154
cuda::ptx::space_shared_t,

docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_tensor_multicast.rst

+8-5
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
1+
..
2+
This file was automatically generated. Do not edit.
3+
14
cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster
25
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
36
.. code:: cuda
47
58
// cp.async.bulk.tensor.1d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. PTX ISA 80, SM_90a
69
// .dst = { .shared::cluster }
710
// .src = { .global }
8-
template <typename=void>
11+
template <typename = void>
912
__device__ static inline void cp_async_bulk_tensor(
1013
cuda::ptx::space_cluster_t,
1114
cuda::ptx::space_global_t,
@@ -22,7 +25,7 @@ cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
2225
// cp.async.bulk.tensor.2d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. PTX ISA 80, SM_90a
2326
// .dst = { .shared::cluster }
2427
// .src = { .global }
25-
template <typename=void>
28+
template <typename = void>
2629
__device__ static inline void cp_async_bulk_tensor(
2730
cuda::ptx::space_cluster_t,
2831
cuda::ptx::space_global_t,
@@ -39,7 +42,7 @@ cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
3942
// cp.async.bulk.tensor.3d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. PTX ISA 80, SM_90a
4043
// .dst = { .shared::cluster }
4144
// .src = { .global }
42-
template <typename=void>
45+
template <typename = void>
4346
__device__ static inline void cp_async_bulk_tensor(
4447
cuda::ptx::space_cluster_t,
4548
cuda::ptx::space_global_t,
@@ -56,7 +59,7 @@ cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
5659
// cp.async.bulk.tensor.4d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. PTX ISA 80, SM_90a
5760
// .dst = { .shared::cluster }
5861
// .src = { .global }
59-
template <typename=void>
62+
template <typename = void>
6063
__device__ static inline void cp_async_bulk_tensor(
6164
cuda::ptx::space_cluster_t,
6265
cuda::ptx::space_global_t,
@@ -73,7 +76,7 @@ cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes
7376
// cp.async.bulk.tensor.5d.dst.src.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. PTX ISA 80, SM_90a
7477
// .dst = { .shared::cluster }
7578
// .src = { .global }
76-
template <typename=void>
79+
template <typename = void>
7780
__device__ static inline void cp_async_bulk_tensor(
7881
cuda::ptx::space_cluster_t,
7982
cuda::ptx::space_global_t,

docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_wait_group.rst

+3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
..
2+
This file was automatically generated. Do not edit.
3+
14
cp.async.bulk.wait_group
25
^^^^^^^^^^^^^^^^^^^^^^^^
36
.. code:: cuda

0 commit comments

Comments
 (0)