Regenerate PTX test

Overwrites all generated PTX tests and runs `pre-commit run --all-files`
NVIDIA · Nov 25, 2024 · 6ffc627 · 6ffc627
1 parent 0b36a7d
commit 6ffc627
Show file tree

Hide file tree

Showing 28 changed files with 829 additions and 1,781 deletions.
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.inc
@@ -1,40 +1,36 @@
-__global__ void test_barrier_cluster(void** fn_ptr)
-{
+__global__ void test_barrier_cluster(void ** fn_ptr) {
 #if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(NV_PROVIDES_SM_90,
-               (
-                   // barrier.cluster.arrive;
-                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_arrive));));
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // barrier.cluster.arrive;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_arrive));
+  ));
 #endif // __cccl_ptx_isa >= 780
 
 #if __cccl_ptx_isa >= 780
-  NV_IF_TARGET(NV_PROVIDES_SM_90,
-               (
-                   // barrier.cluster.wait;
-                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_wait));));
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // barrier.cluster.wait;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_wait));
+  ));
 #endif // __cccl_ptx_isa >= 780
 
 #if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(NV_PROVIDES_SM_90,
-               (
-                   // barrier.cluster.arrive.release;
-                   * fn_ptr++ = reinterpret_cast<void*>(
-                     static_cast<void (*)(cuda::ptx::sem_release_t)>(cuda::ptx::barrier_cluster_arrive));));
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // barrier.cluster.arrive.release;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_release_t)>(cuda::ptx::barrier_cluster_arrive));
+  ));
 #endif // __cccl_ptx_isa >= 800
 
 #if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(NV_PROVIDES_SM_90,
-               (
-                   // barrier.cluster.arrive.relaxed;
-                   * fn_ptr++ = reinterpret_cast<void*>(
-                     static_cast<void (*)(cuda::ptx::sem_relaxed_t)>(cuda::ptx::barrier_cluster_arrive));));
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // barrier.cluster.arrive.relaxed;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_relaxed_t)>(cuda::ptx::barrier_cluster_arrive));
+  ));
 #endif // __cccl_ptx_isa >= 800
 
 #if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(NV_PROVIDES_SM_90,
-               (
-                   // barrier.cluster.wait.acquire;
-                   * fn_ptr++ = reinterpret_cast<void*>(
-                     static_cast<void (*)(cuda::ptx::sem_acquire_t)>(cuda::ptx::barrier_cluster_wait));));
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // barrier.cluster.wait.acquire;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_acquire_t)>(cuda::ptx::barrier_cluster_wait));
+  ));
 #endif // __cccl_ptx_isa >= 800
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.inc
@@ -1,37 +1,22 @@
-__global__ void test_cp_async_bulk(void** fn_ptr)
-{
+__global__ void test_cp_async_bulk(void ** fn_ptr) {
 #if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; //
-        // 1a. unicast
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const uint32_t&, uint64_t*)>(
-            cuda::ptx::cp_async_bulk));));
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const uint32_t& , uint64_t* )>(cuda::ptx::cp_async_bulk));
+  ));
 #endif // __cccl_ptx_isa >= 800
 
 #if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [dstMem], [srcMem], size,
-        // [rdsmem_bar]; // 2.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, void*, const void*, const uint32_t&, uint64_t*)>(
-            cuda::ptx::cp_async_bulk));));
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2.
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, void* , const void* , const uint32_t& , uint64_t* )>(cuda::ptx::cp_async_bulk));
+  ));
 #endif // __cccl_ptx_isa >= 800
 
 #if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.global.shared::cta.bulk_group [dstMem], [srcMem], size; // 3.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, void*, const void*, const uint32_t&)>(
-            cuda::ptx::cp_async_bulk));));
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // cp.async.bulk.global.shared::cta.bulk_group [dstMem], [srcMem], size; // 3.
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, cuda::ptx::space_shared_t, void* , const void* , const uint32_t& )>(cuda::ptx::cp_async_bulk));
+  ));
 #endif // __cccl_ptx_isa >= 800
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc
@@ -1,10 +1,8 @@
-__global__ void test_cp_async_bulk_commit_group(void** fn_ptr)
-{
+__global__ void test_cp_async_bulk_commit_group(void ** fn_ptr) {
 #if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.commit_group;
-        * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::cp_async_bulk_commit_group));));
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // cp.async.bulk.commit_group;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::cp_async_bulk_commit_group));
+  ));
 #endif // __cccl_ptx_isa >= 800
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.inc
@@ -1,18 +1,8 @@
-__global__ void test_cp_async_bulk_multicast(void** fn_ptr)
-{
+__global__ void test_cp_async_bulk_multicast(void ** fn_ptr) {
 #if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_HAS_FEATURE_SM_90a,
-    (
-        // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem],
-        // size, [smem_bar], ctaMask; // 1.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(cuda::ptx::space_cluster_t,
-                               cuda::ptx::space_global_t,
-                               void*,
-                               const void*,
-                               const uint32_t&,
-                               uint64_t*,
-                               const uint16_t&)>(cuda::ptx::cp_async_bulk));));
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, (
+    // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1.
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const uint32_t& , uint64_t* , const uint16_t& )>(cuda::ptx::cp_async_bulk));
+  ));
 #endif // __cccl_ptx_isa >= 800
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.inc
@@ -1,117 +1,71 @@
-__global__ void test_cp_async_bulk_tensor(void** fn_ptr)
-{
+__global__ void test_cp_async_bulk_tensor(void ** fn_ptr) {
 #if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
-        // tensorCoords], [smem_bar];// 1a.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[1], uint64_t*)>(
-            cuda::ptx::cp_async_bulk_tensor));));
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1a.
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[1], uint64_t* )>(cuda::ptx::cp_async_bulk_tensor));
+  ));
 #endif // __cccl_ptx_isa >= 800
 
 #if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[1], const void*)>(
-            cuda::ptx::cp_async_bulk_tensor));));
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a.
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void* , const int32_t (&)[1], const void* )>(cuda::ptx::cp_async_bulk_tensor));
+  ));
 #endif // __cccl_ptx_isa >= 800
 
 #if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
-        // tensorCoords], [smem_bar];// 1b.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[2], uint64_t*)>(
-            cuda::ptx::cp_async_bulk_tensor));));
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1b.
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[2], uint64_t* )>(cuda::ptx::cp_async_bulk_tensor));
+  ));
 #endif // __cccl_ptx_isa >= 800
 
 #if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[2], const void*)>(
-            cuda::ptx::cp_async_bulk_tensor));));
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b.
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void* , const int32_t (&)[2], const void* )>(cuda::ptx::cp_async_bulk_tensor));
+  ));
 #endif // __cccl_ptx_isa >= 800
 
 #if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
-        // tensorCoords], [smem_bar];// 1c.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[3], uint64_t*)>(
-            cuda::ptx::cp_async_bulk_tensor));));
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1c.
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[3], uint64_t* )>(cuda::ptx::cp_async_bulk_tensor));
+  ));
 #endif // __cccl_ptx_isa >= 800
 
 #if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[3], const void*)>(
-            cuda::ptx::cp_async_bulk_tensor));));
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c.
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void* , const int32_t (&)[3], const void* )>(cuda::ptx::cp_async_bulk_tensor));
+  ));
 #endif // __cccl_ptx_isa >= 800
 
 #if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
-        // tensorCoords], [smem_bar];// 1d.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[4], uint64_t*)>(
-            cuda::ptx::cp_async_bulk_tensor));));
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1d.
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[4], uint64_t* )>(cuda::ptx::cp_async_bulk_tensor));
+  ));
 #endif // __cccl_ptx_isa >= 800
 
 #if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[4], const void*)>(
-            cuda::ptx::cp_async_bulk_tensor));));
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d.
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void* , const int32_t (&)[4], const void* )>(cuda::ptx::cp_async_bulk_tensor));
+  ));
 #endif // __cccl_ptx_isa >= 800
 
 #if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
-        // tensorCoords], [smem_bar];// 1e.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[5], uint64_t*)>(
-            cuda::ptx::cp_async_bulk_tensor));));
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1e.
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[5], uint64_t* )>(cuda::ptx::cp_async_bulk_tensor));
+  ));
 #endif // __cccl_ptx_isa >= 800
 
 #if __cccl_ptx_isa >= 800
-  NV_IF_TARGET(
-    NV_PROVIDES_SM_90,
-    (
-        // cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e.
-        * fn_ptr++ = reinterpret_cast<void*>(
-          static_cast<void (*)(
-            cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[5], const void*)>(
-            cuda::ptx::cp_async_bulk_tensor));));
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e.
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void* , const int32_t (&)[5], const void* )>(cuda::ptx::cp_async_bulk_tensor));
+  ));
 #endif // __cccl_ptx_isa >= 800
 }