Regenerate PTX test

Overwrites all generated PTX tests and runs `pre-commit run --all-files`
NVIDIA · Nov 25, 2024 · eba90a1 · eba90a1
1 parent 0b36a7d
commit eba90a1
Show file tree

Hide file tree

Showing 55 changed files with 550 additions and 425 deletions.
diff --git a/...xx/cuda/ptx/generated/barrier_cluster.inc → ...acxx/cuda/ptx/generated/barrier_cluster.h b/...xx/cuda/ptx/generated/barrier_cluster.inc → ...acxx/cuda/ptx/generated/barrier_cluster.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_barrier_cluster(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 780

diff --git a/...acxx/cuda/ptx/generated/cp_async_bulk.inc → ...udacxx/cuda/ptx/generated/cp_async_bulk.h b/...acxx/cuda/ptx/generated/cp_async_bulk.inc → ...udacxx/cuda/ptx/generated/cp_async_bulk.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_cp_async_bulk(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 800

diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.h
@@ -0,0 +1,26 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
+__global__ void test_cp_async_bulk_commit_group(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // cp.async.bulk.commit_group;
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::cp_async_bulk_commit_group));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc
diff --git a/...ptx/generated/cp_async_bulk_multicast.inc → ...a/ptx/generated/cp_async_bulk_multicast.h b/...ptx/generated/cp_async_bulk_multicast.inc → ...a/ptx/generated/cp_async_bulk_multicast.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_cp_async_bulk_multicast(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 800

diff --git a/...da/ptx/generated/cp_async_bulk_tensor.inc → ...cuda/ptx/generated/cp_async_bulk_tensor.h b/...da/ptx/generated/cp_async_bulk_tensor.inc → ...cuda/ptx/generated/cp_async_bulk_tensor.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_cp_async_bulk_tensor(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 800

diff --git a/...erated/cp_async_bulk_tensor_multicast.inc → ...enerated/cp_async_bulk_tensor_multicast.h b/...erated/cp_async_bulk_tensor_multicast.inc → ...enerated/cp_async_bulk_tensor_multicast.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 800

diff --git a/...tx/generated/cp_async_bulk_wait_group.inc → .../ptx/generated/cp_async_bulk_wait_group.h b/...tx/generated/cp_async_bulk_wait_group.inc → .../ptx/generated/cp_async_bulk_wait_group.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_cp_async_bulk_wait_group(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 800

diff --git a/...da/ptx/generated/cp_reduce_async_bulk.inc → ...cuda/ptx/generated/cp_reduce_async_bulk.h b/...da/ptx/generated/cp_reduce_async_bulk.inc → ...cuda/ptx/generated/cp_reduce_async_bulk.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_cp_reduce_async_bulk(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 800

diff --git a/...x/generated/cp_reduce_async_bulk_bf16.inc → ...ptx/generated/cp_reduce_async_bulk_bf16.h b/...x/generated/cp_reduce_async_bulk_bf16.inc → ...ptx/generated/cp_reduce_async_bulk_bf16.h
@@ -1,6 +1,22 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_cp_reduce_async_bulk_bf16(void** fn_ptr)
 {
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
   NV_IF_TARGET(
     NV_PROVIDES_SM_90,
     (
@@ -12,9 +28,9 @@ __global__ void test_cp_reduce_async_bulk_bf16(void** fn_ptr)
                                __nv_bfloat16*,
                                const __nv_bfloat16*,
                                uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
-#  endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 800
 
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
   NV_IF_TARGET(
     NV_PROVIDES_SM_90,
     (
@@ -26,9 +42,9 @@ __global__ void test_cp_reduce_async_bulk_bf16(void** fn_ptr)
                                __nv_bfloat16*,
                                const __nv_bfloat16*,
                                uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
-#  endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 800
 
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
   NV_IF_TARGET(
     NV_PROVIDES_SM_90,
     (
@@ -40,5 +56,5 @@ __global__ void test_cp_reduce_async_bulk_bf16(void** fn_ptr)
                                __nv_bfloat16*,
                                const __nv_bfloat16*,
                                uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
-#  endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 800
 }
diff --git a/...tx/generated/cp_reduce_async_bulk_f16.inc → .../ptx/generated/cp_reduce_async_bulk_f16.h b/...tx/generated/cp_reduce_async_bulk_f16.inc → .../ptx/generated/cp_reduce_async_bulk_f16.h
@@ -1,6 +1,22 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_cp_reduce_async_bulk_f16(void** fn_ptr)
 {
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
   NV_IF_TARGET(
     NV_PROVIDES_SM_90,
     (
@@ -9,9 +25,9 @@ __global__ void test_cp_reduce_async_bulk_f16(void** fn_ptr)
           static_cast<void (*)(
             cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_min_t, __half*, const __half*, uint32_t)>(
             cuda::ptx::cp_reduce_async_bulk));));
-#  endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 800
 
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
   NV_IF_TARGET(
     NV_PROVIDES_SM_90,
     (
@@ -20,9 +36,9 @@ __global__ void test_cp_reduce_async_bulk_f16(void** fn_ptr)
           static_cast<void (*)(
             cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_max_t, __half*, const __half*, uint32_t)>(
             cuda::ptx::cp_reduce_async_bulk));));
-#  endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 800
 
-#  if __cccl_ptx_isa >= 800
+#if __cccl_ptx_isa >= 800
   NV_IF_TARGET(
     NV_PROVIDES_SM_90,
     (
@@ -31,5 +47,5 @@ __global__ void test_cp_reduce_async_bulk_f16(void** fn_ptr)
           static_cast<void (*)(
             cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_add_t, __half*, const __half*, uint32_t)>(
             cuda::ptx::cp_reduce_async_bulk));));
-#  endif // __cccl_ptx_isa >= 800
+#endif // __cccl_ptx_isa >= 800
 }
diff --git a/...generated/cp_reduce_async_bulk_tensor.inc → ...x/generated/cp_reduce_async_bulk_tensor.h b/...generated/cp_reduce_async_bulk_tensor.inc → ...x/generated/cp_reduce_async_bulk_tensor.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_cp_reduce_async_bulk_tensor(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 800

diff --git a/...t/libcudacxx/cuda/ptx/generated/fence.inc → ...est/libcudacxx/cuda/ptx/generated/fence.h b/...t/libcudacxx/cuda/ptx/generated/fence.inc → ...est/libcudacxx/cuda/ptx/generated/fence.h
@@ -1,3 +1,19 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
 __global__ void test_fence(void** fn_ptr)
 {
 #if __cccl_ptx_isa >= 600

diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.h
@@ -0,0 +1,27 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
+__global__ void test_fence_mbarrier_init(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 800
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+        // fence.mbarrier_init.release.cluster; // 3.
+        * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t)>(
+          cuda::ptx::fence_mbarrier_init));));
+#endif // __cccl_ptx_isa >= 800
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.inc b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_mbarrier_init.inc
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.h b/libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.h
@@ -0,0 +1,25 @@
+// This file was automatically generated. Do not edit.
+
+// We use a special strategy to force the generation of the PTX. This is mainly
+// a fight against dead-code-elimination in the NVVM layer.
+//
+// The reason we need this strategy is because certain older versions of ptxas
+// segfault when a non-sensical sequence of PTX is generated. So instead, we try
+// to force the instantiation and compilation to PTX of all the overloads of the
+// PTX wrapping functions.
+//
+// We do this by writing a function pointer of each overload to the kernel
+// parameter `fn_ptr`.
+//
+// Because `fn_ptr` is possibly visible outside this translation unit, the
+// compiler must compile all the functions which are stored.
+
+__global__ void test_fence_proxy_alias(void** fn_ptr)
+{
+#if __cccl_ptx_isa >= 750
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (
+                   // fence.proxy.alias; // 4.
+                   * fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::fence_proxy_alias));));
+#endif // __cccl_ptx_isa >= 750
+}