Skip to content

Commit

Permalink
Regenerate PTX test
Browse files Browse the repository at this point in the history
Overwrites all generated PTX tests and runs `pre-commit run --all-files`
  • Loading branch information
bernhardmgruber committed Nov 25, 2024
1 parent 0b36a7d commit eba90a1
Show file tree
Hide file tree
Showing 55 changed files with 550 additions and 425 deletions.
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
// This file was automatically generated. Do not edit.

// We use a special strategy to force the generation of the PTX. This is mainly
// a fight against dead-code-elimination in the NVVM layer.
//
// The reason we need this strategy is because certain older versions of ptxas
// segfault when a non-sensical sequence of PTX is generated. So instead, we try
// to force the instantiation and compilation to PTX of all the overloads of the
// PTX wrapping functions.
//
// We do this by writing a function pointer of each overload to the kernel
// parameter `fn_ptr`.
//
// Because `fn_ptr` is possibly visible outside this translation unit, the
// compiler must compile all the functions which are stored.

__global__ void test_barrier_cluster(void** fn_ptr)
{
#if __cccl_ptx_isa >= 780
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
// This file was automatically generated. Do not edit.

// We use a special strategy to force the generation of the PTX. This is mainly
// a fight against dead-code-elimination in the NVVM layer.
//
// The reason we need this strategy is because certain older versions of ptxas
// segfault when a non-sensical sequence of PTX is generated. So instead, we try
// to force the instantiation and compilation to PTX of all the overloads of the
// PTX wrapping functions.
//
// We do this by writing a function pointer of each overload to the kernel
// parameter `fn_ptr`.
//
// Because `fn_ptr` is possibly visible outside this translation unit, the
// compiler must compile all the functions which are stored.

__global__ void test_cp_async_bulk(void** fn_ptr)
{
#if __cccl_ptx_isa >= 800
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// This file was automatically generated. Do not edit.

// We use a special strategy to force the generation of the PTX. This is mainly
// a fight against dead-code-elimination in the NVVM layer.
//
// The reason we need this strategy is because certain older versions of ptxas
// segfault when a non-sensical sequence of PTX is generated. So instead, we try
// to force the instantiation and compilation to PTX of all the overloads of the
// PTX wrapping functions.
//
// We do this by writing a function pointer of each overload to the kernel
// parameter `fn_ptr`.
//
// Because `fn_ptr` is possibly visible outside this translation unit, the
// compiler must compile all the functions which are stored.

__global__ void test_cp_async_bulk_commit_group(void** fn_ptr)
{
#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.commit_group;
* fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::cp_async_bulk_commit_group));));
#endif // __cccl_ptx_isa >= 800
}

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
// This file was automatically generated. Do not edit.

// We use a special strategy to force the generation of the PTX. This is mainly
// a fight against dead-code-elimination in the NVVM layer.
//
// The reason we need this strategy is because certain older versions of ptxas
// segfault when a non-sensical sequence of PTX is generated. So instead, we try
// to force the instantiation and compilation to PTX of all the overloads of the
// PTX wrapping functions.
//
// We do this by writing a function pointer of each overload to the kernel
// parameter `fn_ptr`.
//
// Because `fn_ptr` is possibly visible outside this translation unit, the
// compiler must compile all the functions which are stored.

__global__ void test_cp_async_bulk_multicast(void** fn_ptr)
{
#if __cccl_ptx_isa >= 800
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
// This file was automatically generated. Do not edit.

// We use a special strategy to force the generation of the PTX. This is mainly
// a fight against dead-code-elimination in the NVVM layer.
//
// The reason we need this strategy is because certain older versions of ptxas
// segfault when a non-sensical sequence of PTX is generated. So instead, we try
// to force the instantiation and compilation to PTX of all the overloads of the
// PTX wrapping functions.
//
// We do this by writing a function pointer of each overload to the kernel
// parameter `fn_ptr`.
//
// Because `fn_ptr` is possibly visible outside this translation unit, the
// compiler must compile all the functions which are stored.

__global__ void test_cp_async_bulk_tensor(void** fn_ptr)
{
#if __cccl_ptx_isa >= 800
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
// This file was automatically generated. Do not edit.

// We use a special strategy to force the generation of the PTX. This is mainly
// a fight against dead-code-elimination in the NVVM layer.
//
// The reason we need this strategy is because certain older versions of ptxas
// segfault when a non-sensical sequence of PTX is generated. So instead, we try
// to force the instantiation and compilation to PTX of all the overloads of the
// PTX wrapping functions.
//
// We do this by writing a function pointer of each overload to the kernel
// parameter `fn_ptr`.
//
// Because `fn_ptr` is possibly visible outside this translation unit, the
// compiler must compile all the functions which are stored.

__global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr)
{
#if __cccl_ptx_isa >= 800
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
// This file was automatically generated. Do not edit.

// We use a special strategy to force the generation of the PTX. This is mainly
// a fight against dead-code-elimination in the NVVM layer.
//
// The reason we need this strategy is because certain older versions of ptxas
// segfault when a non-sensical sequence of PTX is generated. So instead, we try
// to force the instantiation and compilation to PTX of all the overloads of the
// PTX wrapping functions.
//
// We do this by writing a function pointer of each overload to the kernel
// parameter `fn_ptr`.
//
// Because `fn_ptr` is possibly visible outside this translation unit, the
// compiler must compile all the functions which are stored.

__global__ void test_cp_async_bulk_wait_group(void** fn_ptr)
{
#if __cccl_ptx_isa >= 800
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
// This file was automatically generated. Do not edit.

// We use a special strategy to force the generation of the PTX. This is mainly
// a fight against dead-code-elimination in the NVVM layer.
//
// The reason we need this strategy is because certain older versions of ptxas
// segfault when a non-sensical sequence of PTX is generated. So instead, we try
// to force the instantiation and compilation to PTX of all the overloads of the
// PTX wrapping functions.
//
// We do this by writing a function pointer of each overload to the kernel
// parameter `fn_ptr`.
//
// Because `fn_ptr` is possibly visible outside this translation unit, the
// compiler must compile all the functions which are stored.

__global__ void test_cp_reduce_async_bulk(void** fn_ptr)
{
#if __cccl_ptx_isa >= 800
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,22 @@
// This file was automatically generated. Do not edit.

// We use a special strategy to force the generation of the PTX. This is mainly
// a fight against dead-code-elimination in the NVVM layer.
//
// The reason we need this strategy is because certain older versions of ptxas
// segfault when a non-sensical sequence of PTX is generated. So instead, we try
// to force the instantiation and compilation to PTX of all the overloads of the
// PTX wrapping functions.
//
// We do this by writing a function pointer of each overload to the kernel
// parameter `fn_ptr`.
//
// Because `fn_ptr` is possibly visible outside this translation unit, the
// compiler must compile all the functions which are stored.

__global__ void test_cp_reduce_async_bulk_bf16(void** fn_ptr)
{
# if __cccl_ptx_isa >= 800
#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
Expand All @@ -12,9 +28,9 @@ __global__ void test_cp_reduce_async_bulk_bf16(void** fn_ptr)
__nv_bfloat16*,
const __nv_bfloat16*,
uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
# endif // __cccl_ptx_isa >= 800
#endif // __cccl_ptx_isa >= 800

# if __cccl_ptx_isa >= 800
#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
Expand All @@ -26,9 +42,9 @@ __global__ void test_cp_reduce_async_bulk_bf16(void** fn_ptr)
__nv_bfloat16*,
const __nv_bfloat16*,
uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
# endif // __cccl_ptx_isa >= 800
#endif // __cccl_ptx_isa >= 800

# if __cccl_ptx_isa >= 800
#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
Expand All @@ -40,5 +56,5 @@ __global__ void test_cp_reduce_async_bulk_bf16(void** fn_ptr)
__nv_bfloat16*,
const __nv_bfloat16*,
uint32_t)>(cuda::ptx::cp_reduce_async_bulk));));
# endif // __cccl_ptx_isa >= 800
#endif // __cccl_ptx_isa >= 800
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,22 @@
// This file was automatically generated. Do not edit.

// We use a special strategy to force the generation of the PTX. This is mainly
// a fight against dead-code-elimination in the NVVM layer.
//
// The reason we need this strategy is because certain older versions of ptxas
// segfault when a non-sensical sequence of PTX is generated. So instead, we try
// to force the instantiation and compilation to PTX of all the overloads of the
// PTX wrapping functions.
//
// We do this by writing a function pointer of each overload to the kernel
// parameter `fn_ptr`.
//
// Because `fn_ptr` is possibly visible outside this translation unit, the
// compiler must compile all the functions which are stored.

__global__ void test_cp_reduce_async_bulk_f16(void** fn_ptr)
{
# if __cccl_ptx_isa >= 800
#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
Expand All @@ -9,9 +25,9 @@ __global__ void test_cp_reduce_async_bulk_f16(void** fn_ptr)
static_cast<void (*)(
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_min_t, __half*, const __half*, uint32_t)>(
cuda::ptx::cp_reduce_async_bulk));));
# endif // __cccl_ptx_isa >= 800
#endif // __cccl_ptx_isa >= 800

# if __cccl_ptx_isa >= 800
#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
Expand All @@ -20,9 +36,9 @@ __global__ void test_cp_reduce_async_bulk_f16(void** fn_ptr)
static_cast<void (*)(
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_max_t, __half*, const __half*, uint32_t)>(
cuda::ptx::cp_reduce_async_bulk));));
# endif // __cccl_ptx_isa >= 800
#endif // __cccl_ptx_isa >= 800

# if __cccl_ptx_isa >= 800
#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
Expand All @@ -31,5 +47,5 @@ __global__ void test_cp_reduce_async_bulk_f16(void** fn_ptr)
static_cast<void (*)(
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, cuda::ptx::op_add_t, __half*, const __half*, uint32_t)>(
cuda::ptx::cp_reduce_async_bulk));));
# endif // __cccl_ptx_isa >= 800
#endif // __cccl_ptx_isa >= 800
}
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
// This file was automatically generated. Do not edit.

// We use a special strategy to force the generation of the PTX. This is mainly
// a fight against dead-code-elimination in the NVVM layer.
//
// The reason we need this strategy is because certain older versions of ptxas
// segfault when a non-sensical sequence of PTX is generated. So instead, we try
// to force the instantiation and compilation to PTX of all the overloads of the
// PTX wrapping functions.
//
// We do this by writing a function pointer of each overload to the kernel
// parameter `fn_ptr`.
//
// Because `fn_ptr` is possibly visible outside this translation unit, the
// compiler must compile all the functions which are stored.

__global__ void test_cp_reduce_async_bulk_tensor(void** fn_ptr)
{
#if __cccl_ptx_isa >= 800
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
// This file was automatically generated. Do not edit.

// We use a special strategy to force the generation of the PTX. This is mainly
// a fight against dead-code-elimination in the NVVM layer.
//
// The reason we need this strategy is because certain older versions of ptxas
// segfault when a non-sensical sequence of PTX is generated. So instead, we try
// to force the instantiation and compilation to PTX of all the overloads of the
// PTX wrapping functions.
//
// We do this by writing a function pointer of each overload to the kernel
// parameter `fn_ptr`.
//
// Because `fn_ptr` is possibly visible outside this translation unit, the
// compiler must compile all the functions which are stored.

__global__ void test_fence(void** fn_ptr)
{
#if __cccl_ptx_isa >= 600
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// This file was automatically generated. Do not edit.

// We use a special strategy to force the generation of the PTX. This is mainly
// a fight against dead-code-elimination in the NVVM layer.
//
// The reason we need this strategy is because certain older versions of ptxas
// segfault when a non-sensical sequence of PTX is generated. So instead, we try
// to force the instantiation and compilation to PTX of all the overloads of the
// PTX wrapping functions.
//
// We do this by writing a function pointer of each overload to the kernel
// parameter `fn_ptr`.
//
// Because `fn_ptr` is possibly visible outside this translation unit, the
// compiler must compile all the functions which are stored.

__global__ void test_fence_mbarrier_init(void** fn_ptr)
{
#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// fence.mbarrier_init.release.cluster; // 3.
* fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t)>(
cuda::ptx::fence_mbarrier_init));));
#endif // __cccl_ptx_isa >= 800
}

This file was deleted.

25 changes: 25 additions & 0 deletions libcudacxx/test/libcudacxx/cuda/ptx/generated/fence_proxy_alias.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// This file was automatically generated. Do not edit.

// We use a special strategy to force the generation of the PTX. This is mainly
// a fight against dead-code-elimination in the NVVM layer.
//
// The reason we need this strategy is because certain older versions of ptxas
// segfault when a non-sensical sequence of PTX is generated. So instead, we try
// to force the instantiation and compilation to PTX of all the overloads of the
// PTX wrapping functions.
//
// We do this by writing a function pointer of each overload to the kernel
// parameter `fn_ptr`.
//
// Because `fn_ptr` is possibly visible outside this translation unit, the
// compiler must compile all the functions which are stored.

__global__ void test_fence_proxy_alias(void** fn_ptr)
{
#if __cccl_ptx_isa >= 750
NV_IF_TARGET(NV_PROVIDES_SM_70,
(
// fence.proxy.alias; // 4.
* fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::fence_proxy_alias));));
#endif // __cccl_ptx_isa >= 750
}
Loading

0 comments on commit eba90a1

Please sign in to comment.