Skip to content

Commit

Permalink
Regenerate PTX test
Browse files Browse the repository at this point in the history
Overwrites all generated PTX tests and runs `pre-commit run --all-files`
  • Loading branch information
bernhardmgruber committed Nov 25, 2024
1 parent 0b36a7d commit 6ffc627
Show file tree
Hide file tree
Showing 28 changed files with 829 additions and 1,781 deletions.
46 changes: 21 additions & 25 deletions libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.inc
Original file line number Diff line number Diff line change
@@ -1,40 +1,36 @@
__global__ void test_barrier_cluster(void** fn_ptr)
{
__global__ void test_barrier_cluster(void ** fn_ptr) {
#if __cccl_ptx_isa >= 780
NV_IF_TARGET(NV_PROVIDES_SM_90,
(
// barrier.cluster.arrive;
* fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_arrive));));
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// barrier.cluster.arrive;
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_arrive));
));
#endif // __cccl_ptx_isa >= 780

#if __cccl_ptx_isa >= 780
NV_IF_TARGET(NV_PROVIDES_SM_90,
(
// barrier.cluster.wait;
* fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_wait));));
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// barrier.cluster.wait;
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_wait));
));
#endif // __cccl_ptx_isa >= 780

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90,
(
// barrier.cluster.arrive.release;
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(cuda::ptx::sem_release_t)>(cuda::ptx::barrier_cluster_arrive));));
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// barrier.cluster.arrive.release;
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_release_t)>(cuda::ptx::barrier_cluster_arrive));
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90,
(
// barrier.cluster.arrive.relaxed;
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(cuda::ptx::sem_relaxed_t)>(cuda::ptx::barrier_cluster_arrive));));
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// barrier.cluster.arrive.relaxed;
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_relaxed_t)>(cuda::ptx::barrier_cluster_arrive));
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90,
(
// barrier.cluster.wait.acquire;
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(cuda::ptx::sem_acquire_t)>(cuda::ptx::barrier_cluster_wait));));
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// barrier.cluster.wait.acquire;
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_acquire_t)>(cuda::ptx::barrier_cluster_wait));
));
#endif // __cccl_ptx_isa >= 800
}
41 changes: 13 additions & 28 deletions libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.inc
Original file line number Diff line number Diff line change
@@ -1,37 +1,22 @@
__global__ void test_cp_async_bulk(void** fn_ptr)
{
__global__ void test_cp_async_bulk(void ** fn_ptr) {
#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; //
// 1a. unicast
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const uint32_t&, uint64_t*)>(
cuda::ptx::cp_async_bulk));));
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const uint32_t& , uint64_t* )>(cuda::ptx::cp_async_bulk));
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [dstMem], [srcMem], size,
// [rdsmem_bar]; // 2.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(
cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, void*, const void*, const uint32_t&, uint64_t*)>(
cuda::ptx::cp_async_bulk));));
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2.
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, void* , const void* , const uint32_t& , uint64_t* )>(cuda::ptx::cp_async_bulk));
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.global.shared::cta.bulk_group [dstMem], [srcMem], size; // 3.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, void*, const void*, const uint32_t&)>(
cuda::ptx::cp_async_bulk));));
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.global.shared::cta.bulk_group [dstMem], [srcMem], size; // 3.
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, cuda::ptx::space_shared_t, void* , const void* , const uint32_t& )>(cuda::ptx::cp_async_bulk));
));
#endif // __cccl_ptx_isa >= 800
}
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
__global__ void test_cp_async_bulk_commit_group(void** fn_ptr)
{
__global__ void test_cp_async_bulk_commit_group(void ** fn_ptr) {
#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.commit_group;
* fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::cp_async_bulk_commit_group));));
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.commit_group;
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::cp_async_bulk_commit_group));
));
#endif // __cccl_ptx_isa >= 800
}
Original file line number Diff line number Diff line change
@@ -1,18 +1,8 @@
__global__ void test_cp_async_bulk_multicast(void** fn_ptr)
{
__global__ void test_cp_async_bulk_multicast(void ** fn_ptr) {
#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_HAS_FEATURE_SM_90a,
(
// cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem],
// size, [smem_bar], ctaMask; // 1.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(cuda::ptx::space_cluster_t,
cuda::ptx::space_global_t,
void*,
const void*,
const uint32_t&,
uint64_t*,
const uint16_t&)>(cuda::ptx::cp_async_bulk));));
NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, (
// cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1.
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const uint32_t& , uint64_t* , const uint16_t& )>(cuda::ptx::cp_async_bulk));
));
#endif // __cccl_ptx_isa >= 800
}
128 changes: 41 additions & 87 deletions libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.inc
Original file line number Diff line number Diff line change
@@ -1,117 +1,71 @@
__global__ void test_cp_async_bulk_tensor(void** fn_ptr)
{
__global__ void test_cp_async_bulk_tensor(void ** fn_ptr) {
#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
// tensorCoords], [smem_bar];// 1a.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[1], uint64_t*)>(
cuda::ptx::cp_async_bulk_tensor));));
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1a.
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[1], uint64_t* )>(cuda::ptx::cp_async_bulk_tensor));
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[1], const void*)>(
cuda::ptx::cp_async_bulk_tensor));));
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a.
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void* , const int32_t (&)[1], const void* )>(cuda::ptx::cp_async_bulk_tensor));
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
// tensorCoords], [smem_bar];// 1b.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[2], uint64_t*)>(
cuda::ptx::cp_async_bulk_tensor));));
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1b.
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[2], uint64_t* )>(cuda::ptx::cp_async_bulk_tensor));
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[2], const void*)>(
cuda::ptx::cp_async_bulk_tensor));));
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b.
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void* , const int32_t (&)[2], const void* )>(cuda::ptx::cp_async_bulk_tensor));
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
// tensorCoords], [smem_bar];// 1c.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[3], uint64_t*)>(
cuda::ptx::cp_async_bulk_tensor));));
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1c.
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[3], uint64_t* )>(cuda::ptx::cp_async_bulk_tensor));
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[3], const void*)>(
cuda::ptx::cp_async_bulk_tensor));));
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c.
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void* , const int32_t (&)[3], const void* )>(cuda::ptx::cp_async_bulk_tensor));
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
// tensorCoords], [smem_bar];// 1d.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[4], uint64_t*)>(
cuda::ptx::cp_async_bulk_tensor));));
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1d.
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[4], uint64_t* )>(cuda::ptx::cp_async_bulk_tensor));
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[4], const void*)>(
cuda::ptx::cp_async_bulk_tensor));));
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d.
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void* , const int32_t (&)[4], const void* )>(cuda::ptx::cp_async_bulk_tensor));
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap,
// tensorCoords], [smem_bar];// 1e.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[5], uint64_t*)>(
cuda::ptx::cp_async_bulk_tensor));));
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1e.
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[5], uint64_t* )>(cuda::ptx::cp_async_bulk_tensor));
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(
// cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e.
* fn_ptr++ = reinterpret_cast<void*>(
static_cast<void (*)(
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[5], const void*)>(
cuda::ptx::cp_async_bulk_tensor));));
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e.
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void* , const int32_t (&)[5], const void* )>(cuda::ptx::cp_async_bulk_tensor));
));
#endif // __cccl_ptx_isa >= 800
}
Loading

0 comments on commit 6ffc627

Please sign in to comment.