forked from NVIDIA/cccl
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Reorganize PTX tests to match generator (NVIDIA#2930)
- Loading branch information
1 parent
d0f5bd2
commit 5b57a4c
Showing
49 changed files
with
2,427 additions
and
2,407 deletions.
There are no files selected for viewing
40 changes: 40 additions & 0 deletions
40
libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.inc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
__global__ void test_barrier_cluster(void** fn_ptr) | ||
{ | ||
#if __cccl_ptx_isa >= 780 | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, | ||
( | ||
// barrier.cluster.arrive; | ||
* fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_arrive));)); | ||
#endif // __cccl_ptx_isa >= 780 | ||
|
||
#if __cccl_ptx_isa >= 780 | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, | ||
( | ||
// barrier.cluster.wait; | ||
* fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_wait));)); | ||
#endif // __cccl_ptx_isa >= 780 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, | ||
( | ||
// barrier.cluster.arrive.release; | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)(cuda::ptx::sem_release_t)>(cuda::ptx::barrier_cluster_arrive));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, | ||
( | ||
// barrier.cluster.arrive.relaxed; | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)(cuda::ptx::sem_relaxed_t)>(cuda::ptx::barrier_cluster_arrive));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, | ||
( | ||
// barrier.cluster.wait.acquire; | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)(cuda::ptx::sem_acquire_t)>(cuda::ptx::barrier_cluster_wait));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
} |
37 changes: 37 additions & 0 deletions
37
libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.inc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
__global__ void test_cp_async_bulk(void** fn_ptr) | ||
{ | ||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // | ||
// 1a. unicast | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const uint32_t&, uint64_t*)>( | ||
cuda::ptx::cp_async_bulk));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, | ||
// [rdsmem_bar]; // 2. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, void*, const void*, const uint32_t&, uint64_t*)>( | ||
cuda::ptx::cp_async_bulk));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.global.shared::cta.bulk_group [dstMem], [srcMem], size; // 3. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, void*, const void*, const uint32_t&)>( | ||
cuda::ptx::cp_async_bulk));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
} |
10 changes: 10 additions & 0 deletions
10
libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
__global__ void test_cp_async_bulk_commit_group(void** fn_ptr) | ||
{ | ||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.commit_group; | ||
* fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::cp_async_bulk_commit_group));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
} |
18 changes: 18 additions & 0 deletions
18
libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.inc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
__global__ void test_cp_async_bulk_multicast(void** fn_ptr) | ||
{ | ||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_HAS_FEATURE_SM_90a, | ||
( | ||
// cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], | ||
// size, [smem_bar], ctaMask; // 1. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)(cuda::ptx::space_cluster_t, | ||
cuda::ptx::space_global_t, | ||
void*, | ||
const void*, | ||
const uint32_t&, | ||
uint64_t*, | ||
const uint16_t&)>(cuda::ptx::cp_async_bulk));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
} |
117 changes: 117 additions & 0 deletions
117
libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.inc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
__global__ void test_cp_async_bulk_tensor(void** fn_ptr) | ||
{ | ||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, | ||
// tensorCoords], [smem_bar];// 1a. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[1], uint64_t*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[1], const void*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, | ||
// tensorCoords], [smem_bar];// 1b. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[2], uint64_t*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[2], const void*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, | ||
// tensorCoords], [smem_bar];// 1c. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[3], uint64_t*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[3], const void*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, | ||
// tensorCoords], [smem_bar];// 1d. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[4], uint64_t*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[4], const void*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, | ||
// tensorCoords], [smem_bar];// 1e. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[5], uint64_t*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[5], const void*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
} |
82 changes: 82 additions & 0 deletions
82
libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor_multicast.inc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
__global__ void test_cp_async_bulk_tensor_multicast(void** fn_ptr) | ||
{ | ||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_HAS_FEATURE_SM_90a, | ||
( | ||
// cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], | ||
// [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)(cuda::ptx::space_cluster_t, | ||
cuda::ptx::space_global_t, | ||
void*, | ||
const void*, | ||
const int32_t(&)[1], | ||
uint64_t*, | ||
const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_HAS_FEATURE_SM_90a, | ||
( | ||
// cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], | ||
// [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)(cuda::ptx::space_cluster_t, | ||
cuda::ptx::space_global_t, | ||
void*, | ||
const void*, | ||
const int32_t(&)[2], | ||
uint64_t*, | ||
const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_HAS_FEATURE_SM_90a, | ||
( | ||
// cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], | ||
// [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)(cuda::ptx::space_cluster_t, | ||
cuda::ptx::space_global_t, | ||
void*, | ||
const void*, | ||
const int32_t(&)[3], | ||
uint64_t*, | ||
const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_HAS_FEATURE_SM_90a, | ||
( | ||
// cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], | ||
// [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)(cuda::ptx::space_cluster_t, | ||
cuda::ptx::space_global_t, | ||
void*, | ||
const void*, | ||
const int32_t(&)[4], | ||
uint64_t*, | ||
const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_HAS_FEATURE_SM_90a, | ||
( | ||
// cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], | ||
// [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)(cuda::ptx::space_cluster_t, | ||
cuda::ptx::space_global_t, | ||
void*, | ||
const void*, | ||
const int32_t(&)[5], | ||
uint64_t*, | ||
const uint16_t&)>(cuda::ptx::cp_async_bulk_tensor));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
} |
18 changes: 18 additions & 0 deletions
18
libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_wait_group.inc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
__global__ void test_cp_async_bulk_wait_group(void** fn_ptr) | ||
{ | ||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.wait_group N; | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)(cuda::ptx::n32_t<128>)>(cuda::ptx::cp_async_bulk_wait_group));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.wait_group.read N; | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)(cuda::ptx::n32_t<128>)>(cuda::ptx::cp_async_bulk_wait_group_read));)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
} |
Oops, something went wrong.