-
Notifications
You must be signed in to change notification settings - Fork 190
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Overwrites all generated PTX tests and runs `pre-commit run --all-files`
- Loading branch information
1 parent
0b36a7d
commit 6ffc627
Showing
28 changed files
with
829 additions
and
1,781 deletions.
There are no files selected for viewing
46 changes: 21 additions & 25 deletions
46
libcudacxx/test/libcudacxx/cuda/ptx/generated/barrier_cluster.inc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,40 +1,36 @@ | ||
__global__ void test_barrier_cluster(void** fn_ptr) | ||
{ | ||
__global__ void test_barrier_cluster(void ** fn_ptr) { | ||
#if __cccl_ptx_isa >= 780 | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, | ||
( | ||
// barrier.cluster.arrive; | ||
* fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_arrive));)); | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, ( | ||
// barrier.cluster.arrive; | ||
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_arrive)); | ||
)); | ||
#endif // __cccl_ptx_isa >= 780 | ||
|
||
#if __cccl_ptx_isa >= 780 | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, | ||
( | ||
// barrier.cluster.wait; | ||
* fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_wait));)); | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, ( | ||
// barrier.cluster.wait; | ||
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_wait)); | ||
)); | ||
#endif // __cccl_ptx_isa >= 780 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, | ||
( | ||
// barrier.cluster.arrive.release; | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)(cuda::ptx::sem_release_t)>(cuda::ptx::barrier_cluster_arrive));)); | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, ( | ||
// barrier.cluster.arrive.release; | ||
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_release_t)>(cuda::ptx::barrier_cluster_arrive)); | ||
)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, | ||
( | ||
// barrier.cluster.arrive.relaxed; | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)(cuda::ptx::sem_relaxed_t)>(cuda::ptx::barrier_cluster_arrive));)); | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, ( | ||
// barrier.cluster.arrive.relaxed; | ||
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_relaxed_t)>(cuda::ptx::barrier_cluster_arrive)); | ||
)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, | ||
( | ||
// barrier.cluster.wait.acquire; | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)(cuda::ptx::sem_acquire_t)>(cuda::ptx::barrier_cluster_wait));)); | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, ( | ||
// barrier.cluster.wait.acquire; | ||
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_acquire_t)>(cuda::ptx::barrier_cluster_wait)); | ||
)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
} |
41 changes: 13 additions & 28 deletions
41
libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk.inc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,37 +1,22 @@ | ||
__global__ void test_cp_async_bulk(void** fn_ptr) | ||
{ | ||
__global__ void test_cp_async_bulk(void ** fn_ptr) { | ||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // | ||
// 1a. unicast | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const uint32_t&, uint64_t*)>( | ||
cuda::ptx::cp_async_bulk));)); | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, ( | ||
// cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast | ||
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const uint32_t& , uint64_t* )>(cuda::ptx::cp_async_bulk)); | ||
)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, | ||
// [rdsmem_bar]; // 2. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, void*, const void*, const uint32_t&, uint64_t*)>( | ||
cuda::ptx::cp_async_bulk));)); | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, ( | ||
// cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2. | ||
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_shared_t, void* , const void* , const uint32_t& , uint64_t* )>(cuda::ptx::cp_async_bulk)); | ||
)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.global.shared::cta.bulk_group [dstMem], [srcMem], size; // 3. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, void*, const void*, const uint32_t&)>( | ||
cuda::ptx::cp_async_bulk));)); | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, ( | ||
// cp.async.bulk.global.shared::cta.bulk_group [dstMem], [srcMem], size; // 3. | ||
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, cuda::ptx::space_shared_t, void* , const void* , const uint32_t& )>(cuda::ptx::cp_async_bulk)); | ||
)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
} |
12 changes: 5 additions & 7 deletions
12
libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_commit_group.inc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,8 @@ | ||
__global__ void test_cp_async_bulk_commit_group(void** fn_ptr) | ||
{ | ||
__global__ void test_cp_async_bulk_commit_group(void ** fn_ptr) { | ||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.commit_group; | ||
* fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::cp_async_bulk_commit_group));)); | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, ( | ||
// cp.async.bulk.commit_group; | ||
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::cp_async_bulk_commit_group)); | ||
)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
} |
20 changes: 5 additions & 15 deletions
20
libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_multicast.inc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,8 @@ | ||
__global__ void test_cp_async_bulk_multicast(void** fn_ptr) | ||
{ | ||
__global__ void test_cp_async_bulk_multicast(void ** fn_ptr) { | ||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_HAS_FEATURE_SM_90a, | ||
( | ||
// cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], | ||
// size, [smem_bar], ctaMask; // 1. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)(cuda::ptx::space_cluster_t, | ||
cuda::ptx::space_global_t, | ||
void*, | ||
const void*, | ||
const uint32_t&, | ||
uint64_t*, | ||
const uint16_t&)>(cuda::ptx::cp_async_bulk));)); | ||
NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, ( | ||
// cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1. | ||
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const uint32_t& , uint64_t* , const uint16_t& )>(cuda::ptx::cp_async_bulk)); | ||
)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
} |
128 changes: 41 additions & 87 deletions
128
libcudacxx/test/libcudacxx/cuda/ptx/generated/cp_async_bulk_tensor.inc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,117 +1,71 @@ | ||
__global__ void test_cp_async_bulk_tensor(void** fn_ptr) | ||
{ | ||
__global__ void test_cp_async_bulk_tensor(void ** fn_ptr) { | ||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, | ||
// tensorCoords], [smem_bar];// 1a. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[1], uint64_t*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, ( | ||
// cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1a. | ||
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[1], uint64_t* )>(cuda::ptx::cp_async_bulk_tensor)); | ||
)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[1], const void*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, ( | ||
// cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. | ||
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void* , const int32_t (&)[1], const void* )>(cuda::ptx::cp_async_bulk_tensor)); | ||
)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, | ||
// tensorCoords], [smem_bar];// 1b. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[2], uint64_t*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, ( | ||
// cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1b. | ||
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[2], uint64_t* )>(cuda::ptx::cp_async_bulk_tensor)); | ||
)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[2], const void*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, ( | ||
// cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. | ||
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void* , const int32_t (&)[2], const void* )>(cuda::ptx::cp_async_bulk_tensor)); | ||
)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, | ||
// tensorCoords], [smem_bar];// 1c. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[3], uint64_t*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, ( | ||
// cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1c. | ||
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[3], uint64_t* )>(cuda::ptx::cp_async_bulk_tensor)); | ||
)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[3], const void*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, ( | ||
// cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. | ||
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void* , const int32_t (&)[3], const void* )>(cuda::ptx::cp_async_bulk_tensor)); | ||
)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, | ||
// tensorCoords], [smem_bar];// 1d. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[4], uint64_t*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, ( | ||
// cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1d. | ||
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[4], uint64_t* )>(cuda::ptx::cp_async_bulk_tensor)); | ||
)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[4], const void*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, ( | ||
// cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. | ||
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void* , const int32_t (&)[4], const void* )>(cuda::ptx::cp_async_bulk_tensor)); | ||
)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, | ||
// tensorCoords], [smem_bar];// 1e. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void*, const void*, const int32_t(&)[5], uint64_t*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, ( | ||
// cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [dstMem], [tensorMap, tensorCoords], [smem_bar];// 1e. | ||
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[5], uint64_t* )>(cuda::ptx::cp_async_bulk_tensor)); | ||
)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
// cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. | ||
* fn_ptr++ = reinterpret_cast<void*>( | ||
static_cast<void (*)( | ||
cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void*, const int32_t(&)[5], const void*)>( | ||
cuda::ptx::cp_async_bulk_tensor));)); | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, ( | ||
// cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. | ||
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, cuda::ptx::space_shared_t, const void* , const int32_t (&)[5], const void* )>(cuda::ptx::cp_async_bulk_tensor)); | ||
)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
} |
Oops, something went wrong.