-
Notifications
You must be signed in to change notification settings - Fork 184
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Move
.multicast
tests out into their own file (#1478)
We are warning against usage of `-multicast` prior to SM90. So ensure that this passes CI
- Loading branch information
Showing
4 changed files
with
126 additions
and
42 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
49 changes: 49 additions & 0 deletions
49
libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// Part of libcu++, the C++ Standard Library for your entire system, | ||
// under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
// UNSUPPORTED: libcpp-has-no-threads | ||
|
||
// UNSUPPORTED: nvcc-11 | ||
// XFAIL: !pre-sm-90 && pre-sm-90a | ||
|
||
// <cuda/ptx> | ||
|
||
#include <cuda/ptx> | ||
#include <cuda/std/utility> | ||
|
||
/* | ||
* We use a special strategy to force the generation of the PTX. This is mainly | ||
* a fight against dead-code-elimination in the NVVM layer. | ||
* | ||
* The reason we need this strategy is because certain older versions of ptxas | ||
* segfault when a non-sensical sequence of PTX is generated. So instead, we try | ||
* to force the instantiation and compilation to PTX of all the overloads of the | ||
* PTX wrapping functions. | ||
* | ||
* We do this by writing a function pointer of each overload to the kernel | ||
* parameter `fn_ptr`. | ||
* | ||
* Because `fn_ptr` is possibly visible outside this translation unit, the | ||
* compiler must compile all the functions which are stored. | ||
* | ||
*/ | ||
|
||
__global__ void test_cp_async_bulk(void ** fn_ptr) { | ||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, ( | ||
// cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1. | ||
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const uint32_t& , uint64_t* , const uint16_t& )>(cuda::ptx::cp_async_bulk)); | ||
)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
} | ||
|
||
int main(int, char**) | ||
{ | ||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
77 changes: 77 additions & 0 deletions
77
libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// Part of libcu++, the C++ Standard Library for your entire system, | ||
// under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
// UNSUPPORTED: libcpp-has-no-threads | ||
|
||
// UNSUPPORTED: nvcc-11 | ||
// XFAIL: !pre-sm-90 && pre-sm-90a | ||
|
||
// <cuda/ptx> | ||
|
||
#include <cuda/ptx> | ||
#include <cuda/std/utility> | ||
|
||
/* | ||
* We use a special strategy to force the generation of the PTX. This is mainly | ||
* a fight against dead-code-elimination in the NVVM layer. | ||
* | ||
* The reason we need this strategy is because certain older versions of ptxas | ||
* segfault when a non-sensical sequence of PTX is generated. So instead, we try | ||
* to force the instantiation and compilation to PTX of all the overloads of the | ||
* PTX wrapping functions. | ||
* | ||
* We do this by writing a function pointer of each overload to the kernel | ||
* parameter `fn_ptr`. | ||
* | ||
* Because `fn_ptr` is possibly visible outside this translation unit, the | ||
* compiler must compile all the functions which are stored. | ||
* | ||
*/ | ||
|
||
__global__ void test_cp_async_bulk_tensor(void ** fn_ptr) { | ||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, ( | ||
// cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. | ||
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[1], uint64_t* , const uint16_t& )>(cuda::ptx::cp_async_bulk_tensor)); | ||
)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, ( | ||
// cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. | ||
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[2], uint64_t* , const uint16_t& )>(cuda::ptx::cp_async_bulk_tensor)); | ||
)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, ( | ||
// cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. | ||
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[3], uint64_t* , const uint16_t& )>(cuda::ptx::cp_async_bulk_tensor)); | ||
)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, ( | ||
// cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. | ||
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[4], uint64_t* , const uint16_t& )>(cuda::ptx::cp_async_bulk_tensor)); | ||
)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
|
||
#if __cccl_ptx_isa >= 800 | ||
NV_IF_TARGET(NV_PROVIDES_SM_90, ( | ||
// cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. | ||
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[5], uint64_t* , const uint16_t& )>(cuda::ptx::cp_async_bulk_tensor)); | ||
)); | ||
#endif // __cccl_ptx_isa >= 800 | ||
} | ||
|
||
int main(int, char**) | ||
{ | ||
return 0; | ||
} |