Skip to content

Commit

Permalink
Move .multicast tests out into their own file (#1478)
Browse files Browse the repository at this point in the history
We are warning against usage of `-multicast` prior to SM90. So ensure that this passes CI
  • Loading branch information
miscco authored Mar 4, 2024
1 parent 63e7ff3 commit 4b8fb92
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 42 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,6 @@ __global__ void test_cp_async_bulk(void ** fn_ptr) {
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1.
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const uint32_t& , uint64_t* , const uint16_t& )>(cuda::ptx::cp_async_bulk));
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
// UNSUPPORTED: libcpp-has-no-threads

// UNSUPPORTED: nvcc-11
// XFAIL: !pre-sm-90 && pre-sm-90a

// <cuda/ptx>

#include <cuda/ptx>
#include <cuda/std/utility>

/*
* We use a special strategy to force the generation of the PTX. This is mainly
* a fight against dead-code-elimination in the NVVM layer.
*
* The reason we need this strategy is because certain older versions of ptxas
* segfault when a non-sensical sequence of PTX is generated. So instead, we try
* to force the instantiation and compilation to PTX of all the overloads of the
* PTX wrapping functions.
*
* We do this by writing a function pointer of each overload to the kernel
* parameter `fn_ptr`.
*
* Because `fn_ptr` is possibly visible outside this translation unit, the
* compiler must compile all the functions which are stored.
*
*/

__global__ void test_cp_async_bulk(void ** fn_ptr) {
#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1.
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const uint32_t& , uint64_t* , const uint16_t& )>(cuda::ptx::cp_async_bulk));
));
#endif // __cccl_ptx_isa >= 800
}

int main(int, char**)
{
return 0;
}
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,6 @@ __global__ void test_cp_async_bulk_tensor(void ** fn_ptr) {
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a.
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[1], uint64_t* , const uint16_t& )>(cuda::ptx::cp_async_bulk_tensor));
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a.
Expand All @@ -60,13 +53,6 @@ __global__ void test_cp_async_bulk_tensor(void ** fn_ptr) {
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b.
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[2], uint64_t* , const uint16_t& )>(cuda::ptx::cp_async_bulk_tensor));
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b.
Expand All @@ -81,13 +67,6 @@ __global__ void test_cp_async_bulk_tensor(void ** fn_ptr) {
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c.
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[3], uint64_t* , const uint16_t& )>(cuda::ptx::cp_async_bulk_tensor));
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c.
Expand All @@ -102,13 +81,6 @@ __global__ void test_cp_async_bulk_tensor(void ** fn_ptr) {
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d.
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[4], uint64_t* , const uint16_t& )>(cuda::ptx::cp_async_bulk_tensor));
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d.
Expand All @@ -123,13 +95,6 @@ __global__ void test_cp_async_bulk_tensor(void ** fn_ptr) {
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e.
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[5], uint64_t* , const uint16_t& )>(cuda::ptx::cp_async_bulk_tensor));
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
// UNSUPPORTED: libcpp-has-no-threads

// UNSUPPORTED: nvcc-11
// XFAIL: !pre-sm-90 && pre-sm-90a

// <cuda/ptx>

#include <cuda/ptx>
#include <cuda/std/utility>

/*
* We use a special strategy to force the generation of the PTX. This is mainly
* a fight against dead-code-elimination in the NVVM layer.
*
* The reason we need this strategy is because certain older versions of ptxas
* segfault when a non-sensical sequence of PTX is generated. So instead, we try
* to force the instantiation and compilation to PTX of all the overloads of the
* PTX wrapping functions.
*
* We do this by writing a function pointer of each overload to the kernel
* parameter `fn_ptr`.
*
* Because `fn_ptr` is possibly visible outside this translation unit, the
* compiler must compile all the functions which are stored.
*
*/

__global__ void test_cp_async_bulk_tensor(void ** fn_ptr) {
#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a.
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[1], uint64_t* , const uint16_t& )>(cuda::ptx::cp_async_bulk_tensor));
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b.
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[2], uint64_t* , const uint16_t& )>(cuda::ptx::cp_async_bulk_tensor));
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c.
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[3], uint64_t* , const uint16_t& )>(cuda::ptx::cp_async_bulk_tensor));
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d.
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[4], uint64_t* , const uint16_t& )>(cuda::ptx::cp_async_bulk_tensor));
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e.
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_cluster_t, cuda::ptx::space_global_t, void* , const void* , const int32_t (&)[5], uint64_t* , const uint16_t& )>(cuda::ptx::cp_async_bulk_tensor));
));
#endif // __cccl_ptx_isa >= 800
}

int main(int, char**)
{
return 0;
}

0 comments on commit 4b8fb92

Please sign in to comment.