diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp index de08bda4f6c..22194c4c1d9 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.compile.pass.cpp @@ -39,13 +39,6 @@ __global__ void test_cp_async_bulk(void ** fn_ptr) { )); #endif // __cccl_ptx_isa >= 800 -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET(NV_PROVIDES_SM_90, ( - // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1. - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::cp_async_bulk)); - )); -#endif // __cccl_ptx_isa >= 800 - #if __cccl_ptx_isa >= 800 NV_IF_TARGET(NV_PROVIDES_SM_90, ( // cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2. diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp new file mode 100644 index 00000000000..1336445fd34 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.multicast.compile.pass.cpp @@ -0,0 +1,49 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// UNSUPPORTED: nvcc-11 +// XFAIL: !pre-sm-90 && pre-sm-90a + +// + +#include +#include + +/* + * We use a special strategy to force the generation of the PTX. This is mainly + * a fight against dead-code-elimination in the NVVM layer. + * + * The reason we need this strategy is because certain older versions of ptxas + * segfault when a non-sensical sequence of PTX is generated. So instead, we try + * to force the instantiation and compilation to PTX of all the overloads of the + * PTX wrapping functions. + * + * We do this by writing a function pointer of each overload to the kernel + * parameter `fn_ptr`. + * + * Because `fn_ptr` is possibly visible outside this translation unit, the + * compiler must compile all the functions which are stored. + * + */ + +__global__ void test_cp_async_bulk(void ** fn_ptr) { +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + // cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1. + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::cp_async_bulk)); + )); +#endif // __cccl_ptx_isa >= 800 +} + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp index 8ab330bf977..a9bb7b8d127 100644 --- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.compile.pass.cpp @@ -39,13 +39,6 @@ __global__ void test_cp_async_bulk_tensor(void ** fn_ptr) { )); #endif // __cccl_ptx_isa >= 800 -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET(NV_PROVIDES_SM_90, ( - // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::cp_async_bulk_tensor)); - )); -#endif // __cccl_ptx_isa >= 800 - #if __cccl_ptx_isa >= 800 NV_IF_TARGET(NV_PROVIDES_SM_90, ( // cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3a. @@ -60,13 +53,6 @@ __global__ void test_cp_async_bulk_tensor(void ** fn_ptr) { )); #endif // __cccl_ptx_isa >= 800 -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET(NV_PROVIDES_SM_90, ( - // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::cp_async_bulk_tensor)); - )); -#endif // __cccl_ptx_isa >= 800 - #if __cccl_ptx_isa >= 800 NV_IF_TARGET(NV_PROVIDES_SM_90, ( // cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3b. @@ -81,13 +67,6 @@ __global__ void test_cp_async_bulk_tensor(void ** fn_ptr) { )); #endif // __cccl_ptx_isa >= 800 -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET(NV_PROVIDES_SM_90, ( - // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::cp_async_bulk_tensor)); - )); -#endif // __cccl_ptx_isa >= 800 - #if __cccl_ptx_isa >= 800 NV_IF_TARGET(NV_PROVIDES_SM_90, ( // cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3c. @@ -102,13 +81,6 @@ __global__ void test_cp_async_bulk_tensor(void ** fn_ptr) { )); #endif // __cccl_ptx_isa >= 800 -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET(NV_PROVIDES_SM_90, ( - // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::cp_async_bulk_tensor)); - )); -#endif // __cccl_ptx_isa >= 800 - #if __cccl_ptx_isa >= 800 NV_IF_TARGET(NV_PROVIDES_SM_90, ( // cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3d. @@ -123,13 +95,6 @@ __global__ void test_cp_async_bulk_tensor(void ** fn_ptr) { )); #endif // __cccl_ptx_isa >= 800 -#if __cccl_ptx_isa >= 800 - NV_IF_TARGET(NV_PROVIDES_SM_90, ( - // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. - *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::cp_async_bulk_tensor)); - )); -#endif // __cccl_ptx_isa >= 800 - #if __cccl_ptx_isa >= 800 NV_IF_TARGET(NV_PROVIDES_SM_90, ( // cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [tensorMap, tensorCoords], [srcMem]; // 3e. diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp new file mode 100644 index 00000000000..aad80a6de10 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.cp.async.bulk.tensor.multicast.compile.pass.cpp @@ -0,0 +1,77 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// UNSUPPORTED: nvcc-11 +// XFAIL: !pre-sm-90 && pre-sm-90a + +// + +#include +#include + +/* + * We use a special strategy to force the generation of the PTX. This is mainly + * a fight against dead-code-elimination in the NVVM layer. + * + * The reason we need this strategy is because certain older versions of ptxas + * segfault when a non-sensical sequence of PTX is generated. So instead, we try + * to force the instantiation and compilation to PTX of all the overloads of the + * PTX wrapping functions. + * + * We do this by writing a function pointer of each overload to the kernel + * parameter `fn_ptr`. + * + * Because `fn_ptr` is possibly visible outside this translation unit, the + * compiler must compile all the functions which are stored. + * + */ + +__global__ void test_cp_async_bulk_tensor(void ** fn_ptr) { +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + // cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2a. + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::cp_async_bulk_tensor)); + )); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + // cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2b. + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::cp_async_bulk_tensor)); + )); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + // cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2c. + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::cp_async_bulk_tensor)); + )); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + // cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2d. + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::cp_async_bulk_tensor)); + )); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + // cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [tensorMap, tensorCoords], [smem_bar], ctaMask; // 2e. + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::cp_async_bulk_tensor)); + )); +#endif // __cccl_ptx_isa >= 800 +} + +int main(int, char**) +{ + return 0; +}