From 565c9442ee55fd39ecf7387632bd1d3a5b8522a2 Mon Sep 17 00:00:00 2001 From: Michael Varvarin <55709728+MichaelVarvarin@users.noreply.github.com> Date: Mon, 29 Jul 2024 13:12:09 +0300 Subject: [PATCH] Add m_cooperativeLaunch device prop and runtime check for CG support for CUDA/HIP --- include/alpaka/acc/AccCpuOmp2Blocks.hpp | 4 +++- include/alpaka/acc/AccCpuOmp2Threads.hpp | 4 +++- include/alpaka/acc/AccCpuSerial.hpp | 4 +++- include/alpaka/acc/AccCpuTbbBlocks.hpp | 4 +++- include/alpaka/acc/AccCpuThreads.hpp | 4 +++- include/alpaka/acc/AccDevProps.hpp | 1 + include/alpaka/acc/AccGenericSycl.hpp | 4 +++- include/alpaka/acc/AccGpuUniformCudaHipRt.hpp | 14 +++++++++-- include/alpaka/core/ApiCudaRt.hpp | 1 + include/alpaka/core/ApiHipRt.hpp | 1 + .../kernel/TaskKernelGpuUniformCudaHipRt.hpp | 24 +++++++++++++++++++ test/unit/acc/src/AccDevPropsTest.cpp | 7 ++++-- 12 files changed, 62 insertions(+), 10 deletions(-) diff --git a/include/alpaka/acc/AccCpuOmp2Blocks.hpp b/include/alpaka/acc/AccCpuOmp2Blocks.hpp index a5c59e6446ae..283a5b068999 100644 --- a/include/alpaka/acc/AccCpuOmp2Blocks.hpp +++ b/include/alpaka/acc/AccCpuOmp2Blocks.hpp @@ -151,7 +151,9 @@ namespace alpaka // m_sharedMemSizeBytes static_cast(AccCpuOmp2Blocks::staticAllocBytes()), // m_globalMemSizeBytes - getMemBytes(dev)}; + getMemBytes(dev), + //m_cooperativeLaunch + std::false_type()}; } }; diff --git a/include/alpaka/acc/AccCpuOmp2Threads.hpp b/include/alpaka/acc/AccCpuOmp2Threads.hpp index bc326bc05c4f..1c58259637f1 100644 --- a/include/alpaka/acc/AccCpuOmp2Threads.hpp +++ b/include/alpaka/acc/AccCpuOmp2Threads.hpp @@ -162,7 +162,9 @@ namespace alpaka // m_sharedMemSizeBytes memBytes, // m_globalMemSizeBytes - memBytes}; + memBytes, + //m_cooperativeLaunch + std::false_type()}; } }; diff --git a/include/alpaka/acc/AccCpuSerial.hpp b/include/alpaka/acc/AccCpuSerial.hpp index 4a4e8f0621a6..3264d483edbf 100644 --- a/include/alpaka/acc/AccCpuSerial.hpp +++ b/include/alpaka/acc/AccCpuSerial.hpp @@ -145,7 +145,9 @@ namespace alpaka // m_sharedMemSizeBytes static_cast(AccCpuSerial::staticAllocBytes()), // m_globalMemSizeBytes - getMemBytes(dev)}; + getMemBytes(dev), + //m_cooperativeLaunch + std::false_type()}; } }; diff --git a/include/alpaka/acc/AccCpuTbbBlocks.hpp b/include/alpaka/acc/AccCpuTbbBlocks.hpp index 3ef4283d7b63..7d1b887aab5b 100644 --- a/include/alpaka/acc/AccCpuTbbBlocks.hpp +++ b/include/alpaka/acc/AccCpuTbbBlocks.hpp @@ -143,7 +143,9 @@ namespace alpaka // m_sharedMemSizeBytes static_cast(AccCpuTbbBlocks::staticAllocBytes()), // m_globalMemSizeBytes - getMemBytes(dev)}; + getMemBytes(dev), + //m_cooperativeLaunch + std::false_type()}; } }; diff --git a/include/alpaka/acc/AccCpuThreads.hpp b/include/alpaka/acc/AccCpuThreads.hpp index ce8f04a73d99..2bcbdb4f362c 100644 --- a/include/alpaka/acc/AccCpuThreads.hpp +++ b/include/alpaka/acc/AccCpuThreads.hpp @@ -170,7 +170,9 @@ namespace alpaka // m_sharedMemSizeBytes memBytes, // m_globalMemSizeBytes - memBytes}; + memBytes, + //m_cooperativeLaunch + std::false_type()}; } }; diff --git a/include/alpaka/acc/AccDevProps.hpp b/include/alpaka/acc/AccDevProps.hpp index a199d542cb34..ffeb93964ec9 100644 --- a/include/alpaka/acc/AccDevProps.hpp +++ b/include/alpaka/acc/AccDevProps.hpp @@ -30,5 +30,6 @@ namespace alpaka TIdx m_threadElemCountMax; //!< The maximum number of elements in a threads. size_t m_sharedMemSizeBytes; //!< The size of shared memory per block size_t m_globalMemSizeBytes; //!< The size of global memory + bool m_cooperativeLaunch; //!< The support for launch of cooperative kernels }; } // namespace alpaka diff --git a/include/alpaka/acc/AccGenericSycl.hpp b/include/alpaka/acc/AccGenericSycl.hpp index fc8ba1da760f..8c158d717621 100644 --- a/include/alpaka/acc/AccGenericSycl.hpp +++ b/include/alpaka/acc/AccGenericSycl.hpp @@ -153,7 +153,9 @@ namespace alpaka::trait // m_sharedMemSizeBytes device.template get_info(), // m_globalMemSizeBytes - getMemBytes(dev)}; + getMemBytes(dev), + //m_cooperativeLaunch + std::false_type()}; } }; diff --git a/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp b/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp index 5eeb71a2eae2..a9038dd6916e 100644 --- a/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp +++ b/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp @@ -163,6 +163,12 @@ namespace alpaka TApi::deviceAttributeMaxSharedMemoryPerBlock, dev.getNativeHandle())); + int cooperativeLaunch = {}; + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute( + &cooperativeLaunch, + TApi::deviceAttributeCooperativeLaunch, + dev.getNativeHandle())); + return {// m_multiProcessorCount alpaka::core::clipCast(multiProcessorCount), // m_gridBlockExtentMax @@ -186,7 +192,9 @@ namespace alpaka // m_sharedMemSizeBytes static_cast(sharedMemSizeBytes), // m_globalMemSizeBytes - getMemBytes(dev)}; + getMemBytes(dev), + //m_cooperativeLaunch + static_cast(cooperativeLaunch)}; # else typename TApi::DeviceProp_t properties; @@ -215,7 +223,9 @@ namespace alpaka // m_sharedMemSizeBytes static_cast(properties.sharedMemPerBlock), // m_globalMemSizeBytes - getMemBytes(dev)}; + getMemBytes(dev), + //m_cooperativeLaunch + static_cast(properties.cooperativeLaunch)}; # endif } }; diff --git a/include/alpaka/core/ApiCudaRt.hpp b/include/alpaka/core/ApiCudaRt.hpp index 1710e7594fa5..54591ea67b11 100644 --- a/include/alpaka/core/ApiCudaRt.hpp +++ b/include/alpaka/core/ApiCudaRt.hpp @@ -76,6 +76,7 @@ namespace alpaka static constexpr DeviceAttr_t deviceAttributeMaxThreadsPerBlock = ::cudaDevAttrMaxThreadsPerBlock; static constexpr DeviceAttr_t deviceAttributeMultiprocessorCount = ::cudaDevAttrMultiProcessorCount; static constexpr DeviceAttr_t deviceAttributeWarpSize = ::cudaDevAttrWarpSize; + static constexpr DeviceAttr_t deviceAttributeCooperativeLaunch = ::cudaDevAttrCooperativeLaunch; static constexpr Limit_t limitPrintfFifoSize = ::cudaLimitPrintfFifoSize; static constexpr Limit_t limitMallocHeapSize = ::cudaLimitMallocHeapSize; diff --git a/include/alpaka/core/ApiHipRt.hpp b/include/alpaka/core/ApiHipRt.hpp index 0ea8790d63c0..6feec999df4a 100644 --- a/include/alpaka/core/ApiHipRt.hpp +++ b/include/alpaka/core/ApiHipRt.hpp @@ -79,6 +79,7 @@ namespace alpaka static constexpr DeviceAttr_t deviceAttributeMaxThreadsPerBlock = ::hipDeviceAttributeMaxThreadsPerBlock; static constexpr DeviceAttr_t deviceAttributeMultiprocessorCount = ::hipDeviceAttributeMultiprocessorCount; static constexpr DeviceAttr_t deviceAttributeWarpSize = ::hipDeviceAttributeWarpSize; + static constexpr DeviceAttr_t deviceAttributeCooperativeLaunch = ::hipDeviceAttributeCooperativeLaunch; # if HIP_VERSION >= 40'500'000 static constexpr Limit_t limitPrintfFifoSize = ::hipLimitPrintfFifoSize; diff --git a/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp b/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp index b5b5e20a2137..87a525c1f601 100644 --- a/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp +++ b/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp @@ -204,6 +204,18 @@ namespace alpaka // TApi::deviceGetLimit(&printfFifoSize, TApi::limitPrintfFifoSize); // std::cout << __func__ << "INFO: printfFifoSize: " << printfFifoSize << std::endl; # endif + +# if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL + // This checks if the device supports cooperative kernel launch + if constexpr (TCooperative) + { + if(!trait::GetAccDevProps::getAccDevProps(getDev(queue)).m_cooperativeLaunch) + { + throw std::runtime_error("This accelerator doesn't support cooperative gropus functionality!"); + } else std::cout << "This accelerator supports cooperative gropus functionality!"; + } +# endif + auto const gridBlockExtent = getWorkDiv(task); auto const blockThreadExtent = getWorkDiv(task); auto const threadElemExtent = getWorkDiv(task); @@ -361,6 +373,18 @@ namespace alpaka // TApi::deviceGetLimit(&printfFifoSize, TApi::limitPrintfFifoSize); // std::cout << __func__ << "INFO: printfFifoSize: " << printfFifoSize << std::endl; # endif + +# if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL + // This checks if the device supports cooperative kernel launch + if constexpr (TCooperative) + { + if(!trait::GetAccDevProps::getAccDevProps(getDev(queue)).m_cooperativeLaunch) + { + throw std::runtime_error("This accelerator doesn't support cooperative groups functionality!"); + } + } +# endif + auto const gridBlockExtent = getWorkDiv(task); auto const blockThreadExtent = getWorkDiv(task); auto const threadElemExtent = getWorkDiv(task); diff --git a/test/unit/acc/src/AccDevPropsTest.cpp b/test/unit/acc/src/AccDevPropsTest.cpp index 650942c4c583..b77d2c7525b6 100644 --- a/test/unit/acc/src/AccDevPropsTest.cpp +++ b/test/unit/acc/src/AccDevPropsTest.cpp @@ -29,7 +29,7 @@ TEMPLATE_LIST_TEST_CASE("getAccDevProps", "[acc]", alpaka::test::TestAccs) TEST_CASE("AccDevProps.aggregate_init", "[acc]") { - auto const props = alpaka::AccDevProps, int>{1, {2}, 3, {4}, 5, {6}, 7, 8, 9}; + auto const props = alpaka::AccDevProps, int>{1, {2}, 3, {4}, 5, {6}, 7, 8, 9, std::true_type()}; CHECK(props.m_multiProcessorCount == 1); CHECK(props.m_gridBlockExtentMax == alpaka::Vec{2}); @@ -40,6 +40,7 @@ TEST_CASE("AccDevProps.aggregate_init", "[acc]") CHECK(props.m_threadElemCountMax == 7); CHECK(props.m_sharedMemSizeBytes == 8); CHECK(props.m_globalMemSizeBytes == 9); + CHECK(props.m_cooperativeLaunch == true); } #ifdef __cpp_designated_initializers @@ -54,7 +55,8 @@ TEST_CASE("AccDevProps.designated_initializers", "[acc]") .m_threadElemExtentMax = {60}, .m_threadElemCountMax = 70, .m_sharedMemSizeBytes = 80, - .m_globalMemSizeBytes = 90}; + .m_globalMemSizeBytes = 90 + .m_cooperativeLaunch = std::false_type()}; CHECK(props.m_multiProcessorCount == 10); CHECK(props.m_gridBlockExtentMax == alpaka::Vec{20}); @@ -65,5 +67,6 @@ TEST_CASE("AccDevProps.designated_initializers", "[acc]") CHECK(props.m_threadElemCountMax == 70); CHECK(props.m_sharedMemSizeBytes == 80); CHECK(props.m_globalMemSizeBytes == 90); + CHECK(props.m_cooperativeLaunch == false); } #endif