Skip to content

Commit

Permalink
Add cooperative kernel launch and grid sync support for HIP
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelVarvarin committed Jul 26, 2024
1 parent 3d89335 commit bb5ccaa
Show file tree
Hide file tree
Showing 6 changed files with 49 additions and 14 deletions.
4 changes: 2 additions & 2 deletions include/alpaka/acc/AccGpuUniformCudaHipRt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include "alpaka/block/shared/st/BlockSharedMemStUniformCudaHipBuiltIn.hpp"
#include "alpaka/block/sync/BlockSyncUniformCudaHipBuiltIn.hpp"
#include "alpaka/core/DemangleTypeNames.hpp"
#include "alpaka/grid/GridSyncGpuCuda.hpp"
#include "alpaka/grid/GridSyncGpuCudaHip.hpp"
#include "alpaka/idx/bt/IdxBtUniformCudaHipBuiltIn.hpp"
#include "alpaka/idx/gb/IdxGbUniformCudaHipBuiltIn.hpp"
#include "alpaka/intrinsic/IntrinsicUniformCudaHipBuiltIn.hpp"
Expand Down Expand Up @@ -60,7 +60,7 @@ namespace alpaka
, public BlockSharedMemDynUniformCudaHipBuiltIn
, public BlockSharedMemStUniformCudaHipBuiltIn
, public BlockSyncUniformCudaHipBuiltIn
, public GridSyncCudaBuiltIn
, public GridSyncCudaHipBuiltIn
, public IntrinsicUniformCudaHipBuiltIn
, public MemFenceUniformCudaHipBuiltIn
# ifdef ALPAKA_DISABLE_VENDOR_RNG
Expand Down
2 changes: 1 addition & 1 deletion include/alpaka/alpaka.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
#include "alpaka/atomic/Op.hpp"
#include "alpaka/atomic/Traits.hpp"
// grid
#include "alpaka/grid/GridSyncGpuCuda.hpp"
#include "alpaka/grid/GridSyncGpuCudaHip.hpp"
#include "alpaka/grid/Traits.hpp"
// block
// shared
Expand Down
11 changes: 11 additions & 0 deletions include/alpaka/core/ApiCudaRt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,11 @@ namespace alpaka
return ::cudaHostUnregister(ptr);
}

static inline Error_t launchCooperativeKernel(const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, Stream_t stream)
{
return ::cudaLaunchCooperativeKernel(func, gridDim, blockDim, args, sharedMem, stream);
}

static inline Error_t launchHostFunc(Stream_t stream, HostFn_t fn, void* userData)
{
# if CUDART_VERSION >= 10000
Expand Down Expand Up @@ -388,6 +393,12 @@ namespace alpaka
{
return ::make_cudaExtent(w, h, d);
}

template<class T>
static inline Error_t occupancyMaxActiveBlocksPerMultiprocessor (int* numBlocks, T func, int blockSize, size_t dynamicSMemSize)
{
return ::cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func, blockSize, dynamicSMemSize);
}
};

} // namespace alpaka
Expand Down
11 changes: 11 additions & 0 deletions include/alpaka/core/ApiHipRt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,11 @@ namespace alpaka
return ::hipHostUnregister(ptr);
}

static inline Error_t launchCooperativeKernel(const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, Stream_t stream)
{
return ::hipLaunchCooperativeKernel(func, gridDim, blockDim, args, sharedMem, stream);
}

static inline Error_t launchHostFunc(Stream_t stream, HostFn_t fn, void* userData)
{
// hipLaunchHostFunc is implemented only in ROCm 5.4.0 and later.
Expand Down Expand Up @@ -427,6 +432,12 @@ namespace alpaka
{
return ::make_hipExtent(w, h, d);
}

template<class T>
static inline Error_t occupancyMaxActiveBlocksPerMultiprocessor (int* numBlocks, T func, int blockSize, size_t dynamicSMemSize)
{
return ::hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func, blockSize, dynamicSMemSize);
}
};

} // namespace alpaka
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,29 +8,42 @@
#include "alpaka/core/BoostPredef.hpp"
#include "alpaka/core/Concepts.hpp"

#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
#include <cooperative_groups.h>
#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)

# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
# include <cooperative_groups.h>
# endif

# if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
# include <hip/hip_cooperative_groups.h>
# endif



namespace alpaka
{
//! The GPU CUDA grid synchronization.
class GridSyncCudaBuiltIn
: public concepts::Implements<ConceptGridSync, GridSyncCudaBuiltIn>
//! The GPU CUDA/HIP grid synchronization.
class GridSyncCudaHipBuiltIn
: public concepts::Implements<ConceptGridSync, GridSyncCudaHipBuiltIn>
{
};

# if !defined(ALPAKA_HOST_ONLY)

# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
# error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
# endif

# if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
# error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
# endif

namespace trait
{
template<>
struct SyncGridThreads<GridSyncCudaBuiltIn>
struct SyncGridThreads<GridSyncCudaHipBuiltIn>
{
__device__ static auto syncGridThreads(GridSyncCudaBuiltIn const& /*gridSync*/) -> void
__device__ static auto syncGridThreads(GridSyncCudaHipBuiltIn const& /*gridSync*/) -> void
{
cooperative_groups::this_grid().sync();
}
Expand Down
8 changes: 4 additions & 4 deletions include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ namespace alpaka
if constexpr (TCooperative) {
// This checks if requested number of blocks is compliant with the maxima of the accelerator.
int numBlocksPerSm = 0;
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
TApi::occupancyMaxActiveBlocksPerMultiprocessor(
&numBlocksPerSm,
kernelName,
blockThreadExtent.prod(),
Expand Down Expand Up @@ -308,7 +308,7 @@ namespace alpaka
{
void const* kernelArgs[] = {&threadElemExtent, &task.m_kernelFnObj, &args...};

cudaLaunchCooperativeKernel(
TApi::launchCooperativeKernel(
reinterpret_cast<void*>(kernelName),
gridDim,
blockDim,
Expand Down Expand Up @@ -409,7 +409,7 @@ namespace alpaka
if constexpr (TCooperative) {
// This checks if requested number of blocks is compliant with the maxima of the accelerator.
int numBlocksPerSm = 0;
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
occupancyMaxActiveBlocksPerMultiprocessor(
&numBlocksPerSm,
kernelName,
blockThreadExtent.prod(),
Expand Down Expand Up @@ -463,7 +463,7 @@ namespace alpaka
{
void const* kernelArgs[] = {&threadElemExtent, &task.m_kernelFnObj, &args...};

cudaLaunchCooperativeKernel(
launchCooperativeKernel(
reinterpret_cast<void*>(kernelName),
gridDim,
blockDim,
Expand Down

0 comments on commit bb5ccaa

Please sign in to comment.