Skip to content

Commit

Permalink
Add cooperative groups and grid sync functionality to SYCL
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelVarvarin committed Nov 12, 2024
1 parent 7cf652e commit aaed855
Show file tree
Hide file tree
Showing 14 changed files with 277 additions and 69 deletions.
2 changes: 1 addition & 1 deletion cmake/alpakaCommon.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -735,7 +735,7 @@ if(alpaka_ACC_SYCL_ENABLE)

#-----------------------------------------------------------------------------------------------------------------
# Generic SYCL options
alpaka_set_compiler_options(DEVICE target alpaka "-fsycl-unnamed-lambda") # Compiler default but made explicit here
alpaka_set_compiler_options(DEVICE target alpaka "-fsycl-unnamed-lambda") # Compiler default but made explicit here

if(alpaka_RELOCATABLE_DEVICE_CODE STREQUAL ON)
alpaka_set_compiler_options(DEVICE target alpaka "-fsycl-rdc")
Expand Down
49 changes: 31 additions & 18 deletions example/helloWorldGridSync/src/helloWorldGridSync.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ struct HelloWorldKernel
uint32_t gridThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
uint32_t gridThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];

printf("Hello, World from alpaka thread %u!\n", gridThreadIdx);
if(gridThreadIdx == 0)
printf("Hello, World from alpaka thread %u!\n", gridThreadIdx);

// Write the index of the thread to array.
data[gridThreadIdx] = gridThreadIdx;
Expand All @@ -38,13 +39,15 @@ struct HelloWorldKernel
uint32_t expectedSum = gridThreadExtent - 1;

// Print the result and signify an error if the grid synchronization fails.
printf(
"After grid sync, this thread is %u, thread on the opposite side is %u. Their sum is %u, expected: %u.%s",
gridThreadIdx,
gridThreadIdxOpposite,
sum,
expectedSum,
sum == expectedSum ? "\n" : " ERROR: the sum is incorrect.\n");
if(sum != expectedSum)
printf(
"After grid sync, this thread is %u, thread on the opposite side is %u. Their sum is %u, expected: "
"%u.%s",
gridThreadIdx,
gridThreadIdxOpposite,
sum,
expectedSum,
sum == expectedSum ? "\n" : " ERROR: the sum is incorrect.\n");
}
};

Expand All @@ -55,7 +58,7 @@ auto main() -> int
using Idx = uint32_t;

// Define alpaka accelerator type, which corresponds to the underlying programming model
using Acc = alpaka::AccGpuCudaRt<Dim, Idx>;
using Acc = alpaka::AccGpuSyclIntel<Dim, Idx>;

// Select the first device available on a system, for the chosen accelerator
auto const platformAcc = alpaka::Platform<Acc>{};
Expand All @@ -70,33 +73,43 @@ auto main() -> int
// threads per block, and elements per thread.
Idx blocksPerGrid = 10;
Idx threadsPerBlock = 1;
Idx threadsPerBlock2 = 1024;
Idx elementsPerThread = 1;

using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
auto workDiv = WorkDiv{blocksPerGrid, threadsPerBlock, elementsPerThread};
auto workDiv2 = WorkDiv{blocksPerGrid, threadsPerBlock2, elementsPerThread};

// Allocate memory on the device.
alpaka::Vec<Dim, Idx> bufferExtent{blocksPerGrid * threadsPerBlock};
auto deviceMemory = alpaka::allocBuf<uint32_t, Idx>(devAcc, bufferExtent);

alpaka::Vec<Dim, Idx> bufferExtent2{blocksPerGrid * threadsPerBlock2};
auto deviceMemory2 = alpaka::allocBuf<uint32_t, Idx>(devAcc, bufferExtent2);
// Instantiate the kernel object.
HelloWorldKernel helloWorldKernel;

int maxBlocks = alpaka::getMaxActiveBlocks<Acc>(
devAcc,
helloWorldKernel,
threadsPerBlock,
elementsPerThread,
getPtrNative(deviceMemory));
std::cout << "Maximum blocks for the kernel: " << maxBlocks << std::endl;
// int maxBlocks = alpaka::getMaxActiveBlocks<Acc>(
// devAcc,
// helloWorldKernel,
// threadsPerBlock,
// elementsPerThread,
// getPtrNative(deviceMemory));
// std::cout << "Maximum blocks for the kernel: " << maxBlocks << std::endl;

// Create a task to run the kernel.
// Note the cooperative kernel specification.
// Only cooperative kernels can perform grid synchronization.
auto taskRunKernel
= alpaka::createTaskCooperativeKernel<Acc>(workDiv, helloWorldKernel, getPtrNative(deviceMemory));
auto taskRunKernel = alpaka::createTaskKernel<Acc>(workDiv, helloWorldKernel, getPtrNative(deviceMemory));

auto taskRunKernel2
= alpaka::createTaskCooperativeKernel<Acc>(workDiv2, helloWorldKernel, getPtrNative(deviceMemory2));

// Enqueue the kernel execution task..
alpaka::enqueue(queue, taskRunKernel);
alpaka::wait(queue);
printf("launching kernel 2\n");
alpaka::enqueue(queue, taskRunKernel2);

return 0;
}
44 changes: 39 additions & 5 deletions include/alpaka/acc/AccGenericSycl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp"
#include "alpaka/block/sync/BlockSyncGenericSycl.hpp"
#include "alpaka/dev/DevGenericSycl.hpp"
#include "alpaka/grid/GridSyncGenericSycl.hpp"
#include "alpaka/idx/bt/IdxBtGenericSycl.hpp"
#include "alpaka/idx/gb/IdxGbGenericSycl.hpp"
#include "alpaka/intrinsic/IntrinsicGenericSycl.hpp"
Expand Down Expand Up @@ -46,7 +47,14 @@

namespace alpaka
{
template<concepts::Tag TTag, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, bool TCooperative, typename... TArgs>
template<
concepts::Tag TTag,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
bool TCooperative,
typename... TArgs>
class TaskKernelGenericSycl;

//! The SYCL accelerator.
Expand All @@ -62,6 +70,7 @@ namespace alpaka
, public BlockSharedMemDynGenericSycl
, public BlockSharedMemStGenericSycl
, public BlockSyncGenericSycl<TDim>
, public GridSyncGenericSycl<TDim>
, public IntrinsicGenericSycl
, public MemFenceGenericSycl
# ifdef ALPAKA_DISABLE_VENDOR_RNG
Expand Down Expand Up @@ -91,6 +100,7 @@ namespace alpaka
, BlockSharedMemDynGenericSycl{dyn_shared_acc}
, BlockSharedMemStGenericSycl{st_shared_acc}
, BlockSyncGenericSycl<TDim>{work_item}
, GridSyncGenericSycl<TDim>{work_item}
# ifndef ALPAKA_DISABLE_VENDOR_RNG
, rand::RandGenericSycl<TDim>{work_item}
# endif
Expand Down Expand Up @@ -197,10 +207,34 @@ namespace alpaka::trait
{
static auto createTaskKernel(TWorkDiv const& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
{
return TaskKernelGenericSycl<TTag, AccGenericSycl<TTag, TDim, TIdx>, TDim, TIdx, TKernelFnObj, TArgs...>{
workDiv,
kernelFnObj,
std::forward<TArgs>(args)...};
return TaskKernelGenericSycl<
TTag,
AccGenericSycl<TTag, TDim, TIdx>,
TDim,
TIdx,
TKernelFnObj,
false,
TArgs...>{workDiv, kernelFnObj, std::forward<TArgs>(args)...};
}
};

//! The SYCL accelerator execution task type trait specialization.
template<typename TTag, typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
struct CreateTaskCooperativeKernel<AccGenericSycl<TTag, TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
{
static auto createTaskCooperativeKernel(
TWorkDiv const& workDiv,
TKernelFnObj const& kernelFnObj,
TArgs&&... args)
{
return TaskKernelGenericSycl<
TTag,
AccGenericSycl<TTag, TDim, TIdx>,
TDim,
TIdx,
TKernelFnObj,
true,
TArgs...>{workDiv, kernelFnObj, std::forward<TArgs>(args)...};
}
};

Expand Down
45 changes: 45 additions & 0 deletions include/alpaka/grid/GridSyncGenericSycl.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/* Copyright 2024 Mykhailo Varvarin
* SPDX-License-Identifier: MPL-2.0
*/

#pragma once

#include "alpaka/core/Concepts.hpp"
#include "alpaka/grid/Traits.hpp"

#ifdef ALPAKA_ACC_SYCL_ENABLED

# include <sycl/ext/oneapi/experimental/root_group.hpp>
# include <sycl/sycl.hpp>

namespace alpaka
{
//! The grid synchronization for SYCL.
template<typename TDim>
class GridSyncGenericSycl : public concepts::Implements<ConceptGridSync, GridSyncGenericSycl<TDim>>
{
public:
GridSyncGenericSycl(sycl::nd_item<TDim::value> work_item) : my_item{work_item}
{
}

sycl::nd_item<TDim::value> my_item;
};

namespace trait
{
template<typename TDim>
struct SyncGridThreads<GridSyncGenericSycl<TDim>>
{
ALPAKA_NO_HOST_ACC_WARNING
ALPAKA_FN_ACC static auto syncGridThreads(GridSyncGenericSycl<TDim> const& gridSync) -> void
{
sycl::group_barrier(gridSync.my_item.ext_oneapi_get_root_group());
}
};

} // namespace trait

} // namespace alpaka

#endif
4 changes: 2 additions & 2 deletions include/alpaka/kernel/TaskKernelCpuSycl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@

namespace alpaka
{
template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
template<typename TDim, typename TIdx, typename TKernelFnObj, bool TCooperative, typename... TArgs>
using TaskKernelCpuSycl
= TaskKernelGenericSycl<TagCpuSycl, AccCpuSycl<TDim, TIdx>, TDim, TIdx, TKernelFnObj, TArgs...>;
= TaskKernelGenericSycl<TagCpuSycl, AccCpuSycl<TDim, TIdx>, TDim, TIdx, TKernelFnObj, TCooperative, TArgs...>;

} // namespace alpaka

Expand Down
12 changes: 9 additions & 3 deletions include/alpaka/kernel/TaskKernelFpgaSyclIntel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,15 @@

namespace alpaka
{
template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
using TaskKernelFpgaSyclIntel
= TaskKernelGenericSycl<TagFpgaSyclIntel, AccFpgaSyclIntel<TDim, TIdx>, TDim, TIdx, TKernelFnObj, TArgs...>;
template<typename TDim, typename TIdx, typename TKernelFnObj, bool TCooperative, typename... TArgs>
using TaskKernelFpgaSyclIntel = TaskKernelGenericSycl<
TagFpgaSyclIntel,
AccFpgaSyclIntel<TDim, TIdx>,
TDim,
TIdx,
TKernelFnObj,
TCooperative,
TArgs...>;

} // namespace alpaka

Expand Down
Loading

0 comments on commit aaed855

Please sign in to comment.