Skip to content

Commit

Permalink
Fix parallelLoopPatterns to SYCL backends
Browse files Browse the repository at this point in the history
  • Loading branch information
fwyzard committed Jan 24, 2025
1 parent 3dc2c62 commit c025e24
Showing 1 changed file with 20 additions and 0 deletions.
20 changes: 20 additions & 0 deletions example/parallelLoopPatterns/src/parallelLoopPatterns.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,12 @@ void gridStridedLoop(TDev& dev, TQueue& queue, TBufAcc& bufAcc)
// and fixed number of blocks tied to hardware parameters.
// alpaka element layer is not used in this pattern.
auto const threadsPerBlock = maxThreadsPerBlock;
#if defined(__SYCL_ID_QUERIES_FIT_IN_INT__) && __SYCL_ID_QUERIES_FIT_IN_INT__
// the compiler assumes that SYCL query parameters and maximum grid*block size fits in INT_MAX (2**31-1)
auto const blocksPerGrid = std::min<uint64_t>(deviceProperties.m_multiProcessorCount, INT_MAX / threadsPerBlock);
#else
auto const blocksPerGrid = deviceProperties.m_multiProcessorCount;
#endif
auto const elementsPerThread = 1u;
auto workDiv = WorkDiv{blocksPerGrid, threadsPerBlock, elementsPerThread};
std::cout << "\nGrid strided loop processing - fixed number of threads and blocks:\n";
Expand Down Expand Up @@ -246,7 +251,12 @@ void chunkedGridStridedLoop(TDev& dev, TQueue& queue, TBufAcc& bufAcc)
// Fixed sized alpaka element layer defines chunk size.
// With 1 element per thread this pattern is same as grid strided loop.
auto const threadsPerBlock = maxThreadsPerBlock;
#if defined(__SYCL_ID_QUERIES_FIT_IN_INT__) && __SYCL_ID_QUERIES_FIT_IN_INT__
// the compiler assumes that SYCL query parameters and maximum grid*block size fits in INT_MAX (2**31-1)
auto const blocksPerGrid = std::min<uint64_t>(deviceProperties.m_multiProcessorCount, INT_MAX / threadsPerBlock);
#else
auto const blocksPerGrid = deviceProperties.m_multiProcessorCount;
#endif
auto const elementsPerThread = 8u;
auto workDiv = WorkDiv{blocksPerGrid, threadsPerBlock, elementsPerThread};
std::cout << "\nChunked grid strided loop processing - fixed number of threads and blocks:\n";
Expand Down Expand Up @@ -312,7 +322,12 @@ void naiveOpenMPStyle(TDev& dev, TQueue& queue, TBufAcc& bufAcc)
// and number of blocks scales with the problem size.
// alpaka element layer is not used in this pattern.
auto const threadsPerBlock = maxThreadsPerBlock;
#if defined(__SYCL_ID_QUERIES_FIT_IN_INT__) && __SYCL_ID_QUERIES_FIT_IN_INT__
// the compiler assumes that SYCL query parameters and maximum grid*block size fits in INT_MAX (2**31-1)
auto const blocksPerGrid = std::min<uint64_t>(numCores, INT_MAX / threadsPerBlock);
#else
auto const blocksPerGrid = numCores;
#endif
auto const elementsPerThread = 1u;
auto workDiv = WorkDiv{blocksPerGrid, threadsPerBlock, elementsPerThread};
std::cout << "\nNaive OpenMP style processing - each thread processes a single consecutive range of elements:\n";
Expand Down Expand Up @@ -390,7 +405,12 @@ void openMPSimdStyle(TDev& dev, TQueue& queue, TBufAcc& bufAcc)
// Fixed sized alpaka element layer defines SIMD size.
// With 1 element per thread this pattern is same as naive OpenMP style.
auto const threadsPerBlock = maxThreadsPerBlock;
#if defined(__SYCL_ID_QUERIES_FIT_IN_INT__) && __SYCL_ID_QUERIES_FIT_IN_INT__
// the compiler assumes that SYCL query parameters and maximum grid*block size fits in INT_MAX (2**31-1)
auto const blocksPerGrid = std::min<uint64_t>(numCores, INT_MAX / threadsPerBlock);
#else
auto const blocksPerGrid = numCores;
#endif
auto const elementsPerThread = 4u;
auto workDiv = WorkDiv{blocksPerGrid, threadsPerBlock, elementsPerThread};
std::cout << "\nOpenMP SIMD style processing - each thread processes a single consecutive range of elements:\n";
Expand Down

0 comments on commit c025e24

Please sign in to comment.