From fab1f24f6d0222474bca3aeb8c7429038832847a Mon Sep 17 00:00:00 2001
From: Michael Varvarin <55709728+MichaelVarvarin@users.noreply.github.com>
Date: Tue, 12 Nov 2024 10:17:01 +0200
Subject: [PATCH] Rewrite example to use executeForEachAccTag

---
 .../src/helloWorldGridSync.cpp                | 71 ++++++++++++-------
 1 file changed, 47 insertions(+), 24 deletions(-)
diff --git a/example/helloWorldGridSync/src/helloWorldGridSync.cpp b/example/helloWorldGridSync/src/helloWorldGridSync.cpp
index ef2afc48a52..3944da8a876 100644
--- a/example/helloWorldGridSync/src/helloWorldGridSync.cpp
+++ b/example/helloWorldGridSync/src/helloWorldGridSync.cpp
@@ -3,6 +3,7 @@
  */
 
 #include <alpaka/alpaka.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
 
 #include <cstdint>
 #include <iostream>
@@ -51,14 +52,22 @@ struct HelloWorldKernel
     }
 };
 
-auto main() -> int
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<typename TAccTag>
+auto example(TAccTag const&) -> int
 {
+    // Define the accelerator
+    // For simplicity this examples always uses 1 dimensional indexing, and index type size_t
+    using Acc = alpaka::TagToAcc<TAccTag, alpaka::DimInt<1>, std::size_t>;
+    std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
+
     // Define dimensionality and type of indices to be used in kernels
     using Dim = alpaka::DimInt<1>;
-    using Idx = uint32_t;
+    using Idx = size_t;
 
-    // Define alpaka accelerator type, which corresponds to the underlying programming model
-    using Acc = alpaka::AccGpuSyclIntel<Dim, Idx>;
 
     // Select the first device available on a system, for the chosen accelerator
     auto const platformAcc = alpaka::Platform<Acc>{};
@@ -71,45 +80,59 @@ auto main() -> int
 
     // Define kernel execution configuration of blocks,
     // threads per block, and elements per thread.
-    Idx blocksPerGrid = 10;
+    Idx blocksPerGrid = 1000;
     Idx threadsPerBlock = 1;
-    Idx threadsPerBlock2 = 1024;
     Idx elementsPerThread = 1;
 
     using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
-    auto workDiv = WorkDiv{blocksPerGrid, threadsPerBlock, elementsPerThread};
-    auto workDiv2 = WorkDiv{blocksPerGrid, threadsPerBlock2, elementsPerThread};
 
     // Allocate memory on the device.
     alpaka::Vec<Dim, Idx> bufferExtent{blocksPerGrid * threadsPerBlock};
     auto deviceMemory = alpaka::allocBuf<uint32_t, Idx>(devAcc, bufferExtent);
 
-    alpaka::Vec<Dim, Idx> bufferExtent2{blocksPerGrid * threadsPerBlock2};
-    auto deviceMemory2 = alpaka::allocBuf<uint32_t, Idx>(devAcc, bufferExtent2);
+
     // Instantiate the kernel object.
     HelloWorldKernel helloWorldKernel;
 
-    // int maxBlocks = alpaka::getMaxActiveBlocks<Acc>(
-    //     devAcc,
-    //    helloWorldKernel,
-    //     threadsPerBlock,
-    //     elementsPerThread,
-    //     getPtrNative(deviceMemory));
-    // std::cout << "Maximum blocks for the kernel: " << maxBlocks << std::endl;
+    // Query the maximum number of blocks allowed for the device
+    int maxBlocks = alpaka::getMaxActiveBlocks<Acc>(
+        devAcc,
+        helloWorldKernel,
+        threadsPerBlock,
+        elementsPerThread,
+        getPtrNative(deviceMemory));
+    std::cout << "Maximum blocks for the kernel: " << maxBlocks << std::endl;
+
+    // Create a workdiv according to the limitations
+    blocksPerGrid = std::min(static_cast<Idx>(maxBlocks), blocksPerGrid);
+    auto workDiv2 = WorkDiv{blocksPerGrid, threadsPerBlock, elementsPerThread};
+    alpaka::Vec<Dim, Idx> bufferExtent2{blocksPerGrid * threadsPerBlock};
+    auto deviceMemory2 = alpaka::allocBuf<uint32_t, Idx>(devAcc, bufferExtent2);
 
     // Create a task to run the kernel.
     // Note the cooperative kernel specification.
     // Only cooperative kernels can perform grid synchronization.
-    auto taskRunKernel = alpaka::createTaskKernel<Acc>(workDiv, helloWorldKernel, getPtrNative(deviceMemory));
-
-    auto taskRunKernel2
+    auto taskRunKernel
         = alpaka::createTaskCooperativeKernel<Acc>(workDiv2, helloWorldKernel, getPtrNative(deviceMemory2));
 
     // Enqueue the kernel execution task..
     alpaka::enqueue(queue, taskRunKernel);
-    alpaka::wait(queue);
-    printf("launching kernel 2\n");
-    alpaka::enqueue(queue, taskRunKernel2);
 
-    return 0;
+    return EXIT_SUCCESS;
+}
+
+auto main() -> int
+{
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
 }