From 98aac95d890cb4dd158833576d67e5576a92042d Mon Sep 17 00:00:00 2001
From: mehmet yusufoglu <mehmetyusufoglu01@gmail.com>
Date: Thu, 7 Nov 2024 11:12:22 +0100
Subject: [PATCH] make kernels depend each other, use original variables and
 access order

---
 .../babelstream/src/babelStreamCommon.hpp     |  7 +--
 .../babelstream/src/babelStreamMainTest.cpp   | 53 +++++++++++--------
 2 files changed, 34 insertions(+), 26 deletions(-)
diff --git a/benchmarks/babelstream/src/babelStreamCommon.hpp b/benchmarks/babelstream/src/babelStreamCommon.hpp
index a22f7d032d3..8885b73b80c 100644
--- a/benchmarks/babelstream/src/babelStreamCommon.hpp
+++ b/benchmarks/babelstream/src/babelStreamCommon.hpp
@@ -28,10 +28,11 @@ namespace
     [[maybe_unused]] constexpr auto minArrSize = 1024 * 128;
 
     // Scalar value for Mul and Triad kernel parameters.
-    [[maybe_unused]] constexpr auto scalarVal = 2.0f;
+    [[maybe_unused]] constexpr double scalarVal = 0.4;
 
     // Block thread extent for DotKernel test work division parameters.
     [[maybe_unused]] constexpr auto blockThreadExtentMain = 1024;
+    [[maybe_unused]] constexpr auto dotGridBlockExtent = 256;
 
     // Number of runs for each kernel, can be changed by command line arguments.
     // At least 100 runs are recommended for good benchmarking.
@@ -39,7 +40,7 @@ namespace
     [[maybe_unused]] auto numberOfRuns = 2;
 
     // Data input value for babelstream.
-    [[maybe_unused]] constexpr auto valA = 1.0f;
+    [[maybe_unused]] constexpr double valA = 0.1;
 
     //! handleCustomArguments Gets custom cmd line arguments from the all arguments.
     //! Namely gets --array-size=1234 and --number-runs=1234 and keeps the others which are
@@ -111,7 +112,7 @@ namespace
     {
         if constexpr(std::is_floating_point_v<T>)
         {
-            return std::fabs(a - b) < std::numeric_limits<T>::epsilon() * static_cast<T>(100.0);
+            return std::fabs(a - b) < (std::numeric_limits<T>::epsilon() * static_cast<T>(100.0));
         }
         else if constexpr(std::is_integral_v<T>)
         {
diff --git a/benchmarks/babelstream/src/babelStreamMainTest.cpp b/benchmarks/babelstream/src/babelStreamMainTest.cpp
index 79ec6216508..2abd10eafc6 100644
--- a/benchmarks/babelstream/src/babelStreamMainTest.cpp
+++ b/benchmarks/babelstream/src/babelStreamMainTest.cpp
@@ -22,7 +22,7 @@
  * Can be run with custom arguments as well as catch2 arguments
  * Run with Custom arguments:
  * ./babelstream --array-size=33554432 --number-runs=100
- * Runt with default array size and num runs:
+ * Run with default array size and num runs:
  * ./babelstream
  * Run with Catch2 arguments and defaul arrary size and num runs:
  * ./babelstream --success
@@ -76,12 +76,12 @@ struct CopyKernel
     //! \tparam T The data type
     //! \param acc The accelerator to be executed on.
     //! \param a Pointer for vector a
-    //! \param b Pointer for vector b
+    //! \param c Pointer for vector c
     template<typename TAcc, typename T>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T* b) const
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T* c) const
     {
         auto const [index] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        b[index] = a[index];
+        c[index] = a[index];
     }
 };
 
@@ -92,14 +92,14 @@ struct MultKernel
     //! \tparam TAcc The accelerator environment to be executed on.
     //! \tparam T The data type
     //! \param acc The accelerator to be executed on.
-    //! \param a Pointer for vector a
+    //! \param c Pointer for vector c
     //! \param b Pointer for result vector b
     template<typename TAcc, typename T>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T* const a, T* b) const
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T* const c, T* b) const
     {
         const T scalar = static_cast<T>(scalarVal);
         auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        b[i] = scalar * a[i];
+        b[i] = scalar * c[i];
     }
 };
 
@@ -132,11 +132,11 @@ struct TriadKernel
     //! \param b Pointer for vector b
     //! \param c Pointer for result vector c
     template<typename TAcc, typename T>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* c) const
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T const* b, T const* c) const
     {
         const T scalar = static_cast<T>(scalarVal);
         auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        c[i] = a[i] + scalar * b[i];
+        a[i] = b[i] + scalar * c[i];
     }
 };
 
@@ -151,6 +151,7 @@ struct DotKernel
     //! \param a Pointer for vector a
     //! \param b Pointer for vector b
     //! \param sum Pointer for result vector consisting sums for each block
+    //! \param arraySize the size of the array
     template<typename TAcc, typename T>
     ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* sum, alpaka::Idx<TAcc> arraySize) const
     {
@@ -316,23 +317,23 @@ void testKernels()
         },
         "InitKernel");
 
-    // Test the copy-kernel. Copy A one by one to B.
+    // Test the copy-kernel. Copy A one by one to C.
     measureKernelExec(
-        [&]() { alpaka::exec<Acc>(queue, workDivCopy, CopyKernel(), bufAccInputAPtr, bufAccInputBPtr); },
+        [&]() { alpaka::exec<Acc>(queue, workDivCopy, CopyKernel(), bufAccInputAPtr, bufAccOutputCPtr); },
         "CopyKernel");
 
-    // Test the scaling-kernel. Calculate B=scalar*A.
+    // Test the scaling-kernel. Calculate B=scalar*C. Where C = A.
     measureKernelExec(
-        [&]() { alpaka::exec<Acc>(queue, workDivMult, MultKernel(), bufAccInputAPtr, bufAccInputBPtr); },
+        [&]() { alpaka::exec<Acc>(queue, workDivMult, MultKernel(), bufAccOutputCPtr, bufAccInputBPtr); },
         "MultKernel");
 
-    // Test the addition-kernel. Calculate C=A+B. Where B=scalar*A.
+    // Test the addition-kernel. Calculate C=A+B. Where B=scalar*C or B=scalar*A.
     measureKernelExec(
         [&]()
         { alpaka::exec<Acc>(queue, workDivAdd, AddKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr); },
         "AddKernel");
 
-    // Test the Triad-kernel. Calculate C=A+scalar*B where B=scalar*A.
+    // Test the Triad-kernel. Calculate A=B+scalar*C. Where C is A+scalar*A.
     measureKernelExec(
         [&]()
         { alpaka::exec<Acc>(queue, workDivTriad, TriadKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr); },
@@ -350,19 +351,21 @@ void testKernels()
     DataType initVal{static_cast<DataType>(0.0)};
     DataType sumErrC{initVal}, sumErrB{initVal}, sumErrA{initVal};
 
-    auto const expectedC = static_cast<DataType>(valA + scalarVal * scalarVal * valA);
+
     auto const expectedB = static_cast<DataType>(scalarVal * valA);
-    auto const expectedA = static_cast<DataType>(valA);
+    auto const expectedC = static_cast<DataType>(static_cast<DataType>(valA) + expectedB);
+    auto const expectedA = static_cast<DataType>(expectedB + static_cast<DataType>(scalarVal) * expectedC);
 
     // sum of the errors for each array
     for(Idx i = 0; i < arraySize; ++i)
     {
-        sumErrC += bufHostOutputC[static_cast<Idx>(i)] - expectedC;
-        sumErrB += bufHostOutputB[static_cast<Idx>(i)] - expectedB;
-        sumErrA += bufHostOutputA[static_cast<Idx>(i)] - expectedA;
+        sumErrC += std::fabs(bufHostOutputC[static_cast<Idx>(i)] - expectedC);
+        sumErrB += std::fabs(bufHostOutputB[static_cast<Idx>(i)] - expectedB);
+        sumErrA += std::fabs(bufHostOutputA[static_cast<Idx>(i)] - expectedA);
     }
 
     // Normalize and compare sum of the errors
+    // Use a different equality check if floating point errors exceed precision of FuzzyEqual function
     REQUIRE(FuzzyEqual(sumErrC / static_cast<DataType>(arraySize) / expectedC, static_cast<DataType>(0.0)));
     REQUIRE(FuzzyEqual(sumErrB / static_cast<DataType>(arraySize) / expectedB, static_cast<DataType>(0.0)));
     REQUIRE(FuzzyEqual(sumErrA / static_cast<DataType>(arraySize) / expectedA, static_cast<DataType>(0.0)));
@@ -375,7 +378,7 @@ void testKernels()
         // Threads per block for Dot kernel
         constexpr Idx blockThreadExtent = blockThreadExtentMain;
         // Blocks per grid for Dot kernel
-        constexpr Idx gridBlockExtent = static_cast<Idx>(256);
+        const Idx gridBlockExtent = static_cast<Idx>(dotGridBlockExtent);
         // Vector of sums of each block
         auto bufAccSumPerBlock = alpaka::allocBuf<DataType, Idx>(devAcc, gridBlockExtent);
         auto bufHostSumPerBlock = alpaka::allocBuf<DataType, Idx>(devHost, gridBlockExtent);
@@ -401,8 +404,12 @@ void testKernels()
 
         DataType const* sumPtr = std::data(bufHostSumPerBlock);
         auto const result = std::reduce(sumPtr, sumPtr + gridBlockExtent, DataType{0});
-        // Since vector values are 1, dot product should be identical to arraySize
-        REQUIRE(FuzzyEqual(static_cast<DataType>(result), static_cast<DataType>(arraySize * 2)));
+
+        auto const expectedSum = static_cast<DataType>(arraySize) * expectedA * expectedB;
+        //  dot product should be identical to arraySize*valA*valB
+        //  Use a different equality check if floating point errors exceed precision of FuzzyEqual function
+        REQUIRE(FuzzyEqual((static_cast<DataType>(result) - expectedSum) / expectedSum, static_cast<DataType>(0.0)));
+
         // Add workdiv to the list of workdivs to print later
         metaData.setItem(BMInfoDataType::WorkDivDot, workDivDot);
     }