Skip to content

Commit

Permalink
make kernels depend each other, use original variables and access order
Browse files Browse the repository at this point in the history
  • Loading branch information
mehmetyusufoglu committed Nov 9, 2024
1 parent 8fefd70 commit 98aac95
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 26 deletions.
7 changes: 4 additions & 3 deletions benchmarks/babelstream/src/babelStreamCommon.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,19 @@ namespace
[[maybe_unused]] constexpr auto minArrSize = 1024 * 128;

// Scalar value for Mul and Triad kernel parameters.
[[maybe_unused]] constexpr auto scalarVal = 2.0f;
[[maybe_unused]] constexpr double scalarVal = 0.4;

// Block thread extent for DotKernel test work division parameters.
[[maybe_unused]] constexpr auto blockThreadExtentMain = 1024;
[[maybe_unused]] constexpr auto dotGridBlockExtent = 256;

// Number of runs for each kernel, can be changed by command line arguments.
// At least 100 runs are recommended for good benchmarking.
// To prevent timeouts in CI, a small value is used.
[[maybe_unused]] auto numberOfRuns = 2;

// Data input value for babelstream.
[[maybe_unused]] constexpr auto valA = 1.0f;
[[maybe_unused]] constexpr double valA = 0.1;

//! handleCustomArguments Gets custom cmd line arguments from the all arguments.
//! Namely gets --array-size=1234 and --number-runs=1234 and keeps the others which are
Expand Down Expand Up @@ -111,7 +112,7 @@ namespace
{
if constexpr(std::is_floating_point_v<T>)
{
return std::fabs(a - b) < std::numeric_limits<T>::epsilon() * static_cast<T>(100.0);
return std::fabs(a - b) < (std::numeric_limits<T>::epsilon() * static_cast<T>(100.0));
}
else if constexpr(std::is_integral_v<T>)
{
Expand Down
53 changes: 30 additions & 23 deletions benchmarks/babelstream/src/babelStreamMainTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
* Can be run with custom arguments as well as catch2 arguments
* Run with Custom arguments:
* ./babelstream --array-size=33554432 --number-runs=100
* Runt with default array size and num runs:
* Run with default array size and num runs:
* ./babelstream
* Run with Catch2 arguments and defaul arrary size and num runs:
* ./babelstream --success
Expand Down Expand Up @@ -76,12 +76,12 @@ struct CopyKernel
//! \tparam T The data type
//! \param acc The accelerator to be executed on.
//! \param a Pointer for vector a
//! \param b Pointer for vector b
//! \param c Pointer for vector c
template<typename TAcc, typename T>
ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T* b) const
ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T* c) const
{
auto const [index] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
b[index] = a[index];
c[index] = a[index];
}
};

Expand All @@ -92,14 +92,14 @@ struct MultKernel
//! \tparam TAcc The accelerator environment to be executed on.
//! \tparam T The data type
//! \param acc The accelerator to be executed on.
//! \param a Pointer for vector a
//! \param c Pointer for vector c
//! \param b Pointer for result vector b
template<typename TAcc, typename T>
ALPAKA_FN_ACC void operator()(TAcc const& acc, T* const a, T* b) const
ALPAKA_FN_ACC void operator()(TAcc const& acc, T* const c, T* b) const
{
const T scalar = static_cast<T>(scalarVal);
auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
b[i] = scalar * a[i];
b[i] = scalar * c[i];
}
};

Expand Down Expand Up @@ -132,11 +132,11 @@ struct TriadKernel
//! \param b Pointer for vector b
//! \param c Pointer for result vector c
template<typename TAcc, typename T>
ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* c) const
ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T const* b, T const* c) const
{
const T scalar = static_cast<T>(scalarVal);
auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
c[i] = a[i] + scalar * b[i];
a[i] = b[i] + scalar * c[i];
}
};

Expand All @@ -151,6 +151,7 @@ struct DotKernel
//! \param a Pointer for vector a
//! \param b Pointer for vector b
//! \param sum Pointer for result vector consisting sums for each block
//! \param arraySize the size of the array
template<typename TAcc, typename T>
ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* sum, alpaka::Idx<TAcc> arraySize) const
{
Expand Down Expand Up @@ -316,23 +317,23 @@ void testKernels()
},
"InitKernel");

// Test the copy-kernel. Copy A one by one to B.
// Test the copy-kernel. Copy A one by one to C.
measureKernelExec(
[&]() { alpaka::exec<Acc>(queue, workDivCopy, CopyKernel(), bufAccInputAPtr, bufAccInputBPtr); },
[&]() { alpaka::exec<Acc>(queue, workDivCopy, CopyKernel(), bufAccInputAPtr, bufAccOutputCPtr); },
"CopyKernel");

// Test the scaling-kernel. Calculate B=scalar*A.
// Test the scaling-kernel. Calculate B=scalar*C. Where C = A.
measureKernelExec(
[&]() { alpaka::exec<Acc>(queue, workDivMult, MultKernel(), bufAccInputAPtr, bufAccInputBPtr); },
[&]() { alpaka::exec<Acc>(queue, workDivMult, MultKernel(), bufAccOutputCPtr, bufAccInputBPtr); },
"MultKernel");

// Test the addition-kernel. Calculate C=A+B. Where B=scalar*A.
// Test the addition-kernel. Calculate C=A+B. Where B=scalar*C or B=scalar*A.
measureKernelExec(
[&]()
{ alpaka::exec<Acc>(queue, workDivAdd, AddKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr); },
"AddKernel");

// Test the Triad-kernel. Calculate C=A+scalar*B where B=scalar*A.
// Test the Triad-kernel. Calculate A=B+scalar*C. Where C is A+scalar*A.
measureKernelExec(
[&]()
{ alpaka::exec<Acc>(queue, workDivTriad, TriadKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr); },
Expand All @@ -350,19 +351,21 @@ void testKernels()
DataType initVal{static_cast<DataType>(0.0)};
DataType sumErrC{initVal}, sumErrB{initVal}, sumErrA{initVal};

auto const expectedC = static_cast<DataType>(valA + scalarVal * scalarVal * valA);

auto const expectedB = static_cast<DataType>(scalarVal * valA);
auto const expectedA = static_cast<DataType>(valA);
auto const expectedC = static_cast<DataType>(static_cast<DataType>(valA) + expectedB);
auto const expectedA = static_cast<DataType>(expectedB + static_cast<DataType>(scalarVal) * expectedC);

// sum of the errors for each array
for(Idx i = 0; i < arraySize; ++i)
{
sumErrC += bufHostOutputC[static_cast<Idx>(i)] - expectedC;
sumErrB += bufHostOutputB[static_cast<Idx>(i)] - expectedB;
sumErrA += bufHostOutputA[static_cast<Idx>(i)] - expectedA;
sumErrC += std::fabs(bufHostOutputC[static_cast<Idx>(i)] - expectedC);
sumErrB += std::fabs(bufHostOutputB[static_cast<Idx>(i)] - expectedB);
sumErrA += std::fabs(bufHostOutputA[static_cast<Idx>(i)] - expectedA);
}

// Normalize and compare sum of the errors
// Use a different equality check if floating point errors exceed precision of FuzzyEqual function
REQUIRE(FuzzyEqual(sumErrC / static_cast<DataType>(arraySize) / expectedC, static_cast<DataType>(0.0)));
REQUIRE(FuzzyEqual(sumErrB / static_cast<DataType>(arraySize) / expectedB, static_cast<DataType>(0.0)));
REQUIRE(FuzzyEqual(sumErrA / static_cast<DataType>(arraySize) / expectedA, static_cast<DataType>(0.0)));
Expand All @@ -375,7 +378,7 @@ void testKernels()
// Threads per block for Dot kernel
constexpr Idx blockThreadExtent = blockThreadExtentMain;
// Blocks per grid for Dot kernel
constexpr Idx gridBlockExtent = static_cast<Idx>(256);
const Idx gridBlockExtent = static_cast<Idx>(dotGridBlockExtent);
// Vector of sums of each block
auto bufAccSumPerBlock = alpaka::allocBuf<DataType, Idx>(devAcc, gridBlockExtent);
auto bufHostSumPerBlock = alpaka::allocBuf<DataType, Idx>(devHost, gridBlockExtent);
Expand All @@ -401,8 +404,12 @@ void testKernels()

DataType const* sumPtr = std::data(bufHostSumPerBlock);
auto const result = std::reduce(sumPtr, sumPtr + gridBlockExtent, DataType{0});
// Since vector values are 1, dot product should be identical to arraySize
REQUIRE(FuzzyEqual(static_cast<DataType>(result), static_cast<DataType>(arraySize * 2)));

auto const expectedSum = static_cast<DataType>(arraySize) * expectedA * expectedB;
// dot product should be identical to arraySize*valA*valB
// Use a different equality check if floating point errors exceed precision of FuzzyEqual function
REQUIRE(FuzzyEqual((static_cast<DataType>(result) - expectedSum) / expectedSum, static_cast<DataType>(0.0)));

// Add workdiv to the list of workdivs to print later
metaData.setItem(BMInfoDataType::WorkDivDot, workDivDot);
}
Expand Down

0 comments on commit 98aac95

Please sign in to comment.