diff --git a/benchmarks/babelstream/src/babelStreamCommon.hpp b/benchmarks/babelstream/src/babelStreamCommon.hpp index a22f7d032d3..46c76c83063 100644 --- a/benchmarks/babelstream/src/babelStreamCommon.hpp +++ b/benchmarks/babelstream/src/babelStreamCommon.hpp @@ -3,11 +3,11 @@ #include #include #include -#include #include #include #include #include +#include #include #include #include @@ -22,61 +22,112 @@ namespace // According to tests, 2^25 or larger values are needed for proper benchmarking: // ./babelstream --array-size=33554432 --number-runs=100 // To prevent timeouts in CI, a smaller default value is used. - [[maybe_unused]] auto arraySizeMain = 1024 * 1024; + [[maybe_unused]] auto arraySizeMain = 1024 * 256; // Minimum array size to be used. - [[maybe_unused]] constexpr auto minArrSize = 1024 * 128; + [[maybe_unused]] constexpr auto minArrSize = 1024 * 256; // Scalar value for Mul and Triad kernel parameters. - [[maybe_unused]] constexpr auto scalarVal = 2.0f; + [[maybe_unused]] constexpr double scalarVal = 0.4; // Block thread extent for DotKernel test work division parameters. [[maybe_unused]] constexpr auto blockThreadExtentMain = 1024; + [[maybe_unused]] constexpr auto dotGridBlockExtent = 256; // Number of runs for each kernel, can be changed by command line arguments. // At least 100 runs are recommended for good benchmarking. // To prevent timeouts in CI, a small value is used. [[maybe_unused]] auto numberOfRuns = 2; - // Data input value for babelstream. - [[maybe_unused]] constexpr auto valA = 1.0f; + // Data input values for babelstream. + [[maybe_unused]] constexpr double initA = 0.1; + [[maybe_unused]] constexpr double initB = 0.2; + // Change this if triad kernel is going to be run alone + [[maybe_unused]] constexpr double initC = 0.0; + + //! Values corresponding to the command line argument run-kernels + enum class KernelsToRun + { + All, // init, add, copy, mul, triad, dot + Triad, // only init and triad + NStream // only init and nstream + }; + + // Define the variable showing the kernel(s) being run + [[maybe_unused]] KernelsToRun kernelsToBeExecuted{KernelsToRun::All}; //! handleCustomArguments Gets custom cmd line arguments from the all arguments. //! Namely gets --array-size=1234 and --number-runs=1234 and keeps the others which are //! command line args for Catch2 session. [[maybe_unused]] static void handleCustomArguments(int& argc, char* argv[]) { - std::vector newArgv; - newArgv.push_back(argv[0]); // Keep the program name + std::vector newArgv({argv[0]}); // keep program name for(int i = 1; i < argc; ++i) { std::string arg = argv[i]; if(arg.rfind("--array-size=", 0) == 0) { - auto const arrSize = std::stoi(arg.substr(13)); // Convert to integer - if(arrSize > minArrSize) + try { - arraySizeMain = arrSize; - std::cout << "Array size provided(items): " << arraySizeMain << std::endl; + // Convert argument to integer + auto arrSize = std::stoi(arg.substr(13)); + if(arrSize > minArrSize) + { + arraySizeMain = arrSize; + std::cout << "Array size set to: " << arraySizeMain << std::endl; + } + else + { + std::cout << "Array size too small. Must be at least " << minArrSize + << ", using default: " << arraySizeMain << std::endl; + } } - else + catch(std::invalid_argument const&) { - std::cout << "Too small array size given. Must be at least " << minArrSize << std::endl; - std::cout << "Using default array size(number of items): " << arraySizeMain << std::endl; + std::cerr << "Invalid array size argument: " << arg << ". Default value used." << std::endl; } } else if(arg.rfind("--number-runs=", 0) == 0) { - auto const numRuns = std::stoi(arg.substr(14)); // Convert to integer - if(numRuns > 0) + try + { + // Convert argument to integer + auto const numRuns = std::stoi(arg.substr(14)); + if(numRuns > 0) + { + numberOfRuns = numRuns; + std::cout << "Number of runs provided: " << numberOfRuns << std::endl; + } + else + { + std::cout << "Using default number of runs: " << numberOfRuns << std::endl; + } + } + catch(std::invalid_argument const&) + { + std::cerr << "Invalid number of runs argument: " << arg << " . Default value used." << std::endl; + } + } + else if(arg.rfind("--run-kernels=", 0) == 0) + { + // Get argument to determine which kernels will be run + auto const kernelsString = arg.substr(14); + if(kernelsString == "nstream") + { + std::cout << "Only nstream kernel will be executed." << std::endl; + kernelsToBeExecuted = KernelsToRun::NStream; + } + else if(kernelsString == "triad") { - numberOfRuns = numRuns; - std::cout << "Number of runs provided: " << numberOfRuns << std::endl; + kernelsToBeExecuted = KernelsToRun::Triad; + std::cout << "Only triad kernel will be executed." << std::endl; } - else + else if(kernelsString == "all") { - std::cout << "Using default number of runs: " << numberOfRuns << std::endl; + // The variable kernelsToBeExecuted default value is "all"; + kernelsToBeExecuted = KernelsToRun::All; + std::cout << "All 5 babelstream kernels are going to be executed." << std::endl; } } else @@ -87,7 +138,11 @@ namespace if(arg.rfind("-?", 0) == 0 || arg.rfind("--help", 0) == 0 || arg.rfind("-h", 0) == 0) { std::cout << "Usage of custom arguments (arguments which are not Catch2): --array-size=33554432 and " - "--number-runs=100" + "--number-runs=100\n" + << std::endl; + std::cout << "If you want to run only nstream kernel or triad kernel use --run-kernels=nstream or " + "--run-kernels=triad. Otherwise all 5 standard kernels will be executed. Copy, Mul, Add, " + "Triad. (and Dot kernel, if multi-threaded acc is used.)" << std::endl; } } @@ -98,6 +153,12 @@ namespace { argv[i] = newArgv[static_cast(i)]; } + + // Array size must a multiple of + if(arraySizeMain % blockThreadExtentMain != 0) + throw std::runtime_error( + "Array size is " + std::to_string(arraySizeMain) + ". It must be a multiple of block-size " + + std::to_string(blockThreadExtentMain)); } //! FuzzyEqual compares two floating-point or integral type values. @@ -111,7 +172,7 @@ namespace { if constexpr(std::is_floating_point_v) { - return std::fabs(a - b) < std::numeric_limits::epsilon() * static_cast(100.0); + return std::fabs(a - b) < (std::numeric_limits::epsilon() * static_cast(100.0)); } else if constexpr(std::is_integral_v) { @@ -213,12 +274,14 @@ namespace NumRuns, DataSize, DataType, + CopyTimeFromAccToHost, WorkDivInit, WorkDivCopy, WorkDivAdd, WorkDivTriad, WorkDivMult, WorkDivDot, + WorkDivNStream, DeviceName, TimeUnit, KernelNames, @@ -251,6 +314,8 @@ namespace return "DataSize(items)"; case BMInfoDataType::DataType: return "Precision"; + case BMInfoDataType::CopyTimeFromAccToHost: + return "AccToHost Memcpy Time(sec)"; case BMInfoDataType::DeviceName: return "DeviceName"; case BMInfoDataType::TimeUnit: @@ -279,6 +344,8 @@ namespace return "WorkDivMult "; case BMInfoDataType::WorkDivDot: return "WorkDivDot "; + case BMInfoDataType::WorkDivNStream: + return "WorkDivNStream"; default: return ""; } @@ -314,11 +381,159 @@ namespace return bytesReadWriteGB / static_cast(runTimeSeconds); } + //! \brief calculateBabelstreamExpectedResults Fins the expected values by doing the same operations with the + //! initial values \param expectedA Array item value expected (all array values are equal in original babelstream) + //! \param expectedB Array item value expected (all array values are equal in original babelstream) + //! \param expectedC Array item value expected (all array values are equal in original babelstream) + template + [[maybe_unused]] static void calculateBabelstreamExpectedResults(T& expectedA, T& expectedB, T& expectedC) + { + // + // All items of arrays are the same, therefore an expected value is for the whole array + for(auto i = 0; i < numberOfRuns; i++) + { + if(kernelsToBeExecuted == KernelsToRun::All) + { + expectedC = expectedA; + expectedB = static_cast(scalarVal) * expectedC; + expectedC = expectedA + expectedB; + expectedA = expectedB + static_cast(scalarVal) * expectedC; + } + else if(kernelsToBeExecuted == KernelsToRun::Triad) + { + expectedA = expectedB + static_cast(scalarVal) * expectedC; + } + else if(kernelsToBeExecuted == KernelsToRun::NStream) + { + // each run changes the result + expectedA += expectedB + static_cast(scalarVal) * expectedC; + } + } + } + + /** + * /brief The RuntimeResults class bundles the kernel runtime data in a map + * The keys of the map are kernel names the values of the map are KernelRunData struct pointers + */ + class RuntimeResults + { + struct KernelRunData + { + std::vector timingsSuccessiveRuns; // Stores execution timings of successive runs + double byteReadWriteMB{0}; // Bytes read/write in MB + double bandwidthKernel{0}; // Bandwidth of kernel + double minExecTime{0}; // Minimum execution time + double maxExecTime{0}; // Maximum execution time + double avgExecTime{0}; // Average execution time + }; + + public: + // Map from kernelName (string) to a unique_ptr for KernelRunData + // Using unique_ptr for automatic memory management + std::map> kernelToRundataMap; + + // Function to initialize the byteReadWriteMB field for each kernel + template + void initializeByteReadWrite(size_t arraySize) + { + // Define kernel names and their throughput values based on the provided array size + std::map throughputValues + = {{"InitKernel", getDataThroughput(3u, static_cast(arraySize))}, + {"CopyKernel", getDataThroughput(2u, static_cast(arraySize))}, + {"MultKernel", getDataThroughput(2u, static_cast(arraySize))}, + {"AddKernel", getDataThroughput(3u, static_cast(arraySize))}, + {"TriadKernel", getDataThroughput(3u, static_cast(arraySize))}, + {"DotKernel", getDataThroughput(2u, static_cast(arraySize))}, + {"NStreamKernel", getDataThroughput(2u, static_cast(arraySize))}}; + + // Populate each KernelRunData entry in kernelToRundataMap + for(auto const& [kernelName, throughput] : throughputValues) + { + // Check if the kernel name exists in the map + if(kernelToRundataMap.find(kernelName) != kernelToRundataMap.end()) + { + // Set the byteReadWriteMB field in the corresponding KernelRunData + kernelToRundataMap[kernelName]->byteReadWriteMB = throughput; + } + } + } + + //! /brief calculateBandwidthsForKernels Function calculates bandwidth for each kernel and update execution + //! times Fills the fields of bandwidth, execution min-time, execution max-time and execution avg-time + template + void calculateBandwidthsForKernels() + { + for(auto const& [kernelName, kernelData] : kernelToRundataMap) + { + // Calculate min and max execution times from recorded vector of times for the kernel named kernelName + auto const minmaxPair = findMinMax(kernelData->timingsSuccessiveRuns); + kernelData->minExecTime = minmaxPair.first; + kernelData->maxExecTime = minmaxPair.second; + + // Calculate average execution time + kernelData->avgExecTime = findAverage(kernelData->timingsSuccessiveRuns); + + // Calculate bandwidth based on byteReadWriteMB and min execution time + kernelData->bandwidthKernel = calculateBandwidth(kernelData->byteReadWriteMB, minmaxPair.first); + } + } + + //! /brief Get item from each struct in the map and make a vector + //! /tparam Func is the accessor function to access to a specific field + template + std::vector getItemFromStructs(Func accessor) const noexcept + { + std::vector results; + for(auto const& [key, dataStruct] : kernelToRundataMap) + { + results.push_back(accessor(dataStruct.get())); // Dereference unique_ptr with .get() + } + return results; + } + + // Functions to retrieve specific fields as vectors for all kernels + std::vector getBandwidthKernelVec() const noexcept + { + return getItemFromStructs([](KernelRunData const* dataStruct) { return dataStruct->bandwidthKernel; }); + } + + std::vector getThroughputKernelArray() const noexcept + { + return getItemFromStructs([](KernelRunData const* dataStruct) { return dataStruct->byteReadWriteMB; }); + } + + std::vector getAvgExecTimeKernelArray() const noexcept + { + return getItemFromStructs([](KernelRunData const* dataStruct) { return dataStruct->avgExecTime; }); + } + + std::vector getMinExecTimeKernelArray() const noexcept + { + return getItemFromStructs([](KernelRunData const* dataStruct) { return dataStruct->minExecTime; }); + } + + std::vector getMaxExecTimeKernelArray() const noexcept + { + return getItemFromStructs([](KernelRunData const* dataStruct) { return dataStruct->maxExecTime; }); + } + + // Function to add a kernelName-timesForRuns pair by storing a unique_ptr to KernelRunData + void addKernelTimingsVec(std::string const& kernelName) noexcept + { + // Use make_unique to create a new KernelRunData object and store it in the map + kernelToRundataMap[kernelName] = std::make_unique(); + } + }; + //! MetaData class to store and serialize benchmark information. //! \details The MetaData class includes a single map to keep all benchmark information and provides serialization //! methods for generating output. - class MetaData + class BenchmarkMetaData { + private: + // Information type to string. String can be comma separated values. + std::map metaDataMap; + public: //! setItem Sets an item in the metadata map. //! \tparam T The type of the value to store. @@ -353,29 +568,32 @@ namespace { std::stringstream ss; // define lambda to add values to a string stream created already - auto addItemValue = [&, this](BMInfoDataType item) { - ss << "\n" << typeToTypeStr(item) << ":" << metaDataMap.at(item); + auto addItemValueToSS = [&, this](BMInfoDataType item) + { + if(metaDataMap.count(item) != 0) + ss << "\n" << typeToTypeStr(item) << ":" << metaDataMap.at(item); }; - // Initially chose some data to serialize + // Initially choose some data to serialize from the meta-data map to add to string stream ss << "\n"; - addItemValue(BMInfoDataType::AcceleratorType); - addItemValue(BMInfoDataType::NumRuns); - addItemValue(BMInfoDataType::DataType); - addItemValue(BMInfoDataType::DataSize); - addItemValue(BMInfoDataType::DeviceName); - addItemValue(BMInfoDataType::WorkDivInit); - addItemValue(BMInfoDataType::WorkDivCopy); - addItemValue(BMInfoDataType::WorkDivMult); - addItemValue(BMInfoDataType::WorkDivAdd); - addItemValue(BMInfoDataType::WorkDivTriad); - if(metaDataMap.count(BMInfoDataType::WorkDivDot) != 0) - addItemValue(BMInfoDataType::WorkDivDot); - + addItemValueToSS(BMInfoDataType::AcceleratorType); + addItemValueToSS(BMInfoDataType::NumRuns); + addItemValueToSS(BMInfoDataType::DataType); + addItemValueToSS(BMInfoDataType::DataSize); + addItemValueToSS(BMInfoDataType::DeviceName); + addItemValueToSS(BMInfoDataType::WorkDivInit); + addItemValueToSS(BMInfoDataType::WorkDivCopy); + addItemValueToSS(BMInfoDataType::WorkDivMult); + addItemValueToSS(BMInfoDataType::WorkDivAdd); + addItemValueToSS(BMInfoDataType::WorkDivTriad); + addItemValueToSS(BMInfoDataType::WorkDivDot); + addItemValueToSS(BMInfoDataType::WorkDivNStream); + addItemValueToSS(BMInfoDataType::CopyTimeFromAccToHost); + + // if the item is a string with delimited values get the item then the value at the index auto getItemFromStrList = [this](BMInfoDataType item, int index) -> std::string { std::string const str = metaDataMap.at(item); - if(index < 1) { throw std::invalid_argument("Index must be 1 or greater."); @@ -405,7 +623,7 @@ namespace throw std::out_of_range("Index out of range"); }; - // Prepare Table + // Prepare Table Display // Table column names ss << std::endl; ss << std::left << std::setw(15) << typeToTypeStr(BMInfoDataType::KernelNames) << " " << std::left @@ -421,7 +639,7 @@ namespace // Table rows. Print test results for each kernel line by line for(auto i = 1; i <= numberOfKernels; i++) { - // Print the row for the kernel i + // Print the row for the kernel i. ss << " " << std::left << std::setw(15) << getItemFromStrList(BMInfoDataType::KernelNames, i) << " "; ss << std::left << std::setw(15) << getItemFromStrList(BMInfoDataType::KernelBandwidths, i) << " "; ss << std::left << std::setw(8) << getItemFromStrList(BMInfoDataType::KernelMinTimes, i) << " "; @@ -433,8 +651,6 @@ namespace return ss.str(); } - - private: - std::map metaDataMap; }; + } // namespace diff --git a/benchmarks/babelstream/src/babelStreamMainTest.cpp b/benchmarks/babelstream/src/babelStreamMainTest.cpp index 79ec6216508..9cf10f4d383 100644 --- a/benchmarks/babelstream/src/babelStreamMainTest.cpp +++ b/benchmarks/babelstream/src/babelStreamMainTest.cpp @@ -9,24 +9,32 @@ #include #include +#include +#include #include /** - * Babelstream benchmarking example. Babelstream has 5 kernels. Add, Multiply, Copy, Triad and Dot. - * Babelstream is a memory-bound benchmark since the main operation in the kernels has high Code Balance (bytes/FLOP) - * value. For example c[i] = a[i] + b[i]; has 2 reads 1 writes and has one FLOP operation. For double precision each - * read-write is 8 bytes. Hence Code Balance (3*8 / 1) = 24 bytes/FLOP. + * Babelstream benchmarking example. Babelstream has 5 kernels. Add, Multiply, Copy, Triad and Dot. NStream is + * optional. Init kernel is run before 5 standard kernel sequence. Babelstream is a memory-bound benchmark since the + * main operation in the kernels has high Code Balance (bytes/FLOP) value. For example c[i] = a[i] + b[i]; has 2 reads + * 1 writes and has one FLOP operation. For double precision each read-write is 8 bytes. Hence Code Balance (3*8 / 1) = + * 24 bytes/FLOP. * * Some implementations and the documents are accessible through https://github.com/UoB-HPC * * Can be run with custom arguments as well as catch2 arguments - * Run with Custom arguments: + * Run with Custom arguments and for kernels: init, copy, mul, add, triad (and dot kernel if a multi-thread acc + * available): * ./babelstream --array-size=33554432 --number-runs=100 - * Runt with default array size and num runs: + * Run with Custom arguments and select from 3 kernel groups: all, triad, nstream + * ./babelstream --array-size=33554432 --number-runs=100 --run-kernels=triad (only triad kernel) + * ./babelstream --array-size=33554432 --number-runs=100 --run-kernels=nstream (only nstream kernel) + * ./babelstream --array-size=33554432 --number-runs=100 --run-kernels=all (default case. Add, Multiply, Copy, Triad + * and Dot) Run with default array size and num runs: * ./babelstream - * Run with Catch2 arguments and defaul arrary size and num runs: + * Run with Catch2 arguments and default array size and num runs: * ./babelstream --success - * ./babelstream -r a.xml + * ./babelstream -r xml * Run with Custom and catch2 arguments together: * ./babelstream --success --array-size=1280000 --number-runs=10 * Help to list custom and catch2 arguments @@ -57,14 +65,15 @@ struct InitKernel //! \tparam T The data type //! \param acc The accelerator to be executed on. //! \param a Pointer for vector a - //! \param initA the value to set all items in the vector + //! \param initialA the value to set all items in the vector a + //! \param initialB the value to set all items in the vector b template - ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T* b, T* c, T initA) const + ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T* b, T* c, T initialA, T initialB, T initialC) const { auto const [i] = alpaka::getIdx(acc); - a[i] = initA; - b[i] = static_cast(0.0); - c[i] = static_cast(0.0); + a[i] = initialA; + b[i] = initialB; + c[i] = initialC; } }; @@ -76,12 +85,12 @@ struct CopyKernel //! \tparam T The data type //! \param acc The accelerator to be executed on. //! \param a Pointer for vector a - //! \param b Pointer for vector b + //! \param c Pointer for vector c template - ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T* b) const + ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T* c) const { auto const [index] = alpaka::getIdx(acc); - b[index] = a[index]; + c[index] = a[index]; } }; @@ -92,14 +101,14 @@ struct MultKernel //! \tparam TAcc The accelerator environment to be executed on. //! \tparam T The data type //! \param acc The accelerator to be executed on. - //! \param a Pointer for vector a + //! \param c Pointer for vector c //! \param b Pointer for result vector b template - ALPAKA_FN_ACC void operator()(TAcc const& acc, T* const a, T* b) const + ALPAKA_FN_ACC void operator()(TAcc const& acc, T* b, T* const c) const { const T scalar = static_cast(scalarVal); auto const [i] = alpaka::getIdx(acc); - b[i] = scalar * a[i]; + b[i] = scalar * c[i]; } }; @@ -132,11 +141,23 @@ struct TriadKernel //! \param b Pointer for vector b //! \param c Pointer for result vector c template - ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* c) const + ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T const* b, T const* c) const + { + const T scalar = static_cast(scalarVal); + auto const [i] = alpaka::getIdx(acc); + a[i] = b[i] + scalar * c[i]; + } +}; + +//! Optional kernel, not one of the 5 standard Babelstream kernels +struct NstreamKernel +{ + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T const* b, T const* c) const { const T scalar = static_cast(scalarVal); auto const [i] = alpaka::getIdx(acc); - c[i] = a[i] + scalar * b[i]; + a[i] += b[i] + scalar * c[i]; } }; @@ -150,7 +171,8 @@ struct DotKernel //! \param acc The accelerator to be executed on. //! \param a Pointer for vector a //! \param b Pointer for vector b - //! \param sum Pointer for result vector consisting sums for each block + //! \param sum Pointer for result vector consisting sums of blocks + //! \param arraySize the size of the array template ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* sum, alpaka::Idx arraySize) const { @@ -171,7 +193,21 @@ struct DotKernel { alpaka::syncBlockThreads(acc); if(local_i < offset) + { +#if defined(__GNUC__) && !defined(__clang__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#elif defined(__clang__) +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wuninitialized" +#endif tbSum[local_i] += tbSum[local_i + offset]; +#if defined(__GNUC__) && !defined(__clang__) +# pragma GCC diagnostic pop +#elif defined(__clang__) +# pragma clang diagnostic pop +#endif + } } auto const gridBlockIndex = alpaka::getIdx(acc)[0]; @@ -186,15 +222,20 @@ struct DotKernel template void testKernels() { + if(kernelsToBeExecuted == KernelsToRun::All) + { + std::cout << "Kernels: Init, Copy, Mul, Add, Triad, Dot Kernels" << std::endl; + } using Acc = TAcc; - // Define the index domain // Set the number of dimensions as an integral constant. Set to 1 for 1D. using Dim = alpaka::Dim; using Idx = alpaka::Idx; - // Meta data - // A MetaData class instance to keep the problem and results to print later - MetaData metaData; + // A MetaData class instance to keep the benchmark info and results to print later. Does not include intermediate + // runtime data. + BenchmarkMetaData metaData; + + // Convert data-type to string to display std::string dataTypeStr; if(std::is_same::value) { @@ -251,7 +292,9 @@ void testKernels() bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr, - static_cast(valA)); + static_cast(initA), + static_cast(initB), + static_cast(initC)); auto const workDivCopy = alpaka::getValidWorkDiv(kernelCfg, devAcc, CopyKernel(), bufAccInputAPtr, bufAccInputBPtr); auto const workDivMult @@ -267,41 +310,87 @@ void testKernels() bufAccInputBPtr, bufAccOutputCPtr); - // Vector of average run-times of babelstream kernels - std::vector avgExecTimesOfKernels; - std::vector minExecTimesOfKernels; - std::vector maxExecTimesOfKernels; - std::vector kernelLabels; - // Vector for collecting successive run-times of a single kernel in benchmark macro - std::vector times; + auto const workDivNStream = alpaka::getValidWorkDiv( + kernelCfg, + devAcc, + NstreamKernel(), + bufAccInputAPtr, + bufAccInputBPtr, + bufAccOutputCPtr); - // Lambda for measuring run-time - auto measureKernelExec = [&](auto&& kernelFunc, [[maybe_unused]] auto&& kernelLabel) + + // Lambda to create and return work division for dot kernel + auto getWorkDivForDotKernel = [&]() -> alpaka::WorkDivMembers { - for(auto i = 0; i < numberOfRuns; i++) + // Use babelstream standard work division for multi-threaded backends + if constexpr(alpaka:: + accMatchesTags) + { + return alpaka::WorkDivMembers{ + Vec::all(static_cast>(dotGridBlockExtent)), + Vec::all(blockThreadExtentMain), + Vec::all(1)}; + } + else { - double runtime = 0.0; - auto start = std::chrono::high_resolution_clock::now(); - kernelFunc(); - alpaka::wait(queue); - auto end = std::chrono::high_resolution_clock::now(); - std::chrono::duration duration = end - start; - runtime = duration.count(); - times.push_back(runtime); + // Work division for single-threaded backends + // Since block size is 1, the elements per grid is dotGridBlockExtent * blockThreadExtentMain + alpaka::KernelCfg const kernelCfgDot + = {Vec::all(dotGridBlockExtent * blockThreadExtentMain), elementsPerThread}; + + return alpaka::getValidWorkDiv( + kernelCfgDot, + devAcc, + DotKernel(), + bufAccInputAPtr, + bufAccInputBPtr, + bufAccOutputCPtr, // this is used here a kind of dummy + static_cast>(arraySize)); } + }; - // find the minimum of the durations array. - // In benchmarking the first item of the runtimes array is not included in calculations. - const auto minmaxPair = findMinMax(times); - minExecTimesOfKernels.push_back(minmaxPair.first); - maxExecTimesOfKernels.push_back(minmaxPair.second); - avgExecTimesOfKernels.push_back(findAverage(times)); - kernelLabels.push_back(kernelLabel); - times.clear(); + // Work Division for Dot Kernel + auto const workDivDot = (getWorkDivForDotKernel.template operator()()); + // To record runtime data generated while running the kernels + RuntimeResults runtimeResults; + + // Lambda for measuring run-time + auto measureKernelExec = [&](auto&& kernelFunc, [[maybe_unused]] auto&& kernelLabel) + { + double runtime = 0.0; + auto start = std::chrono::high_resolution_clock::now(); + kernelFunc(); + alpaka::wait(queue); + auto end = std::chrono::high_resolution_clock::now(); + // get duration in seconds + std::chrono::duration duration = end - start; + runtime = duration.count(); + runtimeResults.kernelToRundataMap[kernelLabel]->timingsSuccessiveRuns.push_back(runtime); }; - // Run kernels one by one - // Test the init-kernel. + + // Initialize logger before running kernels + // Runtime result initialisation to be filled by each kernel + runtimeResults.addKernelTimingsVec("InitKernel"); + if(kernelsToBeExecuted == KernelsToRun::All) + { + runtimeResults.addKernelTimingsVec("CopyKernel"); + runtimeResults.addKernelTimingsVec("AddKernel"); + runtimeResults.addKernelTimingsVec("TriadKernel"); + runtimeResults.addKernelTimingsVec("MultKernel"); + runtimeResults.addKernelTimingsVec("DotKernel"); + } + else if(kernelsToBeExecuted == KernelsToRun::NStream) + { + runtimeResults.addKernelTimingsVec("NStreamKernel"); + } + else if(kernelsToBeExecuted == KernelsToRun::Triad) + { + runtimeResults.addKernelTimingsVec("TriadKernel"); + } + + + // Init kernel measureKernelExec( [&]() { @@ -312,149 +401,239 @@ void testKernels() bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr, - static_cast(valA)); + static_cast(initA), + static_cast(initB), + static_cast(initC)); }, "InitKernel"); - // Test the copy-kernel. Copy A one by one to B. - measureKernelExec( - [&]() { alpaka::exec(queue, workDivCopy, CopyKernel(), bufAccInputAPtr, bufAccInputBPtr); }, - "CopyKernel"); - - // Test the scaling-kernel. Calculate B=scalar*A. - measureKernelExec( - [&]() { alpaka::exec(queue, workDivMult, MultKernel(), bufAccInputAPtr, bufAccInputBPtr); }, - "MultKernel"); + // Init kernel will be run for all cases therefore add it to metadata unconditionally + metaData.setItem(BMInfoDataType::WorkDivInit, workDivInit); - // Test the addition-kernel. Calculate C=A+B. Where B=scalar*A. - measureKernelExec( - [&]() - { alpaka::exec(queue, workDivAdd, AddKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr); }, - "AddKernel"); + // Dot kernel result + DataType resultDot = DataType{0.0}; - // Test the Triad-kernel. Calculate C=A+scalar*B where B=scalar*A. - measureKernelExec( - [&]() - { alpaka::exec(queue, workDivTriad, TriadKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr); }, - "TriadKernel"); + // Main for loop to run the kernel-sequence + for(auto i = 0; i < numberOfRuns; i++) + { + if(kernelsToBeExecuted == KernelsToRun::All) + { + // Test the copy-kernel. Copy A one by one to C. + measureKernelExec( + [&]() { alpaka::exec(queue, workDivCopy, CopyKernel(), bufAccInputAPtr, bufAccOutputCPtr); }, + "CopyKernel"); + + // Test the scaling-kernel. Calculate B=scalar*C. Where C = A. + measureKernelExec( + [&]() { alpaka::exec(queue, workDivMult, MultKernel(), bufAccInputBPtr, bufAccOutputCPtr); }, + "MultKernel"); + + // Test the addition-kernel. Calculate C=A+B. Where B=scalar*C or B=scalar*A. + measureKernelExec( + [&]() { + alpaka::exec( + queue, + workDivAdd, + AddKernel(), + bufAccInputAPtr, + bufAccInputBPtr, + bufAccOutputCPtr); + }, + "AddKernel"); + } + // Triad kernel is run for 2 command line arguments + if(kernelsToBeExecuted == KernelsToRun::All || kernelsToBeExecuted == KernelsToRun::Triad) + { + // Test the Triad-kernel. Calculate A=B+scalar*C. Where C is A+scalar*A. + measureKernelExec( + [&]() { + alpaka::exec( + queue, + workDivTriad, + TriadKernel(), + bufAccInputAPtr, + bufAccInputBPtr, + bufAccOutputCPtr); + }, + "TriadKernel"); + } + if(kernelsToBeExecuted == KernelsToRun::All) + { + // Vector of sums of each block + auto bufAccSumPerBlock = alpaka::allocBuf(devAcc, workDivDot.m_gridBlockExtent[0]); + auto bufHostSumPerBlock = alpaka::allocBuf(devHost, workDivDot.m_gridBlockExtent[0]); + // Test Dot kernel with specific blocksize which is larger than one + + + measureKernelExec( + [&]() + { + alpaka::exec( + queue, + workDivDot, + DotKernel(), // Dot kernel + bufAccInputAPtr, + bufAccInputBPtr, + alpaka::getPtrNative(bufAccSumPerBlock), + static_cast>(arraySize)); + alpaka::memcpy(queue, bufHostSumPerBlock, bufAccSumPerBlock, workDivDot.m_gridBlockExtent[0]); + alpaka::wait(queue); + + DataType const* sumPtr = std::data(bufHostSumPerBlock); + resultDot + = static_cast(std::reduce(sumPtr, sumPtr + workDivDot.m_gridBlockExtent[0], 0.0)); + }, + "DotKernel"); + // Add workdiv to the list of workdivs to print later + metaData.setItem(BMInfoDataType::WorkDivDot, workDivDot); + } + // NStream kernel is run only for one command line argument + if(kernelsToBeExecuted == KernelsToRun::NStream) + { + // Test the NStream-kernel. Calculate A += B + scalar * C; + measureKernelExec( + [&]() { + alpaka::exec( + queue, + workDivNStream, + NstreamKernel(), + bufAccInputAPtr, + bufAccInputBPtr, + bufAccOutputCPtr); + }, + "NStreamKernel"); + } + alpaka::wait(queue); + } // End of MAIN LOOP which runs the kernels many times - // Copy arrays back to host - alpaka::memcpy(queue, bufHostOutputC, bufAccOutputC, arraySize); - alpaka::memcpy(queue, bufHostOutputB, bufAccInputB, arraySize); - alpaka::memcpy(queue, bufHostOutputA, bufAccInputA, arraySize); + // Copy results back to the host, measure copy time + { + auto start = std::chrono::high_resolution_clock::now(); + // Copy arrays back to host since the execution of kernels except dot kernel finished + alpaka::memcpy(queue, bufHostOutputC, bufAccOutputC, arraySize); + alpaka::memcpy(queue, bufHostOutputB, bufAccInputB, arraySize); + alpaka::memcpy(queue, bufHostOutputA, bufAccInputA, arraySize); + alpaka::wait(queue); + auto end = std::chrono::high_resolution_clock::now(); + // Get duration in seconds + std::chrono::duration duration = end - start; + double copyRuntime = duration.count(); + metaData.setItem(BMInfoDataType::CopyTimeFromAccToHost, copyRuntime); + } - // Verify the results // - // Find sum of the errors as sum of the differences from expected values - DataType initVal{static_cast(0.0)}; - DataType sumErrC{initVal}, sumErrB{initVal}, sumErrA{initVal}; + // Result Verification and BW Calculation for 3 cases + // - auto const expectedC = static_cast(valA + scalarVal * scalarVal * valA); - auto const expectedB = static_cast(scalarVal * valA); - auto const expectedA = static_cast(valA); + // Generated expected values by doing the same chain of operations due to floating point error + DataType expectedA = static_cast(initA); + DataType expectedB = static_cast(initB); + DataType expectedC = static_cast(initC); - // sum of the errors for each array - for(Idx i = 0; i < arraySize; ++i) + // To calculate expected results by applying at host the same operation sequence + calculateBabelstreamExpectedResults(expectedA, expectedB, expectedC); + + // Verify the resulting data, if kernels are init, copy, mul, add, triad and dot kernel + if(kernelsToBeExecuted == KernelsToRun::All) { - sumErrC += bufHostOutputC[static_cast(i)] - expectedC; - sumErrB += bufHostOutputB[static_cast(i)] - expectedB; - sumErrA += bufHostOutputA[static_cast(i)] - expectedA; - } + // Find sum of the errors as sum of the differences from expected values + constexpr DataType initVal{static_cast(0.0)}; + DataType sumErrC{initVal}, sumErrB{initVal}, sumErrA{initVal}; - // Normalize and compare sum of the errors - REQUIRE(FuzzyEqual(sumErrC / static_cast(arraySize) / expectedC, static_cast(0.0))); - REQUIRE(FuzzyEqual(sumErrB / static_cast(arraySize) / expectedB, static_cast(0.0))); - REQUIRE(FuzzyEqual(sumErrA / static_cast(arraySize) / expectedA, static_cast(0.0))); - alpaka::wait(queue); + // sum of the errors for each array + for(Idx i = 0; i < arraySize; ++i) + { + sumErrC += std::fabs(bufHostOutputC[static_cast(i)] - expectedC); + sumErrB += std::fabs(bufHostOutputB[static_cast(i)] - expectedB); + sumErrA += std::fabs(bufHostOutputA[static_cast(i)] - expectedA); + } - // Test Dot kernel with specific blocksize which is larger than 1 - if constexpr(alpaka::accMatchesTags) - { - using WorkDiv = alpaka::WorkDivMembers; - // Threads per block for Dot kernel - constexpr Idx blockThreadExtent = blockThreadExtentMain; - // Blocks per grid for Dot kernel - constexpr Idx gridBlockExtent = static_cast(256); - // Vector of sums of each block - auto bufAccSumPerBlock = alpaka::allocBuf(devAcc, gridBlockExtent); - auto bufHostSumPerBlock = alpaka::allocBuf(devHost, gridBlockExtent); - // A specific work-division is used for dotKernel - auto const workDivDot = WorkDiv{Vec{gridBlockExtent}, Vec{blockThreadExtent}, Vec::all(1)}; - - measureKernelExec( - [&]() - { - alpaka::exec( - queue, - workDivDot, - DotKernel(), // Dot kernel - alpaka::getPtrNative(bufAccInputA), - alpaka::getPtrNative(bufAccInputB), - alpaka::getPtrNative(bufAccSumPerBlock), - static_cast>(arraySize)); - }, - "DotKernel"); - - alpaka::memcpy(queue, bufHostSumPerBlock, bufAccSumPerBlock, gridBlockExtent); + // Normalize and compare sum of the errors + // Use a different equality check if floating point errors exceed precision of FuzzyEqual function + REQUIRE(FuzzyEqual(sumErrC / static_cast(arraySize), static_cast(0.0))); + REQUIRE(FuzzyEqual(sumErrB / static_cast(arraySize), static_cast(0.0))); + REQUIRE(FuzzyEqual(sumErrA / static_cast(arraySize), static_cast(0.0))); alpaka::wait(queue); - DataType const* sumPtr = std::data(bufHostSumPerBlock); - auto const result = std::reduce(sumPtr, sumPtr + gridBlockExtent, DataType{0}); - // Since vector values are 1, dot product should be identical to arraySize - REQUIRE(FuzzyEqual(static_cast(result), static_cast(arraySize * 2))); - // Add workdiv to the list of workdivs to print later - metaData.setItem(BMInfoDataType::WorkDivDot, workDivDot); + // Verify Dot kernel + DataType const expectedSum = static_cast(arraySize) * expectedA * expectedB; + // Dot product should be identical to arraySize*valA*valB + // Use a different equality check if floating point errors exceed precision of FuzzyEqual function + REQUIRE(FuzzyEqual(static_cast(std::fabs(resultDot - expectedSum) / expectedSum), 0.0f)); + + // Set workdivs of benchmark metadata to be displayed at the end + metaData.setItem(BMInfoDataType::WorkDivInit, workDivInit); + metaData.setItem(BMInfoDataType::WorkDivCopy, workDivCopy); + metaData.setItem(BMInfoDataType::WorkDivAdd, workDivAdd); + metaData.setItem(BMInfoDataType::WorkDivMult, workDivMult); + metaData.setItem(BMInfoDataType::WorkDivTriad, workDivTriad); } + // Verify the Triad Kernel result if "--run-kernels=triad". + else if(kernelsToBeExecuted == KernelsToRun::Triad) + { + // Verify triad by summing the error + auto sumErrA = static_cast(0.0); + // sum of the errors for each array + for(Idx i = 0; i < arraySize; ++i) + { + sumErrA += std::fabs(bufHostOutputA[static_cast(i)] - expectedA); + } - - // - // Calculate and Display Benchmark Results - // - std::vector bytesReadWriteMB = { - getDataThroughput(2u, static_cast(arraySize)), - getDataThroughput(2u, static_cast(arraySize)), - getDataThroughput(2u, static_cast(arraySize)), - getDataThroughput(3u, static_cast(arraySize)), - getDataThroughput(3u, static_cast(arraySize)), - getDataThroughput(2u, static_cast(arraySize)), - }; - - // calculate the bandwidth as throughput per seconds - std::vector bandwidthsPerKernel; - if(minExecTimesOfKernels.size() == kernelLabels.size()) + REQUIRE(FuzzyEqual(sumErrA / static_cast(arraySize) / expectedA, static_cast(0.0))); + metaData.setItem(BMInfoDataType::WorkDivTriad, workDivTriad); + } + // Verify the NStream Kernel result if "--run-kernels=nstream". + else if(kernelsToBeExecuted == KernelsToRun::NStream) { - for(size_t i = 0; i < minExecTimesOfKernels.size(); ++i) + auto sumErrA = static_cast(0.0); + // sum of the errors for each array + for(Idx i = 0; i < arraySize; ++i) { - bandwidthsPerKernel.push_back(calculateBandwidth(bytesReadWriteMB.at(i), minExecTimesOfKernels.at(i))); + sumErrA += std::fabs(bufHostOutputA[static_cast(i)] - expectedA); } + REQUIRE(FuzzyEqual(sumErrA / static_cast(arraySize) / expectedA, static_cast(0.0))); + + metaData.setItem(BMInfoDataType::WorkDivNStream, workDivNStream); } - // Setting fields of Benchmark Info map. All information about benchmark and results are stored in a single map + // Runtime results of the benchmark: Calculate throughput and bandwidth + // Set throuput values depending on the kernels + runtimeResults.initializeByteReadWrite(arraySize); + runtimeResults.calculateBandwidthsForKernels(); + + // Set metadata to display all benchmark related information. + // + // All information about benchmark and results are stored in a single map metaData.setItem(BMInfoDataType::TimeStamp, getCurrentTimestamp()); metaData.setItem(BMInfoDataType::NumRuns, std::to_string(numberOfRuns)); metaData.setItem(BMInfoDataType::DataSize, std::to_string(arraySizeMain)); metaData.setItem(BMInfoDataType::DataType, dataTypeStr); - - metaData.setItem(BMInfoDataType::WorkDivInit, workDivInit); - metaData.setItem(BMInfoDataType::WorkDivCopy, workDivCopy); - metaData.setItem(BMInfoDataType::WorkDivAdd, workDivAdd); - metaData.setItem(BMInfoDataType::WorkDivMult, workDivMult); - metaData.setItem(BMInfoDataType::WorkDivTriad, workDivTriad); - // Device and accelerator metaData.setItem(BMInfoDataType::DeviceName, alpaka::getName(devAcc)); metaData.setItem(BMInfoDataType::AcceleratorType, alpaka::getAccName()); // XML reporter of catch2 always converts to Nano Seconds metaData.setItem(BMInfoDataType::TimeUnit, "Nano Seconds"); - // Join elements and create a comma separated string - metaData.setItem(BMInfoDataType::KernelNames, joinElements(kernelLabels, ", ")); - metaData.setItem(BMInfoDataType::KernelDataUsageValues, joinElements(bytesReadWriteMB, ", ")); - metaData.setItem(BMInfoDataType::KernelBandwidths, joinElements(bandwidthsPerKernel, ", ")); - metaData.setItem(BMInfoDataType::KernelMinTimes, joinElements(minExecTimesOfKernels, ", ")); - metaData.setItem(BMInfoDataType::KernelMaxTimes, joinElements(maxExecTimesOfKernels, ", ")); - metaData.setItem(BMInfoDataType::KernelAvgTimes, joinElements(avgExecTimesOfKernels, ", ")); + // get labels from the map + std::vector kernelLabels; + std::transform( + runtimeResults.kernelToRundataMap.begin(), + runtimeResults.kernelToRundataMap.end(), + std::back_inserter(kernelLabels), + [](auto const& pair) { return pair.first; }); + // Join elements and create a comma separated string and set item + metaData.setItem(BMInfoDataType::KernelNames, joinElements(kernelLabels, ", ")); + // Join elements and create a comma separated string and set item + std::vector values(runtimeResults.getThroughputKernelArray()); + metaData.setItem(BMInfoDataType::KernelDataUsageValues, joinElements(values, ", ")); + // Join elements and create a comma separated string and set item + std::vector valuesBW(runtimeResults.getBandwidthKernelVec()); + metaData.setItem(BMInfoDataType::KernelBandwidths, joinElements(valuesBW, ", ")); + + metaData.setItem(BMInfoDataType::KernelMinTimes, joinElements(runtimeResults.getMinExecTimeKernelArray(), ", ")); + metaData.setItem(BMInfoDataType::KernelMaxTimes, joinElements(runtimeResults.getMaxExecTimeKernelArray(), ", ")); + metaData.setItem(BMInfoDataType::KernelAvgTimes, joinElements(runtimeResults.getAvgExecTimeKernelArray(), ", ")); // Print the summary as a table, if a standard serialization is needed other functions of the class can be used std::cout << metaData.serializeAsTable() << std::endl; } @@ -462,7 +641,7 @@ void testKernels() using TestAccs1D = alpaka::test::EnabledAccs, std::uint32_t>; // Run for all Accs given by the argument -TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Five Kernels", "[benchmark-test]", TestAccs1D) +TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Kernels", "[benchmark-test]", TestAccs1D) { using Acc = TestType; // Run tests for the float data type @@ -470,7 +649,7 @@ TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Five Kernels", "[benchmark-tes } // Run for all Accs given by the argument -TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Five Kernels", "[benchmark-test]", TestAccs1D) +TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Kernels", "[benchmark-test]", TestAccs1D) { using Acc = TestType; // Run tests for the double data type