Skip to content

Commit

Permalink
make kernels depend each other, use original variables and access order
Browse files Browse the repository at this point in the history
  • Loading branch information
mehmetyusufoglu committed Nov 17, 2024
1 parent 8fefd70 commit 09d5ff9
Show file tree
Hide file tree
Showing 2 changed files with 544 additions and 219 deletions.
267 changes: 224 additions & 43 deletions benchmarks/babelstream/src/babelStreamCommon.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
#include <iostream>
#include <limits>
#include <map>
#include <memory>
#include <numeric>
#include <ranges>
#include <sstream>
#include <stdexcept>
#include <string>
Expand All @@ -28,55 +30,106 @@ namespace
[[maybe_unused]] constexpr auto minArrSize = 1024 * 128;

// Scalar value for Mul and Triad kernel parameters.
[[maybe_unused]] constexpr auto scalarVal = 2.0f;
[[maybe_unused]] constexpr double scalarVal = 0.4;

// Block thread extent for DotKernel test work division parameters.
[[maybe_unused]] constexpr auto blockThreadExtentMain = 1024;
[[maybe_unused]] constexpr auto dotGridBlockExtent = 256;

// Number of runs for each kernel, can be changed by command line arguments.
// At least 100 runs are recommended for good benchmarking.
// To prevent timeouts in CI, a small value is used.
[[maybe_unused]] auto numberOfRuns = 2;

// Data input value for babelstream.
[[maybe_unused]] constexpr auto valA = 1.0f;
// Data input values for babelstream.
[[maybe_unused]] constexpr double initA = 0.1;
[[maybe_unused]] constexpr double initB = 0.2;
// Change this if triad kernel is going to be run alone
[[maybe_unused]] constexpr double initC = 0.0;

//! Values corresponding to the command line argument run-kernels
enum class KernelsToRun
{
All, // init, add, copy, mul, triad, dot
Triad, // only init and triad
NStream // only init and nstream
};

// Define the variable showing the kernel(s) being run
[[maybe_unused]] KernelsToRun kernelsToBeExecuted{KernelsToRun::All};

//! handleCustomArguments Gets custom cmd line arguments from the all arguments.
//! Namely gets --array-size=1234 and --number-runs=1234 and keeps the others which are
//! command line args for Catch2 session.
[[maybe_unused]] static void handleCustomArguments(int& argc, char* argv[])
{
std::vector<char*> newArgv;
newArgv.push_back(argv[0]); // Keep the program name
std::vector<char*> newArgv({argv[0]}); // keep program name

for(int i = 1; i < argc; ++i)
{
std::string arg = argv[i];
if(arg.rfind("--array-size=", 0) == 0)
{
auto const arrSize = std::stoi(arg.substr(13)); // Convert to integer
if(arrSize > minArrSize)
try
{
arraySizeMain = arrSize;
std::cout << "Array size provided(items): " << arraySizeMain << std::endl;
// Convert argument to integer
auto arrSize = std::stoi(arg.substr(13));
if(arrSize > minArrSize)
{
arraySizeMain = arrSize;
std::cout << "Array size set to: " << arraySizeMain << std::endl;
}
else
{
std::cout << "Array size too small. Must be at least " << minArrSize
<< ", using default: " << arraySizeMain << std::endl;
}
}
else
catch(std::invalid_argument const&)
{
std::cout << "Too small array size given. Must be at least " << minArrSize << std::endl;
std::cout << "Using default array size(number of items): " << arraySizeMain << std::endl;
std::cerr << "Invalid array size argument: " << arg << ". Default value used." << std::endl;
}
}
else if(arg.rfind("--number-runs=", 0) == 0)
{
auto const numRuns = std::stoi(arg.substr(14)); // Convert to integer
if(numRuns > 0)
try
{
// Convert argument to integer
auto const numRuns = std::stoi(arg.substr(14));
if(numRuns > 0)
{
numberOfRuns = numRuns;
std::cout << "Number of runs provided: " << numberOfRuns << std::endl;
}
else
{
std::cout << "Using default number of runs: " << numberOfRuns << std::endl;
}
}
catch(std::invalid_argument const&)
{
std::cerr << "Invalid number of runs argument: " << arg << " . Default value used." << std::endl;
}
}
else if(arg.rfind("--run-kernels=", 0) == 0)
{
// Get argument to determine which kernels will be run
auto const kernelsString = arg.substr(14);
if(kernelsString == "nstream")
{
std::cout << "Only nstream kernel will be executed." << std::endl;
kernelsToBeExecuted = KernelsToRun::NStream;
}
else if(kernelsString == "triad")
{
numberOfRuns = numRuns;
std::cout << "Number of runs provided: " << numberOfRuns << std::endl;
kernelsToBeExecuted = KernelsToRun::Triad;
std::cout << "Only triad kernel will be executed." << std::endl;
}
else
else if(kernelsString == "all")
{
std::cout << "Using default number of runs: " << numberOfRuns << std::endl;
// The variable kernelsToBeExecuted default value is "all";
kernelsToBeExecuted = KernelsToRun::All;
std::cout << "All 5 babelstream kernels are going to be executed." << std::endl;
}
}
else
Expand All @@ -87,7 +140,11 @@ namespace
if(arg.rfind("-?", 0) == 0 || arg.rfind("--help", 0) == 0 || arg.rfind("-h", 0) == 0)
{
std::cout << "Usage of custom arguments (arguments which are not Catch2): --array-size=33554432 and "
"--number-runs=100"
"--number-runs=100\n"
<< std::endl;
std::cout << "If you want to run only nstream kernel or triad kernel use --run-kernels=nstream or "
"--run-kernels=triad. Otherwise all 5 standard kernels will be executed. Copy, Mul, Add, "
"Triad. (and Dot kernel, if multi-threaded acc is used.)"
<< std::endl;
}
}
Expand All @@ -98,6 +155,12 @@ namespace
{
argv[i] = newArgv[static_cast<size_t>(i)];
}

// Array size must a multiple of
if(arraySizeMain % blockThreadExtentMain != 0)
throw std::runtime_error(
"Array size is " + std::to_string(arraySizeMain) + ". It must be a multiple of block-size "
+ std::to_string(blockThreadExtentMain));
}

//! FuzzyEqual compares two floating-point or integral type values.
Expand All @@ -111,7 +174,7 @@ namespace
{
if constexpr(std::is_floating_point_v<T>)
{
return std::fabs(a - b) < std::numeric_limits<T>::epsilon() * static_cast<T>(100.0);
return std::fabs(a - b) < (std::numeric_limits<T>::epsilon() * static_cast<T>(100.0));
}
else if constexpr(std::is_integral_v<T>)
{
Expand Down Expand Up @@ -219,6 +282,7 @@ namespace
WorkDivTriad,
WorkDivMult,
WorkDivDot,
WorkDivNStream,
DeviceName,
TimeUnit,
KernelNames,
Expand Down Expand Up @@ -279,6 +343,8 @@ namespace
return "WorkDivMult ";
case BMInfoDataType::WorkDivDot:
return "WorkDivDot ";
case BMInfoDataType::WorkDivNStream:
return "WorkDivNStream";
default:
return "";
}
Expand Down Expand Up @@ -314,11 +380,126 @@ namespace
return bytesReadWriteGB / static_cast<double>(runTimeSeconds);
}

/**
* /brief The RuntimeResults class bundles the kernel runtime data in a map
* The keys of the map are kernel names the values of the map are KernelRunData struct pointers
*/
class RuntimeResults
{
struct KernelRunData
{
std::vector<double> timingsSuccessiveRuns; // Stores execution timings of successive runs
double byteReadWriteMB{0}; // Bytes read/write in MB
double bandwidthKernel{0}; // Bandwidth of kernel
double minExecTime{0}; // Minimum execution time
double maxExecTime{0}; // Maximum execution time
double avgExecTime{0}; // Average execution time
};

public:
// Map from kernelName (string) to a unique_ptr for KernelRunData
// Using unique_ptr for automatic memory management
std::map<std::string, std::unique_ptr<KernelRunData>> kernelToRundataMap;

// Function to initialize the byteReadWriteMB field for each kernel
template<typename DataType>
void initializeByteReadWrite(size_t arraySize)
{
// Define kernel names and their throughput values based on the provided array size
std::map<std::string, double> throughputValues
= {{"CopyKernel", getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize))},
{"MultKernel", getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize))},
{"AddKernel", getDataThroughput<DataType>(3u, static_cast<unsigned>(arraySize))},
{"TriadKernel", getDataThroughput<DataType>(3u, static_cast<unsigned>(arraySize))},
{"DotKernel", getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize))},
{"NStreamKernel", getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize))}};

// Populate each KernelRunData entry in kernelToRundataMap
for(auto const& [kernelName, throughput] : throughputValues)
{
// Check if the kernel name exists in the map
if(kernelToRundataMap.find(kernelName) != kernelToRundataMap.end())
{
// Set the byteReadWriteMB field in the corresponding KernelRunData
kernelToRundataMap[kernelName]->byteReadWriteMB = throughput;
}
}
}

// Function to calculate bandwidth for each kernel and update execution times
template<typename DataType>
void calculateBandwidthsForKernels()
{
for(auto const& [kernelName, kernelData] : kernelToRundataMap)
{
// Calculate min and max execution times
auto const minmaxPair = findMinMax(kernelData->timingsSuccessiveRuns);
kernelData->minExecTime = minmaxPair.first;
kernelData->maxExecTime = minmaxPair.second;

// Calculate average execution time
kernelData->avgExecTime = findAverage(kernelData->timingsSuccessiveRuns);

// Calculate bandwidth based on byteReadWriteMB and min execution time
kernelData->bandwidthKernel = calculateBandwidth(kernelData->byteReadWriteMB, minmaxPair.first);
}
}

// Get item from each struct in the map and make a vector
template<typename Func>
std::vector<double> getItemFromStructs(Func accessor) const noexcept
{
std::vector<double> results;
for(auto const& [key, dataStruct] : kernelToRundataMap)
{
results.push_back(accessor(dataStruct.get())); // Dereference unique_ptr with .get()
}
return results;
}

// Functions to retrieve specific fields as vectors for all kernels
std::vector<double> getBandwidthKernelVec() const noexcept
{
return getItemFromStructs([](KernelRunData const* dataStruct) { return dataStruct->bandwidthKernel; });
}

std::vector<double> getThroughputKernelArray() const noexcept
{
return getItemFromStructs([](KernelRunData const* dataStruct) { return dataStruct->byteReadWriteMB; });
}

std::vector<double> getAvgExecTimeKernelArray() const noexcept
{
return getItemFromStructs([](KernelRunData const* dataStruct) { return dataStruct->avgExecTime; });
}

std::vector<double> getMinExecTimeKernelArray() const noexcept
{
return getItemFromStructs([](KernelRunData const* dataStruct) { return dataStruct->minExecTime; });
}

std::vector<double> getMaxExecTimeKernelArray() const noexcept
{
return getItemFromStructs([](KernelRunData const* dataStruct) { return dataStruct->maxExecTime; });
}

// Function to add a kernelName-timesForRuns pair by storing a unique_ptr to KernelRunData
void addKernelTimingsVec(std::string const& kernelName) noexcept
{
// Use make_unique to create a new KernelRunData object and store it in the map
kernelToRundataMap[kernelName] = std::make_unique<KernelRunData>();
}
};

//! MetaData class to store and serialize benchmark information.
//! \details The MetaData class includes a single map to keep all benchmark information and provides serialization
//! methods for generating output.
class MetaData
class BenchmarkMetaData
{
private:
// Information type to string. String can be comma separated values.
std::map<BMInfoDataType, std::string> metaDataMap;

public:
//! setItem Sets an item in the metadata map.
//! \tparam T The type of the value to store.
Expand Down Expand Up @@ -353,29 +534,31 @@ namespace
{
std::stringstream ss;
// define lambda to add values to a string stream created already
auto addItemValue = [&, this](BMInfoDataType item) {
ss << "\n" << typeToTypeStr(item) << ":" << metaDataMap.at(item);
auto addItemValueToSS = [&, this](BMInfoDataType item)
{
if(metaDataMap.count(item) != 0)
ss << "\n" << typeToTypeStr(item) << ":" << metaDataMap.at(item);
};

// Initially chose some data to serialize
// Initially choose some data to serialize from the meta-data map to add to string stream
ss << "\n";
addItemValue(BMInfoDataType::AcceleratorType);
addItemValue(BMInfoDataType::NumRuns);
addItemValue(BMInfoDataType::DataType);
addItemValue(BMInfoDataType::DataSize);
addItemValue(BMInfoDataType::DeviceName);
addItemValue(BMInfoDataType::WorkDivInit);
addItemValue(BMInfoDataType::WorkDivCopy);
addItemValue(BMInfoDataType::WorkDivMult);
addItemValue(BMInfoDataType::WorkDivAdd);
addItemValue(BMInfoDataType::WorkDivTriad);
if(metaDataMap.count(BMInfoDataType::WorkDivDot) != 0)
addItemValue(BMInfoDataType::WorkDivDot);

addItemValueToSS(BMInfoDataType::AcceleratorType);
addItemValueToSS(BMInfoDataType::NumRuns);
addItemValueToSS(BMInfoDataType::DataType);
addItemValueToSS(BMInfoDataType::DataSize);
addItemValueToSS(BMInfoDataType::DeviceName);
addItemValueToSS(BMInfoDataType::WorkDivInit);
addItemValueToSS(BMInfoDataType::WorkDivCopy);
addItemValueToSS(BMInfoDataType::WorkDivMult);
addItemValueToSS(BMInfoDataType::WorkDivAdd);
addItemValueToSS(BMInfoDataType::WorkDivTriad);
addItemValueToSS(BMInfoDataType::WorkDivDot);
addItemValueToSS(BMInfoDataType::WorkDivNStream);

// if the item is a string with delimited values get the item then the value at the index
auto getItemFromStrList = [this](BMInfoDataType item, int index) -> std::string
{
std::string const str = metaDataMap.at(item);

if(index < 1)
{
throw std::invalid_argument("Index must be 1 or greater.");
Expand Down Expand Up @@ -405,7 +588,7 @@ namespace
throw std::out_of_range("Index out of range");
};

// Prepare Table
// Prepare Table Display
// Table column names
ss << std::endl;
ss << std::left << std::setw(15) << typeToTypeStr(BMInfoDataType::KernelNames) << " " << std::left
Expand All @@ -421,7 +604,7 @@ namespace
// Table rows. Print test results for each kernel line by line
for(auto i = 1; i <= numberOfKernels; i++)
{
// Print the row for the kernel i
// Print the row for the kernel i.
ss << " " << std::left << std::setw(15) << getItemFromStrList(BMInfoDataType::KernelNames, i) << " ";
ss << std::left << std::setw(15) << getItemFromStrList(BMInfoDataType::KernelBandwidths, i) << " ";
ss << std::left << std::setw(8) << getItemFromStrList(BMInfoDataType::KernelMinTimes, i) << " ";
Expand All @@ -433,8 +616,6 @@ namespace

return ss.str();
}

private:
std::map<BMInfoDataType, std::string> metaDataMap;
};

} // namespace
Loading

0 comments on commit 09d5ff9

Please sign in to comment.