Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add additional tests for isolating issues on Sunspot #868

Draft
wants to merge 43 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
5e4e1df
add callback test for opencl
pvelesko Jun 10, 2024
2d3067b
add l0
pvelesko Jun 10, 2024
3c74adf
wip
pvelesko Jun 10, 2024
c40f7b5
flesh out level zero test
pvelesko Jun 11, 2024
4abf59c
add LevelZeroStream.cpp
pvelesko Jun 11, 2024
997c0ec
stream update
pvelesko Jun 11, 2024
77fa675
Stream more log
pvelesko Jun 11, 2024
a712287
LevelZeroStream.cpp
pvelesko Jun 11, 2024
709181b
update stream repro for multiple GPUs
pvelesko Jun 12, 2024
80f1eaf
fix LevelZeroStream test
pvelesko Jun 13, 2024
0c6ab97
add streamHang
pvelesko Jun 13, 2024
56f8135
add streamBroken.cpp
pvelesko Jun 14, 2024
98673ee
update LevelZeroStream
pvelesko Jun 14, 2024
bc41cd8
wip
pvelesko Jun 14, 2024
bae90fc
wip
pvelesko Jun 14, 2024
d2397e1
wip
pvelesko Jun 14, 2024
beb3a61
add First Touch Level Zero test
pvelesko Jun 14, 2024
007ceb8
working l0 first touch test
pvelesko Jun 14, 2024
9406454
copy inputs to build dir
pvelesko Jun 14, 2024
e0e1aac
add input, cleanup
pvelesko Jun 15, 2024
340f1c4
cleanup
pvelesko Jun 15, 2024
3faef8e
exclude LevelZeroStream
pvelesko Jun 15, 2024
b48d132
refactor LeveLZeroFirstTouch
pvelesko Jun 18, 2024
ff67345
use ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_CACHED
pvelesko Jun 18, 2024
6c7de6d
move LevelZeroStream join move
pvelesko Jun 18, 2024
16eea83
wip
pvelesko Jun 18, 2024
367789b
wip
pvelesko Jun 18, 2024
daf8d25
wip
pvelesko Jun 18, 2024
4b156e3
wip
pvelesko Jun 18, 2024
6683ac2
stream WIP
pvelesko Jun 18, 2024
7982070
stream hacking
pvelesko Jun 19, 2024
b6e2cfc
wup
pvelesko Jun 19, 2024
00d80ce
wip
pvelesko Jun 19, 2024
912cf19
wip
pvelesko Jun 19, 2024
4e30392
wip
pvelesko Jun 19, 2024
d6d1908
wip
pvelesko Jun 19, 2024
2988c7a
wip
pvelesko Jun 19, 2024
9820e32
wip
pvelesko Jun 19, 2024
cb2bdc7
wip
pvelesko Jun 19, 2024
17d9606
wip
pvelesko Jun 19, 2024
21a8c1f
fixup firstTouch
pvelesko Jun 19, 2024
94f95b5
wip
pvelesko Jun 19, 2024
31e86f4
wip
pvelesko Jun 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 19 additions & 140 deletions samples/7_streams/stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,156 +23,35 @@ THE SOFTWARE.

#include "hip/hip_runtime.h"

#define WIDTH 32

#define NUM (WIDTH * WIDTH)

#define THREADS_PER_BLOCK_X 4
#define THREADS_PER_BLOCK_Y 4
#define THREADS_PER_BLOCK_Z 1
// #define NUM 256 // pass
#define NUM 257 // hangs

using namespace std;

int StreamCount = 0;
std::mutex GlobalMtx;

int id1, id2;
void TestCallback(hipStream_t stream, hipError_t status, void* userData) {
float* TransposeData = (float* )userData;
for (int i = 0; i < NUM; i ++)
TransposeData[i] += 1.0f;

GlobalMtx.lock();
StreamCount ++;
GlobalMtx.unlock();

// std::cout << "Invoke CALLBACK " << TransposeData[0] << std::endl;

// return 0;
}

__global__ void matrixTranspose_static_shared(float* out, float* in,
const int width) {
__shared__ float sharedMem[WIDTH * WIDTH];

int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

sharedMem[y * width + x] = in[x * width + y];

__syncthreads();

out[y * width + x] = sharedMem[y * width + x];
}

__global__ void matrixTranspose_dynamic_shared(float* out, float* in,
const int width) {
// declare dynamic shared memory
HIP_DYNAMIC_SHARED(float, sharedMem)

int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

sharedMem[y * width + x] = in[x * width + y];

__syncthreads();

out[y * width + x] = sharedMem[y * width + x];
std::cout << "Invoke CALLBACK " << std::endl;
}

void MultipleStream(float** data, float* randArray, float** gpuTransposeMatrix,
float** TransposeMatrix, int width) {
const int num_streams = 2;
hipStream_t streams[num_streams];

for (int i = 0; i < num_streams; i++) hipStreamCreate(&streams[i]);

for (int i = 0; i < num_streams; i++) {
hipMalloc((void**)&data[i], NUM * sizeof(float));
hipMemcpyAsync(data[i], randArray, NUM * sizeof(float), hipMemcpyHostToDevice, streams[i]);
}

hipLaunchKernelGGL(matrixTranspose_static_shared,
dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, streams[0],
gpuTransposeMatrix[0], data[0], width);

hipLaunchKernelGGL(matrixTranspose_static_shared,
dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, streams[1],
gpuTransposeMatrix[1], data[1], width);

/*
hipLaunchKernelGGL(matrixTranspose_dynamic_shared,
dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), sizeof(float) * WIDTH * WIDTH,
streams[1], gpuTransposeMatrix[1], data[1], width);
*/
for (int i = 0; i < num_streams; i++)
hipMemcpyAsync(TransposeMatrix[i], gpuTransposeMatrix[i], NUM * sizeof(float),
hipMemcpyDeviceToHost, streams[i]);

// id1 = 0;
// id2 = 1;
hipStreamAddCallback(streams[0], TestCallback, (void* )TransposeMatrix[0], 0);
hipStreamAddCallback(streams[1], TestCallback, (void* )TransposeMatrix[1], 0);
}
//hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float),
// hipMemcpyDeviceToHost); // pass
//hipMemcpyAsync(gpuTransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float),
// hipMemcpyDeviceToDevice, 0); // pass
//hipMemcpyAsync(gpuTransposeMatrix, TransposeMatrix, NUM * sizeof(float),
// hipMemcpyHostToDevice, 0); // pass

int main() {
hipSetDevice(0);

float *data[2], *TransposeMatrix[2], *gpuTransposeMatrix[2], *randArray;

int width = WIDTH;

randArray = (float*)malloc(NUM * sizeof(float));

TransposeMatrix[0] = (float*)calloc(NUM , sizeof(float));
TransposeMatrix[1] = (float*)calloc(NUM , sizeof(float));

hipMalloc((void**)&gpuTransposeMatrix[0], NUM * sizeof(float));
hipMalloc((void**)&gpuTransposeMatrix[1], NUM * sizeof(float));

for (int i = 0; i < NUM; i++) {
randArray[i] = (float)i * 1.0f;
}

MultipleStream(data, randArray, gpuTransposeMatrix, TransposeMatrix, width);

float *TransposeMatrix, *gpuTransposeMatrix, *randArray;
TransposeMatrix = (float*)calloc(NUM , sizeof(float));
hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));

hipMemcpyAsync(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float),
hipMemcpyDeviceToHost, 0); // fail
hipStreamAddCallback(0, TestCallback, nullptr, 0);
std::cout << "Callback enqueue done\n";
hipDeviceSynchronize();
std::cout << "hipDeviceSync done\n";

/*
// Spin on stream counter to wait for the termination of event callbacks
int spinVal = 0;
do {
GlobalMtx.lock();
spinVal = StreamCount;
GlobalMtx.unlock();
} while (spinVal < 2);
*/

// verify the results
int errors = 0;
float eps = 1.0E-6;
for (int i = 0; i < NUM; i++) {
if (std::fabs(TransposeMatrix[0][i] - TransposeMatrix[1][i]) > eps) {
printf("%d stream0: %f stream1 %f\n", i, TransposeMatrix[0][i], TransposeMatrix[1][i]);
errors++;
}
}
if (errors != 0) {
printf("FAILED: %d errors\n", errors);
} else {
printf("stream PASSED!\n");
}

free(randArray);
for (int i = 0; i < 2; i++) {
hipFree(data[i]);
hipFree(gpuTransposeMatrix[i]);
free(TransposeMatrix[i]);
}

hipDeviceReset();
return 0;
return 0;
}
23 changes: 9 additions & 14 deletions src/backend/Level0/CHIPBackendLevel0.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1153,27 +1153,22 @@ CHIPQueueLevel0::launchImpl(chipstar::ExecItem *ExecItem) {
auto Y = ExecItem->getGrid().y;
auto Z = ExecItem->getGrid().z;
ze_group_count_t LaunchArgs = {X, Y, Z};
// if using immediate command lists, lock the mutex
LOCK(CommandListMtx); // TODO this is probably not needed when using RCL
LOCK(CommandListMtx);
auto CommandList = this->getCmdListImm();

// Do we need to annotate indirect buffer accesses?
auto *LzDev = static_cast<CHIPDeviceLevel0 *>(getDevice());
if (!LzDev->hasOnDemandPaging()) {
// The baseline answer is yes (unless we would know that the
// kernel won't access buffers indirectly).
auto Status = zeKernelSetIndirectAccess(
KernelZe, ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE |
ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST);
CHIPERR_CHECK_LOG_AND_THROW(Status, ZE_RESULT_SUCCESS,
hipErrorInitializationError);
}
// annotate the kernel for indirect access to handle the case where the kernel
// argument is SoA
auto Status = zeKernelSetIndirectAccess(
KernelZe, ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE |
ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST);
CHIPERR_CHECK_LOG_AND_THROW(Status, ZE_RESULT_SUCCESS,
hipErrorInitializationError);

// This function may not be called from simultaneous threads with the same
// command list handle.
// Done via LOCK(CommandListMtx)
auto [EventHandles, EventLocks] = addDependenciesQueueSync(LaunchEvent);
auto Status = zeCommandListAppendLaunchKernel(
Status = zeCommandListAppendLaunchKernel(
CommandList, KernelZe, &LaunchArgs,
std::static_pointer_cast<CHIPEventLevel0>(LaunchEvent)->peek(),
EventHandles.size(), EventHandles.data());
Expand Down
20 changes: 19 additions & 1 deletion tests/known_failures.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
TOTAL_TESTS: 1397
ANY:
ALL:
LevelZeroStream: 'Hangs on event query when multiple event pools are used'
# Invalid test (if it is the one from HIP/ submodule instead of hip-tests/).
# The source allocation 'Ah' is not initialized (this is fixed in hip-tests/)
# and input is therefore random. Because of this the test is known to fail
Expand Down Expand Up @@ -471,4 +472,21 @@ cupcake:
LEVEL0_GPU:
OPENCL_CPU:
OPENCL_GPU:
OPENCL_POCL:
OPENCL_POCL:
x1921.*b0n0: # Sunspot compute
ALL:
LEVEL0_GPU:
OpenCLCallbacks: ''
OpenCLCallbacks-buffer: ''
Unit_hipStreamAddCallback_WithDefaultStream: ''
Unit_hipStreamAddCallback_WithCreatedStream: ''
stream: ''
hipAddCallback: ''
cuda-bandwidthTest: ''
hip_async_binomial: ''
sycl_chip_interop: ''
sycl_chip_interop_usm: ''
clock: ''
OPENCL_CPU:
OPENCL_GPU:
OPENCL_POCL:
8 changes: 8 additions & 0 deletions tests/runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ function(add_shell_test SCRIPT)
COMMAND /bin/bash ${CMAKE_CURRENT_BINARY_DIR}/${SCRIPT})
endfunction()

file(COPY ${CMAKE_SOURCE_DIR}/tests/runtime/inputs DESTINATION ${CMAKE_BINARY_DIR}/tests/runtime)
# TODO - this should check if immediate cmd lists are being used
add_hip_runtime_test(TestRecordEventBlocking.cpp)
# This test checks if zeEventQuery is a blocking operation
Expand Down Expand Up @@ -108,3 +109,10 @@ add_hip_runtime_test(TestPositiveHasNoIGBAs.hip)

add_hip_runtime_test(CatchMemLeak1.hip)
add_hip_runtime_test(TestBufferDevAddr.hip)

add_hip_runtime_test(OpenCLCallbacks.cpp)
add_hip_runtime_test(OpenCLCallbacks-buffer.cpp)
add_hip_runtime_test(LevelZeroCallbacks.cpp)
add_hip_runtime_test(LevelZeroStream.cpp)
add_hip_runtime_test(streamHang.cpp)
add_hip_runtime_test(LevelZeroFirstTouch.cpp)
Loading
Loading