From 6adaca3d2fd229185ffe3f8233d27e83aa0bb1fc Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Thu, 19 Dec 2024 12:12:20 +0100 Subject: [PATCH] Fix memcpy_async* tests (#3197) * memcpy_async_tx: Fix bug in test Two bugs, one of which occurs in practice: 1. There is a missing fence.proxy.space::global between the writes to global memory and the memcpy_async_tx. (Occurs in practice) 2. The end of the kernel should be fenced with `__syncthreads()`, because the barrier is invalidated in the destructor. If other threads are still waiting on it, there will be UB. (Has not yet manifested itself) * cp_async_bulk_tensor: Pre-emptively fence more in test --- .../cuda/barrier/cp_async_bulk_tensor_generic.h | 11 +++++++++++ .../cuda/memcpy_async/memcpy_async_tx.pass.cpp | 10 +++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h index e6cab7bfa77..84495195e80 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h @@ -14,9 +14,12 @@ #define TEST_CP_ASYNC_BULK_TENSOR_GENERIC_H_ #include +#include #include #include // cuda::std::move +namespace ptx = cuda::ptx; + #include "test_macros.h" // TEST_NV_DIAG_SUPPRESS // NVRTC does not support cuda.h (due to import of stdlib.h) @@ -173,7 +176,11 @@ test(cuda::std::array smem_coord, } // Ensure that writes to global memory are visible to others, including // those in the async proxy. + // ahendriksen: Issuing threadfence and fence.proxy.async.global. The + // fence.proxy.async.global should suffice, but I am keeping the threadfence + // out of an abundance of caution. __threadfence(); + ptx::fence_proxy_async(ptx::space_global); __syncthreads(); // TEST: Add i to buffer[i] @@ -223,7 +230,11 @@ test(cuda::std::array smem_coord, cde::cp_async_bulk_commit_group(); cde::cp_async_bulk_wait_group_read<0>(); } + // ahendriksen: Issuing threadfence and fence.proxy.async.global. The + // fence.proxy.async.global should suffice, but I am keeping the threadfence + // out of an abundance of caution. __threadfence(); + ptx::fence_proxy_async(ptx::space_global); __syncthreads(); // // TEAR-DOWN: check that global memory is correct diff --git a/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp index 324d22bf440..f370559cfb7 100644 --- a/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp @@ -15,8 +15,11 @@ // #include +#include #include // cuda::std::move +namespace ptx = cuda::ptx; + #include "cuda_space_selector.h" // shared_memory_selector #include "test_macros.h" // TEST_NV_DIAG_SUPPRESS @@ -47,6 +50,9 @@ int main(int, char**) // Initialize gmem_x for (int i = threadIdx.x; i < 2048; i += blockDim.x) { gmem_x[i] = i; } __syncthreads(); + // The writes to global memory by threads in block have to be made visible to the async proxy. + ptx::fence_proxy_async(ptx::space_global); + barrier_t::arrival_token token; if (threadIdx.x == 0) { auto fulfillment = cuda::device::memcpy_async_tx(smem_x, gmem_x, cuda::aligned_size_t<16>(sizeof(smem_x)), *b); @@ -55,6 +61,8 @@ int main(int, char**) } else { token = b->arrive(1); } b->wait(cuda::std::move(token)); // assert that smem_x contains the contents of gmem_x[0], ..., gmem_x[1023] - for (int i = threadIdx.x; i < 1024; i += blockDim.x) { assert(smem_x[i] == i); })); + for (int i = threadIdx.x; i < 1024; i += blockDim.x) { assert(smem_x[i] == i); } + // syncthreads() here, otherwise barrier may be destroyed before all threads have finished waiting. + __syncthreads();)); return 0; }