Fix memcpy_async* tests (#3197)

* memcpy_async_tx: Fix bug in test Two bugs, one of which occurs in practice: 1. There is a missing fence.proxy.space::global between the writes to global memory and the memcpy_async_tx. (Occurs in practice) 2. The end of the kernel should be fenced with `__syncthreads()`, because the barrier is invalidated in the destructor. If other threads are still waiting on it, there will be UB. (Has not yet manifested itself) * cp_async_bulk_tensor: Pre-emptively fence more in test
NVIDIA · Dec 19, 2024 · 6adaca3 · 6adaca3
1 parent c0793cf
commit 6adaca3
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 1 deletion.
diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h
@@ -14,9 +14,12 @@
 #define TEST_CP_ASYNC_BULK_TENSOR_GENERIC_H_
 
 #include <cuda/barrier>
+#include <cuda/ptx>
 #include <cuda/std/array>
 #include <cuda/std/utility> // cuda::std::move
 
+namespace ptx = cuda::ptx;
+
 #include "test_macros.h" // TEST_NV_DIAG_SUPPRESS
 
 // NVRTC does not support cuda.h (due to import of stdlib.h)
@@ -173,7 +176,11 @@ test(cuda::std::array<uint32_t, num_dims> smem_coord,
   }
   // Ensure that writes to global memory are visible to others, including
   // those in the async proxy.
+  // ahendriksen: Issuing threadfence and fence.proxy.async.global. The
+  // fence.proxy.async.global should suffice, but I am keeping the threadfence
+  // out of an abundance of caution.
   __threadfence();
+  ptx::fence_proxy_async(ptx::space_global);
   __syncthreads();
 
   // TEST: Add i to buffer[i]
@@ -223,7 +230,11 @@ test(cuda::std::array<uint32_t, num_dims> smem_coord,
     cde::cp_async_bulk_commit_group();
     cde::cp_async_bulk_wait_group_read<0>();
   }
+  // ahendriksen: Issuing threadfence and fence.proxy.async.global. The
+  // fence.proxy.async.global should suffice, but I am keeping the threadfence
+  // out of an abundance of caution.
   __threadfence();
+  ptx::fence_proxy_async(ptx::space_global);
   __syncthreads();
 
   // // TEAR-DOWN: check that global memory is correct

diff --git a/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp
@@ -15,8 +15,11 @@
 // <cuda/barrier>
 
 #include <cuda/barrier>
+#include <cuda/ptx>
 #include <cuda/std/utility> // cuda::std::move
 
+namespace ptx = cuda::ptx;
+
 #include "cuda_space_selector.h" // shared_memory_selector
 #include "test_macros.h" // TEST_NV_DIAG_SUPPRESS
 
@@ -47,6 +50,9 @@ int main(int, char**)
       // Initialize gmem_x
       for (int i = threadIdx.x; i < 2048; i += blockDim.x) { gmem_x[i] = i; } __syncthreads();
 
+      // The writes to global memory by threads in block have to be made visible to the async proxy.
+      ptx::fence_proxy_async(ptx::space_global);
+
       barrier_t::arrival_token token;
       if (threadIdx.x == 0) {
         auto fulfillment = cuda::device::memcpy_async_tx(smem_x, gmem_x, cuda::aligned_size_t<16>(sizeof(smem_x)), *b);
@@ -55,6 +61,8 @@ int main(int, char**)
       } else { token = b->arrive(1); } b->wait(cuda::std::move(token));
 
       // assert that smem_x contains the contents of gmem_x[0], ..., gmem_x[1023]
-      for (int i = threadIdx.x; i < 1024; i += blockDim.x) { assert(smem_x[i] == i); }));
+      for (int i = threadIdx.x; i < 1024; i += blockDim.x) { assert(smem_x[i] == i); }
+      // syncthreads() here, otherwise barrier may be destroyed before all threads have finished waiting.
+      __syncthreads();));
   return 0;
 }