From 6adaca3d2fd229185ffe3f8233d27e83aa0bb1fc Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 19 Dec 2024 12:12:20 +0100
Subject: [PATCH] Fix memcpy_async* tests (#3197)

* memcpy_async_tx: Fix bug in test

Two bugs, one of which occurs in practice:

1. There is a missing fence.proxy.space::global between the writes to
   global memory and the memcpy_async_tx. (Occurs in practice)

2. The end of the kernel should be fenced with `__syncthreads()`,
   because the barrier is invalidated in the destructor. If other
   threads are still waiting on it, there will be UB. (Has not yet
   manifested itself)

* cp_async_bulk_tensor: Pre-emptively fence more in test
---
 .../cuda/barrier/cp_async_bulk_tensor_generic.h       | 11 +++++++++++
 .../cuda/memcpy_async/memcpy_async_tx.pass.cpp        | 10 +++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h
index e6cab7bfa77..84495195e80 100644
--- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h
+++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h
@@ -14,9 +14,12 @@
 #define TEST_CP_ASYNC_BULK_TENSOR_GENERIC_H_
 
 #include <cuda/barrier>
+#include <cuda/ptx>
 #include <cuda/std/array>
 #include <cuda/std/utility> // cuda::std::move
 
+namespace ptx = cuda::ptx;
+
 #include "test_macros.h" // TEST_NV_DIAG_SUPPRESS
 
 // NVRTC does not support cuda.h (due to import of stdlib.h)
@@ -173,7 +176,11 @@ test(cuda::std::array<uint32_t, num_dims> smem_coord,
   }
   // Ensure that writes to global memory are visible to others, including
   // those in the async proxy.
+  // ahendriksen: Issuing threadfence and fence.proxy.async.global. The
+  // fence.proxy.async.global should suffice, but I am keeping the threadfence
+  // out of an abundance of caution.
   __threadfence();
+  ptx::fence_proxy_async(ptx::space_global);
   __syncthreads();
 
   // TEST: Add i to buffer[i]
@@ -223,7 +230,11 @@ test(cuda::std::array<uint32_t, num_dims> smem_coord,
     cde::cp_async_bulk_commit_group();
     cde::cp_async_bulk_wait_group_read<0>();
   }
+  // ahendriksen: Issuing threadfence and fence.proxy.async.global. The
+  // fence.proxy.async.global should suffice, but I am keeping the threadfence
+  // out of an abundance of caution.
   __threadfence();
+  ptx::fence_proxy_async(ptx::space_global);
   __syncthreads();
 
   // // TEAR-DOWN: check that global memory is correct
diff --git a/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp
index 324d22bf440..f370559cfb7 100644
--- a/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp
@@ -15,8 +15,11 @@
 // <cuda/barrier>
 
 #include <cuda/barrier>
+#include <cuda/ptx>
 #include <cuda/std/utility> // cuda::std::move
 
+namespace ptx = cuda::ptx;
+
 #include "cuda_space_selector.h" // shared_memory_selector
 #include "test_macros.h" // TEST_NV_DIAG_SUPPRESS
 
@@ -47,6 +50,9 @@ int main(int, char**)
       // Initialize gmem_x
       for (int i = threadIdx.x; i < 2048; i += blockDim.x) { gmem_x[i] = i; } __syncthreads();
 
+      // The writes to global memory by threads in block have to be made visible to the async proxy.
+      ptx::fence_proxy_async(ptx::space_global);
+
       barrier_t::arrival_token token;
       if (threadIdx.x == 0) {
         auto fulfillment = cuda::device::memcpy_async_tx(smem_x, gmem_x, cuda::aligned_size_t<16>(sizeof(smem_x)), *b);
@@ -55,6 +61,8 @@ int main(int, char**)
       } else { token = b->arrive(1); } b->wait(cuda::std::move(token));
 
       // assert that smem_x contains the contents of gmem_x[0], ..., gmem_x[1023]
-      for (int i = threadIdx.x; i < 1024; i += blockDim.x) { assert(smem_x[i] == i); }));
+      for (int i = threadIdx.x; i < 1024; i += blockDim.x) { assert(smem_x[i] == i); }
+      // syncthreads() here, otherwise barrier may be destroyed before all threads have finished waiting.
+      __syncthreads();));
   return 0;
 }