Skip to content

Commit

Permalink
Fix memcpy_async* tests (#3197)
Browse files Browse the repository at this point in the history
* memcpy_async_tx: Fix bug in test

Two bugs, one of which occurs in practice:

1. There is a missing fence.proxy.space::global between the writes to
   global memory and the memcpy_async_tx. (Occurs in practice)

2. The end of the kernel should be fenced with `__syncthreads()`,
   because the barrier is invalidated in the destructor. If other
   threads are still waiting on it, there will be UB. (Has not yet
   manifested itself)

* cp_async_bulk_tensor: Pre-emptively fence more in test
  • Loading branch information
ahendriksen authored Dec 19, 2024
1 parent c0793cf commit 6adaca3
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,12 @@
#define TEST_CP_ASYNC_BULK_TENSOR_GENERIC_H_

#include <cuda/barrier>
#include <cuda/ptx>
#include <cuda/std/array>
#include <cuda/std/utility> // cuda::std::move

namespace ptx = cuda::ptx;

#include "test_macros.h" // TEST_NV_DIAG_SUPPRESS

// NVRTC does not support cuda.h (due to import of stdlib.h)
Expand Down Expand Up @@ -173,7 +176,11 @@ test(cuda::std::array<uint32_t, num_dims> smem_coord,
}
// Ensure that writes to global memory are visible to others, including
// those in the async proxy.
// ahendriksen: Issuing threadfence and fence.proxy.async.global. The
// fence.proxy.async.global should suffice, but I am keeping the threadfence
// out of an abundance of caution.
__threadfence();
ptx::fence_proxy_async(ptx::space_global);
__syncthreads();

// TEST: Add i to buffer[i]
Expand Down Expand Up @@ -223,7 +230,11 @@ test(cuda::std::array<uint32_t, num_dims> smem_coord,
cde::cp_async_bulk_commit_group();
cde::cp_async_bulk_wait_group_read<0>();
}
// ahendriksen: Issuing threadfence and fence.proxy.async.global. The
// fence.proxy.async.global should suffice, but I am keeping the threadfence
// out of an abundance of caution.
__threadfence();
ptx::fence_proxy_async(ptx::space_global);
__syncthreads();

// // TEAR-DOWN: check that global memory is correct
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,11 @@
// <cuda/barrier>

#include <cuda/barrier>
#include <cuda/ptx>
#include <cuda/std/utility> // cuda::std::move

namespace ptx = cuda::ptx;

#include "cuda_space_selector.h" // shared_memory_selector
#include "test_macros.h" // TEST_NV_DIAG_SUPPRESS

Expand Down Expand Up @@ -47,6 +50,9 @@ int main(int, char**)
// Initialize gmem_x
for (int i = threadIdx.x; i < 2048; i += blockDim.x) { gmem_x[i] = i; } __syncthreads();

// The writes to global memory by threads in block have to be made visible to the async proxy.
ptx::fence_proxy_async(ptx::space_global);

barrier_t::arrival_token token;
if (threadIdx.x == 0) {
auto fulfillment = cuda::device::memcpy_async_tx(smem_x, gmem_x, cuda::aligned_size_t<16>(sizeof(smem_x)), *b);
Expand All @@ -55,6 +61,8 @@ int main(int, char**)
} else { token = b->arrive(1); } b->wait(cuda::std::move(token));

// assert that smem_x contains the contents of gmem_x[0], ..., gmem_x[1023]
for (int i = threadIdx.x; i < 1024; i += blockDim.x) { assert(smem_x[i] == i); }));
for (int i = threadIdx.x; i < 1024; i += blockDim.x) { assert(smem_x[i] == i); }
// syncthreads() here, otherwise barrier may be destroyed before all threads have finished waiting.
__syncthreads();));
return 0;
}

0 comments on commit 6adaca3

Please sign in to comment.