From 7fd9ae518407d3a364236849f0fb430269627397 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Fri, 20 Dec 2024 22:41:18 +0100 Subject: [PATCH] cp_async_bulk: Fix test (#3198) * memcpy_async_tx: Fix bug in test Two bugs, one of which occurs in practice: 1. There is a missing fence.proxy.space::global between the writes to global memory and the memcpy_async_tx. (Occurs in practice) 2. The end of the kernel should be fenced with `__syncthreads()`, because the barrier is invalidated in the destructor. If other threads are still waiting on it, there will be UB. (Has not yet manifested itself) * cp_async_bulk_tensor: Pre-emptively fence more in test * cp_async_bulk: Fix test The global memory pointer could be misaligned. --- libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp index d5d54876a7f..5f26c80602b 100644 --- a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp @@ -26,7 +26,7 @@ using barrier = cuda::barrier; namespace cde = cuda::device::experimental; static constexpr int buf_len = 1024; -__device__ int gmem_buffer[buf_len]; +__device__ alignas(128) int gmem_buffer[buf_len]; __device__ void test() {