From f8e6c5457a5a760200d022fc2708541508481c55 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Thu, 23 Jan 2025 21:11:34 +0100 Subject: [PATCH] Fix issue with conversion between `mdspan` and `mdspan` (#3469) * Fix issue with conversion between `mdspan` and `mdspan` This worked because we werent properly enforcing that the accessor of a mdspan has the right element type. * Be safe and use `add_const` --- cudax/examples/stf/04-fibonacci-run_once.cu | 2 +- cudax/examples/stf/04-fibonacci.cu | 2 +- cudax/examples/stf/fdtd_mgpu.cu | 2 +- cudax/examples/stf/heat.cu | 2 +- cudax/examples/stf/heat_mgpu.cu | 2 +- cudax/examples/stf/parallel_for_2D.cu | 29 +++++----- cudax/examples/stf/standalone-launches.cu | 4 +- .../cuda/experimental/__algorithm/copy.cuh | 16 +----- .../experimental/__hierarchy/dimensions.cuh | 1 + .../experimental/__stf/graph/graph_ctx.cuh | 4 +- .../__stf/internal/data_interface.cuh | 53 +++++++++++++------ cudax/include/cuda/experimental/stf.cuh | 8 +-- cudax/test/stf/examples/09-nbody-blocked.cu | 6 +-- .../stf/interface/data_from_device_async.cu | 2 +- .../stf/interface/graph_use_device_data.cu | 2 +- cudax/test/stf/local_stf/interop_cuda.cu | 2 +- cudax/test/stf/local_stf/legacy_to_stf.cu | 2 +- cudax/test/stf/parallel_for/fdtd.cu | 2 +- .../test2_parallel_for_context.cu | 31 +++++------ .../stf/parallel_for/test_parallel_for.cu | 8 +-- cudax/test/stf/parallel_for/tiled_loops.cu | 2 +- cudax/test/stf/stencil/stencil-1D.cu | 4 +- cudax/test/stf/stress/kernel_chain.cu | 10 ++-- 23 files changed, 104 insertions(+), 92 deletions(-) diff --git a/cudax/examples/stf/04-fibonacci-run_once.cu b/cudax/examples/stf/04-fibonacci-run_once.cu index f4975f46b8d..61b6cc3620e 100644 --- a/cudax/examples/stf/04-fibonacci-run_once.cu +++ b/cudax/examples/stf/04-fibonacci-run_once.cu @@ -31,7 +31,7 @@ int fibo_ref(int n) } } -__global__ void add(slice out, const slice in1, const slice in2) +__global__ void add(slice out, const slice in1, const slice in2) { out(0) = in1(0) + in2(0); } diff --git a/cudax/examples/stf/04-fibonacci.cu b/cudax/examples/stf/04-fibonacci.cu index c9c2af840b4..c42a3842dd4 100644 --- a/cudax/examples/stf/04-fibonacci.cu +++ b/cudax/examples/stf/04-fibonacci.cu @@ -31,7 +31,7 @@ int fibo_ref(int n) } } -__global__ void add(slice out, const slice in1, const slice in2) +__global__ void add(slice out, const slice in1, const slice in2) { out(0) = in1(0) + in2(0); } diff --git a/cudax/examples/stf/fdtd_mgpu.cu b/cudax/examples/stf/fdtd_mgpu.cu index d991c97f258..c10da77045e 100644 --- a/cudax/examples/stf/fdtd_mgpu.cu +++ b/cudax/examples/stf/fdtd_mgpu.cu @@ -22,7 +22,7 @@ using namespace cuda::experimental::stf; // FIXME : MSVC has trouble with box constructors #if !_CCCL_COMPILER(MSVC) -void write_vtk_2D(const std::string& filename, slice Ez, double dx, double dy, double /*unused*/) +void write_vtk_2D(const std::string& filename, slice Ez, double dx, double dy, double /*unused*/) { FILE* f = fopen(filename.c_str(), "w"); diff --git a/cudax/examples/stf/heat.cu b/cudax/examples/stf/heat.cu index 2af2cda6422..b69dfc4ce4f 100644 --- a/cudax/examples/stf/heat.cu +++ b/cudax/examples/stf/heat.cu @@ -23,7 +23,7 @@ using namespace cuda::experimental::stf; -void dump_iter(slice sUn, int iter) +void dump_iter(slice sUn, int iter) { /* Create a binary file in the PPM format */ char name[64]; diff --git a/cudax/examples/stf/heat_mgpu.cu b/cudax/examples/stf/heat_mgpu.cu index a4befc2a05b..1fc2ade2e03 100644 --- a/cudax/examples/stf/heat_mgpu.cu +++ b/cudax/examples/stf/heat_mgpu.cu @@ -20,7 +20,7 @@ using namespace cuda::experimental::stf; -void dump_iter(slice sUn, int iter) +void dump_iter(slice sUn, int iter) { /* Create a binary file in the PPM format */ char name[64]; diff --git a/cudax/examples/stf/parallel_for_2D.cu b/cudax/examples/stf/parallel_for_2D.cu index 42cc2ccead0..ad82b867a38 100644 --- a/cudax/examples/stf/parallel_for_2D.cu +++ b/cudax/examples/stf/parallel_for_2D.cu @@ -53,21 +53,22 @@ int main() } }; - ctx.parallel_for(exec_place::host, ly.shape(), ly.read())->*[=] __host__(size_t i, size_t j, slice sy) { - double expected = y0(i, j); - for (size_t ii = 0; ii < 2; ii++) - { - for (size_t jj = 0; jj < 2; jj++) - { - expected += x0(2 * i + ii, 2 * j + jj); - } - } + ctx.parallel_for(exec_place::host, ly.shape(), ly.read()) + ->*[=] __host__(size_t i, size_t j, slice sy) { + double expected = y0(i, j); + for (size_t ii = 0; ii < 2; ii++) + { + for (size_t jj = 0; jj < 2; jj++) + { + expected += x0(2 * i + ii, 2 * j + jj); + } + } - if (fabs(sy(i, j) - expected) > 0.001) - { - printf("sy(%zu, %zu) %f expect %f\n", i, j, sy(i, j), expected); - } - }; + if (fabs(sy(i, j) - expected) > 0.001) + { + printf("sy(%zu, %zu) %f expect %f\n", i, j, sy(i, j), expected); + } + }; ctx.finalize(); } diff --git a/cudax/examples/stf/standalone-launches.cu b/cudax/examples/stf/standalone-launches.cu index 7717ab3925c..83ad6fc291b 100644 --- a/cudax/examples/stf/standalone-launches.cu +++ b/cudax/examples/stf/standalone-launches.cu @@ -42,12 +42,12 @@ int main() auto handle_Z = ctx.logical_data(Z, {N}); ctx.task(handle_X.read(), handle_Y.write(), handle_Z.write()) - ->*[](cudaStream_t s, slice x, slice y, slice z) { + ->*[](cudaStream_t s, slice x, slice y, slice z) { std::vector streams; streams.push_back(s); auto spec = par(1024); reserved::launch(spec, exec_place::current_device(), streams, std::tuple{x, y}) - ->*[] _CCCL_DEVICE(auto t, slice x, slice y) { + ->*[] _CCCL_DEVICE(auto t, slice x, slice y) { size_t tid = t.rank(); size_t nthreads = t.size(); for (size_t ind = tid; ind < N; ind += nthreads) diff --git a/cudax/include/cuda/experimental/__algorithm/copy.cuh b/cudax/include/cuda/experimental/__algorithm/copy.cuh index e2c7c73d51a..d084adfc5da 100644 --- a/cudax/include/cuda/experimental/__algorithm/copy.cuh +++ b/cudax/include/cuda/experimental/__algorithm/copy.cuh @@ -75,20 +75,6 @@ void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst) detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst))))); } -template -inline constexpr bool __copy_bytes_compatible_extents = false; - -template -inline constexpr bool __copy_bytes_compatible_extents<_CUDA_VSTD::extents<_IndexType, _Extents...>, - _CUDA_VSTD::extents<_OtherIndexType, _OtherExtents...>> = - decltype(_CUDA_VSTD::__detail::__check_compatible_extents( - _CUDA_VSTD::integral_constant{}, - _CUDA_VSTD::integer_sequence{}, - _CUDA_VSTD::integer_sequence{}))::value; - template _CCCL_NODISCARD bool __copy_bytes_runtime_extents_match(_SrcExtents __src_exts, _DstExtents __dst_exts) { @@ -116,7 +102,7 @@ void __nd_copy_bytes_impl(stream_ref __stream, _CUDA_VSTD::mdspan<_SrcElem, _SrcExtents, _SrcLayout, _SrcAccessor> __src, _CUDA_VSTD::mdspan<_DstElem, _DstExtents, _DstLayout, _DstAccessor> __dst) { - static_assert(__copy_bytes_compatible_extents<_SrcExtents, _DstExtents>, + static_assert(_CUDA_VSTD::is_constructible_v<_DstExtents, _SrcExtents>, "Multidimensional copy requires both source and destination extents to be compatible"); static_assert(_CUDA_VSTD::is_same_v<_SrcLayout, _DstLayout>, "Multidimensional copy requires both source and destination layouts to match"); diff --git a/cudax/include/cuda/experimental/__hierarchy/dimensions.cuh b/cudax/include/cuda/experimental/__hierarchy/dimensions.cuh index ecb0f8c6d6f..701d1bf3894 100644 --- a/cudax/include/cuda/experimental/__hierarchy/dimensions.cuh +++ b/cudax/include/cuda/experimental/__hierarchy/dimensions.cuh @@ -11,6 +11,7 @@ #ifndef _CUDAX__HIERARCHY_DIMENSIONS #define _CUDAX__HIERARCHY_DIMENSIONS +#include #include #if _CCCL_STD_VER >= 2017 diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh index 5db6a7c8954..8cad2af8bc8 100644 --- a/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh +++ b/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh @@ -979,14 +979,14 @@ inline void unit_test_graph_epoch_3() if ((k % 2) == 0) { ctx.parallel_for(blocked_partition(), exec_place::current_device(), lA.shape(), lA.rw(), lB.read()) - ->*[] _CCCL_HOST_DEVICE(size_t i, slice A, slice B) { + ->*[] _CCCL_HOST_DEVICE(size_t i, slice A, slice B) { A(i) = cos(B(i)); }; } else { ctx.parallel_for(blocked_partition(), exec_place::current_device(), lA.shape(), lA.read(), lB.rw()) - ->*[] _CCCL_HOST_DEVICE(size_t i, slice A, slice B) { + ->*[] _CCCL_HOST_DEVICE(size_t i, slice A, slice B) { B(i) = sin(A(i)); }; } diff --git a/cudax/include/cuda/experimental/__stf/internal/data_interface.cuh b/cudax/include/cuda/experimental/__stf/internal/data_interface.cuh index c2d91313451..ce7cd47b719 100644 --- a/cudax/include/cuda/experimental/__stf/internal/data_interface.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/data_interface.cuh @@ -26,6 +26,8 @@ # pragma system_header #endif // no system header +#include + #include #include #include @@ -59,32 +61,28 @@ struct readonly_type_of }; // Specialization of `readonly_type_of` for `mdspan`. -template -struct readonly_type_of> +template class Accessor> +struct readonly_type_of>> { - using type = mdspan; + using type = mdspan<::cuda::std::add_const_t, Extents, Layout, Accessor<::cuda::std::add_const_t>>; }; // Helper struct to deduce read-write types. template struct rw_type_of { - using type = ::std::remove_const_t; + using type = ::cuda::std::remove_const_t; }; // Specialization of rw_type_of for `mdspan`. -template -struct rw_type_of> +template class Accessor> +struct rw_type_of>> { - using type = mdspan; + using type = mdspan>; }; -// Specialization of rw_type_of for const mdspan. -template -struct rw_type_of> -{ - using type = mdspan; -}; +template +inline constexpr bool always_false = false; } // namespace reserved @@ -93,14 +91,39 @@ struct rw_type_of> * @tparam T Type to process */ template -using readonly_type_of = typename reserved::readonly_type_of::type; +using readonly_type_of = typename reserved::readonly_type_of<::cuda::std::remove_cvref_t>::type; /** * @brief Given a type `T`, returns the inverse of `constify`. * @tparam T Type to process */ template -using rw_type_of = typename reserved::rw_type_of::type; +using rw_type_of = typename reserved::rw_type_of<::cuda::std::remove_cvref_t>::type; + +template +rw_type_of to_rw_type_of(T t) +{ + return rw_type_of{t}; +} + +template class Accessor> +mdspan> to_rw_type_of(mdspan> md) +{ + if constexpr (_CUDA_VSTD::is_default_constructible_v>) + { + return mdspan>{const_cast(md.data_handle()), md.mapping()}; + } + else if constexpr (_CUDA_VSTD::is_constructible_v, const Accessor&>) + { + return mdspan>{ + const_cast(md.data_handle()), md.mapping(), Accessor{md.accessor()}}; + } + else + { + static_assert(reserved::always_false, "Need to implement the conversion of Accessor to Accessor"); + } + _CCCL_UNREACHABLE(); +} namespace reserved { diff --git a/cudax/include/cuda/experimental/stf.cuh b/cudax/include/cuda/experimental/stf.cuh index d76494aa9ff..267a0dbd4b1 100644 --- a/cudax/include/cuda/experimental/stf.cuh +++ b/cudax/include/cuda/experimental/stf.cuh @@ -1204,7 +1204,7 @@ UNITTEST("context task") ctx.task(la.read(), lb.write())->*[](auto s, auto a, auto b) { // no-op - cudaMemcpyAsync(&a(0), &b(0), sizeof(int), cudaMemcpyDeviceToDevice, s); + cudaMemcpyAsync(&b(0), &a(0), sizeof(int), cudaMemcpyDeviceToDevice, s); }; ctx.finalize(); @@ -1527,7 +1527,7 @@ public: // Our infrastructure currently does not like to work with // constant types for the data interface so we pretend this is // a modifiable data if necessary - return gctx.logical_data(rw_type_of(x), current_place.affine_data_place()); + return gctx.logical_data(to_rw_type_of(x), current_place.affine_data_place()); }; // Transform the tuple of instances into a tuple of logical data @@ -1688,7 +1688,7 @@ public: // Our infrastructure currently does not like to work with constant // types for the data interface so we pretend this is a modifiable // data if necessary - return gctx.logical_data(rw_type_of(x), current_place.affine_data_place()); + return gctx.logical_data(to_rw_type_of(x), current_place.affine_data_place()); }; // Transform the tuple of instances into a tuple of logical data @@ -1849,7 +1849,7 @@ public: (void) data_per_iteration; auto logify = [](auto& dest_ctx, auto x) { - return dest_ctx.logical_data(rw_type_of(x), exec_place::current_device().affine_data_place()); + return dest_ctx.logical_data(to_rw_type_of(x), exec_place::current_device().affine_data_place()); }; for (size_t i = start; i < end; i++) diff --git a/cudax/test/stf/examples/09-nbody-blocked.cu b/cudax/test/stf/examples/09-nbody-blocked.cu index eca18b2df60..35a168d8637 100644 --- a/cudax/test/stf/examples/09-nbody-blocked.cu +++ b/cudax/test/stf/examples/09-nbody-blocked.cu @@ -49,7 +49,7 @@ void writeVTKFile(context& ctx, for (size_t b = 0; b < parts.size(); b++) { - ctx.task(exec_place::host, parts[b].read())->*[&](cudaStream_t s, slice p) { + ctx.task(exec_place::host, parts[b].read())->*[&](cudaStream_t s, slice p) { cuda_safe_call(cudaStreamSynchronize(s)); for (size_t i = 0; i < p.size(); i++) { @@ -210,7 +210,7 @@ int main(int argc, char** argv) { ctx.launch(exec_place::device(b % ngpus), parts[b].read(), parts[b_other].read(), acc_parts[b].rw()) //.set_symbol("compute_acc") - ->*[=] _CCCL_DEVICE(auto t, slice p, slice p_other, slice acc) { + ->*[=] _CCCL_DEVICE(auto t, slice p, slice p_other, slice acc) { for (size_t i = t.rank(); i < p.extent(0); i += t.size()) { for (size_t j = 0; j < p_other.extent(0); j++) @@ -242,7 +242,7 @@ int main(int argc, char** argv) // Update velocity and positions ctx.launch(exec_place::device(b % ngpus), parts[b].rw(), acc_parts[b].read()) //.set_symbol("update") - ->*[=] _CCCL_DEVICE(auto t, slice p, slice acc) { + ->*[=] _CCCL_DEVICE(auto t, slice p, slice acc) { for (size_t i = t.rank(); i < p.extent(0); i += t.size()) { for (size_t k = 0; k < 3; k++) diff --git a/cudax/test/stf/interface/data_from_device_async.cu b/cudax/test/stf/interface/data_from_device_async.cu index d0f026c5c7a..d6659a305e9 100644 --- a/cudax/test/stf/interface/data_from_device_async.cu +++ b/cudax/test/stf/interface/data_from_device_async.cu @@ -13,7 +13,7 @@ using namespace cuda::experimental::stf; template -__global__ void axpy(int n, T a, T* x, T* y) +__global__ void axpy(int n, T a, const T* x, T* y) { int tid = blockIdx.x * blockDim.x + threadIdx.x; int nthreads = gridDim.x * blockDim.x; diff --git a/cudax/test/stf/interface/graph_use_device_data.cu b/cudax/test/stf/interface/graph_use_device_data.cu index 0f05003209c..facdf783c43 100644 --- a/cudax/test/stf/interface/graph_use_device_data.cu +++ b/cudax/test/stf/interface/graph_use_device_data.cu @@ -14,7 +14,7 @@ using namespace cuda::experimental::stf; template -__global__ void axpy(int N, T a, T* x, T* y) +__global__ void axpy(int N, T a, const T* x, T* y) { int tid = blockIdx.x * blockDim.x + threadIdx.x; int nthreads = gridDim.x * blockDim.x; diff --git a/cudax/test/stf/local_stf/interop_cuda.cu b/cudax/test/stf/local_stf/interop_cuda.cu index ff1f8aa687d..5291a6403be 100644 --- a/cudax/test/stf/local_stf/interop_cuda.cu +++ b/cudax/test/stf/local_stf/interop_cuda.cu @@ -18,7 +18,7 @@ using namespace cuda::experimental::stf; // B += alpha*A; -__global__ void axpy(double alpha, double* d_ptrA, double* d_ptrB, size_t N) +__global__ void axpy(double alpha, const double* d_ptrA, double* d_ptrB, size_t N) { int tid = blockIdx.x * blockDim.x + threadIdx.x; int nthreads = gridDim.x * blockDim.x; diff --git a/cudax/test/stf/local_stf/legacy_to_stf.cu b/cudax/test/stf/local_stf/legacy_to_stf.cu index 2895b6b934c..31b4bf665db 100644 --- a/cudax/test/stf/local_stf/legacy_to_stf.cu +++ b/cudax/test/stf/local_stf/legacy_to_stf.cu @@ -33,7 +33,7 @@ __global__ void initB(double* d_ptrB, size_t N) } // B += alpha*A; -__global__ void axpy(double alpha, double* d_ptrA, double* d_ptrB, size_t N) +__global__ void axpy(double alpha, const double* d_ptrA, double* d_ptrB, size_t N) { int tid = blockIdx.x * blockDim.x + threadIdx.x; int nthreads = gridDim.x * blockDim.x; diff --git a/cudax/test/stf/parallel_for/fdtd.cu b/cudax/test/stf/parallel_for/fdtd.cu index 686d613d710..bd0052b7e1b 100644 --- a/cudax/test/stf/parallel_for/fdtd.cu +++ b/cudax/test/stf/parallel_for/fdtd.cu @@ -15,7 +15,7 @@ using namespace cuda::experimental::stf; // FIXME : MSVC has trouble with box constructors #if !_CCCL_COMPILER(MSVC) -void write_vtk_2D(const std::string& filename, slice Ez, double dx, double dy, double /*unused*/) +void write_vtk_2D(const std::string& filename, slice Ez, double dx, double dy, double /*unused*/) { FILE* f = fopen(filename.c_str(), "w"); diff --git a/cudax/test/stf/parallel_for/test2_parallel_for_context.cu b/cudax/test/stf/parallel_for/test2_parallel_for_context.cu index 5ff11ffafca..c54be6e85c0 100644 --- a/cudax/test/stf/parallel_for/test2_parallel_for_context.cu +++ b/cudax/test/stf/parallel_for/test2_parallel_for_context.cu @@ -53,21 +53,22 @@ int main() } }; - ctx.parallel_for(exec_place::host, ly.shape(), ly.read())->*[=] __host__(size_t i, size_t j, slice sy) { - double expected = y0(i, j); - for (size_t ii = 0; ii < 2; ii++) - { - for (size_t jj = 0; jj < 2; jj++) - { - expected += x0(2 * i + ii, 2 * j + jj); - } - } - if (fabs(sy(i, j) - expected) > 0.001) - { - printf("sy(%zu, %zu) %f expect %f\n", i, j, sy(i, j), expected); - } - // assert(fabs(sy(i, j) - expected) < 0.001); - }; + ctx.parallel_for(exec_place::host, ly.shape(), ly.read()) + ->*[=] __host__(size_t i, size_t j, slice sy) { + double expected = y0(i, j); + for (size_t ii = 0; ii < 2; ii++) + { + for (size_t jj = 0; jj < 2; jj++) + { + expected += x0(2 * i + ii, 2 * j + jj); + } + } + if (fabs(sy(i, j) - expected) > 0.001) + { + printf("sy(%zu, %zu) %f expect %f\n", i, j, sy(i, j), expected); + } + // assert(fabs(sy(i, j) - expected) < 0.001); + }; ctx.finalize(); } diff --git a/cudax/test/stf/parallel_for/test_parallel_for.cu b/cudax/test/stf/parallel_for/test_parallel_for.cu index 1688704ff73..b2da9a33cb7 100644 --- a/cudax/test/stf/parallel_for/test_parallel_for.cu +++ b/cudax/test/stf/parallel_for/test_parallel_for.cu @@ -94,28 +94,28 @@ int main() case 3: // This works because it dynamically selects the dual function to run on the host ctx.parallel_for(exec_place::host, ly.shape(), lx.read(), ly.rw()) - ->*[=] __host__ __device__(size_t pos, slice sx, slice sy) { + ->*[=] __host__ __device__(size_t pos, slice sx, slice sy) { sy(pos) += 0.5 * (sx(2 * pos) + sx(2 * pos + 1)); }; break; case 4: // This works because it dynamically selects the dual function to run on the device ctx.parallel_for(exec_place::current_device(), ly.shape(), lx.read(), ly.rw()) - ->*[=] __host__ __device__(size_t pos, slice sx, slice sy) { + ->*[=] __host__ __device__(size_t pos, slice sx, slice sy) { sy(pos) += 0.5 * (sx(2 * pos) + sx(2 * pos + 1)); }; break; case 5: // This works because it dynamically selects the dual function to run on the current device ctx.parallel_for(ly.shape(), lx.read(), ly.rw()) - ->*[=] __host__ __device__(size_t pos, slice sx, slice sy) { + ->*[=] __host__ __device__(size_t pos, slice sx, slice sy) { sy(pos) += 0.5 * (sx(2 * pos) + sx(2 * pos + 1)); }; break; case 6: // This works because it dispatches on all devices ctx.parallel_for(blocked_partition(), exec_place::all_devices(), ly.shape(), lx.read(), ly.rw()) - ->*[=] __host__ __device__(size_t pos, slice sx, slice sy) { + ->*[=] __host__ __device__(size_t pos, slice sx, slice sy) { sy(pos) += 0.5 * (sx(2 * pos) + sx(2 * pos + 1)); }; break; diff --git a/cudax/test/stf/parallel_for/tiled_loops.cu b/cudax/test/stf/parallel_for/tiled_loops.cu index bdc2f7fa8b8..bc429e0a344 100644 --- a/cudax/test/stf/parallel_for/tiled_loops.cu +++ b/cudax/test/stf/parallel_for/tiled_loops.cu @@ -66,7 +66,7 @@ int main() bool* pchecked = &checked; /* Check the result on the host */ - ctx.parallel_for(exec_place::host, ly.shape(), ly.read())->*[=](size_t pos, slice sy) { + ctx.parallel_for(exec_place::host, ly.shape(), ly.read())->*[=](size_t pos, slice sy) { int expected = static_cast(ref_tiling(pos, tile_size, nparts)); int value = (int) sy(pos); if (expected != value) diff --git a/cudax/test/stf/stencil/stencil-1D.cu b/cudax/test/stf/stencil/stencil-1D.cu index 843a1fb13d9..a897552a63f 100644 --- a/cudax/test/stf/stencil/stencil-1D.cu +++ b/cudax/test/stf/stencil/stencil-1D.cu @@ -63,7 +63,7 @@ public: }; template -__global__ void copy_kernel(size_t cnt, T* dst, T* src) +__global__ void copy_kernel(size_t cnt, T* dst, const T* src) { for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < cnt; idx += blockDim.x * gridDim.x) { @@ -72,7 +72,7 @@ __global__ void copy_kernel(size_t cnt, T* dst, T* src) } template -__global__ void stencil_kernel(size_t cnt, size_t ghost_size, T* array, T* array1) +__global__ void stencil_kernel(size_t cnt, size_t ghost_size, T* array, const T* array1) { for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < cnt; idx += blockDim.x * gridDim.x) { diff --git a/cudax/test/stf/stress/kernel_chain.cu b/cudax/test/stf/stress/kernel_chain.cu index 621ac74adaf..dbbcb554fbd 100644 --- a/cudax/test/stf/stress/kernel_chain.cu +++ b/cudax/test/stf/stress/kernel_chain.cu @@ -12,7 +12,7 @@ using namespace cuda::experimental::stf; -__global__ void swap(slice dst, const slice src) +__global__ void swap_kernel(slice dst, slice src) { size_t tid = threadIdx.x + blockIdx.x * blockDim.x; size_t nthreads = blockDim.x * gridDim.x; @@ -57,11 +57,11 @@ int main(int argc, char** argv) start = std::chrono::steady_clock::now(); for (size_t iter = 0; iter < iter_cnt; iter++) { - ctx.task(lX.read(), lY.rw())->*[&](cudaStream_t s, auto dX, auto dY) { - swap<<<4, 16, 0, s>>>(dY, dX); + ctx.task(lX.rw(), lY.rw())->*[&](cudaStream_t s, auto dX, auto dY) { + swap_kernel<<<4, 16, 0, s>>>(dY, dX); }; - ctx.task(lY.read(), lX.rw())->*[&](cudaStream_t s, auto dY, auto dX) { - swap<<<4, 16, 0, s>>>(dX, dY); + ctx.task(lY.rw(), lX.rw())->*[&](cudaStream_t s, auto dY, auto dX) { + swap_kernel<<<4, 16, 0, s>>>(dX, dY); }; } stop = std::chrono::steady_clock::now();