From f8e6c5457a5a760200d022fc2708541508481c55 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Thu, 23 Jan 2025 21:11:34 +0100
Subject: [PATCH] Fix issue with conversion between `mdspan<T>` and
 `mdspan<const T>` (#3469)

* Fix issue with conversion between `mdspan<T>` and `mdspan<const T>`

This worked because we werent properly enforcing that the accessor of a mdspan has the right element type.

* Be safe and use `add_const`
---
 cudax/examples/stf/04-fibonacci-run_once.cu   |  2 +-
 cudax/examples/stf/04-fibonacci.cu            |  2 +-
 cudax/examples/stf/fdtd_mgpu.cu               |  2 +-
 cudax/examples/stf/heat.cu                    |  2 +-
 cudax/examples/stf/heat_mgpu.cu               |  2 +-
 cudax/examples/stf/parallel_for_2D.cu         | 29 +++++-----
 cudax/examples/stf/standalone-launches.cu     |  4 +-
 .../cuda/experimental/__algorithm/copy.cuh    | 16 +-----
 .../experimental/__hierarchy/dimensions.cuh   |  1 +
 .../experimental/__stf/graph/graph_ctx.cuh    |  4 +-
 .../__stf/internal/data_interface.cuh         | 53 +++++++++++++------
 cudax/include/cuda/experimental/stf.cuh       |  8 +--
 cudax/test/stf/examples/09-nbody-blocked.cu   |  6 +--
 .../stf/interface/data_from_device_async.cu   |  2 +-
 .../stf/interface/graph_use_device_data.cu    |  2 +-
 cudax/test/stf/local_stf/interop_cuda.cu      |  2 +-
 cudax/test/stf/local_stf/legacy_to_stf.cu     |  2 +-
 cudax/test/stf/parallel_for/fdtd.cu           |  2 +-
 .../test2_parallel_for_context.cu             | 31 +++++------
 .../stf/parallel_for/test_parallel_for.cu     |  8 +--
 cudax/test/stf/parallel_for/tiled_loops.cu    |  2 +-
 cudax/test/stf/stencil/stencil-1D.cu          |  4 +-
 cudax/test/stf/stress/kernel_chain.cu         | 10 ++--
 23 files changed, 104 insertions(+), 92 deletions(-)
diff --git a/cudax/examples/stf/04-fibonacci-run_once.cu b/cudax/examples/stf/04-fibonacci-run_once.cu
index f4975f46b8d..61b6cc3620e 100644
--- a/cudax/examples/stf/04-fibonacci-run_once.cu
+++ b/cudax/examples/stf/04-fibonacci-run_once.cu
@@ -31,7 +31,7 @@ int fibo_ref(int n)
   }
 }
 
-__global__ void add(slice<int> out, const slice<int> in1, const slice<int> in2)
+__global__ void add(slice<int> out, const slice<const int> in1, const slice<const int> in2)
 {
   out(0) = in1(0) + in2(0);
 }
diff --git a/cudax/examples/stf/04-fibonacci.cu b/cudax/examples/stf/04-fibonacci.cu
index c9c2af840b4..c42a3842dd4 100644
--- a/cudax/examples/stf/04-fibonacci.cu
+++ b/cudax/examples/stf/04-fibonacci.cu
@@ -31,7 +31,7 @@ int fibo_ref(int n)
   }
 }
 
-__global__ void add(slice<int> out, const slice<int> in1, const slice<int> in2)
+__global__ void add(slice<int> out, const slice<const int> in1, const slice<const int> in2)
 {
   out(0) = in1(0) + in2(0);
 }
diff --git a/cudax/examples/stf/fdtd_mgpu.cu b/cudax/examples/stf/fdtd_mgpu.cu
index d991c97f258..c10da77045e 100644
--- a/cudax/examples/stf/fdtd_mgpu.cu
+++ b/cudax/examples/stf/fdtd_mgpu.cu
@@ -22,7 +22,7 @@ using namespace cuda::experimental::stf;
 
 // FIXME : MSVC has trouble with box constructors
 #if !_CCCL_COMPILER(MSVC)
-void write_vtk_2D(const std::string& filename, slice<double, 3> Ez, double dx, double dy, double /*unused*/)
+void write_vtk_2D(const std::string& filename, slice<const double, 3> Ez, double dx, double dy, double /*unused*/)
 {
   FILE* f = fopen(filename.c_str(), "w");
 
diff --git a/cudax/examples/stf/heat.cu b/cudax/examples/stf/heat.cu
index 2af2cda6422..b69dfc4ce4f 100644
--- a/cudax/examples/stf/heat.cu
+++ b/cudax/examples/stf/heat.cu
@@ -23,7 +23,7 @@
 
 using namespace cuda::experimental::stf;
 
-void dump_iter(slice<double, 2> sUn, int iter)
+void dump_iter(slice<const double, 2> sUn, int iter)
 {
   /* Create a binary file in the PPM format */
   char name[64];
diff --git a/cudax/examples/stf/heat_mgpu.cu b/cudax/examples/stf/heat_mgpu.cu
index a4befc2a05b..1fc2ade2e03 100644
--- a/cudax/examples/stf/heat_mgpu.cu
+++ b/cudax/examples/stf/heat_mgpu.cu
@@ -20,7 +20,7 @@
 
 using namespace cuda::experimental::stf;
 
-void dump_iter(slice<double, 2> sUn, int iter)
+void dump_iter(slice<const double, 2> sUn, int iter)
 {
   /* Create a binary file in the PPM format */
   char name[64];
diff --git a/cudax/examples/stf/parallel_for_2D.cu b/cudax/examples/stf/parallel_for_2D.cu
index 42cc2ccead0..ad82b867a38 100644
--- a/cudax/examples/stf/parallel_for_2D.cu
+++ b/cudax/examples/stf/parallel_for_2D.cu
@@ -53,21 +53,22 @@ int main()
     }
   };
 
-  ctx.parallel_for(exec_place::host, ly.shape(), ly.read())->*[=] __host__(size_t i, size_t j, slice<double, 2> sy) {
-    double expected = y0(i, j);
-    for (size_t ii = 0; ii < 2; ii++)
-    {
-      for (size_t jj = 0; jj < 2; jj++)
-      {
-        expected += x0(2 * i + ii, 2 * j + jj);
-      }
-    }
+  ctx.parallel_for(exec_place::host, ly.shape(), ly.read())
+      ->*[=] __host__(size_t i, size_t j, slice<const double, 2> sy) {
+            double expected = y0(i, j);
+            for (size_t ii = 0; ii < 2; ii++)
+            {
+              for (size_t jj = 0; jj < 2; jj++)
+              {
+                expected += x0(2 * i + ii, 2 * j + jj);
+              }
+            }
 
-    if (fabs(sy(i, j) - expected) > 0.001)
-    {
-      printf("sy(%zu, %zu) %f expect %f\n", i, j, sy(i, j), expected);
-    }
-  };
+            if (fabs(sy(i, j) - expected) > 0.001)
+            {
+              printf("sy(%zu, %zu) %f expect %f\n", i, j, sy(i, j), expected);
+            }
+          };
 
   ctx.finalize();
 }
diff --git a/cudax/examples/stf/standalone-launches.cu b/cudax/examples/stf/standalone-launches.cu
index 7717ab3925c..83ad6fc291b 100644
--- a/cudax/examples/stf/standalone-launches.cu
+++ b/cudax/examples/stf/standalone-launches.cu
@@ -42,12 +42,12 @@ int main()
   auto handle_Z = ctx.logical_data(Z, {N});
 
   ctx.task(handle_X.read(), handle_Y.write(), handle_Z.write())
-      ->*[](cudaStream_t s, slice<int> x, slice<int> y, slice<int> z) {
+      ->*[](cudaStream_t s, slice<const int> x, slice<int> y, slice<int> z) {
             std::vector<cudaStream_t> streams;
             streams.push_back(s);
             auto spec = par(1024);
             reserved::launch(spec, exec_place::current_device(), streams, std::tuple{x, y})
-                ->*[] _CCCL_DEVICE(auto t, slice<int> x, slice<int> y) {
+                ->*[] _CCCL_DEVICE(auto t, slice<const int> x, slice<int> y) {
                       size_t tid      = t.rank();
                       size_t nthreads = t.size();
                       for (size_t ind = tid; ind < N; ind += nthreads)
diff --git a/cudax/include/cuda/experimental/__algorithm/copy.cuh b/cudax/include/cuda/experimental/__algorithm/copy.cuh
index e2c7c73d51a..d084adfc5da 100644
--- a/cudax/include/cuda/experimental/__algorithm/copy.cuh
+++ b/cudax/include/cuda/experimental/__algorithm/copy.cuh
@@ -75,20 +75,6 @@ void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
       detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst)))));
 }
 
-template <typename _Extents, typename _OtherExtents>
-inline constexpr bool __copy_bytes_compatible_extents = false;
-
-template <typename _IndexType,
-          _CUDA_VSTD::size_t... _Extents,
-          typename _OtherIndexType,
-          _CUDA_VSTD::size_t... _OtherExtents>
-inline constexpr bool __copy_bytes_compatible_extents<_CUDA_VSTD::extents<_IndexType, _Extents...>,
-                                                      _CUDA_VSTD::extents<_OtherIndexType, _OtherExtents...>> =
-  decltype(_CUDA_VSTD::__detail::__check_compatible_extents(
-    _CUDA_VSTD::integral_constant<bool, sizeof...(_Extents) == sizeof...(_OtherExtents)>{},
-    _CUDA_VSTD::integer_sequence<size_t, _Extents...>{},
-    _CUDA_VSTD::integer_sequence<size_t, _OtherExtents...>{}))::value;
-
 template <typename _SrcExtents, typename _DstExtents>
 _CCCL_NODISCARD bool __copy_bytes_runtime_extents_match(_SrcExtents __src_exts, _DstExtents __dst_exts)
 {
@@ -116,7 +102,7 @@ void __nd_copy_bytes_impl(stream_ref __stream,
                           _CUDA_VSTD::mdspan<_SrcElem, _SrcExtents, _SrcLayout, _SrcAccessor> __src,
                           _CUDA_VSTD::mdspan<_DstElem, _DstExtents, _DstLayout, _DstAccessor> __dst)
 {
-  static_assert(__copy_bytes_compatible_extents<_SrcExtents, _DstExtents>,
+  static_assert(_CUDA_VSTD::is_constructible_v<_DstExtents, _SrcExtents>,
                 "Multidimensional copy requires both source and destination extents to be compatible");
   static_assert(_CUDA_VSTD::is_same_v<_SrcLayout, _DstLayout>,
                 "Multidimensional copy requires both source and destination layouts to match");
diff --git a/cudax/include/cuda/experimental/__hierarchy/dimensions.cuh b/cudax/include/cuda/experimental/__hierarchy/dimensions.cuh
index ecb0f8c6d6f..701d1bf3894 100644
--- a/cudax/include/cuda/experimental/__hierarchy/dimensions.cuh
+++ b/cudax/include/cuda/experimental/__hierarchy/dimensions.cuh
@@ -11,6 +11,7 @@
 #ifndef _CUDAX__HIERARCHY_DIMENSIONS
 #define _CUDAX__HIERARCHY_DIMENSIONS
 
+#include <cuda/std/functional>
 #include <cuda/std/mdspan>
 
 #if _CCCL_STD_VER >= 2017
diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh
index 5db6a7c8954..8cad2af8bc8 100644
--- a/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh
+++ b/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh
@@ -979,14 +979,14 @@ inline void unit_test_graph_epoch_3()
     if ((k % 2) == 0)
     {
       ctx.parallel_for(blocked_partition(), exec_place::current_device(), lA.shape(), lA.rw(), lB.read())
-          ->*[] _CCCL_HOST_DEVICE(size_t i, slice<double> A, slice<double> B) {
+          ->*[] _CCCL_HOST_DEVICE(size_t i, slice<double> A, slice<const double> B) {
                 A(i) = cos(B(i));
               };
     }
     else
     {
       ctx.parallel_for(blocked_partition(), exec_place::current_device(), lA.shape(), lA.read(), lB.rw())
-          ->*[] _CCCL_HOST_DEVICE(size_t i, slice<double> A, slice<double> B) {
+          ->*[] _CCCL_HOST_DEVICE(size_t i, slice<const double> A, slice<double> B) {
                 B(i) = sin(A(i));
               };
     }
diff --git a/cudax/include/cuda/experimental/__stf/internal/data_interface.cuh b/cudax/include/cuda/experimental/__stf/internal/data_interface.cuh
index c2d91313451..ce7cd47b719 100644
--- a/cudax/include/cuda/experimental/__stf/internal/data_interface.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/data_interface.cuh
@@ -26,6 +26,8 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/type_traits>
+
 #include <cuda/experimental/__stf/allocators/block_allocator.cuh>
 #include <cuda/experimental/__stf/internal/msir.cuh>
 #include <cuda/experimental/__stf/internal/slice.cuh>
@@ -59,32 +61,28 @@ struct readonly_type_of
 };
 
 // Specialization of `readonly_type_of` for `mdspan`.
-template <typename T, typename... More>
-struct readonly_type_of<mdspan<T, More...>>
+template <typename T, typename Extents, typename Layout, template <typename> class Accessor>
+struct readonly_type_of<mdspan<T, Extents, Layout, Accessor<T>>>
 {
-  using type = mdspan<const T, More...>;
+  using type = mdspan<::cuda::std::add_const_t<T>, Extents, Layout, Accessor<::cuda::std::add_const_t<T>>>;
 };
 
 // Helper struct to deduce read-write types.
 template <typename T>
 struct rw_type_of
 {
-  using type = ::std::remove_const_t<T>;
+  using type = ::cuda::std::remove_const_t<T>;
 };
 
 // Specialization of rw_type_of for `mdspan`.
-template <typename T, typename... More>
-struct rw_type_of<mdspan<const T, More...>>
+template <typename T, typename Extents, typename Layout, template <typename> class Accessor>
+struct rw_type_of<mdspan<const T, Extents, Layout, Accessor<const T>>>
 {
-  using type = mdspan<T, More...>;
+  using type = mdspan<T, Extents, Layout, Accessor<T>>;
 };
 
-// Specialization of rw_type_of for const mdspan.
-template <typename T, typename... More>
-struct rw_type_of<const mdspan<const T, More...>>
-{
-  using type = mdspan<T, More...>;
-};
+template <class T>
+inline constexpr bool always_false = false;
 
 } // namespace reserved
 
@@ -93,14 +91,39 @@ struct rw_type_of<const mdspan<const T, More...>>
  * @tparam T Type to process
  */
 template <typename T>
-using readonly_type_of = typename reserved::readonly_type_of<T>::type;
+using readonly_type_of = typename reserved::readonly_type_of<::cuda::std::remove_cvref_t<T>>::type;
 
 /**
  * @brief Given a type `T`, returns the inverse of `constify`.
  * @tparam T Type to process
  */
 template <typename T>
-using rw_type_of = typename reserved::rw_type_of<T>::type;
+using rw_type_of = typename reserved::rw_type_of<::cuda::std::remove_cvref_t<T>>::type;
+
+template <typename T>
+rw_type_of<T> to_rw_type_of(T t)
+{
+  return rw_type_of<T>{t};
+}
+
+template <typename T, typename Extents, typename Layout, template <typename> class Accessor>
+mdspan<T, Extents, Layout, Accessor<T>> to_rw_type_of(mdspan<const T, Extents, Layout, Accessor<const T>> md)
+{
+  if constexpr (_CUDA_VSTD::is_default_constructible_v<Accessor<T>>)
+  {
+    return mdspan<T, Extents, Layout, Accessor<T>>{const_cast<T*>(md.data_handle()), md.mapping()};
+  }
+  else if constexpr (_CUDA_VSTD::is_constructible_v<Accessor<T>, const Accessor<const T>&>)
+  {
+    return mdspan<T, Extents, Layout, Accessor<T>>{
+      const_cast<T*>(md.data_handle()), md.mapping(), Accessor<T>{md.accessor()}};
+  }
+  else
+  {
+    static_assert(reserved::always_false<T>, "Need to implement the conversion of Accessor<T> to Accessor<const T>");
+  }
+  _CCCL_UNREACHABLE();
+}
 
 namespace reserved
 {
diff --git a/cudax/include/cuda/experimental/stf.cuh b/cudax/include/cuda/experimental/stf.cuh
index d76494aa9ff..267a0dbd4b1 100644
--- a/cudax/include/cuda/experimental/stf.cuh
+++ b/cudax/include/cuda/experimental/stf.cuh
@@ -1204,7 +1204,7 @@ UNITTEST("context task")
 
   ctx.task(la.read(), lb.write())->*[](auto s, auto a, auto b) {
     // no-op
-    cudaMemcpyAsync(&a(0), &b(0), sizeof(int), cudaMemcpyDeviceToDevice, s);
+    cudaMemcpyAsync(&b(0), &a(0), sizeof(int), cudaMemcpyDeviceToDevice, s);
   };
 
   ctx.finalize();
@@ -1527,7 +1527,7 @@ public:
         // Our infrastructure currently does not like to work with
         // constant types for the data interface so we pretend this is
         // a modifiable data if necessary
-        return gctx.logical_data(rw_type_of<decltype(x)>(x), current_place.affine_data_place());
+        return gctx.logical_data(to_rw_type_of(x), current_place.affine_data_place());
       };
 
       // Transform the tuple of instances into a tuple of logical data
@@ -1688,7 +1688,7 @@ public:
       // Our infrastructure currently does not like to work with constant
       // types for the data interface so we pretend this is a modifiable
       // data if necessary
-      return gctx.logical_data(rw_type_of<decltype(x)>(x), current_place.affine_data_place());
+      return gctx.logical_data(to_rw_type_of(x), current_place.affine_data_place());
     };
 
     // Transform the tuple of instances into a tuple of logical data
@@ -1849,7 +1849,7 @@ public:
       (void) data_per_iteration;
 
       auto logify = [](auto& dest_ctx, auto x) {
-        return dest_ctx.logical_data(rw_type_of<decltype(x)>(x), exec_place::current_device().affine_data_place());
+        return dest_ctx.logical_data(to_rw_type_of(x), exec_place::current_device().affine_data_place());
       };
 
       for (size_t i = start; i < end; i++)
diff --git a/cudax/test/stf/examples/09-nbody-blocked.cu b/cudax/test/stf/examples/09-nbody-blocked.cu
index eca18b2df60..35a168d8637 100644
--- a/cudax/test/stf/examples/09-nbody-blocked.cu
+++ b/cudax/test/stf/examples/09-nbody-blocked.cu
@@ -49,7 +49,7 @@ void writeVTKFile(context& ctx,
 
   for (size_t b = 0; b < parts.size(); b++)
   {
-    ctx.task(exec_place::host, parts[b].read())->*[&](cudaStream_t s, slice<body> p) {
+    ctx.task(exec_place::host, parts[b].read())->*[&](cudaStream_t s, slice<const body> p) {
       cuda_safe_call(cudaStreamSynchronize(s));
       for (size_t i = 0; i < p.size(); i++)
       {
@@ -210,7 +210,7 @@ int main(int argc, char** argv)
       {
         ctx.launch(exec_place::device(b % ngpus), parts[b].read(), parts[b_other].read(), acc_parts[b].rw())
             //.set_symbol("compute_acc")
-            ->*[=] _CCCL_DEVICE(auto t, slice<body> p, slice<body> p_other, slice<double, 2> acc) {
+            ->*[=] _CCCL_DEVICE(auto t, slice<const body> p, slice<const body> p_other, slice<double, 2> acc) {
                   for (size_t i = t.rank(); i < p.extent(0); i += t.size())
                   {
                     for (size_t j = 0; j < p_other.extent(0); j++)
@@ -242,7 +242,7 @@ int main(int argc, char** argv)
       // Update velocity and positions
       ctx.launch(exec_place::device(b % ngpus), parts[b].rw(), acc_parts[b].read())
           //.set_symbol("update")
-          ->*[=] _CCCL_DEVICE(auto t, slice<body> p, slice<double, 2> acc) {
+          ->*[=] _CCCL_DEVICE(auto t, slice<body> p, slice<const double, 2> acc) {
                 for (size_t i = t.rank(); i < p.extent(0); i += t.size())
                 {
                   for (size_t k = 0; k < 3; k++)
diff --git a/cudax/test/stf/interface/data_from_device_async.cu b/cudax/test/stf/interface/data_from_device_async.cu
index d0f026c5c7a..d6659a305e9 100644
--- a/cudax/test/stf/interface/data_from_device_async.cu
+++ b/cudax/test/stf/interface/data_from_device_async.cu
@@ -13,7 +13,7 @@
 using namespace cuda::experimental::stf;
 
 template <typename T>
-__global__ void axpy(int n, T a, T* x, T* y)
+__global__ void axpy(int n, T a, const T* x, T* y)
 {
   int tid      = blockIdx.x * blockDim.x + threadIdx.x;
   int nthreads = gridDim.x * blockDim.x;
diff --git a/cudax/test/stf/interface/graph_use_device_data.cu b/cudax/test/stf/interface/graph_use_device_data.cu
index 0f05003209c..facdf783c43 100644
--- a/cudax/test/stf/interface/graph_use_device_data.cu
+++ b/cudax/test/stf/interface/graph_use_device_data.cu
@@ -14,7 +14,7 @@
 using namespace cuda::experimental::stf;
 
 template <typename T>
-__global__ void axpy(int N, T a, T* x, T* y)
+__global__ void axpy(int N, T a, const T* x, T* y)
 {
   int tid      = blockIdx.x * blockDim.x + threadIdx.x;
   int nthreads = gridDim.x * blockDim.x;
diff --git a/cudax/test/stf/local_stf/interop_cuda.cu b/cudax/test/stf/local_stf/interop_cuda.cu
index ff1f8aa687d..5291a6403be 100644
--- a/cudax/test/stf/local_stf/interop_cuda.cu
+++ b/cudax/test/stf/local_stf/interop_cuda.cu
@@ -18,7 +18,7 @@
 using namespace cuda::experimental::stf;
 
 // B += alpha*A;
-__global__ void axpy(double alpha, double* d_ptrA, double* d_ptrB, size_t N)
+__global__ void axpy(double alpha, const double* d_ptrA, double* d_ptrB, size_t N)
 {
   int tid      = blockIdx.x * blockDim.x + threadIdx.x;
   int nthreads = gridDim.x * blockDim.x;
diff --git a/cudax/test/stf/local_stf/legacy_to_stf.cu b/cudax/test/stf/local_stf/legacy_to_stf.cu
index 2895b6b934c..31b4bf665db 100644
--- a/cudax/test/stf/local_stf/legacy_to_stf.cu
+++ b/cudax/test/stf/local_stf/legacy_to_stf.cu
@@ -33,7 +33,7 @@ __global__ void initB(double* d_ptrB, size_t N)
 }
 
 // B += alpha*A;
-__global__ void axpy(double alpha, double* d_ptrA, double* d_ptrB, size_t N)
+__global__ void axpy(double alpha, const double* d_ptrA, double* d_ptrB, size_t N)
 {
   int tid      = blockIdx.x * blockDim.x + threadIdx.x;
   int nthreads = gridDim.x * blockDim.x;
diff --git a/cudax/test/stf/parallel_for/fdtd.cu b/cudax/test/stf/parallel_for/fdtd.cu
index 686d613d710..bd0052b7e1b 100644
--- a/cudax/test/stf/parallel_for/fdtd.cu
+++ b/cudax/test/stf/parallel_for/fdtd.cu
@@ -15,7 +15,7 @@ using namespace cuda::experimental::stf;
 
 // FIXME : MSVC has trouble with box constructors
 #if !_CCCL_COMPILER(MSVC)
-void write_vtk_2D(const std::string& filename, slice<double, 3> Ez, double dx, double dy, double /*unused*/)
+void write_vtk_2D(const std::string& filename, slice<const double, 3> Ez, double dx, double dy, double /*unused*/)
 {
   FILE* f = fopen(filename.c_str(), "w");
 
diff --git a/cudax/test/stf/parallel_for/test2_parallel_for_context.cu b/cudax/test/stf/parallel_for/test2_parallel_for_context.cu
index 5ff11ffafca..c54be6e85c0 100644
--- a/cudax/test/stf/parallel_for/test2_parallel_for_context.cu
+++ b/cudax/test/stf/parallel_for/test2_parallel_for_context.cu
@@ -53,21 +53,22 @@ int main()
     }
   };
 
-  ctx.parallel_for(exec_place::host, ly.shape(), ly.read())->*[=] __host__(size_t i, size_t j, slice<double, 2> sy) {
-    double expected = y0(i, j);
-    for (size_t ii = 0; ii < 2; ii++)
-    {
-      for (size_t jj = 0; jj < 2; jj++)
-      {
-        expected += x0(2 * i + ii, 2 * j + jj);
-      }
-    }
-    if (fabs(sy(i, j) - expected) > 0.001)
-    {
-      printf("sy(%zu, %zu) %f expect %f\n", i, j, sy(i, j), expected);
-    }
-    //    assert(fabs(sy(i, j) - expected) < 0.001);
-  };
+  ctx.parallel_for(exec_place::host, ly.shape(), ly.read())
+      ->*[=] __host__(size_t i, size_t j, slice<const double, 2> sy) {
+            double expected = y0(i, j);
+            for (size_t ii = 0; ii < 2; ii++)
+            {
+              for (size_t jj = 0; jj < 2; jj++)
+              {
+                expected += x0(2 * i + ii, 2 * j + jj);
+              }
+            }
+            if (fabs(sy(i, j) - expected) > 0.001)
+            {
+              printf("sy(%zu, %zu) %f expect %f\n", i, j, sy(i, j), expected);
+            }
+            //    assert(fabs(sy(i, j) - expected) < 0.001);
+          };
 
   ctx.finalize();
 }
diff --git a/cudax/test/stf/parallel_for/test_parallel_for.cu b/cudax/test/stf/parallel_for/test_parallel_for.cu
index 1688704ff73..b2da9a33cb7 100644
--- a/cudax/test/stf/parallel_for/test_parallel_for.cu
+++ b/cudax/test/stf/parallel_for/test_parallel_for.cu
@@ -94,28 +94,28 @@ int main()
       case 3:
         // This works because it dynamically selects the dual function to run on the host
         ctx.parallel_for(exec_place::host, ly.shape(), lx.read(), ly.rw())
-            ->*[=] __host__ __device__(size_t pos, slice<double> sx, slice<double> sy) {
+            ->*[=] __host__ __device__(size_t pos, slice<const double> sx, slice<double> sy) {
                   sy(pos) += 0.5 * (sx(2 * pos) + sx(2 * pos + 1));
                 };
         break;
       case 4:
         // This works because it dynamically selects the dual function to run on the device
         ctx.parallel_for(exec_place::current_device(), ly.shape(), lx.read(), ly.rw())
-            ->*[=] __host__ __device__(size_t pos, slice<double> sx, slice<double> sy) {
+            ->*[=] __host__ __device__(size_t pos, slice<const double> sx, slice<double> sy) {
                   sy(pos) += 0.5 * (sx(2 * pos) + sx(2 * pos + 1));
                 };
         break;
       case 5:
         // This works because it dynamically selects the dual function to run on the current device
         ctx.parallel_for(ly.shape(), lx.read(), ly.rw())
-            ->*[=] __host__ __device__(size_t pos, slice<double> sx, slice<double> sy) {
+            ->*[=] __host__ __device__(size_t pos, slice<const double> sx, slice<double> sy) {
                   sy(pos) += 0.5 * (sx(2 * pos) + sx(2 * pos + 1));
                 };
         break;
       case 6:
         // This works because it dispatches on all devices
         ctx.parallel_for(blocked_partition(), exec_place::all_devices(), ly.shape(), lx.read(), ly.rw())
-            ->*[=] __host__ __device__(size_t pos, slice<double> sx, slice<double> sy) {
+            ->*[=] __host__ __device__(size_t pos, slice<const double> sx, slice<double> sy) {
                   sy(pos) += 0.5 * (sx(2 * pos) + sx(2 * pos + 1));
                 };
         break;
diff --git a/cudax/test/stf/parallel_for/tiled_loops.cu b/cudax/test/stf/parallel_for/tiled_loops.cu
index bdc2f7fa8b8..bc429e0a344 100644
--- a/cudax/test/stf/parallel_for/tiled_loops.cu
+++ b/cudax/test/stf/parallel_for/tiled_loops.cu
@@ -66,7 +66,7 @@ int main()
   bool* pchecked = &checked;
 
   /* Check the result on the host */
-  ctx.parallel_for(exec_place::host, ly.shape(), ly.read())->*[=](size_t pos, slice<double> sy) {
+  ctx.parallel_for(exec_place::host, ly.shape(), ly.read())->*[=](size_t pos, slice<const double> sy) {
     int expected = static_cast<int>(ref_tiling(pos, tile_size, nparts));
     int value    = (int) sy(pos);
     if (expected != value)
diff --git a/cudax/test/stf/stencil/stencil-1D.cu b/cudax/test/stf/stencil/stencil-1D.cu
index 843a1fb13d9..a897552a63f 100644
--- a/cudax/test/stf/stencil/stencil-1D.cu
+++ b/cudax/test/stf/stencil/stencil-1D.cu
@@ -63,7 +63,7 @@ public:
 };
 
 template <typename T>
-__global__ void copy_kernel(size_t cnt, T* dst, T* src)
+__global__ void copy_kernel(size_t cnt, T* dst, const T* src)
 {
   for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < cnt; idx += blockDim.x * gridDim.x)
   {
@@ -72,7 +72,7 @@ __global__ void copy_kernel(size_t cnt, T* dst, T* src)
 }
 
 template <typename T>
-__global__ void stencil_kernel(size_t cnt, size_t ghost_size, T* array, T* array1)
+__global__ void stencil_kernel(size_t cnt, size_t ghost_size, T* array, const T* array1)
 {
   for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < cnt; idx += blockDim.x * gridDim.x)
   {
diff --git a/cudax/test/stf/stress/kernel_chain.cu b/cudax/test/stf/stress/kernel_chain.cu
index 621ac74adaf..dbbcb554fbd 100644
--- a/cudax/test/stf/stress/kernel_chain.cu
+++ b/cudax/test/stf/stress/kernel_chain.cu
@@ -12,7 +12,7 @@
 
 using namespace cuda::experimental::stf;
 
-__global__ void swap(slice<double> dst, const slice<double> src)
+__global__ void swap_kernel(slice<double> dst, slice<double> src)
 {
   size_t tid      = threadIdx.x + blockIdx.x * blockDim.x;
   size_t nthreads = blockDim.x * gridDim.x;
@@ -57,11 +57,11 @@ int main(int argc, char** argv)
   start = std::chrono::steady_clock::now();
   for (size_t iter = 0; iter < iter_cnt; iter++)
   {
-    ctx.task(lX.read(), lY.rw())->*[&](cudaStream_t s, auto dX, auto dY) {
-      swap<<<4, 16, 0, s>>>(dY, dX);
+    ctx.task(lX.rw(), lY.rw())->*[&](cudaStream_t s, auto dX, auto dY) {
+      swap_kernel<<<4, 16, 0, s>>>(dY, dX);
     };
-    ctx.task(lY.read(), lX.rw())->*[&](cudaStream_t s, auto dY, auto dX) {
-      swap<<<4, 16, 0, s>>>(dX, dY);
+    ctx.task(lY.rw(), lX.rw())->*[&](cudaStream_t s, auto dY, auto dX) {
+      swap_kernel<<<4, 16, 0, s>>>(dX, dY);
     };
   }
   stop = std::chrono::steady_clock::now();