Make STF pass

NVIDIA · Jan 21, 2025 · e863ad0 · e863ad0
1 parent 5b5570f
commit e863ad0
Show file tree

Hide file tree

Showing 21 changed files with 93 additions and 61 deletions.
diff --git a/cudax/examples/stf/04-fibonacci-run_once.cu b/cudax/examples/stf/04-fibonacci-run_once.cu
@@ -31,7 +31,7 @@ int fibo_ref(int n)
   }
 }
 
-__global__ void add(slice<int> out, const slice<int> in1, const slice<int> in2)
+__global__ void add(slice<int> out, const slice<const int> in1, const slice<const int> in2)
 {
   out(0) = in1(0) + in2(0);
 }

diff --git a/cudax/examples/stf/04-fibonacci.cu b/cudax/examples/stf/04-fibonacci.cu
@@ -31,7 +31,7 @@ int fibo_ref(int n)
   }
 }
 
-__global__ void add(slice<int> out, const slice<int> in1, const slice<int> in2)
+__global__ void add(slice<int> out, const slice<const int> in1, const slice<const int> in2)
 {
   out(0) = in1(0) + in2(0);
 }

diff --git a/cudax/examples/stf/fdtd_mgpu.cu b/cudax/examples/stf/fdtd_mgpu.cu
@@ -22,7 +22,7 @@ using namespace cuda::experimental::stf;
 
 // FIXME : MSVC has trouble with box constructors
 #if !_CCCL_COMPILER(MSVC)
-void write_vtk_2D(const std::string& filename, slice<double, 3> Ez, double dx, double dy, double /*unused*/)
+void write_vtk_2D(const std::string& filename, slice<const double, 3> Ez, double dx, double dy, double /*unused*/)
 {
   FILE* f = fopen(filename.c_str(), "w");
 

diff --git a/cudax/examples/stf/heat.cu b/cudax/examples/stf/heat.cu
@@ -23,7 +23,7 @@
 
 using namespace cuda::experimental::stf;
 
-void dump_iter(slice<double, 2> sUn, int iter)
+void dump_iter(slice<const double, 2> sUn, int iter)
 {
   /* Create a binary file in the PPM format */
   char name[64];

diff --git a/cudax/examples/stf/heat_mgpu.cu b/cudax/examples/stf/heat_mgpu.cu
@@ -20,7 +20,7 @@
 
 using namespace cuda::experimental::stf;
 
-void dump_iter(slice<double, 2> sUn, int iter)
+void dump_iter(slice<const double, 2> sUn, int iter)
 {
   /* Create a binary file in the PPM format */
   char name[64];

diff --git a/cudax/examples/stf/parallel_for_2D.cu b/cudax/examples/stf/parallel_for_2D.cu
@@ -53,21 +53,22 @@ int main()
     }
   };
 
-  ctx.parallel_for(exec_place::host, ly.shape(), ly.read())->*[=] __host__(size_t i, size_t j, slice<double, 2> sy) {
-    double expected = y0(i, j);
-    for (size_t ii = 0; ii < 2; ii++)
-    {
-      for (size_t jj = 0; jj < 2; jj++)
-      {
-        expected += x0(2 * i + ii, 2 * j + jj);
-      }
-    }
+  ctx.parallel_for(exec_place::host, ly.shape(), ly.read())
+      ->*[=] __host__(size_t i, size_t j, slice<const double, 2> sy) {
+            double expected = y0(i, j);
+            for (size_t ii = 0; ii < 2; ii++)
+            {
+              for (size_t jj = 0; jj < 2; jj++)
+              {
+                expected += x0(2 * i + ii, 2 * j + jj);
+              }
+            }
 
-    if (fabs(sy(i, j) - expected) > 0.001)
-    {
-      printf("sy(%zu, %zu) %f expect %f\n", i, j, sy(i, j), expected);
-    }
-  };
+            if (fabs(sy(i, j) - expected) > 0.001)
+            {
+              printf("sy(%zu, %zu) %f expect %f\n", i, j, sy(i, j), expected);
+            }
+          };
 
   ctx.finalize();
 }
diff --git a/cudax/examples/stf/standalone-launches.cu b/cudax/examples/stf/standalone-launches.cu
@@ -42,12 +42,12 @@ int main()
   auto handle_Z = ctx.logical_data(Z, {N});
 
   ctx.task(handle_X.read(), handle_Y.write(), handle_Z.write())
-      ->*[](cudaStream_t s, slice<int> x, slice<int> y, slice<int> z) {
+      ->*[](cudaStream_t s, slice<const int> x, slice<int> y, slice<int> z) {
             std::vector<cudaStream_t> streams;
             streams.push_back(s);
             auto spec = par(1024);
             reserved::launch(spec, exec_place::current_device(), streams, std::tuple{x, y})
-                ->*[] _CCCL_DEVICE(auto t, slice<int> x, slice<int> y) {
+                ->*[] _CCCL_DEVICE(auto t, slice<const int> x, slice<int> y) {
                       size_t tid      = t.rank();
                       size_t nthreads = t.size();
                       for (size_t ind = tid; ind < N; ind += nthreads)

diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh
@@ -958,14 +958,14 @@ inline void unit_test_graph_epoch_3()
     if ((k % 2) == 0)
     {
       ctx.parallel_for(blocked_partition(), exec_place::current_device(), lA.shape(), lA.rw(), lB.read())
-          ->*[] _CCCL_HOST_DEVICE(size_t i, slice<double> A, slice<double> B) {
+          ->*[] _CCCL_HOST_DEVICE(size_t i, slice<double> A, slice<const double> B) {
                 A(i) = cos(B(i));
               };
     }
     else
     {
       ctx.parallel_for(blocked_partition(), exec_place::current_device(), lA.shape(), lA.read(), lB.rw())
-          ->*[] _CCCL_HOST_DEVICE(size_t i, slice<double> A, slice<double> B) {
+          ->*[] _CCCL_HOST_DEVICE(size_t i, slice<const double> A, slice<double> B) {
                 B(i) = sin(A(i));
               };
     }

diff --git a/cudax/include/cuda/experimental/__stf/internal/data_interface.cuh b/cudax/include/cuda/experimental/__stf/internal/data_interface.cuh
@@ -26,6 +26,8 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/type_traits>
+
 #include <cuda/experimental/__stf/allocators/block_allocator.cuh>
 #include <cuda/experimental/__stf/internal/msir.cuh>
 #include <cuda/experimental/__stf/internal/slice.cuh>
@@ -86,6 +88,9 @@ struct rw_type_of<const mdspan<const T, Extents, Layout, Accessor<const T>>>
   using type = mdspan<T, Extents, Layout, Accessor<T>>;
 };
 
+template <class T>
+inline constexpr bool always_false = false;
+
 } // namespace reserved
 
 /**
@@ -102,6 +107,31 @@ using readonly_type_of = typename reserved::readonly_type_of<T>::type;
 template <typename T>
 using rw_type_of = typename reserved::rw_type_of<T>::type;
 
+template <typename T>
+rw_type_of<T> to_rw_type_of(T t)
+{
+  return rw_type_of<T>{t};
+}
+
+template <typename T, typename Extents, typename Layout, template <typename> class Accessor>
+mdspan<T, Extents, Layout, Accessor<T>> to_rw_type_of(mdspan<const T, Extents, Layout, Accessor<const T>> md)
+{
+  if constexpr (_CUDA_VSTD::is_default_constructible_v<Accessor<T>>)
+  {
+    return mdspan<T, Extents, Layout, Accessor<T>>{const_cast<T*>(md.data_handle()), md.mapping()};
+  }
+  else if constexpr (_CUDA_VSTD::is_constructible_v<Accessor<T>, const Accessor<const T>&>)
+  {
+    return mdspan<T, Extents, Layout, Accessor<T>>{
+      const_cast<T*>(md.data_handle()), md.mapping(), Accessor<T>{md.accessor()}};
+  }
+  else
+  {
+    static_assert(reserved::always_false<T>, "Need to implement the conversion of Accessor<T> to Accessor<const T>");
+  }
+  _CCCL_UNREACHABLE();
+}
+
 namespace reserved
 {
 template <typename Data>

diff --git a/cudax/include/cuda/experimental/stf.cuh b/cudax/include/cuda/experimental/stf.cuh
@@ -1204,7 +1204,7 @@ UNITTEST("context task")
 
   ctx.task(la.read(), lb.write())->*[](auto s, auto a, auto b) {
     // no-op
-    cudaMemcpyAsync(&a(0), &b(0), sizeof(int), cudaMemcpyDeviceToDevice, s);
+    cudaMemcpyAsync(&b(0), &a(0), sizeof(int), cudaMemcpyDeviceToDevice, s);
   };
 
   ctx.finalize();
@@ -1459,7 +1459,7 @@ private:
     runner_impl(context_t& _ctx, algorithm& _alg, task_dep<Deps>... _deps)
         : alg(_alg)
         , ctx(_ctx)
-        , deps(::std::make_tuple(mv(_deps)...)) {};
+        , deps(::std::make_tuple(mv(_deps)...)){};
 
     template <typename Fun>
     void operator->*(Fun&& fun)
@@ -1527,7 +1527,7 @@ public:
         // Our infrastructure currently does not like to work with
         // constant types for the data interface so we pretend this is
         // a modifiable data if necessary
-        return gctx.logical_data(rw_type_of<decltype(x)>(x), current_place.affine_data_place());
+        return gctx.logical_data(to_rw_type_of(x), current_place.affine_data_place());
       };
 
       // Transform the tuple of instances into a tuple of logical data
@@ -1688,7 +1688,7 @@ public:
       // Our infrastructure currently does not like to work with constant
       // types for the data interface so we pretend this is a modifiable
       // data if necessary
-      return gctx.logical_data(rw_type_of<decltype(x)>(x), current_place.affine_data_place());
+      return gctx.logical_data(to_rw_type_of(x), current_place.affine_data_place());
     };
 
     // Transform the tuple of instances into a tuple of logical data
@@ -1849,7 +1849,7 @@ public:
       (void) data_per_iteration;
 
       auto logify = [](auto& dest_ctx, auto x) {
-        return dest_ctx.logical_data(rw_type_of<decltype(x)>(x), exec_place::current_device().affine_data_place());
+        return dest_ctx.logical_data(to_rw_type_of(x), exec_place::current_device().affine_data_place());
       };
 
       for (size_t i = start; i < end; i++)

diff --git a/cudax/test/stf/examples/09-nbody-blocked.cu b/cudax/test/stf/examples/09-nbody-blocked.cu
@@ -49,7 +49,7 @@ void writeVTKFile(context& ctx,
 
   for (size_t b = 0; b < parts.size(); b++)
   {
-    ctx.task(exec_place::host, parts[b].read())->*[&](cudaStream_t s, slice<body> p) {
+    ctx.task(exec_place::host, parts[b].read())->*[&](cudaStream_t s, slice<const body> p) {
       cuda_safe_call(cudaStreamSynchronize(s));
       for (size_t i = 0; i < p.size(); i++)
       {
@@ -210,7 +210,7 @@ int main(int argc, char** argv)
       {
         ctx.launch(exec_place::device(b % ngpus), parts[b].read(), parts[b_other].read(), acc_parts[b].rw())
             //.set_symbol("compute_acc")
-            ->*[=] _CCCL_DEVICE(auto t, slice<body> p, slice<body> p_other, slice<double, 2> acc) {
+            ->*[=] _CCCL_DEVICE(auto t, slice<const body> p, slice<const body> p_other, slice<double, 2> acc) {
                   for (size_t i = t.rank(); i < p.extent(0); i += t.size())
                   {
                     for (size_t j = 0; j < p_other.extent(0); j++)
@@ -242,7 +242,7 @@ int main(int argc, char** argv)
       // Update velocity and positions
       ctx.launch(exec_place::device(b % ngpus), parts[b].rw(), acc_parts[b].read())
           //.set_symbol("update")
-          ->*[=] _CCCL_DEVICE(auto t, slice<body> p, slice<double, 2> acc) {
+          ->*[=] _CCCL_DEVICE(auto t, slice<body> p, slice<const double, 2> acc) {
                 for (size_t i = t.rank(); i < p.extent(0); i += t.size())
                 {
                   for (size_t k = 0; k < 3; k++)

diff --git a/cudax/test/stf/interface/data_from_device_async.cu b/cudax/test/stf/interface/data_from_device_async.cu
@@ -13,7 +13,7 @@
 using namespace cuda::experimental::stf;
 
 template <typename T>
-__global__ void axpy(int n, T a, T* x, T* y)
+__global__ void axpy(int n, T a, const T* x, T* y)
 {
   int tid      = blockIdx.x * blockDim.x + threadIdx.x;
   int nthreads = gridDim.x * blockDim.x;

diff --git a/cudax/test/stf/interface/graph_use_device_data.cu b/cudax/test/stf/interface/graph_use_device_data.cu
@@ -14,7 +14,7 @@
 using namespace cuda::experimental::stf;
 
 template <typename T>
-__global__ void axpy(int N, T a, T* x, T* y)
+__global__ void axpy(int N, T a, const T* x, T* y)
 {
   int tid      = blockIdx.x * blockDim.x + threadIdx.x;
   int nthreads = gridDim.x * blockDim.x;

diff --git a/cudax/test/stf/local_stf/interop_cuda.cu b/cudax/test/stf/local_stf/interop_cuda.cu
@@ -18,7 +18,7 @@
 using namespace cuda::experimental::stf;
 
 // B += alpha*A;
-__global__ void axpy(double alpha, double* d_ptrA, double* d_ptrB, size_t N)
+__global__ void axpy(double alpha, const double* d_ptrA, double* d_ptrB, size_t N)
 {
   int tid      = blockIdx.x * blockDim.x + threadIdx.x;
   int nthreads = gridDim.x * blockDim.x;

diff --git a/cudax/test/stf/local_stf/legacy_to_stf.cu b/cudax/test/stf/local_stf/legacy_to_stf.cu
@@ -33,7 +33,7 @@ __global__ void initB(double* d_ptrB, size_t N)
 }
 
 // B += alpha*A;
-__global__ void axpy(double alpha, double* d_ptrA, double* d_ptrB, size_t N)
+__global__ void axpy(double alpha, const double* d_ptrA, double* d_ptrB, size_t N)
 {
   int tid      = blockIdx.x * blockDim.x + threadIdx.x;
   int nthreads = gridDim.x * blockDim.x;

diff --git a/cudax/test/stf/parallel_for/fdtd.cu b/cudax/test/stf/parallel_for/fdtd.cu
@@ -15,7 +15,7 @@ using namespace cuda::experimental::stf;
 
 // FIXME : MSVC has trouble with box constructors
 #if !_CCCL_COMPILER(MSVC)
-void write_vtk_2D(const std::string& filename, slice<double, 3> Ez, double dx, double dy, double /*unused*/)
+void write_vtk_2D(const std::string& filename, slice<const double, 3> Ez, double dx, double dy, double /*unused*/)
 {
   FILE* f = fopen(filename.c_str(), "w");
 

diff --git a/cudax/test/stf/parallel_for/test2_parallel_for_context.cu b/cudax/test/stf/parallel_for/test2_parallel_for_context.cu
@@ -53,21 +53,22 @@ int main()
     }
   };
 
-  ctx.parallel_for(exec_place::host, ly.shape(), ly.read())->*[=] __host__(size_t i, size_t j, slice<double, 2> sy) {
-    double expected = y0(i, j);
-    for (size_t ii = 0; ii < 2; ii++)
-    {
-      for (size_t jj = 0; jj < 2; jj++)
-      {
-        expected += x0(2 * i + ii, 2 * j + jj);
-      }
-    }
-    if (fabs(sy(i, j) - expected) > 0.001)
-    {
-      printf("sy(%zu, %zu) %f expect %f\n", i, j, sy(i, j), expected);
-    }
-    //    assert(fabs(sy(i, j) - expected) < 0.001);
-  };
+  ctx.parallel_for(exec_place::host, ly.shape(), ly.read())
+      ->*[=] __host__(size_t i, size_t j, slice<const double, 2> sy) {
+            double expected = y0(i, j);
+            for (size_t ii = 0; ii < 2; ii++)
+            {
+              for (size_t jj = 0; jj < 2; jj++)
+              {
+                expected += x0(2 * i + ii, 2 * j + jj);
+              }
+            }
+            if (fabs(sy(i, j) - expected) > 0.001)
+            {
+              printf("sy(%zu, %zu) %f expect %f\n", i, j, sy(i, j), expected);
+            }
+            //    assert(fabs(sy(i, j) - expected) < 0.001);
+          };
 
   ctx.finalize();
 }
diff --git a/cudax/test/stf/parallel_for/test_parallel_for.cu b/cudax/test/stf/parallel_for/test_parallel_for.cu
@@ -94,28 +94,28 @@ int main()
       case 3:
         // This works because it dynamically selects the dual function to run on the host
         ctx.parallel_for(exec_place::host, ly.shape(), lx.read(), ly.rw())
-            ->*[=] __host__ __device__(size_t pos, slice<double> sx, slice<double> sy) {
+            ->*[=] __host__ __device__(size_t pos, slice<const double> sx, slice<double> sy) {
                   sy(pos) += 0.5 * (sx(2 * pos) + sx(2 * pos + 1));
                 };
         break;
       case 4:
         // This works because it dynamically selects the dual function to run on the device
         ctx.parallel_for(exec_place::current_device(), ly.shape(), lx.read(), ly.rw())
-            ->*[=] __host__ __device__(size_t pos, slice<double> sx, slice<double> sy) {
+            ->*[=] __host__ __device__(size_t pos, slice<const double> sx, slice<double> sy) {
                   sy(pos) += 0.5 * (sx(2 * pos) + sx(2 * pos + 1));
                 };
         break;
       case 5:
         // This works because it dynamically selects the dual function to run on the current device
         ctx.parallel_for(ly.shape(), lx.read(), ly.rw())
-            ->*[=] __host__ __device__(size_t pos, slice<double> sx, slice<double> sy) {
+            ->*[=] __host__ __device__(size_t pos, slice<const double> sx, slice<double> sy) {
                   sy(pos) += 0.5 * (sx(2 * pos) + sx(2 * pos + 1));
                 };
         break;
       case 6:
         // This works because it dispatches on all devices
         ctx.parallel_for(blocked_partition(), exec_place::all_devices(), ly.shape(), lx.read(), ly.rw())
-            ->*[=] __host__ __device__(size_t pos, slice<double> sx, slice<double> sy) {
+            ->*[=] __host__ __device__(size_t pos, slice<const double> sx, slice<double> sy) {
                   sy(pos) += 0.5 * (sx(2 * pos) + sx(2 * pos + 1));
                 };
         break;

diff --git a/cudax/test/stf/parallel_for/tiled_loops.cu b/cudax/test/stf/parallel_for/tiled_loops.cu
@@ -66,7 +66,7 @@ int main()
   bool* pchecked = &checked;
 
   /* Check the result on the host */
-  ctx.parallel_for(exec_place::host, ly.shape(), ly.read())->*[=](size_t pos, slice<double> sy) {
+  ctx.parallel_for(exec_place::host, ly.shape(), ly.read())->*[=](size_t pos, slice<const double> sy) {
     int expected = static_cast<int>(ref_tiling(pos, tile_size, nparts));
     int value    = (int) sy(pos);
     if (expected != value)

diff --git a/cudax/test/stf/stencil/stencil-1D.cu b/cudax/test/stf/stencil/stencil-1D.cu
@@ -63,7 +63,7 @@ public:
 };
 
 template <typename T>
-__global__ void copy_kernel(size_t cnt, T* dst, T* src)
+__global__ void copy_kernel(size_t cnt, T* dst, const T* src)
 {
   for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < cnt; idx += blockDim.x * gridDim.x)
   {
@@ -72,7 +72,7 @@ __global__ void copy_kernel(size_t cnt, T* dst, T* src)
 }
 
 template <typename T>
-__global__ void stencil_kernel(size_t cnt, size_t ghost_size, T* array, T* array1)
+__global__ void stencil_kernel(size_t cnt, size_t ghost_size, T* array, const T* array1)
 {
   for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < cnt; idx += blockDim.x * gridDim.x)
   {
-Original file line number
+Diff line change
@@ Expand Up / @@ -31,7 +31,7 @@ int fibo_ref(int n) @@
       }
     }
-    __global__ void add(slice<int> out, const slice<int> in1, const slice<int> in2)
+    __global__ void add(slice<int> out, const slice<const int> in1, const slice<const int> in2)
     {
       out(0) = in1(0) + in2(0);
     }
@@ Expand Down @@