Skip to content

Commit

Permalink
Make STF pass
Browse files Browse the repository at this point in the history
  • Loading branch information
miscco committed Jan 21, 2025
1 parent 5b5570f commit e863ad0
Show file tree
Hide file tree
Showing 21 changed files with 93 additions and 61 deletions.
2 changes: 1 addition & 1 deletion cudax/examples/stf/04-fibonacci-run_once.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ int fibo_ref(int n)
}
}

__global__ void add(slice<int> out, const slice<int> in1, const slice<int> in2)
__global__ void add(slice<int> out, const slice<const int> in1, const slice<const int> in2)
{
out(0) = in1(0) + in2(0);
}
Expand Down
2 changes: 1 addition & 1 deletion cudax/examples/stf/04-fibonacci.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ int fibo_ref(int n)
}
}

__global__ void add(slice<int> out, const slice<int> in1, const slice<int> in2)
__global__ void add(slice<int> out, const slice<const int> in1, const slice<const int> in2)
{
out(0) = in1(0) + in2(0);
}
Expand Down
2 changes: 1 addition & 1 deletion cudax/examples/stf/fdtd_mgpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ using namespace cuda::experimental::stf;

// FIXME : MSVC has trouble with box constructors
#if !_CCCL_COMPILER(MSVC)
void write_vtk_2D(const std::string& filename, slice<double, 3> Ez, double dx, double dy, double /*unused*/)
void write_vtk_2D(const std::string& filename, slice<const double, 3> Ez, double dx, double dy, double /*unused*/)
{
FILE* f = fopen(filename.c_str(), "w");

Expand Down
2 changes: 1 addition & 1 deletion cudax/examples/stf/heat.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

using namespace cuda::experimental::stf;

void dump_iter(slice<double, 2> sUn, int iter)
void dump_iter(slice<const double, 2> sUn, int iter)
{
/* Create a binary file in the PPM format */
char name[64];
Expand Down
2 changes: 1 addition & 1 deletion cudax/examples/stf/heat_mgpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

using namespace cuda::experimental::stf;

void dump_iter(slice<double, 2> sUn, int iter)
void dump_iter(slice<const double, 2> sUn, int iter)
{
/* Create a binary file in the PPM format */
char name[64];
Expand Down
29 changes: 15 additions & 14 deletions cudax/examples/stf/parallel_for_2D.cu
Original file line number Diff line number Diff line change
Expand Up @@ -53,21 +53,22 @@ int main()
}
};

ctx.parallel_for(exec_place::host, ly.shape(), ly.read())->*[=] __host__(size_t i, size_t j, slice<double, 2> sy) {
double expected = y0(i, j);
for (size_t ii = 0; ii < 2; ii++)
{
for (size_t jj = 0; jj < 2; jj++)
{
expected += x0(2 * i + ii, 2 * j + jj);
}
}
ctx.parallel_for(exec_place::host, ly.shape(), ly.read())
->*[=] __host__(size_t i, size_t j, slice<const double, 2> sy) {
double expected = y0(i, j);
for (size_t ii = 0; ii < 2; ii++)
{
for (size_t jj = 0; jj < 2; jj++)
{
expected += x0(2 * i + ii, 2 * j + jj);
}
}

if (fabs(sy(i, j) - expected) > 0.001)
{
printf("sy(%zu, %zu) %f expect %f\n", i, j, sy(i, j), expected);
}
};
if (fabs(sy(i, j) - expected) > 0.001)
{
printf("sy(%zu, %zu) %f expect %f\n", i, j, sy(i, j), expected);
}
};

ctx.finalize();
}
4 changes: 2 additions & 2 deletions cudax/examples/stf/standalone-launches.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,12 @@ int main()
auto handle_Z = ctx.logical_data(Z, {N});

ctx.task(handle_X.read(), handle_Y.write(), handle_Z.write())
->*[](cudaStream_t s, slice<int> x, slice<int> y, slice<int> z) {
->*[](cudaStream_t s, slice<const int> x, slice<int> y, slice<int> z) {
std::vector<cudaStream_t> streams;
streams.push_back(s);
auto spec = par(1024);
reserved::launch(spec, exec_place::current_device(), streams, std::tuple{x, y})
->*[] _CCCL_DEVICE(auto t, slice<int> x, slice<int> y) {
->*[] _CCCL_DEVICE(auto t, slice<const int> x, slice<int> y) {
size_t tid = t.rank();
size_t nthreads = t.size();
for (size_t ind = tid; ind < N; ind += nthreads)
Expand Down
4 changes: 2 additions & 2 deletions cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -958,14 +958,14 @@ inline void unit_test_graph_epoch_3()
if ((k % 2) == 0)
{
ctx.parallel_for(blocked_partition(), exec_place::current_device(), lA.shape(), lA.rw(), lB.read())
->*[] _CCCL_HOST_DEVICE(size_t i, slice<double> A, slice<double> B) {
->*[] _CCCL_HOST_DEVICE(size_t i, slice<double> A, slice<const double> B) {
A(i) = cos(B(i));
};
}
else
{
ctx.parallel_for(blocked_partition(), exec_place::current_device(), lA.shape(), lA.read(), lB.rw())
->*[] _CCCL_HOST_DEVICE(size_t i, slice<double> A, slice<double> B) {
->*[] _CCCL_HOST_DEVICE(size_t i, slice<const double> A, slice<double> B) {
B(i) = sin(A(i));
};
}
Expand Down
30 changes: 30 additions & 0 deletions cudax/include/cuda/experimental/__stf/internal/data_interface.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
# pragma system_header
#endif // no system header

#include <cuda/std/type_traits>

#include <cuda/experimental/__stf/allocators/block_allocator.cuh>
#include <cuda/experimental/__stf/internal/msir.cuh>
#include <cuda/experimental/__stf/internal/slice.cuh>
Expand Down Expand Up @@ -86,6 +88,9 @@ struct rw_type_of<const mdspan<const T, Extents, Layout, Accessor<const T>>>
using type = mdspan<T, Extents, Layout, Accessor<T>>;
};

template <class T>
inline constexpr bool always_false = false;

} // namespace reserved

/**
Expand All @@ -102,6 +107,31 @@ using readonly_type_of = typename reserved::readonly_type_of<T>::type;
template <typename T>
using rw_type_of = typename reserved::rw_type_of<T>::type;

template <typename T>
rw_type_of<T> to_rw_type_of(T t)
{
return rw_type_of<T>{t};
}

template <typename T, typename Extents, typename Layout, template <typename> class Accessor>
mdspan<T, Extents, Layout, Accessor<T>> to_rw_type_of(mdspan<const T, Extents, Layout, Accessor<const T>> md)
{
if constexpr (_CUDA_VSTD::is_default_constructible_v<Accessor<T>>)
{
return mdspan<T, Extents, Layout, Accessor<T>>{const_cast<T*>(md.data_handle()), md.mapping()};
}
else if constexpr (_CUDA_VSTD::is_constructible_v<Accessor<T>, const Accessor<const T>&>)
{
return mdspan<T, Extents, Layout, Accessor<T>>{
const_cast<T*>(md.data_handle()), md.mapping(), Accessor<T>{md.accessor()}};
}
else
{
static_assert(reserved::always_false<T>, "Need to implement the conversion of Accessor<T> to Accessor<const T>");
}
_CCCL_UNREACHABLE();
}

namespace reserved
{
template <typename Data>
Expand Down
10 changes: 5 additions & 5 deletions cudax/include/cuda/experimental/stf.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1204,7 +1204,7 @@ UNITTEST("context task")

ctx.task(la.read(), lb.write())->*[](auto s, auto a, auto b) {
// no-op
cudaMemcpyAsync(&a(0), &b(0), sizeof(int), cudaMemcpyDeviceToDevice, s);
cudaMemcpyAsync(&b(0), &a(0), sizeof(int), cudaMemcpyDeviceToDevice, s);
};

ctx.finalize();
Expand Down Expand Up @@ -1459,7 +1459,7 @@ private:
runner_impl(context_t& _ctx, algorithm& _alg, task_dep<Deps>... _deps)
: alg(_alg)
, ctx(_ctx)
, deps(::std::make_tuple(mv(_deps)...)) {};
, deps(::std::make_tuple(mv(_deps)...)){};

template <typename Fun>
void operator->*(Fun&& fun)
Expand Down Expand Up @@ -1527,7 +1527,7 @@ public:
// Our infrastructure currently does not like to work with
// constant types for the data interface so we pretend this is
// a modifiable data if necessary
return gctx.logical_data(rw_type_of<decltype(x)>(x), current_place.affine_data_place());
return gctx.logical_data(to_rw_type_of(x), current_place.affine_data_place());
};

// Transform the tuple of instances into a tuple of logical data
Expand Down Expand Up @@ -1688,7 +1688,7 @@ public:
// Our infrastructure currently does not like to work with constant
// types for the data interface so we pretend this is a modifiable
// data if necessary
return gctx.logical_data(rw_type_of<decltype(x)>(x), current_place.affine_data_place());
return gctx.logical_data(to_rw_type_of(x), current_place.affine_data_place());
};

// Transform the tuple of instances into a tuple of logical data
Expand Down Expand Up @@ -1849,7 +1849,7 @@ public:
(void) data_per_iteration;

auto logify = [](auto& dest_ctx, auto x) {
return dest_ctx.logical_data(rw_type_of<decltype(x)>(x), exec_place::current_device().affine_data_place());
return dest_ctx.logical_data(to_rw_type_of(x), exec_place::current_device().affine_data_place());
};

for (size_t i = start; i < end; i++)
Expand Down
6 changes: 3 additions & 3 deletions cudax/test/stf/examples/09-nbody-blocked.cu
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ void writeVTKFile(context& ctx,

for (size_t b = 0; b < parts.size(); b++)
{
ctx.task(exec_place::host, parts[b].read())->*[&](cudaStream_t s, slice<body> p) {
ctx.task(exec_place::host, parts[b].read())->*[&](cudaStream_t s, slice<const body> p) {
cuda_safe_call(cudaStreamSynchronize(s));
for (size_t i = 0; i < p.size(); i++)
{
Expand Down Expand Up @@ -210,7 +210,7 @@ int main(int argc, char** argv)
{
ctx.launch(exec_place::device(b % ngpus), parts[b].read(), parts[b_other].read(), acc_parts[b].rw())
//.set_symbol("compute_acc")
->*[=] _CCCL_DEVICE(auto t, slice<body> p, slice<body> p_other, slice<double, 2> acc) {
->*[=] _CCCL_DEVICE(auto t, slice<const body> p, slice<const body> p_other, slice<double, 2> acc) {
for (size_t i = t.rank(); i < p.extent(0); i += t.size())
{
for (size_t j = 0; j < p_other.extent(0); j++)
Expand Down Expand Up @@ -242,7 +242,7 @@ int main(int argc, char** argv)
// Update velocity and positions
ctx.launch(exec_place::device(b % ngpus), parts[b].rw(), acc_parts[b].read())
//.set_symbol("update")
->*[=] _CCCL_DEVICE(auto t, slice<body> p, slice<double, 2> acc) {
->*[=] _CCCL_DEVICE(auto t, slice<body> p, slice<const double, 2> acc) {
for (size_t i = t.rank(); i < p.extent(0); i += t.size())
{
for (size_t k = 0; k < 3; k++)
Expand Down
2 changes: 1 addition & 1 deletion cudax/test/stf/interface/data_from_device_async.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
using namespace cuda::experimental::stf;

template <typename T>
__global__ void axpy(int n, T a, T* x, T* y)
__global__ void axpy(int n, T a, const T* x, T* y)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int nthreads = gridDim.x * blockDim.x;
Expand Down
2 changes: 1 addition & 1 deletion cudax/test/stf/interface/graph_use_device_data.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
using namespace cuda::experimental::stf;

template <typename T>
__global__ void axpy(int N, T a, T* x, T* y)
__global__ void axpy(int N, T a, const T* x, T* y)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int nthreads = gridDim.x * blockDim.x;
Expand Down
2 changes: 1 addition & 1 deletion cudax/test/stf/local_stf/interop_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
using namespace cuda::experimental::stf;

// B += alpha*A;
__global__ void axpy(double alpha, double* d_ptrA, double* d_ptrB, size_t N)
__global__ void axpy(double alpha, const double* d_ptrA, double* d_ptrB, size_t N)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int nthreads = gridDim.x * blockDim.x;
Expand Down
2 changes: 1 addition & 1 deletion cudax/test/stf/local_stf/legacy_to_stf.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ __global__ void initB(double* d_ptrB, size_t N)
}

// B += alpha*A;
__global__ void axpy(double alpha, double* d_ptrA, double* d_ptrB, size_t N)
__global__ void axpy(double alpha, const double* d_ptrA, double* d_ptrB, size_t N)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int nthreads = gridDim.x * blockDim.x;
Expand Down
2 changes: 1 addition & 1 deletion cudax/test/stf/parallel_for/fdtd.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ using namespace cuda::experimental::stf;

// FIXME : MSVC has trouble with box constructors
#if !_CCCL_COMPILER(MSVC)
void write_vtk_2D(const std::string& filename, slice<double, 3> Ez, double dx, double dy, double /*unused*/)
void write_vtk_2D(const std::string& filename, slice<const double, 3> Ez, double dx, double dy, double /*unused*/)
{
FILE* f = fopen(filename.c_str(), "w");

Expand Down
31 changes: 16 additions & 15 deletions cudax/test/stf/parallel_for/test2_parallel_for_context.cu
Original file line number Diff line number Diff line change
Expand Up @@ -53,21 +53,22 @@ int main()
}
};

ctx.parallel_for(exec_place::host, ly.shape(), ly.read())->*[=] __host__(size_t i, size_t j, slice<double, 2> sy) {
double expected = y0(i, j);
for (size_t ii = 0; ii < 2; ii++)
{
for (size_t jj = 0; jj < 2; jj++)
{
expected += x0(2 * i + ii, 2 * j + jj);
}
}
if (fabs(sy(i, j) - expected) > 0.001)
{
printf("sy(%zu, %zu) %f expect %f\n", i, j, sy(i, j), expected);
}
// assert(fabs(sy(i, j) - expected) < 0.001);
};
ctx.parallel_for(exec_place::host, ly.shape(), ly.read())
->*[=] __host__(size_t i, size_t j, slice<const double, 2> sy) {
double expected = y0(i, j);
for (size_t ii = 0; ii < 2; ii++)
{
for (size_t jj = 0; jj < 2; jj++)
{
expected += x0(2 * i + ii, 2 * j + jj);
}
}
if (fabs(sy(i, j) - expected) > 0.001)
{
printf("sy(%zu, %zu) %f expect %f\n", i, j, sy(i, j), expected);
}
// assert(fabs(sy(i, j) - expected) < 0.001);
};

ctx.finalize();
}
8 changes: 4 additions & 4 deletions cudax/test/stf/parallel_for/test_parallel_for.cu
Original file line number Diff line number Diff line change
Expand Up @@ -94,28 +94,28 @@ int main()
case 3:
// This works because it dynamically selects the dual function to run on the host
ctx.parallel_for(exec_place::host, ly.shape(), lx.read(), ly.rw())
->*[=] __host__ __device__(size_t pos, slice<double> sx, slice<double> sy) {
->*[=] __host__ __device__(size_t pos, slice<const double> sx, slice<double> sy) {
sy(pos) += 0.5 * (sx(2 * pos) + sx(2 * pos + 1));
};
break;
case 4:
// This works because it dynamically selects the dual function to run on the device
ctx.parallel_for(exec_place::current_device(), ly.shape(), lx.read(), ly.rw())
->*[=] __host__ __device__(size_t pos, slice<double> sx, slice<double> sy) {
->*[=] __host__ __device__(size_t pos, slice<const double> sx, slice<double> sy) {
sy(pos) += 0.5 * (sx(2 * pos) + sx(2 * pos + 1));
};
break;
case 5:
// This works because it dynamically selects the dual function to run on the current device
ctx.parallel_for(ly.shape(), lx.read(), ly.rw())
->*[=] __host__ __device__(size_t pos, slice<double> sx, slice<double> sy) {
->*[=] __host__ __device__(size_t pos, slice<const double> sx, slice<double> sy) {
sy(pos) += 0.5 * (sx(2 * pos) + sx(2 * pos + 1));
};
break;
case 6:
// This works because it dispatches on all devices
ctx.parallel_for(blocked_partition(), exec_place::all_devices(), ly.shape(), lx.read(), ly.rw())
->*[=] __host__ __device__(size_t pos, slice<double> sx, slice<double> sy) {
->*[=] __host__ __device__(size_t pos, slice<const double> sx, slice<double> sy) {
sy(pos) += 0.5 * (sx(2 * pos) + sx(2 * pos + 1));
};
break;
Expand Down
2 changes: 1 addition & 1 deletion cudax/test/stf/parallel_for/tiled_loops.cu
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ int main()
bool* pchecked = &checked;

/* Check the result on the host */
ctx.parallel_for(exec_place::host, ly.shape(), ly.read())->*[=](size_t pos, slice<double> sy) {
ctx.parallel_for(exec_place::host, ly.shape(), ly.read())->*[=](size_t pos, slice<const double> sy) {
int expected = static_cast<int>(ref_tiling(pos, tile_size, nparts));
int value = (int) sy(pos);
if (expected != value)
Expand Down
4 changes: 2 additions & 2 deletions cudax/test/stf/stencil/stencil-1D.cu
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ public:
};

template <typename T>
__global__ void copy_kernel(size_t cnt, T* dst, T* src)
__global__ void copy_kernel(size_t cnt, T* dst, const T* src)
{
for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < cnt; idx += blockDim.x * gridDim.x)
{
Expand All @@ -72,7 +72,7 @@ __global__ void copy_kernel(size_t cnt, T* dst, T* src)
}

template <typename T>
__global__ void stencil_kernel(size_t cnt, size_t ghost_size, T* array, T* array1)
__global__ void stencil_kernel(size_t cnt, size_t ghost_size, T* array, const T* array1)
{
for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < cnt; idx += blockDim.x * gridDim.x)
{
Expand Down
Loading

0 comments on commit e863ad0

Please sign in to comment.