-
Notifications
You must be signed in to change notification settings - Fork 196
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Introduce a mutex to protect the underlying CUDA graph of a graph context so that we can generate tasks concurrently * Protect CUDA graphs against concurrent accesses * do test results * remove dead code * Add a test with graph capture and threads * Add and use with_locked_graph, also use weak_ptr * Big code simplification ! no need for shared_ptr/weak_ptr with the mutex that will outlive its users * use std::reference_wrapper to have graph_task be moved assignable * replace complicated API based on lambda function with a simple lock guard * restore a comment removed by mistake * capture with the lock taken because we need to ensure the captured stream is not used concurrently * Fix build * comment why we put a reference_wrapper * Add a sanity check to ensure we have finished all tasks * atomic variables are not automatically initialized, so we do set them * Add missing mutex headers * improve readability --------- Co-authored-by: Andrei Alexandrescu <[email protected]>
- Loading branch information
Showing
8 changed files
with
244 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// Part of CUDASTF in CUDA C++ Core Libraries, | ||
// under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
/** | ||
* @file | ||
* | ||
* @brief Ensure a graph_ctx can be used concurrently | ||
* | ||
*/ | ||
|
||
#include <cuda/experimental/stf.cuh> | ||
|
||
#include <mutex> | ||
#include <thread> | ||
|
||
using namespace cuda::experimental::stf; | ||
|
||
__global__ void axpy(slice<int> y, slice<const int> x, int a) | ||
{ | ||
int tid = blockIdx.x * blockDim.x + threadIdx.x; | ||
int nthreads = gridDim.x * blockDim.x; | ||
|
||
for (int i = tid; i < x.size(); i += nthreads) | ||
{ | ||
y(i) += a * x(i); | ||
} | ||
} | ||
|
||
void mytask(graph_ctx ctx, int /*id*/) | ||
{ | ||
const size_t N = 16; | ||
|
||
int alpha = 3; | ||
|
||
auto lX = ctx.logical_data<int>(N); | ||
auto lY = ctx.logical_data<int>(N); | ||
|
||
ctx.parallel_for(lX.shape(), lX.write(), lY.write())->*[] __device__(size_t i, auto x, auto y) { | ||
x(i) = (1 + i); | ||
y(i) = (2 + i * i); | ||
}; | ||
|
||
/* Compute Y = Y + alpha X */ | ||
for (size_t i = 0; i < 200; i++) | ||
{ | ||
ctx.task(lY.rw(), lX.read())->*[alpha](cudaStream_t stream, auto dY, auto dX) { | ||
axpy<<<128, 64, 0, stream>>>(dY, dX, alpha); | ||
}; | ||
} | ||
|
||
ctx.host_launch(lX.read(), lY.read())->*[alpha](auto x, auto y) { | ||
for (size_t i = 0; i < N; i++) | ||
{ | ||
EXPECT(x(i) == 1 + i); | ||
EXPECT(y(i) == 2 + i * i + 200 * alpha * x(i)); | ||
} | ||
}; | ||
} | ||
|
||
int main() | ||
{ | ||
graph_ctx ctx; | ||
|
||
::std::vector<::std::thread> threads; | ||
// Launch threads | ||
for (int i = 0; i < 10; ++i) | ||
{ | ||
threads.emplace_back(mytask, ctx, i); | ||
} | ||
|
||
// Wait for all threads to complete. | ||
for (auto& th : threads) | ||
{ | ||
th.join(); | ||
} | ||
|
||
ctx.finalize(); | ||
} |
Oops, something went wrong.