Skip to content

Commit

Permalink
Merge branch 'main' into fix_barrier_handle_include
Browse files Browse the repository at this point in the history
  • Loading branch information
miscco authored Jan 28, 2025
2 parents 07a9c53 + c3948c4 commit e0e9a76
Show file tree
Hide file tree
Showing 10 changed files with 171 additions and 148 deletions.
5 changes: 3 additions & 2 deletions ci/matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@ workflows:
- {jobs: ['build'], std: 'max', cxx: ['msvc2019']}
- {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang', 'msvc']}
# Current CTK testing:
- {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['gcc']}
- {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['clang', 'msvc']}
- {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['gcc', 'clang']}
# Disabled until we figure out the issue with the TBB dll
#- {jobs: ['test'], project: ['libcudacxx', 'thrust'], std: 'max', cxx: ['msvc']}
# Split up cub tests:
- {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['gcc']}
- {jobs: ['test_lid1', 'test_lid2'], project: ['cub'], std: 'max', cxx: ['gcc']}
Expand Down
20 changes: 10 additions & 10 deletions cudax/examples/stf/linear_algebra/06-pdgemm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -160,9 +160,9 @@ public:
{
nvtxRangePushA("FILL");
// Fill blocks by blocks
for (int colb = 0; colb < nt; colb++)
for (size_t colb = 0; colb < nt; colb++)
{
for (int rowb = 0; rowb < mt; rowb++)
for (size_t rowb = 0; rowb < mt; rowb++)
{
// Each task fills a block
auto& h = get_handle(rowb, colb);
Expand Down Expand Up @@ -251,9 +251,9 @@ void PDGEMM(stream_ctx& ctx,
double beta,
matrix<double>& C)
{
for (int m = 0; m < C.mt; m++)
for (size_t m = 0; m < C.mt; m++)
{
for (int n = 0; n < C.nt; n++)
for (size_t n = 0; n < C.nt; n++)
{
//=========================================
// alpha*A*B does not contribute; scale C
Expand All @@ -271,7 +271,7 @@ void PDGEMM(stream_ctx& ctx,
if (transb == CUBLAS_OP_N)
{
assert(A.nt == B.mt);
for (int k = 0; k < A.nt; k++)
for (size_t k = 0; k < A.nt; k++)
{
double zbeta = k == 0 ? beta : 1.0;
DGEMM(ctx, transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n);
Expand All @@ -282,7 +282,7 @@ void PDGEMM(stream_ctx& ctx,
//=====================================
else
{
for (int k = 0; k < A.nt; k++)
for (size_t k = 0; k < A.nt; k++)
{
double zbeta = k == 0 ? beta : 1.0;
DGEMM(ctx, transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n);
Expand All @@ -296,7 +296,7 @@ void PDGEMM(stream_ctx& ctx,
//=====================================
if (transb == CUBLAS_OP_N)
{
for (int k = 0; k < A.mt; k++)
for (size_t k = 0; k < A.mt; k++)
{
double zbeta = k == 0 ? beta : 1.0;
DGEMM(ctx, transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n);
Expand All @@ -307,7 +307,7 @@ void PDGEMM(stream_ctx& ctx,
//==========================================
else
{
for (int k = 0; k < A.mt; k++)
for (size_t k = 0; k < A.mt; k++)
{
double zbeta = k == 0 ? beta : 1.0;
DGEMM(ctx, transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n);
Expand All @@ -328,14 +328,14 @@ void run(stream_ctx& ctx, size_t N, size_t NB)
cuda_safe_call(cudaGetDeviceCount(&ndevs));

/* Warm up allocators */
for (size_t d = 0; d < ndevs; d++)
for (int d = 0; d < ndevs; d++)
{
auto lX = ctx.logical_data(shape_of<slice<double>>(1));
ctx.parallel_for(exec_place::device(d), lX.shape(), lX.write())->*[] _CCCL_DEVICE(size_t, auto) {};
}

/* Initializes CUBLAS on all devices */
for (size_t d = 0; d < ndevs; d++)
for (int d = 0; d < ndevs; d++)
{
cuda_safe_call(cudaSetDevice(d));
get_cublas_handle();
Expand Down
42 changes: 21 additions & 21 deletions cudax/examples/stf/linear_algebra/07-cholesky.cu
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,10 @@ public:

handles.resize(mt * nt);

for (int colb = 0; colb < nt; colb++)
for (size_t colb = 0; colb < nt; colb++)
{
int low_rowb = sym_matrix ? colb : 0;
for (int rowb = low_rowb; rowb < mt; rowb++)
for (size_t rowb = low_rowb; rowb < mt; rowb++)
{
T* addr_h = get_block_h(rowb, colb);
auto& h = handle(rowb, colb);
Expand Down Expand Up @@ -171,10 +171,10 @@ public:
{
nvtxRangePushA("FILL");
// Fill blocks by blocks
for (int colb = 0; colb < nt; colb++)
for (size_t colb = 0; colb < nt; colb++)
{
int low_rowb = sym_matrix ? colb : 0;
for (int rowb = low_rowb; rowb < mt; rowb++)
for (size_t rowb = low_rowb; rowb < mt; rowb++)
{
// Each task fills a block
auto& h = handle(rowb, colb);
Expand Down Expand Up @@ -363,9 +363,9 @@ void PDNRM2_HOST(matrix<double>* A, double* result)
reserved::dot::set_current_color("red");
#endif

for (int rowb = 0; rowb < A->mt; rowb++)
for (size_t rowb = 0; rowb < A->mt; rowb++)
{
for (int colb = 0; colb < A->nt; colb++)
for (size_t colb = 0; colb < A->nt; colb++)
{
ctx.host_launch(A->handle(rowb, colb).read())->*[=](auto sA) {
double res2 = 0.0;
Expand Down Expand Up @@ -452,17 +452,17 @@ void PDTRSM(cublasSideMode_t side,
//===========================================
if (trans == CUBLAS_OP_N)
{
for (int k = 0; k < B.mt; k++)
for (size_t k = 0; k < B.mt; k++)
{
double lalpha = k == 0 ? alpha : 1.0;
for (int n = 0; n < B.nt; n++)
for (size_t n = 0; n < B.nt; n++)
{
cuda_safe_call(cudaSetDevice(A.get_preferred_devid(k, k)));
DTRSM(side, uplo, trans, diag, lalpha, A, k, k, B, k, n);
}
for (int m = k + 1; m < B.mt; m++)
for (size_t m = k + 1; m < B.mt; m++)
{
for (int n = 0; n < B.nt; n++)
for (size_t n = 0; n < B.nt; n++)
{
cuda_safe_call(cudaSetDevice(A.get_preferred_devid(m, k)));
DGEMM(CUBLAS_OP_N, CUBLAS_OP_N, -1.0, A, m, k, B, k, n, lalpha, B, m, n);
Expand All @@ -475,17 +475,17 @@ void PDTRSM(cublasSideMode_t side,
//================================================
else
{
for (int k = 0; k < B.mt; k++)
for (size_t k = 0; k < B.mt; k++)
{
double lalpha = k == 0 ? alpha : 1.0;
for (int n = 0; n < B.nt; n++)
for (size_t n = 0; n < B.nt; n++)
{
cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - k - 1)));
DTRSM(side, uplo, trans, diag, lalpha, A, B.mt - k - 1, B.mt - k - 1, B, B.mt - k - 1, n);
}
for (int m = k + 1; m < B.mt; m++)
for (size_t m = k + 1; m < B.mt; m++)
{
for (int n = 0; n < B.nt; n++)
for (size_t n = 0; n < B.nt; n++)
{
cuda_safe_call(cudaSetDevice(A.get_preferred_devid(B.mt - k - 1, B.mt - 1 - m)));
DGEMM(
Expand Down Expand Up @@ -543,9 +543,9 @@ void PDGEMM(cublasOperation_t transa,
reserved::dot::set_current_color("blue");
#endif

for (int m = 0; m < C.mt; m++)
for (size_t m = 0; m < C.mt; m++)
{
for (int n = 0; n < C.nt; n++)
for (size_t n = 0; n < C.nt; n++)
{
//=========================================
// alpha*A*B does not contribute; scale C
Expand All @@ -562,7 +562,7 @@ void PDGEMM(cublasOperation_t transa,
//================================
if (transb == CUBLAS_OP_N)
{
for (int k = 0; k < A.nt; k++)
for (size_t k = 0; k < A.nt; k++)
{
double zbeta = k == 0 ? beta : 1.0;
DGEMM(transa, transb, alpha, A, m, k, B, k, n, zbeta, C, m, n);
Expand All @@ -573,7 +573,7 @@ void PDGEMM(cublasOperation_t transa,
//=====================================
else
{
for (int k = 0; k < A.nt; k++)
for (size_t k = 0; k < A.nt; k++)
{
double zbeta = k == 0 ? beta : 1.0;
DGEMM(transa, transb, alpha, A, m, k, B, n, k, zbeta, C, m, n);
Expand All @@ -587,7 +587,7 @@ void PDGEMM(cublasOperation_t transa,
//=====================================
if (transb == CUBLAS_OP_N)
{
for (int k = 0; k < A.mt; k++)
for (size_t k = 0; k < A.mt; k++)
{
double zbeta = k == 0 ? beta : 1.0;
DGEMM(transa, transb, alpha, A, k, m, B, k, n, zbeta, C, m, n);
Expand All @@ -598,7 +598,7 @@ void PDGEMM(cublasOperation_t transa,
//==========================================
else
{
for (int k = 0; k < A.mt; k++)
for (size_t k = 0; k < A.mt; k++)
{
double zbeta = k == 0 ? beta : 1.0;
DGEMM(transa, transb, alpha, A, k, m, B, n, k, zbeta, C, m, n);
Expand Down Expand Up @@ -640,7 +640,7 @@ int main(int argc, char** argv)
int ndevs;
cuda_safe_call(cudaGetDeviceCount(&ndevs));

for (size_t d = 0; d < ndevs; d++)
for (int d = 0; d < ndevs; d++)
{
auto lX = ctx.logical_data(shape_of<slice<double>>(1));
ctx.parallel_for(exec_place::device(d), lX.shape(), lX.write())->*[] _CCCL_DEVICE(size_t, auto) {};
Expand Down
Loading

0 comments on commit e0e9a76

Please sign in to comment.