Skip to content

Commit 23ed91b

Browse files
committed
AMREX_GPU_MAX_THREADS: 256 -> 128
1 parent 86c1fe2 commit 23ed91b

14 files changed

+35
-32
lines changed

Docs/sphinx_documentation/source/GPU.rst

+3-3
Original file line numberDiff line numberDiff line change
@@ -229,9 +229,9 @@ Building with CMake
229229

230230
To build AMReX with GPU support in CMake, add
231231
``-DAMReX_GPU_BACKEND=CUDA|HIP|SYCL`` to the ``cmake`` invocation, for CUDA,
232-
HIP and SYCL, respectively. By default, AMReX uses 256 threads per GPU
232+
HIP and SYCL, respectively. By default, AMReX uses 128 threads per GPU
233233
block/group in most situations. This can be changed with
234-
``-DAMReX_GPU_MAX_THREADS=N``, where ``N`` is 128 for example.
234+
``-DAMReX_GPU_MAX_THREADS=N``, where ``N`` is 256 for example.
235235

236236
Enabling CUDA support
237237
^^^^^^^^^^^^^^^^^^^^^
@@ -1166,7 +1166,7 @@ GPU block size
11661166

11671167
By default, :cpp:`ParallelFor` launches ``AMREX_GPU_MAX_THREADS`` threads
11681168
per GPU block, where ``AMREX_GPU_MAX_THREADS`` is a compile-time constant
1169-
with a default value of 256. The users can also explicitly specify the
1169+
with a default value of 128. The users can also explicitly specify the
11701170
number of threads per block by :cpp:`ParallelFor<MY_BLOCK_SIZE>(...)`, where
11711171
``MY_BLOCK_SIZE`` is a multiple of the warp size (e.g., 128). This allows
11721172
the users to do performance tuning for individual kernels.

Src/AmrCore/AMReX_TagBox.cpp

+5-5
Original file line numberDiff line numberDiff line change
@@ -447,8 +447,8 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
447447
const int ncells = fai.fabbox().numPts();
448448
const char* tags = (*this)[fai].dataPtr();
449449
#ifdef AMREX_USE_SYCL
450-
amrex::launch(nblocks[li], block_size, sizeof(int)*Gpu::Device::warp_size,
451-
Gpu::Device::gpuStream(),
450+
amrex::launch<block_size>(nblocks[li], sizeof(int)*Gpu::Device::warp_size,
451+
Gpu::Device::gpuStream(),
452452
[=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
453453
{
454454
int bid = h.item->get_group_linear_id();
@@ -467,7 +467,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
467467
}
468468
});
469469
#else
470-
amrex::launch(nblocks[li], block_size, Gpu::Device::gpuStream(),
470+
amrex::launch<block_size>(nblocks[li], Gpu::Device::gpuStream(),
471471
[=] AMREX_GPU_DEVICE () noexcept
472472
{
473473
int bid = blockIdx.x;
@@ -525,7 +525,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
525525
const int ncells = bx.numPts();
526526
const char* tags = (*this)[fai].dataPtr();
527527
#ifdef AMREX_USE_SYCL
528-
amrex::launch(nblocks[li], block_size, sizeof(unsigned int), Gpu::Device::gpuStream(),
528+
amrex::launch<block_size>(nblocks[li], sizeof(unsigned int), Gpu::Device::gpuStream(),
529529
[=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
530530
{
531531
int bid = h.item->get_group(0);
@@ -553,7 +553,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
553553
}
554554
});
555555
#else
556-
amrex::launch(nblocks[li], block_size, sizeof(unsigned int), Gpu::Device::gpuStream(),
556+
amrex::launch<block_size>(nblocks[li], sizeof(unsigned int), Gpu::Device::gpuStream(),
557557
[=] AMREX_GPU_DEVICE () noexcept
558558
{
559559
int bid = blockIdx.x;

Src/Base/AMReX_BaseFabUtility.H

+3-3
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,14 @@ void fill (BaseFab<STRUCT>& aos_fab, F const& f)
3838
if (Gpu::inLaunchRegion()) {
3939
BoxIndexer indexer(box);
4040
const auto ntotcells = std::uint64_t(box.numPts());
41-
int nthreads_per_block = (STRUCTSIZE <= 8) ? 256 : 128;
41+
constexpr int nthreads_per_block = (STRUCTSIZE <= 8) ? 256 : 128;
4242
std::uint64_t nblocks_long = (ntotcells+nthreads_per_block-1)/nthreads_per_block;
4343
AMREX_ASSERT(nblocks_long <= std::uint64_t(std::numeric_limits<int>::max()));
4444
auto nblocks = int(nblocks_long);
4545
std::size_t shared_mem_bytes = nthreads_per_block * sizeof(STRUCT);
4646
T* p = (T*)aos_fab.dataPtr();
4747
#ifdef AMREX_USE_SYCL
48-
amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
48+
amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
4949
[=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
5050
{
5151
auto const icell = std::uint64_t(handler.globalIdx());
@@ -66,7 +66,7 @@ void fill (BaseFab<STRUCT>& aos_fab, F const& f)
6666
}
6767
});
6868
#else
69-
amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
69+
amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
7070
[=] AMREX_GPU_DEVICE () noexcept
7171
{
7272
std::uint64_t const icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;

Src/Base/AMReX_BlockMutex.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ void BlockMutex::init_states (state_t* state, int N) noexcept {
99
amrex::ignore_unused(state,N);
1010
amrex::Abort("xxxxx SYCL todo");
1111
#else
12-
amrex::launch((N+255)/256, 256, Gpu::gpuStream(),
12+
amrex::launch<256>((N+255)/256, Gpu::gpuStream(),
1313
[=] AMREX_GPU_DEVICE () noexcept
1414
{
1515
int i = threadIdx.x + blockIdx.x*blockDim.x;

Src/Base/AMReX_GpuContainers.H

+3-3
Original file line numberDiff line numberDiff line change
@@ -433,11 +433,11 @@ namespace amrex::Gpu {
433433
unsigned long long, unsigned int>;
434434
constexpr Long nU = sizeof(T) / sizeof(U);
435435
auto pu = reinterpret_cast<U*>(p);
436-
int nthreads_per_block = (sizeof(T) <= 64) ? 256 : 128;
436+
constexpr int nthreads_per_block = (sizeof(T) <= 64) ? 256 : 128;
437437
int nblocks = static_cast<int>((N+nthreads_per_block-1)/nthreads_per_block);
438438
std::size_t shared_mem_bytes = nthreads_per_block * sizeof(T);
439439
#ifdef AMREX_USE_SYCL
440-
amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
440+
amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
441441
[=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
442442
{
443443
Long i = handler.globalIdx();
@@ -458,7 +458,7 @@ namespace amrex::Gpu {
458458
}
459459
});
460460
#else
461-
amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
461+
amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
462462
[=] AMREX_GPU_DEVICE () noexcept
463463
{
464464
Long blockDimx = blockDim.x;

Src/Base/AMReX_GpuLaunch.H

+4
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,13 @@
3434
#ifdef AMREX_USE_CUDA
3535
# define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \
3636
amrex::launch_global<MT><<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__)
37+
# define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream, ... ) \
38+
amrex::launch_global <<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__)
3739
#elif defined(AMREX_USE_HIP)
3840
# define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \
3941
hipLaunchKernelGGL(launch_global<MT>, blocks, threads, sharedMem, stream, __VA_ARGS__)
42+
# define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream, ... ) \
43+
hipLaunchKernelGGL(launch_global , blocks, threads, sharedMem, stream, __VA_ARGS__)
4044
#endif
4145

4246

Src/Base/AMReX_GpuLaunchFunctsG.H

+2-3
Original file line numberDiff line numberDiff line change
@@ -735,9 +735,8 @@ template<typename L>
735735
void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
736736
gpuStream_t stream, L const& f) noexcept
737737
{
738-
AMREX_ASSERT(nthreads_per_block <= AMREX_GPU_MAX_THREADS);
739-
AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, nblocks, nthreads_per_block, shared_mem_bytes,
740-
stream, [=] AMREX_GPU_DEVICE () noexcept { f(); });
738+
AMREX_LAUNCH_KERNEL_NOBOUND(nblocks, nthreads_per_block, shared_mem_bytes,
739+
stream, [=] AMREX_GPU_DEVICE () noexcept { f(); });
741740
AMREX_GPU_ERROR_CHECK();
742741
}
743742

Src/Base/AMReX_MultiFabUtil.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -846,10 +846,10 @@ namespace amrex
846846
int nblocks = n2dblocks * b.length(direction);
847847
#ifdef AMREX_USE_SYCL
848848
std::size_t shared_mem_byte = sizeof(Real)*Gpu::Device::warp_size;
849-
amrex::launch(nblocks, AMREX_GPU_MAX_THREADS, shared_mem_byte, Gpu::gpuStream(),
849+
amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks, shared_mem_byte, Gpu::gpuStream(),
850850
[=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
851851
#else
852-
amrex::launch(nblocks, AMREX_GPU_MAX_THREADS, Gpu::gpuStream(),
852+
amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks, Gpu::gpuStream(),
853853
[=] AMREX_GPU_DEVICE () noexcept
854854
#endif
855855
{

Src/Base/AMReX_Scan.H

+7-7
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ T PrefixSum_mp (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum)
209209
T* blocksum_p = (T*)(dp + nbytes_blockresult);
210210
T* totalsum_p = (T*)(dp + nbytes_blockresult + nbytes_blocksum);
211211

212-
amrex::launch(nblocks, nthreads, sm, stream,
212+
amrex::launch<nthreads>(nblocks, sm, stream,
213213
[=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
214214
{
215215
sycl::sub_group const& sg = gh.item->get_sub_group();
@@ -289,7 +289,7 @@ T PrefixSum_mp (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum)
289289
}
290290
});
291291

292-
amrex::launch(1, nthreads, sm, stream,
292+
amrex::launch<nthreads>(1, sm, stream,
293293
[=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
294294
{
295295
sycl::sub_group const& sg = gh.item->get_sub_group();
@@ -355,7 +355,7 @@ T PrefixSum_mp (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum)
355355
}
356356
});
357357

358-
amrex::launch(nblocks, nthreads, 0, stream,
358+
amrex::launch<nthreads>(nblocks, 0, stream,
359359
[=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
360360
{
361361
int threadIdxx = gh.item->get_local_id(0);
@@ -429,7 +429,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum
429429
}
430430
});
431431

432-
amrex::launch(nblocks, nthreads, sm, stream,
432+
amrex::launch<nthreads>(nblocks, sm, stream,
433433
[=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
434434
{
435435
sycl::sub_group const& sg = gh.item->get_sub_group();
@@ -672,7 +672,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
672672
(reinterpret_cast<OrderedBlockId::id_type*>(dp + nbytes_tile_state));
673673

674674
// Init ScanTileState on device
675-
amrex::launch((nblocks+nthreads-1)/nthreads, nthreads, 0, stream, [=] AMREX_GPU_DEVICE ()
675+
amrex::launch<nthreads>((nblocks+nthreads-1)/nthreads, 0, stream, [=] AMREX_GPU_DEVICE ()
676676
{
677677
auto& scan_tile_state = const_cast<ScanTileState&>(tile_state);
678678
auto& scan_bid = const_cast<OrderedBlockId&>(ordered_block_id);
@@ -813,7 +813,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
813813

814814
if (nblocks > 1) {
815815
// Init ScanTileState on device
816-
amrex::launch((nblocks+nthreads-1)/nthreads, nthreads, 0, stream, [=] AMREX_GPU_DEVICE ()
816+
amrex::launch<nthreads>((nblocks+nthreads-1)/nthreads, 0, stream, [=] AMREX_GPU_DEVICE ()
817817
{
818818
const_cast<ScanTileState&>(tile_state).InitializeStatus(nblocks);
819819
});
@@ -957,7 +957,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
957957
}
958958
});
959959

960-
amrex::launch(nblocks, nthreads, sm, stream,
960+
amrex::launch<nthreads>(nblocks, sm, stream,
961961
[=] AMREX_GPU_DEVICE () noexcept
962962
{
963963
int lane = threadIdx.x % Gpu::Device::warp_size;

Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp_bc.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ MLEBTensorOp::applyBCTensor (int amrlev, int mglev, MultiFab& vel,
8585

8686
#ifdef AMREX_USE_GPU
8787
if (Gpu::inLaunchRegion()) {
88-
amrex::launch(12, 64, Gpu::gpuStream(),
88+
amrex::launch<64>(12, Gpu::gpuStream(),
8989
#ifdef AMREX_USE_SYCL
9090
[=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item)
9191
{

Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,7 @@ MLTensorOp::applyBCTensor (int amrlev, int mglev, MultiFab& vel, // NOLINT(reada
408408
// only edge vals used in 3D stencil
409409
#ifdef AMREX_USE_GPU
410410
if (Gpu::inLaunchRegion()) {
411-
amrex::launch(12, 64, Gpu::gpuStream(),
411+
amrex::launch<64>(12, Gpu::gpuStream(),
412412
#ifdef AMREX_USE_SYCL
413413
[=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item)
414414
{

Src/LinearSolvers/OpenBC/AMReX_OpenBC.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -745,7 +745,7 @@ void OpenBCSolver::compute_potential (Gpu::DeviceVector<openbc::Moments> const&
745745
lenxy,lenx);
746746
amrex::Abort("xxxxx SYCL todo: openbc compute_potential");
747747
#else
748-
amrex::launch(b.numPts(), AMREX_GPU_MAX_THREADS, Gpu::gpuStream(),
748+
amrex::launch<AMREX_GPU_MAX_THREADS>(b.numPts(), Gpu::gpuStream(),
749749
[=] AMREX_GPU_DEVICE () noexcept
750750
{
751751
int icell = blockIdx.x;

Tools/CMake/AMReXOptions.cmake

+1-1
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ if (NOT AMReX_GPU_BACKEND STREQUAL NONE)
133133
message( STATUS " AMReX_GPU_BACKEND = ${AMReX_GPU_BACKEND}")
134134

135135
# We might set different default for different GPUs in the future.
136-
set(AMReX_GPU_MAX_THREADS_DEFAULT "256")
136+
set(AMReX_GPU_MAX_THREADS_DEFAULT "128")
137137
set(AMReX_GPU_MAX_THREADS ${AMReX_GPU_MAX_THREADS_DEFAULT} CACHE STRING
138138
"Maximum number of GPU threads per block" )
139139
message( STATUS " AMReX_GPU_MAX_THREADS = ${AMReX_GPU_MAX_THREADS}")

Tools/GNUMake/Make.defs

+1-1
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,7 @@ else
269269
endif
270270

271271
# Maximum number of GPU threads per block.
272-
CUDA_MAX_THREADS ?= 256
272+
CUDA_MAX_THREADS ?= 128
273273
GPU_MAX_THREADS ?= $(CUDA_MAX_THREADS)
274274

275275
ifeq ($(USE_CUDA),TRUE)

0 commit comments

Comments
 (0)