Skip to content

Commit 45b52d3

Browse files
committed
Future Base: nthreads_128 (AMReX-Codes#4417)
1 parent ae7024c commit 45b52d3

File tree

8 files changed

+117
-139
lines changed

8 files changed

+117
-139
lines changed

Docs/sphinx_documentation/source/GPU.rst

+4-4
Original file line numberDiff line numberDiff line change
@@ -229,9 +229,9 @@ Building with CMake
229229

230230
To build AMReX with GPU support in CMake, add
231231
``-DAMReX_GPU_BACKEND=CUDA|HIP|SYCL`` to the ``cmake`` invocation, for CUDA,
232-
HIP and SYCL, respectively. By default, AMReX uses 256 threads per GPU
233-
block/group in most situations. This can be changed with
234-
``-DAMReX_GPU_MAX_THREADS=N``, where ``N`` is 128 for example.
232+
HIP and SYCL, respectively. By default, AMReX uses 128 threads per GPU block
233+
in most situations for CUDA, and 256 for HIP and SYCL. This can be changed
234+
with ``-DAMReX_GPU_MAX_THREADS=N``, where ``N`` is 256 or 128 for example.
235235

236236
Enabling CUDA support
237237
^^^^^^^^^^^^^^^^^^^^^
@@ -1166,7 +1166,7 @@ GPU block size
11661166

11671167
By default, :cpp:`ParallelFor` launches ``AMREX_GPU_MAX_THREADS`` threads
11681168
per GPU block, where ``AMREX_GPU_MAX_THREADS`` is a compile-time constant
1169-
with a default value of 256. The users can also explicitly specify the
1169+
with a default value of 128 for CUDA and 256 for HIP and SYCL. The users can also explicitly specify the
11701170
number of threads per block by :cpp:`ParallelFor<MY_BLOCK_SIZE>(...)`, where
11711171
``MY_BLOCK_SIZE`` is a multiple of the warp size (e.g., 128). This allows
11721172
the users to do performance tuning for individual kernels.

Src/Base/AMReX_FabArrayBase.H

+6-6
Original file line numberDiff line numberDiff line change
@@ -650,10 +650,12 @@ public:
650650
//! For ParallelFor(FabArray)
651651
struct ParForInfo
652652
{
653-
ParForInfo (const FabArrayBase& fa, const IntVect& nghost, int nthreads);
653+
ParForInfo (const FabArrayBase& fa, const IntVect& nghost);
654654
~ParForInfo ();
655655

656-
std::pair<int*,int*> const& getBlocks () const { return m_nblocks_x; }
656+
int getNBlocksPerBox (int nthreads) const {
657+
return int((m_ncellsmax+nthreads-1)/nthreads);
658+
}
657659
BoxIndexer const* getBoxes () const { return m_boxes; }
658660

659661
ParForInfo () = delete;
@@ -664,14 +666,12 @@ public:
664666

665667
BATransformer m_bat;
666668
IntVect m_ng;
667-
int m_nthreads;
668-
std::pair<int*,int*> m_nblocks_x;
669+
Long m_ncellsmax = 0;
669670
BoxIndexer* m_boxes = nullptr;
670671
char* m_hp = nullptr;
671-
char* m_dp = nullptr;
672672
};
673673

674-
ParForInfo const& getParForInfo (const IntVect& nghost, int nthreads) const;
674+
ParForInfo const& getParForInfo (const IntVect& nghost) const;
675675

676676
static std::multimap<BDKey,ParForInfo*> m_TheParForCache;
677677

Src/Base/AMReX_FabArrayBase.cpp

+9-13
Original file line numberDiff line numberDiff line change
@@ -2635,15 +2635,12 @@ FabArrayBase::isFusingCandidate () const noexcept // NOLINT(readability-convert-
26352635

26362636
#ifdef AMREX_USE_GPU
26372637

2638-
FabArrayBase::ParForInfo::ParForInfo (const FabArrayBase& fa, const IntVect& nghost, int nthreads)
2638+
FabArrayBase::ParForInfo::ParForInfo (const FabArrayBase& fa, const IntVect& nghost)
26392639
: m_bat(fa.boxArray().transformer()),
2640-
m_ng(nghost),
2641-
m_nthreads(nthreads),
2642-
m_nblocks_x({nullptr,nullptr})
2640+
m_ng(nghost)
26432641
{
26442642
Vector<Box> boxes;
2645-
Vector<Long> ncells;
2646-
ncells.reserve(fa.indexArray.size());
2643+
m_ncellsmax = 0;
26472644
for (int K : fa.indexArray) {
26482645
Long N = 0;
26492646
Box b = fa.box(K);
@@ -2652,31 +2649,30 @@ FabArrayBase::ParForInfo::ParForInfo (const FabArrayBase& fa, const IntVect& ngh
26522649
N = b.numPts();
26532650
}
26542651
boxes.push_back(b);
2655-
ncells.push_back(N);
2652+
m_ncellsmax = std::max(m_ncellsmax, N);
26562653
}
2657-
detail::build_par_for_nblocks(m_hp, m_dp, m_nblocks_x, m_boxes, boxes, ncells, nthreads);
2654+
detail::build_par_for_boxes(m_hp, m_boxes, boxes);
26582655
}
26592656

26602657
FabArrayBase::ParForInfo::~ParForInfo ()
26612658
{
2662-
detail::destroy_par_for_nblocks(m_hp, m_dp);
2659+
detail::destroy_par_for_boxes(m_hp, (char*)m_boxes);
26632660
}
26642661

26652662
FabArrayBase::ParForInfo const&
2666-
FabArrayBase::getParForInfo (const IntVect& nghost, int nthreads) const
2663+
FabArrayBase::getParForInfo (const IntVect& nghost) const
26672664
{
26682665
AMREX_ASSERT(getBDKey() == m_bdkey);
26692666
auto er_it = m_TheParForCache.equal_range(m_bdkey);
26702667
for (auto it = er_it.first; it != er_it.second; ++it) {
26712668
if (it->second->m_bat == boxArray().transformer() &&
2672-
it->second->m_ng == nghost &&
2673-
it->second->m_nthreads == nthreads)
2669+
it->second->m_ng == nghost)
26742670
{
26752671
return *(it->second);
26762672
}
26772673
}
26782674

2679-
ParForInfo* new_pfi = new ParForInfo(*this, nghost, nthreads);
2675+
ParForInfo* new_pfi = new ParForInfo(*this, nghost);
26802676
m_TheParForCache.insert(er_it.second,
26812677
std::multimap<BDKey,ParForInfo*>::value_type(m_bdkey,new_pfi));
26822678
return *new_pfi;

Src/Base/AMReX_MFParallelFor.H

+5-5
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ std::enable_if_t<IsFabArray<MF>::value>
6868
ParallelFor (MF const& mf, F&& f)
6969
{
7070
#ifdef AMREX_USE_GPU
71-
detail::ParallelFor<MT>(mf, IntVect(0), FabArrayBase::mfiter_tile_size, false, std::forward<F>(f));
71+
detail::ParallelFor<MT>(mf, IntVect(0), 1, FabArrayBase::mfiter_tile_size, false, std::forward<F>(f));
7272
#else
7373
detail::ParallelFor(mf, IntVect(0), FabArrayBase::mfiter_tile_size, false, std::forward<F>(f));
7474
#endif
@@ -119,7 +119,7 @@ std::enable_if_t<IsFabArray<MF>::value>
119119
ParallelFor (MF const& mf, IntVect const& ng, F&& f)
120120
{
121121
#ifdef AMREX_USE_GPU
122-
detail::ParallelFor<MT>(mf, ng, FabArrayBase::mfiter_tile_size, false, std::forward<F>(f));
122+
detail::ParallelFor<MT>(mf, ng, 1, FabArrayBase::mfiter_tile_size, false, std::forward<F>(f));
123123
#else
124124
detail::ParallelFor(mf, ng, FabArrayBase::mfiter_tile_size, false, std::forward<F>(f));
125125
#endif
@@ -225,7 +225,7 @@ std::enable_if_t<IsFabArray<MF>::value>
225225
ParallelFor (MF const& mf, TileSize const& ts, F&& f)
226226
{
227227
#ifdef AMREX_USE_GPU
228-
detail::ParallelFor<MT>(mf, IntVect(0), ts.tile_size, false, std::forward<F>(f));
228+
detail::ParallelFor<MT>(mf, IntVect(0), 1, ts.tile_size, false, std::forward<F>(f));
229229
#else
230230
detail::ParallelFor(mf, IntVect(0), ts.tile_size, false, std::forward<F>(f));
231231
#endif
@@ -280,7 +280,7 @@ std::enable_if_t<IsFabArray<MF>::value>
280280
ParallelFor (MF const& mf, IntVect const& ng, TileSize const& ts, F&& f)
281281
{
282282
#ifdef AMREX_USE_GPU
283-
detail::ParallelFor<MT>(mf, ng, ts.tile_size, false, std::forward<F>(f));
283+
detail::ParallelFor<MT>(mf, ng, 1, ts.tile_size, false, std::forward<F>(f));
284284
#else
285285
detail::ParallelFor(mf, ng, ts.tile_size, false, std::forward<F>(f));
286286
#endif
@@ -423,7 +423,7 @@ ParallelFor (MF const& mf, IntVect const& ng, TileSize const& ts,
423423
DynamicTiling dt, F&& f)
424424
{
425425
#ifdef AMREX_USE_GPU
426-
detail::ParallelFor<MT>(mf, ng, ts.tile_size, dt.dynamic, std::forward<F>(f));
426+
detail::ParallelFor<MT>(mf, ng, 1, ts.tile_size, dt.dynamic, std::forward<F>(f));
427427
#else
428428
detail::ParallelFor(mf, ng, ts.tile_size, dt.dynamic, std::forward<F>(f));
429429
#endif

Src/Base/AMReX_MFParallelForG.H

+40-59
Original file line numberDiff line numberDiff line change
@@ -12,38 +12,24 @@ namespace amrex {
1212
namespace detail {
1313

1414
inline
15-
void build_par_for_nblocks (char*& a_hp, char*& a_dp, std::pair<int*,int*>& blocks_x, BoxIndexer*& pboxes,
16-
Vector<Box> const& boxes, Vector<Long> const& ncells, int nthreads)
15+
void build_par_for_boxes (char*& hp, BoxIndexer*& pboxes, Vector<Box> const& boxes)
1716
{
18-
if (!ncells.empty()) {
19-
const int nboxes = ncells.size();
20-
const std::size_t nbytes_boxes = amrex::aligned_size(alignof(BoxIndexer), (nboxes+1) * sizeof(int));
21-
const std::size_t nbytes = nbytes_boxes + nboxes*sizeof(BoxIndexer);
22-
a_hp = (char*)The_Pinned_Arena()->alloc(nbytes);
23-
int* hp_blks = (int*)a_hp;
24-
auto* hp_boxes = (BoxIndexer*)(a_hp + nbytes_boxes);
25-
hp_blks[0] = 0;
26-
bool same_size = true;
27-
for (int i = 0; i < nboxes; ++i) {
28-
Long nblocks = (ncells[i] + nthreads-1) / nthreads;
29-
AMREX_ASSERT((hp_blks[i]+nblocks) <= Long(std::numeric_limits<int>::max()));
30-
hp_blks[i+1] = hp_blks[i] + static_cast<int>(nblocks);
31-
same_size = same_size && (ncells[i] == ncells[0]);
32-
33-
new (hp_boxes+i) BoxIndexer(boxes[i]);
34-
}
35-
36-
a_dp = (char*) The_Arena()->alloc(nbytes);
37-
Gpu::htod_memcpy_async(a_dp, a_hp, nbytes);
38-
39-
blocks_x.first = hp_blks;
40-
blocks_x.second = (same_size) ? nullptr : (int*)a_dp;
41-
pboxes = (BoxIndexer*)(a_dp + nbytes_boxes);
17+
if (boxes.empty()) { return; }
18+
const int nboxes = boxes.size();
19+
const std::size_t nbytes = nboxes*sizeof(BoxIndexer);
20+
hp = (char*)The_Pinned_Arena()->alloc(nbytes);
21+
auto* hp_boxes = (BoxIndexer*)hp;
22+
for (int i = 0; i < nboxes; ++i) {
23+
new (hp_boxes+i) BoxIndexer(boxes[i]);
4224
}
25+
26+
auto dp = (char*) The_Arena()->alloc(nbytes);
27+
Gpu::htod_memcpy_async(dp, hp, nbytes);
28+
pboxes = (BoxIndexer*)dp;
4329
}
4430

4531
inline
46-
void destroy_par_for_nblocks (char* hp, char* dp)
32+
void destroy_par_for_boxes (char* hp, char* dp)
4733
{
4834
The_Pinned_Arena()->free(hp);
4935
The_Arena()->free(dp);
@@ -63,10 +49,12 @@ namespace parfor_mf_detail {
6349

6450
template <typename F>
6551
AMREX_GPU_DEVICE
66-
auto call_f (F const& f, int b, int i, int j, int k, int n) noexcept
52+
auto call_f (F const& f, int b, int i, int j, int k, int ncomp) noexcept
6753
-> decltype(f(0,0,0,0,0))
6854
{
69-
f(b,i,j,k,n);
55+
for (int n = 0; n < ncomp; ++n) {
56+
f(b,i,j,k,n);
57+
}
7058
}
7159
}
7260

@@ -81,16 +69,15 @@ ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const&, boo
8169
return;
8270
} else if (nboxes == 1) {
8371
Box const& b = amrex::grow(mf.box(index_array[0]), nghost);
84-
amrex::ParallelFor(b, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept
72+
amrex::ParallelFor(b, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
8573
{
86-
parfor_mf_detail::call_f(f, 0, i, j, k, n);
74+
parfor_mf_detail::call_f(f, 0, i, j, k, ncomp);
8775
});
8876
} else {
89-
auto const& parforinfo = mf.getParForInfo(nghost,MT);
90-
auto par_for_blocks = parforinfo.getBlocks();
91-
const int nblocks = par_for_blocks.first[nboxes];
92-
const int block_0_size = par_for_blocks.first[1];
93-
const int* dp_nblocks = par_for_blocks.second;
77+
auto const& parforinfo = mf.getParForInfo(nghost);
78+
auto nblocks_per_box = parforinfo.getNBlocksPerBox(MT);
79+
AMREX_ASSERT(Long(nblocks_per_box)*Long(nboxes) < Long(std::numeric_limits<int>::max()));
80+
const int nblocks = nblocks_per_box * nboxes;
9481
const BoxIndexer* dp_boxes = parforinfo.getBoxes();
9582

9683
#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
@@ -99,39 +86,23 @@ ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const&, boo
9986
<<<nblocks, MT, 0, Gpu::gpuStream()>>>
10087
([=] AMREX_GPU_DEVICE () noexcept
10188
{
102-
int ibox;
103-
std::uint64_t icell;
104-
if (dp_nblocks) {
105-
ibox = amrex::bisect(dp_nblocks, 0, nboxes, static_cast<int>(blockIdx.x));
106-
icell = std::uint64_t(blockIdx.x-dp_nblocks[ibox])*MT + threadIdx.x;
107-
} else {
108-
ibox = blockIdx.x / block_0_size;
109-
icell = std::uint64_t(blockIdx.x-ibox*block_0_size)*MT + threadIdx.x;
110-
}
89+
int ibox = int(blockIdx.x) / nblocks_per_box;
90+
auto icell = std::uint64_t(blockIdx.x-ibox*nblocks_per_box)*MT + threadIdx.x;
11191

11292
#elif defined(AMREX_USE_SYCL)
11393

11494
amrex::launch<MT>(nblocks, Gpu::gpuStream(),
11595
[=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
11696
{
117-
int ibox;
118-
std::uint64_t icell;
11997
int blockIdxx = item.get_group_linear_id();
12098
int threadIdxx = item.get_local_linear_id();
121-
if (dp_nblocks) {
122-
ibox = amrex::bisect(dp_nblocks, 0, nboxes, static_cast<int>(blockIdxx));
123-
icell = std::uint64_t(blockIdxx-dp_nblocks[ibox])*MT + threadIdxx;
124-
} else {
125-
ibox = blockIdxx / block_0_size;
126-
icell = std::uint64_t(blockIdxx-ibox*block_0_size)*MT + threadIdxx;
127-
}
99+
int ibox = int(blockIdxx) / nblocks_per_box;
100+
auto icell = std::uint64_t(blockIdxx-ibox*nblocks_per_box)*MT + threadIdxx;
128101
#endif
129102
BoxIndexer const& indexer = dp_boxes[ibox];
130103
if (icell < indexer.numPts()) {
131104
auto [i, j, k] = indexer(icell);
132-
for (int n = 0; n < ncomp; ++n) {
133-
parfor_mf_detail::call_f(f, ibox, i, j, k, n);
134-
}
105+
parfor_mf_detail::call_f(f, ibox, i, j, k, ncomp);
135106
}
136107
});
137108
}
@@ -142,14 +113,24 @@ template <typename MF, typename F>
142113
std::enable_if_t<IsFabArray<MF>::value>
143114
ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const& ts, bool dynamic, F&& f)
144115
{
145-
ParallelFor<AMREX_GPU_MAX_THREADS>(mf, nghost, ncomp, ts, dynamic, std::forward<F>(f));
116+
#ifdef AMREX_USE_CUDA
117+
constexpr int MT = 128;
118+
#else
119+
constexpr int MT = AMREX_GPU_MAX_THREADS;
120+
#endif
121+
ParallelFor<MT>(mf, nghost, ncomp, ts, dynamic, std::forward<F>(f));
146122
}
147123

148124
template <typename MF, typename F>
149125
std::enable_if_t<IsFabArray<MF>::value>
150126
ParallelFor (MF const& mf, IntVect const& nghost, IntVect const& ts, bool dynamic, F&& f)
151127
{
152-
ParallelFor<AMREX_GPU_MAX_THREADS>(mf, nghost, 1, ts, dynamic, std::forward<F>(f));
128+
#ifdef AMREX_USE_CUDA
129+
constexpr int MT = 128;
130+
#else
131+
constexpr int MT = AMREX_GPU_MAX_THREADS;
132+
#endif
133+
ParallelFor<MT>(mf, nghost, 1, ts, dynamic, std::forward<F>(f));
153134
}
154135

155136
}

0 commit comments

Comments
 (0)