@@ -12,38 +12,24 @@ namespace amrex {
12
12
namespace detail {
13
13
14
14
inline
15
- void build_par_for_nblocks (char *& a_hp, char *& a_dp, std::pair<int *,int *>& blocks_x, BoxIndexer*& pboxes,
16
- Vector<Box> const & boxes, Vector<Long> const & ncells, int nthreads)
15
+ void build_par_for_boxes (char *& hp, BoxIndexer*& pboxes, Vector<Box> const & boxes)
17
16
{
18
- if (!ncells.empty ()) {
19
- const int nboxes = ncells.size ();
20
- const std::size_t nbytes_boxes = amrex::aligned_size (alignof (BoxIndexer), (nboxes+1 ) * sizeof (int ));
21
- const std::size_t nbytes = nbytes_boxes + nboxes*sizeof (BoxIndexer);
22
- a_hp = (char *)The_Pinned_Arena ()->alloc (nbytes);
23
- int * hp_blks = (int *)a_hp;
24
- auto * hp_boxes = (BoxIndexer*)(a_hp + nbytes_boxes);
25
- hp_blks[0 ] = 0 ;
26
- bool same_size = true ;
27
- for (int i = 0 ; i < nboxes; ++i) {
28
- Long nblocks = (ncells[i] + nthreads-1 ) / nthreads;
29
- AMREX_ASSERT ((hp_blks[i]+nblocks) <= Long (std::numeric_limits<int >::max ()));
30
- hp_blks[i+1 ] = hp_blks[i] + static_cast <int >(nblocks);
31
- same_size = same_size && (ncells[i] == ncells[0 ]);
32
-
33
- new (hp_boxes+i) BoxIndexer (boxes[i]);
34
- }
35
-
36
- a_dp = (char *) The_Arena ()->alloc (nbytes);
37
- Gpu::htod_memcpy_async (a_dp, a_hp, nbytes);
38
-
39
- blocks_x.first = hp_blks;
40
- blocks_x.second = (same_size) ? nullptr : (int *)a_dp;
41
- pboxes = (BoxIndexer*)(a_dp + nbytes_boxes);
17
+ if (boxes.empty ()) { return ; }
18
+ const int nboxes = boxes.size ();
19
+ const std::size_t nbytes = nboxes*sizeof (BoxIndexer);
20
+ hp = (char *)The_Pinned_Arena ()->alloc (nbytes);
21
+ auto * hp_boxes = (BoxIndexer*)hp;
22
+ for (int i = 0 ; i < nboxes; ++i) {
23
+ new (hp_boxes+i) BoxIndexer (boxes[i]);
42
24
}
25
+
26
+ auto dp = (char *) The_Arena ()->alloc (nbytes);
27
+ Gpu::htod_memcpy_async (dp, hp, nbytes);
28
+ pboxes = (BoxIndexer*)dp;
43
29
}
44
30
45
31
inline
46
- void destroy_par_for_nblocks (char * hp, char * dp)
32
+ void destroy_par_for_boxes (char * hp, char * dp)
47
33
{
48
34
The_Pinned_Arena ()->free (hp);
49
35
The_Arena ()->free (dp);
@@ -63,10 +49,12 @@ namespace parfor_mf_detail {
63
49
64
50
template <typename F>
65
51
AMREX_GPU_DEVICE
66
- auto call_f (F const & f, int b, int i, int j, int k, int n ) noexcept
52
+ auto call_f (F const & f, int b, int i, int j, int k, int ncomp ) noexcept
67
53
-> decltype(f(0 ,0 ,0 ,0 ,0 ))
68
54
{
69
- f (b,i,j,k,n);
55
+ for (int n = 0 ; n < ncomp; ++n) {
56
+ f (b,i,j,k,n);
57
+ }
70
58
}
71
59
}
72
60
@@ -81,16 +69,15 @@ ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const&, boo
81
69
return ;
82
70
} else if (nboxes == 1 ) {
83
71
Box const & b = amrex::grow (mf.box (index_array[0 ]), nghost);
84
- amrex::ParallelFor (b, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n ) noexcept
72
+ amrex::ParallelFor (b, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
85
73
{
86
- parfor_mf_detail::call_f (f, 0 , i, j, k, n );
74
+ parfor_mf_detail::call_f (f, 0 , i, j, k, ncomp );
87
75
});
88
76
} else {
89
- auto const & parforinfo = mf.getParForInfo (nghost,MT);
90
- auto par_for_blocks = parforinfo.getBlocks ();
91
- const int nblocks = par_for_blocks.first [nboxes];
92
- const int block_0_size = par_for_blocks.first [1 ];
93
- const int * dp_nblocks = par_for_blocks.second ;
77
+ auto const & parforinfo = mf.getParForInfo (nghost);
78
+ auto nblocks_per_box = parforinfo.getNBlocksPerBox (MT);
79
+ AMREX_ASSERT (Long (nblocks_per_box)*Long (nboxes) < Long (std::numeric_limits<int >::max ()));
80
+ const int nblocks = nblocks_per_box * nboxes;
94
81
const BoxIndexer* dp_boxes = parforinfo.getBoxes ();
95
82
96
83
#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
@@ -99,39 +86,23 @@ ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const&, boo
99
86
<<<nblocks, MT, 0 , Gpu::gpuStream ()>>>
100
87
([=] AMREX_GPU_DEVICE () noexcept
101
88
{
102
- int ibox;
103
- std::uint64_t icell;
104
- if (dp_nblocks) {
105
- ibox = amrex::bisect (dp_nblocks, 0 , nboxes, static_cast <int >(blockIdx.x ));
106
- icell = std::uint64_t (blockIdx.x -dp_nblocks[ibox])*MT + threadIdx.x ;
107
- } else {
108
- ibox = blockIdx.x / block_0_size;
109
- icell = std::uint64_t (blockIdx.x -ibox*block_0_size)*MT + threadIdx.x ;
110
- }
89
+ int ibox = int (blockIdx.x ) / nblocks_per_box;
90
+ auto icell = std::uint64_t (blockIdx.x -ibox*nblocks_per_box)*MT + threadIdx.x ;
111
91
112
92
#elif defined(AMREX_USE_SYCL)
113
93
114
94
amrex::launch<MT>(nblocks, Gpu::gpuStream (),
115
95
[=] AMREX_GPU_DEVICE (sycl::nd_item<1 > const & item) noexcept
116
96
{
117
- int ibox;
118
- std::uint64_t icell;
119
97
int blockIdxx = item.get_group_linear_id ();
120
98
int threadIdxx = item.get_local_linear_id ();
121
- if (dp_nblocks) {
122
- ibox = amrex::bisect (dp_nblocks, 0 , nboxes, static_cast <int >(blockIdxx));
123
- icell = std::uint64_t (blockIdxx-dp_nblocks[ibox])*MT + threadIdxx;
124
- } else {
125
- ibox = blockIdxx / block_0_size;
126
- icell = std::uint64_t (blockIdxx-ibox*block_0_size)*MT + threadIdxx;
127
- }
99
+ int ibox = int (blockIdxx) / nblocks_per_box;
100
+ auto icell = std::uint64_t (blockIdxx-ibox*nblocks_per_box)*MT + threadIdxx;
128
101
#endif
129
102
BoxIndexer const & indexer = dp_boxes[ibox];
130
103
if (icell < indexer.numPts ()) {
131
104
auto [i, j, k] = indexer (icell);
132
- for (int n = 0 ; n < ncomp; ++n) {
133
- parfor_mf_detail::call_f (f, ibox, i, j, k, n);
134
- }
105
+ parfor_mf_detail::call_f (f, ibox, i, j, k, ncomp);
135
106
}
136
107
});
137
108
}
@@ -142,14 +113,24 @@ template <typename MF, typename F>
142
113
std::enable_if_t <IsFabArray<MF>::value>
143
114
ParallelFor (MF const & mf, IntVect const & nghost, int ncomp, IntVect const & ts, bool dynamic, F&& f)
144
115
{
145
- ParallelFor<AMREX_GPU_MAX_THREADS>(mf, nghost, ncomp, ts, dynamic, std::forward<F>(f));
116
+ #ifdef AMREX_USE_CUDA
117
+ constexpr int MT = 128 ;
118
+ #else
119
+ constexpr int MT = AMREX_GPU_MAX_THREADS;
120
+ #endif
121
+ ParallelFor<MT>(mf, nghost, ncomp, ts, dynamic, std::forward<F>(f));
146
122
}
147
123
148
124
template <typename MF, typename F>
149
125
std::enable_if_t <IsFabArray<MF>::value>
150
126
ParallelFor (MF const & mf, IntVect const & nghost, IntVect const & ts, bool dynamic, F&& f)
151
127
{
152
- ParallelFor<AMREX_GPU_MAX_THREADS>(mf, nghost, 1 , ts, dynamic, std::forward<F>(f));
128
+ #ifdef AMREX_USE_CUDA
129
+ constexpr int MT = 128 ;
130
+ #else
131
+ constexpr int MT = AMREX_GPU_MAX_THREADS;
132
+ #endif
133
+ ParallelFor<MT>(mf, nghost, 1 , ts, dynamic, std::forward<F>(f));
153
134
}
154
135
155
136
}
0 commit comments