WeiqunZhang
diff --git a/‎Docs/sphinx_documentation/source/GPU.rst
+4-4 b/‎Docs/sphinx_documentation/source/GPU.rst
+4-4
diff --git a/‎Src/Base/AMReX_FabArrayBase.H
+6-6 b/‎Src/Base/AMReX_FabArrayBase.H
+6-6
diff --git a/‎Src/Base/AMReX_FabArrayBase.cpp
+9-13 b/‎Src/Base/AMReX_FabArrayBase.cpp
+9-13
diff --git a/‎Src/Base/AMReX_MFParallelFor.H
+5-5 b/‎Src/Base/AMReX_MFParallelFor.H
+5-5
diff --git a/‎Src/Base/AMReX_MFParallelForG.H
+40-59 b/‎Src/Base/AMReX_MFParallelForG.H
+40-59
@@ -229,9 +229,9 @@ Building with CMake
 
 To build AMReX with GPU support in CMake, add
 ``-DAMReX_GPU_BACKEND=CUDA|HIP|SYCL`` to the ``cmake`` invocation, for CUDA,
-HIP and SYCL, respectively. By default, AMReX uses 256 threads per GPU
-block/group in most situations. This can be changed with
-``-DAMReX_GPU_MAX_THREADS=N``, where ``N`` is 128 for example.
+HIP and SYCL, respectively. By default, AMReX uses 128 threads per GPU block
+in most situations for CUDA, and 256 for HIP and SYCL. This can be changed
+with ``-DAMReX_GPU_MAX_THREADS=N``, where ``N`` is 256 or 128 for example.
 
 Enabling CUDA support
 ^^^^^^^^^^^^^^^^^^^^^
@@ -1166,7 +1166,7 @@ GPU block size
 
 By default, :cpp:`ParallelFor` launches ``AMREX_GPU_MAX_THREADS`` threads
 per GPU block, where ``AMREX_GPU_MAX_THREADS`` is a compile-time constant
-with a default value of 256.  The users can also explicitly specify the
+with a default value of 128 for CUDA and 256 for HIP and SYCL.  The users can also explicitly specify the
 number of threads per block by :cpp:`ParallelFor<MY_BLOCK_SIZE>(...)`, where
 ``MY_BLOCK_SIZE`` is a multiple of the warp size (e.g., 128).  This allows
 the users to do performance tuning for individual kernels.
 
@@ -650,10 +650,12 @@ public:
     //! For ParallelFor(FabArray)
     struct ParForInfo
     {
-        ParForInfo (const FabArrayBase& fa, const IntVect& nghost, int nthreads);
+        ParForInfo (const FabArrayBase& fa, const IntVect& nghost);
         ~ParForInfo ();
 
-        std::pair<int*,int*> const& getBlocks () const { return m_nblocks_x; }
+        int getNBlocksPerBox (int nthreads) const {
+            return int((m_ncellsmax+nthreads-1)/nthreads);
+        }
         BoxIndexer const* getBoxes () const { return m_boxes; }
 
         ParForInfo () = delete;
@@ -664,14 +666,12 @@ public:
 
         BATransformer m_bat;
         IntVect m_ng;
-        int m_nthreads;
-        std::pair<int*,int*> m_nblocks_x;
+        Long m_ncellsmax = 0;
         BoxIndexer* m_boxes = nullptr;
         char* m_hp = nullptr;
-        char* m_dp = nullptr;
     };
 
-    ParForInfo const& getParForInfo (const IntVect& nghost, int nthreads) const;
+    ParForInfo const& getParForInfo (const IntVect& nghost) const;
 
     static std::multimap<BDKey,ParForInfo*> m_TheParForCache;
 
 
@@ -2635,15 +2635,12 @@ FabArrayBase::isFusingCandidate () const noexcept // NOLINT(readability-convert-
 
 #ifdef AMREX_USE_GPU
 
-FabArrayBase::ParForInfo::ParForInfo (const FabArrayBase& fa, const IntVect& nghost, int nthreads)
+FabArrayBase::ParForInfo::ParForInfo (const FabArrayBase& fa, const IntVect& nghost)
     : m_bat(fa.boxArray().transformer()),
-      m_ng(nghost),
-      m_nthreads(nthreads),
-      m_nblocks_x({nullptr,nullptr})
+      m_ng(nghost)
 {
     Vector<Box> boxes;
-    Vector<Long> ncells;
-    ncells.reserve(fa.indexArray.size());
+    m_ncellsmax = 0;
     for (int K : fa.indexArray) {
         Long N = 0;
         Box b = fa.box(K);
@@ -2652,31 +2649,30 @@ FabArrayBase::ParForInfo::ParForInfo (const FabArrayBase& fa, const IntVect& ngh
             N = b.numPts();
         }
         boxes.push_back(b);
-        ncells.push_back(N);
+        m_ncellsmax = std::max(m_ncellsmax, N);
     }
-    detail::build_par_for_nblocks(m_hp, m_dp, m_nblocks_x, m_boxes, boxes, ncells, nthreads);
+    detail::build_par_for_boxes(m_hp, m_boxes, boxes);
 }
 
 FabArrayBase::ParForInfo::~ParForInfo ()
 {
-    detail::destroy_par_for_nblocks(m_hp, m_dp);
+    detail::destroy_par_for_boxes(m_hp, (char*)m_boxes);
 }
 
 FabArrayBase::ParForInfo const&
-FabArrayBase::getParForInfo (const IntVect& nghost, int nthreads) const
+FabArrayBase::getParForInfo (const IntVect& nghost) const
 {
     AMREX_ASSERT(getBDKey() == m_bdkey);
     auto er_it = m_TheParForCache.equal_range(m_bdkey);
     for (auto it = er_it.first; it != er_it.second; ++it) {
         if (it->second->m_bat        == boxArray().transformer() &&
-            it->second->m_ng         == nghost                 &&
-            it->second->m_nthreads   == nthreads)
+            it->second->m_ng         == nghost)
         {
             return *(it->second);
         }
     }
 
-    ParForInfo* new_pfi = new ParForInfo(*this, nghost, nthreads);
+    ParForInfo* new_pfi = new ParForInfo(*this, nghost);
     m_TheParForCache.insert(er_it.second,
                             std::multimap<BDKey,ParForInfo*>::value_type(m_bdkey,new_pfi));
     return *new_pfi;
 
@@ -68,7 +68,7 @@ std::enable_if_t<IsFabArray<MF>::value>
 ParallelFor (MF const& mf, F&& f)
 {
 #ifdef AMREX_USE_GPU
-    detail::ParallelFor<MT>(mf, IntVect(0), FabArrayBase::mfiter_tile_size, false, std::forward<F>(f));
+    detail::ParallelFor<MT>(mf, IntVect(0), 1, FabArrayBase::mfiter_tile_size, false, std::forward<F>(f));
 #else
     detail::ParallelFor(mf, IntVect(0), FabArrayBase::mfiter_tile_size, false, std::forward<F>(f));
 #endif
@@ -119,7 +119,7 @@ std::enable_if_t<IsFabArray<MF>::value>
 ParallelFor (MF const& mf, IntVect const& ng, F&& f)
 {
 #ifdef AMREX_USE_GPU
-    detail::ParallelFor<MT>(mf, ng, FabArrayBase::mfiter_tile_size, false, std::forward<F>(f));
+    detail::ParallelFor<MT>(mf, ng, 1, FabArrayBase::mfiter_tile_size, false, std::forward<F>(f));
 #else
     detail::ParallelFor(mf, ng, FabArrayBase::mfiter_tile_size, false, std::forward<F>(f));
 #endif
@@ -225,7 +225,7 @@ std::enable_if_t<IsFabArray<MF>::value>
 ParallelFor (MF const& mf, TileSize const& ts, F&& f)
 {
 #ifdef AMREX_USE_GPU
-    detail::ParallelFor<MT>(mf, IntVect(0), ts.tile_size, false, std::forward<F>(f));
+    detail::ParallelFor<MT>(mf, IntVect(0), 1, ts.tile_size, false, std::forward<F>(f));
 #else
     detail::ParallelFor(mf, IntVect(0), ts.tile_size, false, std::forward<F>(f));
 #endif
@@ -280,7 +280,7 @@ std::enable_if_t<IsFabArray<MF>::value>
 ParallelFor (MF const& mf, IntVect const& ng, TileSize const& ts, F&& f)
 {
 #ifdef AMREX_USE_GPU
-    detail::ParallelFor<MT>(mf, ng, ts.tile_size, false, std::forward<F>(f));
+    detail::ParallelFor<MT>(mf, ng, 1, ts.tile_size, false, std::forward<F>(f));
 #else
     detail::ParallelFor(mf, ng, ts.tile_size, false, std::forward<F>(f));
 #endif
@@ -423,7 +423,7 @@ ParallelFor (MF const& mf, IntVect const& ng, TileSize const& ts,
              DynamicTiling dt, F&& f)
 {
 #ifdef AMREX_USE_GPU
-    detail::ParallelFor<MT>(mf, ng, ts.tile_size, dt.dynamic, std::forward<F>(f));
+    detail::ParallelFor<MT>(mf, ng, 1, ts.tile_size, dt.dynamic, std::forward<F>(f));
 #else
     detail::ParallelFor(mf, ng, ts.tile_size, dt.dynamic, std::forward<F>(f));
 #endif
 
@@ -12,38 +12,24 @@ namespace amrex {
 namespace detail {
 
 inline
-void build_par_for_nblocks (char*& a_hp, char*& a_dp, std::pair<int*,int*>& blocks_x, BoxIndexer*& pboxes,
-                            Vector<Box> const& boxes, Vector<Long> const& ncells, int nthreads)
+void build_par_for_boxes (char*& hp, BoxIndexer*& pboxes, Vector<Box> const& boxes)
 {
-    if (!ncells.empty()) {
-        const int nboxes = ncells.size();
-        const std::size_t nbytes_boxes = amrex::aligned_size(alignof(BoxIndexer), (nboxes+1) * sizeof(int));
-        const std::size_t nbytes = nbytes_boxes + nboxes*sizeof(BoxIndexer);
-        a_hp = (char*)The_Pinned_Arena()->alloc(nbytes);
-        int* hp_blks = (int*)a_hp;
-        auto* hp_boxes = (BoxIndexer*)(a_hp + nbytes_boxes);
-        hp_blks[0] = 0;
-        bool same_size = true;
-        for (int i = 0; i < nboxes; ++i) {
-            Long nblocks = (ncells[i] + nthreads-1) / nthreads;
-            AMREX_ASSERT((hp_blks[i]+nblocks) <= Long(std::numeric_limits<int>::max()));
-            hp_blks[i+1] = hp_blks[i] + static_cast<int>(nblocks);
-            same_size = same_size && (ncells[i] == ncells[0]);
-
-            new (hp_boxes+i) BoxIndexer(boxes[i]);
-        }
-
-        a_dp = (char*) The_Arena()->alloc(nbytes);
-        Gpu::htod_memcpy_async(a_dp, a_hp, nbytes);
-
-        blocks_x.first = hp_blks;
-        blocks_x.second = (same_size) ? nullptr : (int*)a_dp;
-        pboxes = (BoxIndexer*)(a_dp + nbytes_boxes);
+    if (boxes.empty()) { return; }
+    const int nboxes = boxes.size();
+    const std::size_t nbytes = nboxes*sizeof(BoxIndexer);
+    hp = (char*)The_Pinned_Arena()->alloc(nbytes);
+    auto* hp_boxes = (BoxIndexer*)hp;
+    for (int i = 0; i < nboxes; ++i) {
+        new (hp_boxes+i) BoxIndexer(boxes[i]);
     }
+
+    auto dp = (char*) The_Arena()->alloc(nbytes);
+    Gpu::htod_memcpy_async(dp, hp, nbytes);
+    pboxes = (BoxIndexer*)dp;
 }
 
 inline
-void destroy_par_for_nblocks (char* hp, char* dp)
+void destroy_par_for_boxes (char* hp, char* dp)
 {
     The_Pinned_Arena()->free(hp);
     The_Arena()->free(dp);
@@ -63,10 +49,12 @@ namespace parfor_mf_detail {
 
     template <typename F>
     AMREX_GPU_DEVICE
-    auto call_f (F const& f, int b, int i, int j, int k, int n) noexcept
+    auto call_f (F const& f, int b, int i, int j, int k, int ncomp) noexcept
         -> decltype(f(0,0,0,0,0))
     {
-        f(b,i,j,k,n);
+        for (int n = 0; n < ncomp; ++n) {
+            f(b,i,j,k,n);
+        }
     }
 }
 
@@ -81,16 +69,15 @@ ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const&, boo
         return;
     } else if (nboxes == 1) {
         Box const& b = amrex::grow(mf.box(index_array[0]), nghost);
-        amrex::ParallelFor(b, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept
+        amrex::ParallelFor(b, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
         {
-            parfor_mf_detail::call_f(f, 0, i, j, k, n);
+            parfor_mf_detail::call_f(f, 0, i, j, k, ncomp);
         });
     } else {
-        auto const& parforinfo = mf.getParForInfo(nghost,MT);
-        auto par_for_blocks = parforinfo.getBlocks();
-        const int nblocks = par_for_blocks.first[nboxes];
-        const int block_0_size = par_for_blocks.first[1];
-        const int* dp_nblocks = par_for_blocks.second;
+        auto const& parforinfo = mf.getParForInfo(nghost);
+        auto nblocks_per_box = parforinfo.getNBlocksPerBox(MT);
+        AMREX_ASSERT(Long(nblocks_per_box)*Long(nboxes) < Long(std::numeric_limits<int>::max()));
+        const int nblocks = nblocks_per_box * nboxes;
         const BoxIndexer* dp_boxes = parforinfo.getBoxes();
 
 #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
@@ -99,39 +86,23 @@ ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const&, boo
             <<<nblocks, MT, 0, Gpu::gpuStream()>>>
             ([=] AMREX_GPU_DEVICE () noexcept
              {
-                 int ibox;
-                 std::uint64_t icell;
-                 if (dp_nblocks) {
-                     ibox = amrex::bisect(dp_nblocks, 0, nboxes, static_cast<int>(blockIdx.x));
-                     icell = std::uint64_t(blockIdx.x-dp_nblocks[ibox])*MT + threadIdx.x;
-                 } else {
-                     ibox = blockIdx.x / block_0_size;
-                     icell = std::uint64_t(blockIdx.x-ibox*block_0_size)*MT + threadIdx.x;
-                 }
+                 int ibox = int(blockIdx.x) / nblocks_per_box;
+                 auto icell = std::uint64_t(blockIdx.x-ibox*nblocks_per_box)*MT + threadIdx.x;
 
 #elif defined(AMREX_USE_SYCL)
 
         amrex::launch<MT>(nblocks, Gpu::gpuStream(),
              [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
              {
-                 int ibox;
-                 std::uint64_t icell;
                  int blockIdxx = item.get_group_linear_id();
                  int threadIdxx = item.get_local_linear_id();
-                 if (dp_nblocks) {
-                     ibox = amrex::bisect(dp_nblocks, 0, nboxes, static_cast<int>(blockIdxx));
-                     icell = std::uint64_t(blockIdxx-dp_nblocks[ibox])*MT + threadIdxx;
-                 } else {
-                     ibox = blockIdxx / block_0_size;
-                     icell = std::uint64_t(blockIdxx-ibox*block_0_size)*MT + threadIdxx;
-                 }
+                 int ibox = int(blockIdxx) / nblocks_per_box;
+                 auto icell = std::uint64_t(blockIdxx-ibox*nblocks_per_box)*MT + threadIdxx;
 #endif
                  BoxIndexer const& indexer = dp_boxes[ibox];
                  if (icell < indexer.numPts()) {
                      auto [i, j, k] = indexer(icell);
-                     for (int n = 0; n < ncomp; ++n) {
-                         parfor_mf_detail::call_f(f, ibox, i, j, k, n);
-                     }
+                     parfor_mf_detail::call_f(f, ibox, i, j, k, ncomp);
                  }
              });
     }
@@ -142,14 +113,24 @@ template <typename MF, typename F>
 std::enable_if_t<IsFabArray<MF>::value>
 ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const& ts, bool dynamic, F&& f)
 {
-    ParallelFor<AMREX_GPU_MAX_THREADS>(mf, nghost, ncomp, ts, dynamic, std::forward<F>(f));
+#ifdef AMREX_USE_CUDA
+    constexpr int MT = 128;
+#else
+    constexpr int MT = AMREX_GPU_MAX_THREADS;
+#endif
+    ParallelFor<MT>(mf, nghost, ncomp, ts, dynamic, std::forward<F>(f));
 }
 
 template <typename MF, typename F>
 std::enable_if_t<IsFabArray<MF>::value>
 ParallelFor (MF const& mf, IntVect const& nghost, IntVect const& ts, bool dynamic, F&& f)
 {
-    ParallelFor<AMREX_GPU_MAX_THREADS>(mf, nghost, 1, ts, dynamic, std::forward<F>(f));
+#ifdef AMREX_USE_CUDA
+    constexpr int MT = 128;
+#else
+    constexpr int MT = AMREX_GPU_MAX_THREADS;
+#endif
+    ParallelFor<MT>(mf, nghost, 1, ts, dynamic, std::forward<F>(f));
 }
 
 }
Original file line number	Diff line number	Diff line change
`@@ -2635,15 +2635,12 @@ FabArrayBase::isFusingCandidate () const noexcept // NOLINT(readability-convert-`
`2635`	`2635`
`2636`	`2636`	`#ifdef AMREX_USE_GPU`
`2637`	`2637`
`2638`		`-FabArrayBase::ParForInfo::ParForInfo (const FabArrayBase& fa, const IntVect& nghost, int nthreads)`
	`2638`	`+FabArrayBase::ParForInfo::ParForInfo (const FabArrayBase& fa, const IntVect& nghost)`
`2639`	`2639`	`: m_bat(fa.boxArray().transformer()),`
`2640`		`- m_ng(nghost),`
`2641`		`- m_nthreads(nthreads),`
`2642`		`- m_nblocks_x({nullptr,nullptr})`
	`2640`	`+ m_ng(nghost)`
`2643`	`2641`	`{`
`2644`	`2642`	`Vector<Box> boxes;`
`2645`		`- Vector<Long> ncells;`
`2646`		`- ncells.reserve(fa.indexArray.size());`
	`2643`	`+ m_ncellsmax = 0;`
`2647`	`2644`	`for (int K : fa.indexArray) {`
`2648`	`2645`	`Long N = 0;`
`2649`	`2646`	`Box b = fa.box(K);`
`@@ -2652,31 +2649,30 @@ FabArrayBase::ParForInfo::ParForInfo (const FabArrayBase& fa, const IntVect& ngh`
`2652`	`2649`	`N = b.numPts();`
`2653`	`2650`	`}`
`2654`	`2651`	`boxes.push_back(b);`
`2655`		`- ncells.push_back(N);`
	`2652`	`+ m_ncellsmax = std::max(m_ncellsmax, N);`
`2656`	`2653`	`}`
`2657`		`- detail::build_par_for_nblocks(m_hp, m_dp, m_nblocks_x, m_boxes, boxes, ncells, nthreads);`
	`2654`	`+ detail::build_par_for_boxes(m_hp, m_boxes, boxes);`
`2658`	`2655`	`}`
`2659`	`2656`
`2660`	`2657`	`FabArrayBase::ParForInfo::~ParForInfo ()`
`2661`	`2658`	`{`
`2662`		`- detail::destroy_par_for_nblocks(m_hp, m_dp);`
	`2659`	`+ detail::destroy_par_for_boxes(m_hp, (char*)m_boxes);`
`2663`	`2660`	`}`
`2664`	`2661`
`2665`	`2662`	`FabArrayBase::ParForInfo const&`
`2666`		`-FabArrayBase::getParForInfo (const IntVect& nghost, int nthreads) const`
	`2663`	`+FabArrayBase::getParForInfo (const IntVect& nghost) const`
`2667`	`2664`	`{`
`2668`	`2665`	`AMREX_ASSERT(getBDKey() == m_bdkey);`
`2669`	`2666`	`auto er_it = m_TheParForCache.equal_range(m_bdkey);`
`2670`	`2667`	`for (auto it = er_it.first; it != er_it.second; ++it) {`
`2671`	`2668`	`if (it->second->m_bat == boxArray().transformer() &&`
`2672`		`- it->second->m_ng == nghost &&`
`2673`		`- it->second->m_nthreads == nthreads)`
	`2669`	`+ it->second->m_ng == nghost)`
`2674`	`2670`	`{`
`2675`	`2671`	`return *(it->second);`
`2676`	`2672`	`}`
`2677`	`2673`	`}`
`2678`	`2674`
`2679`		`- ParForInfo* new_pfi = new ParForInfo(*this, nghost, nthreads);`
	`2675`	`+ ParForInfo* new_pfi = new ParForInfo(*this, nghost);`
`2680`	`2676`	`m_TheParForCache.insert(er_it.second,`
`2681`	`2677`	`std::multimap<BDKey,ParForInfo*>::value_type(m_bdkey,new_pfi));`
`2682`	`2678`	`return *new_pfi;`