diff --git a/Docs/sphinx_documentation/source/GPU.rst b/Docs/sphinx_documentation/source/GPU.rst index 9e370b8ee2..c6da1a0ab3 100644 --- a/Docs/sphinx_documentation/source/GPU.rst +++ b/Docs/sphinx_documentation/source/GPU.rst @@ -229,9 +229,9 @@ Building with CMake To build AMReX with GPU support in CMake, add ``-DAMReX_GPU_BACKEND=CUDA|HIP|SYCL`` to the ``cmake`` invocation, for CUDA, -HIP and SYCL, respectively. By default, AMReX uses 256 threads per GPU -block/group in most situations. This can be changed with -``-DAMReX_GPU_MAX_THREADS=N``, where ``N`` is 128 for example. +HIP and SYCL, respectively. By default, AMReX uses 128 threads per GPU block +in most situations for CUDA, and 256 for HIP and SYCL. This can be changed +with ``-DAMReX_GPU_MAX_THREADS=N``, where ``N`` is 256 or 128 for example. Enabling CUDA support ^^^^^^^^^^^^^^^^^^^^^ @@ -1166,7 +1166,7 @@ GPU block size By default, :cpp:`ParallelFor` launches ``AMREX_GPU_MAX_THREADS`` threads per GPU block, where ``AMREX_GPU_MAX_THREADS`` is a compile-time constant -with a default value of 256. The users can also explicitly specify the +with a default value of 128 for CUDA and 256 for HIP and SYCL. The users can also explicitly specify the number of threads per block by :cpp:`ParallelFor(...)`, where ``MY_BLOCK_SIZE`` is a multiple of the warp size (e.g., 128). This allows the users to do performance tuning for individual kernels. diff --git a/Src/AmrCore/AMReX_MFInterp_C.H b/Src/AmrCore/AMReX_MFInterp_C.H index cce4103d19..c89909ff79 100644 --- a/Src/AmrCore/AMReX_MFInterp_C.H +++ b/Src/AmrCore/AMReX_MFInterp_C.H @@ -14,7 +14,7 @@ Real mf_compute_slopes_x (int i, int j, int k, Array4 const& u, int Real dc = Real(0.5) * (u(i+1,j,k,nu) - u(i-1,j,k,nu)); if (i == domain.smallEnd(0) && (bc.lo(0) == BCType::ext_dir || bc.lo(0) == BCType::hoextrap)) { - if (i+2 < u.end.x) { + if (i+2 < (u.begin.x+u.len.x)) { dc = -Real(16./15.)*u(i-1,j,k,nu) + Real(0.5)*u(i,j,k,nu) + Real(2./3.)*u(i+1,j,k,nu) - Real(0.1)*u(i+2,j,k,nu); } else { @@ -40,7 +40,7 @@ Real mf_compute_slopes_y (int i, int j, int k, Array4 const& u, int Real dc = Real(0.5) * (u(i,j+1,k,nu) - u(i,j-1,k,nu)); if (j == domain.smallEnd(1) && (bc.lo(1) == BCType::ext_dir || bc.lo(1) == BCType::hoextrap)) { - if (j+2 < u.end.y) { + if (j+2 < (u.begin.y+u.len.y)) { dc = -Real(16./15.)*u(i,j-1,k,nu) + Real(0.5)*u(i,j,k,nu) + Real(2./3.)*u(i,j+1,k,nu) - Real(0.1)*u(i,j+2,k,nu); } else { @@ -66,7 +66,7 @@ Real mf_compute_slopes_z (int i, int j, int k, Array4 const& u, int Real dc = Real(0.5) * (u(i,j,k+1,nu) - u(i,j,k-1,nu)); if (k == domain.smallEnd(2) && (bc.lo(2) == BCType::ext_dir || bc.lo(2) == BCType::hoextrap)) { - if (k+2 < u.end.z) { + if (k+2 < (u.begin.z+u.len.z)) { dc = -Real(16./15.)*u(i,j,k-1,nu) + Real(0.5)*u(i,j,k,nu) + Real(2./3.)*u(i,j,k+1,nu) - Real(0.1)*u(i,j,k+2,nu); } else { @@ -93,7 +93,7 @@ Real mf_cell_quadratic_compute_slopes_xx (int i, int j, int k, Real xx = u(i-1,j,k,nu) - 2.0_rt * u(i,j,k,nu) + u(i+1,j,k,nu); if (i == domain.smallEnd(0) && (bc.lo(0) == BCType::ext_dir || bc.lo(0) == BCType::hoextrap)) { - if (i+2 < u.end.x) { + if (i+2 < (u.begin.x+u.len.x)) { xx = 0._rt; } } @@ -114,7 +114,7 @@ Real mf_cell_quadratic_compute_slopes_yy (int i, int j, int k, Real yy = u(i,j-1,k,nu) - 2.0_rt * u(i,j,k,nu) + u(i,j+1,k,nu); if (j == domain.smallEnd(1) && (bc.lo(1) == BCType::ext_dir || bc.lo(1) == BCType::hoextrap)) { - if (j+2 < u.end.y) { + if (j+2 < (u.begin.y+u.len.y)) { yy = 0._rt; } } @@ -135,7 +135,7 @@ Real mf_cell_quadratic_compute_slopes_zz (int i, int j, int k, Real zz = u(i,j,k-1,nu) - 2.0_rt * u(i,j,k,nu) + u(i,j,k+1,nu); if (k == domain.smallEnd(2) && (bc.lo(2) == BCType::ext_dir || bc.lo(2) == BCType::hoextrap)) { - if (k+2 < u.end.z) { + if (k+2 < (u.begin.z+u.len.z)) { zz = 0._rt; } } @@ -157,7 +157,7 @@ Real mf_cell_quadratic_compute_slopes_xy (int i, int j, int k, - u(i-1,j+1,k,nu) + u(i+1,j+1,k,nu) ); if (i == domain.smallEnd(0) && (bc.lo(0) == BCType::ext_dir || bc.lo(0) == BCType::hoextrap)) { - if (i+2 < u.end.x) { + if (i+2 < (u.begin.x+u.len.x)) { xy = 0._rt; } } @@ -169,7 +169,7 @@ Real mf_cell_quadratic_compute_slopes_xy (int i, int j, int k, } if (j == domain.smallEnd(1) && (bc.lo(1) == BCType::ext_dir || bc.lo(1) == BCType::hoextrap)) { - if (j+2 < u.end.y) { + if (j+2 < (u.begin.y+u.len.y)) { xy = 0._rt; } } @@ -191,7 +191,7 @@ Real mf_cell_quadratic_compute_slopes_xz (int i, int j, int k, - u(i-1,j,k+1,nu) + u(i+1,j,k+1,nu) ); if (i == domain.smallEnd(0) && (bc.lo(0) == BCType::ext_dir || bc.lo(0) == BCType::hoextrap)) { - if (i+2 < u.end.x) { + if (i+2 < (u.begin.x+u.len.x)) { xz = 0._rt; } } @@ -203,7 +203,7 @@ Real mf_cell_quadratic_compute_slopes_xz (int i, int j, int k, } if (k == domain.smallEnd(2) && (bc.lo(2) == BCType::ext_dir || bc.lo(2) == BCType::hoextrap)) { - if (k+2 < u.end.z) { + if (k+2 < (u.begin.z+u.len.z)) { xz = 0._rt; } } @@ -225,7 +225,7 @@ Real mf_cell_quadratic_compute_slopes_yz (int i, int j, int k, - u(i,j+1,k-1,nu) + u(i,j+1,k+1,nu) ); if (j == domain.smallEnd(1) && (bc.lo(1) == BCType::ext_dir || bc.lo(1) == BCType::hoextrap)) { - if (j+2 < u.end.y) { + if (j+2 < (u.begin.y+u.len.y)) { yz = 0._rt; } } @@ -237,7 +237,7 @@ Real mf_cell_quadratic_compute_slopes_yz (int i, int j, int k, } if (k == domain.smallEnd(2) && (bc.lo(2) == BCType::ext_dir || bc.lo(2) == BCType::hoextrap)) { - if (k+2 < u.end.z) { + if (k+2 < (u.begin.z+u.len.z)) { yz = 0._rt; } } diff --git a/Src/Base/AMReX_Array4.H b/Src/Base/AMReX_Array4.H index ad89be2d7b..6175307b2c 100644 --- a/Src/Base/AMReX_Array4.H +++ b/Src/Base/AMReX_Array4.H @@ -60,11 +60,8 @@ namespace amrex { struct Array4 { T* AMREX_RESTRICT p; - Long jstride = 0; - Long kstride = 0; - Long nstride = 0; Dim3 begin{1,1,1}; - Dim3 end{0,0,0}; // end is hi + 1 + Dim3 len{0,0,0}; int ncomp=0; AMREX_GPU_HOST_DEVICE @@ -74,22 +71,16 @@ namespace amrex { AMREX_GPU_HOST_DEVICE constexpr Array4 (Array4> const& rhs) noexcept : p(rhs.p), - jstride(rhs.jstride), - kstride(rhs.kstride), - nstride(rhs.nstride), begin(rhs.begin), - end(rhs.end), + len(rhs.len), ncomp(rhs.ncomp) {} AMREX_GPU_HOST_DEVICE constexpr Array4 (T* a_p, Dim3 const& a_begin, Dim3 const& a_end, int a_ncomp) noexcept : p(a_p), - jstride(a_end.x-a_begin.x), - kstride(jstride*(a_end.y-a_begin.y)), - nstride(kstride*(a_end.z-a_begin.z)), begin(a_begin), - end(a_end), + len{a_end.x-a_begin.x, a_end.y-a_begin.y, a_end.z-a_begin.z}, ncomp(a_ncomp) {} @@ -99,12 +90,9 @@ namespace amrex { std::remove_const_t>,int> = 0> AMREX_GPU_HOST_DEVICE constexpr Array4 (Array4 const& rhs, int start_comp) noexcept - : p((T*)(rhs.p+start_comp*rhs.nstride)), - jstride(rhs.jstride), - kstride(rhs.kstride), - nstride(rhs.nstride), + : p((T*)(rhs.p+start_comp*rhs.nstride())), begin(rhs.begin), - end(rhs.end), + len(rhs.len), ncomp(rhs.ncomp-start_comp) {} @@ -114,25 +102,31 @@ namespace amrex { std::remove_const_t>,int> = 0> AMREX_GPU_HOST_DEVICE constexpr Array4 (Array4 const& rhs, int start_comp, int num_comps) noexcept - : p((T*)(rhs.p+start_comp*rhs.nstride)), - jstride(rhs.jstride), - kstride(rhs.kstride), - nstride(rhs.nstride), + : p((T*)(rhs.p+start_comp*rhs.nstride())), begin(rhs.begin), - end(rhs.end), + len(rhs.len), ncomp(num_comps) {} AMREX_GPU_HOST_DEVICE explicit operator bool() const noexcept { return p != nullptr; } + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + Long jstride () const noexcept { return Long(len.x); } + + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + Long kstride () const noexcept { return Long(len.x)*Long(len.y); } + + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + Long nstride () const noexcept { return Long(len.x)*Long(len.y)*Long(len.z); } + template ,int> = 0> [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE U& operator() (int i, int j, int k) const noexcept { #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK) index_assert(i,j,k,0); #endif - return p[(i-begin.x)+(j-begin.y)*jstride+(k-begin.z)*kstride]; + return p[(i-begin.x)+Long(len.x)*((j-begin.y)+Long(len.y)*(k-begin.z))]; } template ,int> = 0> @@ -141,7 +135,7 @@ namespace amrex { #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK) index_assert(i,j,k,n); #endif - return p[(i-begin.x)+(j-begin.y)*jstride+(k-begin.z)*kstride+n*nstride]; + return p[(i-begin.x)+Long(len.x)*((j-begin.y)+Long(len.y)*((k-begin.z)+Long(len.z)*n))]; } template ,int> = 0> @@ -150,7 +144,7 @@ namespace amrex { #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK) index_assert(i,j,k,0); #endif - return p + ((i-begin.x)+(j-begin.y)*jstride+(k-begin.z)*kstride); + return p + ((i-begin.x)+Long(len.x)*((j-begin.y)+Long(len.y)*(k-begin.z))); } template ,int> = 0> @@ -159,7 +153,7 @@ namespace amrex { #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK) index_assert(i,j,k,n); #endif - return p + ((i-begin.x)+(j-begin.y)*jstride+(k-begin.z)*kstride+n*nstride); + return p + ((i-begin.x)+Long(len.x)*((j-begin.y)+Long(len.y)*((k-begin.z)+Long(len.z)*n))); } template ,int> = 0> @@ -241,7 +235,7 @@ namespace amrex { [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::size_t size () const noexcept { - return this->nstride * this->ncomp; + return this->nstride() * this->ncomp; } [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE @@ -249,14 +243,14 @@ namespace amrex { [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE bool contains (int i, int j, int k) const noexcept { - return (i>=begin.x && i=begin.y && j=begin.z && k=begin.x && i<(begin.x+len.x) && j>=begin.y && j<(begin.y+len.y) && k>=begin.z && k<(begin.z+len.z)); } [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE bool contains (IntVect const& iv) const noexcept { - return AMREX_D_TERM( iv[0]>=begin.x && iv[0]=begin.y && iv[1]=begin.z && iv[2]=begin.x && iv[0]<(begin.x+len.x), + && iv[1]>=begin.y && iv[1]<(begin.y+len.y), + && iv[2]>=begin.z && iv[2]<(begin.z+len.z)); } [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE @@ -268,21 +262,21 @@ namespace amrex { AMREX_GPU_HOST_DEVICE inline void index_assert (int i, int j, int k, int n) const { - if (i=end.x || j=end.y || k=end.z + if (i=(begin.x+len.x) || j=(begin.y+len.y) || k=(begin.z+len.z) || n < 0 || n >= ncomp) { AMREX_IF_ON_DEVICE(( AMREX_DEVICE_PRINTF(" (%d,%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d,0:%d)\n", - i, j, k, n, begin.x, end.x-1, begin.y, end.y-1, - begin.z, end.z-1, ncomp-1); + i, j, k, n, begin.x, (begin.x+len.x)-1, begin.y, (begin.y+len.y)-1, + begin.z, (begin.z+len.z)-1, ncomp-1); amrex::Abort(); )) AMREX_IF_ON_HOST(( std::stringstream ss; ss << " (" << i << "," << j << "," << k << "," << n << ") is out of bound (" - << begin.x << ":" << end.x-1 << "," - << begin.y << ":" << end.y-1 << "," - << begin.z << ":" << end.z-1 << "," + << begin.x << ":" << (begin.x+len.x)-1 << "," + << begin.y << ":" << (begin.y+len.y)-1 << "," + << begin.z << ":" << (begin.z+len.z)-1 << "," << "0:" << ncomp-1 << ")"; amrex::Abort(ss.str()); )) @@ -292,7 +286,7 @@ namespace amrex { [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE CellData cellData (int i, int j, int k) const noexcept { - return CellData{this->ptr(i,j,k), nstride, ncomp}; + return CellData{this->ptr(i,j,k), nstride(), ncomp}; } }; @@ -300,7 +294,11 @@ namespace amrex { [[nodiscard]] AMREX_GPU_HOST_DEVICE Array4 ToArray4 (Array4 const& a_in) noexcept { - return Array4((Tto*)(a_in.p), a_in.begin, a_in.end, a_in.ncomp); + return Array4((Tto*)(a_in.p), a_in.begin, + Dim3{a_in.begin.x + a_in.len.x, + a_in.begin.y + a_in.len.y, + a_in.begin.z + a_in.len.z}, + a_in.ncomp); } template @@ -314,14 +312,28 @@ namespace amrex { [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 ubound (Array4 const& a) noexcept { - return Dim3{a.end.x-1,a.end.y-1,a.end.z-1}; + return Dim3{a.begin.x+a.len.x-1,a.begin.y+a.len.y-1,a.begin.z+a.len.z-1}; + } + + template + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + Dim3 begin (Array4 const& a) noexcept + { + return a.begin; + } + + template + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + Dim3 end (Array4 const& a) noexcept + { + return Dim3{a.begin.x+a.len.x,a.begin.y+a.len.y,a.begin.z+a.len.z}; } template [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 length (Array4 const& a) noexcept { - return Dim3{a.end.x-a.begin.x,a.end.y-a.begin.y,a.end.z-a.begin.z}; + return a.len; } template diff --git a/Src/Base/AMReX_BaseFab.H b/Src/Base/AMReX_BaseFab.H index db9289894c..b6ec1b0990 100644 --- a/Src/Base/AMReX_BaseFab.H +++ b/Src/Base/AMReX_BaseFab.H @@ -2013,32 +2013,32 @@ template BaseFab::BaseFab (Array4 const& a) noexcept : dptr(a.p), domain(IntVect(AMREX_D_DECL(a.begin.x,a.begin.y,a.begin.z)), - IntVect(AMREX_D_DECL(a.end.x-1,a.end.y-1,a.end.z-1))), - nvar(a.ncomp), truesize(a.ncomp*a.nstride) + IntVect(AMREX_D_DECL(a.begin.x+a.len.x-1,a.begin.y+a.len.y-1,a.begin.z+a.len.z-1))), + nvar(a.ncomp), truesize(a.ncomp*a.nstride()) {} template BaseFab::BaseFab (Array4 const& a, IndexType t) noexcept : dptr(a.p), domain(IntVect(AMREX_D_DECL(a.begin.x,a.begin.y,a.begin.z)), - IntVect(AMREX_D_DECL(a.end.x-1,a.end.y-1,a.end.z-1)), t), - nvar(a.ncomp), truesize(a.ncomp*a.nstride) + IntVect(AMREX_D_DECL(a.begin.x+a.len.x-1,a.begin.y+a.len.y-1,a.begin.z+a.len.z-1)), t), + nvar(a.ncomp), truesize(a.ncomp*a.nstride()) {} template BaseFab::BaseFab (Array4 const& a) noexcept : dptr(const_cast(a.p)), domain(IntVect(AMREX_D_DECL(a.begin.x,a.begin.y,a.begin.z)), - IntVect(AMREX_D_DECL(a.end.x-1,a.end.y-1,a.end.z-1))), - nvar(a.ncomp), truesize(a.ncomp*a.nstride) + IntVect(AMREX_D_DECL(a.begin.x+a.len.x-1,a.begin.y+a.len.y-1,a.begin.z+a.len.z-1))), + nvar(a.ncomp), truesize(a.ncomp*a.nstride()) {} template BaseFab::BaseFab (Array4 const& a, IndexType t) noexcept : dptr(const_cast(a.p)), domain(IntVect(AMREX_D_DECL(a.begin.x,a.begin.y,a.begin.z)), - IntVect(AMREX_D_DECL(a.end.x-1,a.end.y-1,a.end.z-1)), t), - nvar(a.ncomp), truesize(a.ncomp*a.nstride) + IntVect(AMREX_D_DECL(a.begin.x+a.len.x-1,a.begin.y+a.len.y-1,a.begin.z+a.len.z-1)), t), + nvar(a.ncomp), truesize(a.ncomp*a.nstride()) {} template diff --git a/Src/Base/AMReX_Box.H b/Src/Base/AMReX_Box.H index 7d4cb30c43..cc7011ba73 100644 --- a/Src/Base/AMReX_Box.H +++ b/Src/Base/AMReX_Box.H @@ -95,7 +95,7 @@ public: AMREX_GPU_HOST_DEVICE explicit BoxND (Array4 const& a) noexcept : smallend(a.begin), - bigend(IntVectND(a.end) - 1) + bigend(IntVectND(ubound(a))) {} // dtor, copy-ctor, copy-op=, move-ctor, and move-op= are compiler generated. diff --git a/Src/Base/AMReX_CudaGraph.H b/Src/Base/AMReX_CudaGraph.H index 797376e100..ae289493fb 100644 --- a/Src/Base/AMReX_CudaGraph.H +++ b/Src/Base/AMReX_CudaGraph.H @@ -39,16 +39,16 @@ makeCopyMemory (Array4 const& src, Array4 const& dst, int scomp, int ncomp mem.src = (void*)(src.p); mem.dst = (void*)(dst.p); mem.src_begin = src.begin; - mem.src_end = src.end; + mem.src_end = end(src); mem.dst_begin = dst.begin; - mem.dst_end = dst.end; + mem.dst_end = end(dst); mem.scomp = scomp; mem.ncomp = ncomp; return mem; #else - return CopyMemory{ (void*)(src.p), (void*)(dst.p), src.begin, src.end, dst.begin, dst.end, scomp, ncomp }; + return CopyMemory{ (void*)(src.p), (void*)(dst.p), src.begin, end(src), dst.begin, end(dst), scomp, ncomp }; #endif } diff --git a/Src/Base/AMReX_FabArrayBase.H b/Src/Base/AMReX_FabArrayBase.H index 44d0d9c269..b1bf768fe5 100644 --- a/Src/Base/AMReX_FabArrayBase.H +++ b/Src/Base/AMReX_FabArrayBase.H @@ -650,10 +650,12 @@ public: //! For ParallelFor(FabArray) struct ParForInfo { - ParForInfo (const FabArrayBase& fa, const IntVect& nghost, int nthreads); + ParForInfo (const FabArrayBase& fa, const IntVect& nghost); ~ParForInfo (); - std::pair const& getBlocks () const { return m_nblocks_x; } + int getNBlocksPerBox (int nthreads) const { + return int((m_ncellsmax+nthreads-1)/nthreads); + } BoxIndexer const* getBoxes () const { return m_boxes; } ParForInfo () = delete; @@ -664,14 +666,12 @@ public: BATransformer m_bat; IntVect m_ng; - int m_nthreads; - std::pair m_nblocks_x; + Long m_ncellsmax = 0; BoxIndexer* m_boxes = nullptr; char* m_hp = nullptr; - char* m_dp = nullptr; }; - ParForInfo const& getParForInfo (const IntVect& nghost, int nthreads) const; + ParForInfo const& getParForInfo (const IntVect& nghost) const; static std::multimap m_TheParForCache; diff --git a/Src/Base/AMReX_FabArrayBase.cpp b/Src/Base/AMReX_FabArrayBase.cpp index 5120e23be3..d841dc247c 100644 --- a/Src/Base/AMReX_FabArrayBase.cpp +++ b/Src/Base/AMReX_FabArrayBase.cpp @@ -2635,15 +2635,12 @@ FabArrayBase::isFusingCandidate () const noexcept // NOLINT(readability-convert- #ifdef AMREX_USE_GPU -FabArrayBase::ParForInfo::ParForInfo (const FabArrayBase& fa, const IntVect& nghost, int nthreads) +FabArrayBase::ParForInfo::ParForInfo (const FabArrayBase& fa, const IntVect& nghost) : m_bat(fa.boxArray().transformer()), - m_ng(nghost), - m_nthreads(nthreads), - m_nblocks_x({nullptr,nullptr}) + m_ng(nghost) { Vector boxes; - Vector ncells; - ncells.reserve(fa.indexArray.size()); + m_ncellsmax = 0; for (int K : fa.indexArray) { Long N = 0; Box b = fa.box(K); @@ -2652,31 +2649,30 @@ FabArrayBase::ParForInfo::ParForInfo (const FabArrayBase& fa, const IntVect& ngh N = b.numPts(); } boxes.push_back(b); - ncells.push_back(N); + m_ncellsmax = std::max(m_ncellsmax, N); } - detail::build_par_for_nblocks(m_hp, m_dp, m_nblocks_x, m_boxes, boxes, ncells, nthreads); + detail::build_par_for_boxes(m_hp, m_boxes, boxes); } FabArrayBase::ParForInfo::~ParForInfo () { - detail::destroy_par_for_nblocks(m_hp, m_dp); + detail::destroy_par_for_boxes(m_hp, (char*)m_boxes); } FabArrayBase::ParForInfo const& -FabArrayBase::getParForInfo (const IntVect& nghost, int nthreads) const +FabArrayBase::getParForInfo (const IntVect& nghost) const { AMREX_ASSERT(getBDKey() == m_bdkey); auto er_it = m_TheParForCache.equal_range(m_bdkey); for (auto it = er_it.first; it != er_it.second; ++it) { if (it->second->m_bat == boxArray().transformer() && - it->second->m_ng == nghost && - it->second->m_nthreads == nthreads) + it->second->m_ng == nghost) { return *(it->second); } } - ParForInfo* new_pfi = new ParForInfo(*this, nghost, nthreads); + ParForInfo* new_pfi = new ParForInfo(*this, nghost); m_TheParForCache.insert(er_it.second, std::multimap::value_type(m_bdkey,new_pfi)); return *new_pfi; diff --git a/Src/Base/AMReX_FilCC_1D_C.H b/Src/Base/AMReX_FilCC_1D_C.H index 4021e4b52a..f57e561e4c 100644 --- a/Src/Base/AMReX_FilCC_1D_C.H +++ b/Src/Base/AMReX_FilCC_1D_C.H @@ -23,7 +23,7 @@ struct FilccCell const int ilo = domain_lo[0]; const int ihi = domain_hi[0]; const int is = amrex::max(q.begin.x,ilo); - const int ie = amrex::min(q.end.x-1,ihi); + const int ie = amrex::min(q.begin.x+q.len.x-1,ihi); for (int n = dcomp; n < numcomp+dcomp; ++n) { diff --git a/Src/Base/AMReX_FilCC_2D_C.H b/Src/Base/AMReX_FilCC_2D_C.H index 80b9292972..735bcbc538 100644 --- a/Src/Base/AMReX_FilCC_2D_C.H +++ b/Src/Base/AMReX_FilCC_2D_C.H @@ -27,8 +27,8 @@ struct FilccCell const int jhi = domain_hi[1]; const int is = amrex::max(q.begin.x,ilo); const int js = amrex::max(q.begin.y,jlo); - const int ie = amrex::min(q.end.x-1,ihi); - const int je = amrex::min(q.end.y-1,jhi); + const int ie = amrex::min(q.begin.x+q.len.x-1,ihi); + const int je = amrex::min(q.begin.y+q.len.y-1,jhi); for (int n = dcomp; n < numcomp+dcomp; ++n) { diff --git a/Src/Base/AMReX_FilCC_3D_C.H b/Src/Base/AMReX_FilCC_3D_C.H index f311045c0a..e1976ed3fb 100644 --- a/Src/Base/AMReX_FilCC_3D_C.H +++ b/Src/Base/AMReX_FilCC_3D_C.H @@ -48,7 +48,7 @@ struct FilccCell q(i,j,k,n) = q(ilo,j,k,n); } // i == ilo-1 - else if (ilo+2 <= amrex::min(q.end.x-1,ihi)) + else if (ilo+2 <= amrex::min(q.begin.x+q.len.x-1,ihi)) { q(i,j,k,n) = Real(0.125)*(Real(15.)*q(i+1,j,k,n) - Real(10.)*q(i+2,j,k,n) + Real(3.)*q(i+3,j,k,n)); } @@ -135,7 +135,7 @@ struct FilccCell q(i,j,k,n) = q(i,jlo,k,n); } // j == jlo-1 - else if (jlo+2 <= amrex::min(q.end.y-1,jhi)) + else if (jlo+2 <= amrex::min(q.begin.y+q.len.y-1,jhi)) { q(i,j,k,n) = Real(0.125)*(Real(15.)*q(i,j+1,k,n) - Real(10.)*q(i,j+2,k,n) + Real(3.)*q(i,j+3,k,n)); } @@ -222,7 +222,7 @@ struct FilccCell q(i,j,k,n) = q(i,j,klo,n); } // k == klo-1 - else if (klo+2 <= amrex::min(q.end.z-1,khi)) + else if (klo+2 <= amrex::min(q.begin.z+q.len.z-1,khi)) { q(i,j,k,n) = Real(0.125)*(Real(15.)*q(i,j,k+1,n) - Real(10.)*q(i,j,k+2,n) + Real(3.)*q(i,j,k+3,n)); } diff --git a/Src/Base/AMReX_MFParallelFor.H b/Src/Base/AMReX_MFParallelFor.H index 25f9706d3b..4f870b5b22 100644 --- a/Src/Base/AMReX_MFParallelFor.H +++ b/Src/Base/AMReX_MFParallelFor.H @@ -68,7 +68,7 @@ std::enable_if_t::value> ParallelFor (MF const& mf, F&& f) { #ifdef AMREX_USE_GPU - detail::ParallelFor(mf, IntVect(0), FabArrayBase::mfiter_tile_size, false, std::forward(f)); + detail::ParallelFor(mf, IntVect(0), 1, FabArrayBase::mfiter_tile_size, false, std::forward(f)); #else detail::ParallelFor(mf, IntVect(0), FabArrayBase::mfiter_tile_size, false, std::forward(f)); #endif @@ -119,7 +119,7 @@ std::enable_if_t::value> ParallelFor (MF const& mf, IntVect const& ng, F&& f) { #ifdef AMREX_USE_GPU - detail::ParallelFor(mf, ng, FabArrayBase::mfiter_tile_size, false, std::forward(f)); + detail::ParallelFor(mf, ng, 1, FabArrayBase::mfiter_tile_size, false, std::forward(f)); #else detail::ParallelFor(mf, ng, FabArrayBase::mfiter_tile_size, false, std::forward(f)); #endif @@ -225,7 +225,7 @@ std::enable_if_t::value> ParallelFor (MF const& mf, TileSize const& ts, F&& f) { #ifdef AMREX_USE_GPU - detail::ParallelFor(mf, IntVect(0), ts.tile_size, false, std::forward(f)); + detail::ParallelFor(mf, IntVect(0), 1, ts.tile_size, false, std::forward(f)); #else detail::ParallelFor(mf, IntVect(0), ts.tile_size, false, std::forward(f)); #endif @@ -280,7 +280,7 @@ std::enable_if_t::value> ParallelFor (MF const& mf, IntVect const& ng, TileSize const& ts, F&& f) { #ifdef AMREX_USE_GPU - detail::ParallelFor(mf, ng, ts.tile_size, false, std::forward(f)); + detail::ParallelFor(mf, ng, 1, ts.tile_size, false, std::forward(f)); #else detail::ParallelFor(mf, ng, ts.tile_size, false, std::forward(f)); #endif @@ -423,7 +423,7 @@ ParallelFor (MF const& mf, IntVect const& ng, TileSize const& ts, DynamicTiling dt, F&& f) { #ifdef AMREX_USE_GPU - detail::ParallelFor(mf, ng, ts.tile_size, dt.dynamic, std::forward(f)); + detail::ParallelFor(mf, ng, 1, ts.tile_size, dt.dynamic, std::forward(f)); #else detail::ParallelFor(mf, ng, ts.tile_size, dt.dynamic, std::forward(f)); #endif diff --git a/Src/Base/AMReX_MFParallelForG.H b/Src/Base/AMReX_MFParallelForG.H index 066e46f3b8..e67aba509b 100644 --- a/Src/Base/AMReX_MFParallelForG.H +++ b/Src/Base/AMReX_MFParallelForG.H @@ -12,38 +12,24 @@ namespace amrex { namespace detail { inline -void build_par_for_nblocks (char*& a_hp, char*& a_dp, std::pair& blocks_x, BoxIndexer*& pboxes, - Vector const& boxes, Vector const& ncells, int nthreads) +void build_par_for_boxes (char*& hp, BoxIndexer*& pboxes, Vector const& boxes) { - if (!ncells.empty()) { - const int nboxes = ncells.size(); - const std::size_t nbytes_boxes = amrex::aligned_size(alignof(BoxIndexer), (nboxes+1) * sizeof(int)); - const std::size_t nbytes = nbytes_boxes + nboxes*sizeof(BoxIndexer); - a_hp = (char*)The_Pinned_Arena()->alloc(nbytes); - int* hp_blks = (int*)a_hp; - auto* hp_boxes = (BoxIndexer*)(a_hp + nbytes_boxes); - hp_blks[0] = 0; - bool same_size = true; - for (int i = 0; i < nboxes; ++i) { - Long nblocks = (ncells[i] + nthreads-1) / nthreads; - AMREX_ASSERT((hp_blks[i]+nblocks) <= Long(std::numeric_limits::max())); - hp_blks[i+1] = hp_blks[i] + static_cast(nblocks); - same_size = same_size && (ncells[i] == ncells[0]); - - new (hp_boxes+i) BoxIndexer(boxes[i]); - } - - a_dp = (char*) The_Arena()->alloc(nbytes); - Gpu::htod_memcpy_async(a_dp, a_hp, nbytes); - - blocks_x.first = hp_blks; - blocks_x.second = (same_size) ? nullptr : (int*)a_dp; - pboxes = (BoxIndexer*)(a_dp + nbytes_boxes); + if (boxes.empty()) { return; } + const int nboxes = boxes.size(); + const std::size_t nbytes = nboxes*sizeof(BoxIndexer); + hp = (char*)The_Pinned_Arena()->alloc(nbytes); + auto* hp_boxes = (BoxIndexer*)hp; + for (int i = 0; i < nboxes; ++i) { + new (hp_boxes+i) BoxIndexer(boxes[i]); } + + auto dp = (char*) The_Arena()->alloc(nbytes); + Gpu::htod_memcpy_async(dp, hp, nbytes); + pboxes = (BoxIndexer*)dp; } inline -void destroy_par_for_nblocks (char* hp, char* dp) +void destroy_par_for_boxes (char* hp, char* dp) { The_Pinned_Arena()->free(hp); The_Arena()->free(dp); @@ -63,10 +49,12 @@ namespace parfor_mf_detail { template AMREX_GPU_DEVICE - auto call_f (F const& f, int b, int i, int j, int k, int n) noexcept + auto call_f (F const& f, int b, int i, int j, int k, int ncomp) noexcept -> decltype(f(0,0,0,0,0)) { - f(b,i,j,k,n); + for (int n = 0; n < ncomp; ++n) { + f(b,i,j,k,n); + } } } @@ -81,16 +69,15 @@ ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const&, boo return; } else if (nboxes == 1) { Box const& b = amrex::grow(mf.box(index_array[0]), nghost); - amrex::ParallelFor(b, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept + amrex::ParallelFor(b, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { - parfor_mf_detail::call_f(f, 0, i, j, k, n); + parfor_mf_detail::call_f(f, 0, i, j, k, ncomp); }); } else { - auto const& parforinfo = mf.getParForInfo(nghost,MT); - auto par_for_blocks = parforinfo.getBlocks(); - const int nblocks = par_for_blocks.first[nboxes]; - const int block_0_size = par_for_blocks.first[1]; - const int* dp_nblocks = par_for_blocks.second; + auto const& parforinfo = mf.getParForInfo(nghost); + auto nblocks_per_box = parforinfo.getNBlocksPerBox(MT); + AMREX_ASSERT(Long(nblocks_per_box)*Long(nboxes) < Long(std::numeric_limits::max())); + const int nblocks = nblocks_per_box * nboxes; const BoxIndexer* dp_boxes = parforinfo.getBoxes(); #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) @@ -99,39 +86,23 @@ ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const&, boo <<>> ([=] AMREX_GPU_DEVICE () noexcept { - int ibox; - std::uint64_t icell; - if (dp_nblocks) { - ibox = amrex::bisect(dp_nblocks, 0, nboxes, static_cast(blockIdx.x)); - icell = std::uint64_t(blockIdx.x-dp_nblocks[ibox])*MT + threadIdx.x; - } else { - ibox = blockIdx.x / block_0_size; - icell = std::uint64_t(blockIdx.x-ibox*block_0_size)*MT + threadIdx.x; - } + int ibox = int(blockIdx.x) / nblocks_per_box; + auto icell = std::uint64_t(blockIdx.x-ibox*nblocks_per_box)*MT + threadIdx.x; #elif defined(AMREX_USE_SYCL) amrex::launch(nblocks, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept { - int ibox; - std::uint64_t icell; int blockIdxx = item.get_group_linear_id(); int threadIdxx = item.get_local_linear_id(); - if (dp_nblocks) { - ibox = amrex::bisect(dp_nblocks, 0, nboxes, static_cast(blockIdxx)); - icell = std::uint64_t(blockIdxx-dp_nblocks[ibox])*MT + threadIdxx; - } else { - ibox = blockIdxx / block_0_size; - icell = std::uint64_t(blockIdxx-ibox*block_0_size)*MT + threadIdxx; - } + int ibox = int(blockIdxx) / nblocks_per_box; + auto icell = std::uint64_t(blockIdxx-ibox*nblocks_per_box)*MT + threadIdxx; #endif BoxIndexer const& indexer = dp_boxes[ibox]; if (icell < indexer.numPts()) { auto [i, j, k] = indexer(icell); - for (int n = 0; n < ncomp; ++n) { - parfor_mf_detail::call_f(f, ibox, i, j, k, n); - } + parfor_mf_detail::call_f(f, ibox, i, j, k, ncomp); } }); } @@ -142,14 +113,24 @@ template std::enable_if_t::value> ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const& ts, bool dynamic, F&& f) { - ParallelFor(mf, nghost, ncomp, ts, dynamic, std::forward(f)); +#ifdef AMREX_USE_CUDA + constexpr int MT = 128; +#else + constexpr int MT = AMREX_GPU_MAX_THREADS; +#endif + ParallelFor(mf, nghost, ncomp, ts, dynamic, std::forward(f)); } template std::enable_if_t::value> ParallelFor (MF const& mf, IntVect const& nghost, IntVect const& ts, bool dynamic, F&& f) { - ParallelFor(mf, nghost, 1, ts, dynamic, std::forward(f)); +#ifdef AMREX_USE_CUDA + constexpr int MT = 128; +#else + constexpr int MT = AMREX_GPU_MAX_THREADS; +#endif + ParallelFor(mf, nghost, 1, ts, dynamic, std::forward(f)); } } diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H index f6ed403a2b..dec0b46b06 100644 --- a/Src/Base/AMReX_Reduce.H +++ b/Src/Base/AMReX_Reduce.H @@ -12,6 +12,8 @@ #include #include +#define AMREX_GPU_MAX_THREADS_REDUCE 256 + namespace amrex { namespace Reduce::detail { @@ -93,7 +95,7 @@ struct ReduceOpSum if (h.threadIdx() == 0) { d += r; } } #else - template + template AMREX_GPU_DEVICE AMREX_FORCE_INLINE void parallel_update (T& d, T const& s) const noexcept { T r = Gpu::blockReduceSum(s); @@ -121,7 +123,7 @@ struct ReduceOpMin if (h.threadIdx() == 0) { d = amrex::min(d,r); } } #else - template + template AMREX_GPU_DEVICE AMREX_FORCE_INLINE void parallel_update (T& d, T const& s) const noexcept { T r = Gpu::blockReduceMin(s); @@ -154,7 +156,7 @@ struct ReduceOpMax if (h.threadIdx() == 0) { d = amrex::max(d,r); } } #else - template + template AMREX_GPU_DEVICE AMREX_FORCE_INLINE void parallel_update (T& d, T const& s) const noexcept { T r = Gpu::blockReduceMax(s); @@ -188,7 +190,7 @@ struct ReduceOpLogicalAnd if (h.threadIdx() == 0) { d = d && r; } } #else - template + template AMREX_GPU_DEVICE AMREX_FORCE_INLINE std::enable_if_t::value> parallel_update (T& d, T s) const noexcept { @@ -220,7 +222,7 @@ struct ReduceOpLogicalOr if (h.threadIdx() == 0) { d = d || r; } } #else - template + template AMREX_GPU_DEVICE AMREX_FORCE_INLINE std::enable_if_t::value> parallel_update (T& d, T s) const noexcept { @@ -371,11 +373,10 @@ public: using ReduceTuple = typename D::Type; const int nboxes = mf.local_size(); if (nboxes > 0) { - auto const& parforinfo = mf.getParForInfo(nghost,AMREX_GPU_MAX_THREADS); - auto par_for_blocks = parforinfo.getBlocks(); - const int nblocks = par_for_blocks.first[nboxes]; - const int block_0_size = par_for_blocks.first[1]; - const int* dp_nblocks = par_for_blocks.second; + auto const& parforinfo = mf.getParForInfo(nghost); + auto nblocks_per_box = parforinfo.getNBlocksPerBox(AMREX_GPU_MAX_THREADS_REDUCE); + AMREX_ASSERT(Long(nblocks_per_box)*Long(nboxes) < Long(std::numeric_limits::max())); + const int nblocks = nblocks_per_box * nboxes; const BoxIndexer* dp_boxes = parforinfo.getBoxes(); auto const& stream = Gpu::gpuStream(); @@ -388,14 +389,14 @@ public: #ifdef AMREX_USE_SYCL // device reduce needs local(i.e., shared) memory constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size; - amrex::launch(nblocks_ec, shared_mem_bytes, stream, + amrex::launch(nblocks_ec, shared_mem_bytes, stream, [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept { Dim1 blockIdx {gh.blockIdx()}; Dim1 threadIdx{gh.threadIdx()}; #else - amrex::launch_global - <<>> + amrex::launch_global + <<>> ([=] AMREX_GPU_DEVICE () noexcept { #endif @@ -406,15 +407,8 @@ public: dst = r; } for (int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) { - int ibox; - std::uint64_t icell; - if (dp_nblocks) { - ibox = amrex::bisect(dp_nblocks, 0, nboxes, iblock); - icell = std::uint64_t(iblock-dp_nblocks[ibox])*AMREX_GPU_MAX_THREADS + threadIdx.x; - } else { - ibox = iblock / block_0_size; - icell = std::uint64_t(iblock-ibox*block_0_size)*AMREX_GPU_MAX_THREADS + threadIdx.x; - } + int ibox = iblock / nblocks_per_box; + auto icell = std::uint64_t(iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS_REDUCE + threadIdx.x; BoxIndexer const& indexer = dp_boxes[ibox]; if (icell < indexer.numPts()) { @@ -504,21 +498,21 @@ public: const auto lenx = len.x; IndexType ixtype = box.ixType(); constexpr int nitems_per_thread = 4; - int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS-1) - / (nitems_per_thread*AMREX_GPU_MAX_THREADS); + int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE-1) + / (nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE); nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks()); reduce_data.updateMaxStreamIndex(stream); #ifdef AMREX_USE_SYCL // device reduce needs local(i.e., shared) memory constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size; - amrex::launch(nblocks_ec, shared_mem_bytes, stream, + amrex::launch(nblocks_ec, shared_mem_bytes, stream, [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept { Dim1 blockIdx {gh.blockIdx()}; Dim1 threadIdx{gh.threadIdx()}; Dim1 gridDim {gh.gridDim()}; #else - amrex::launch(nblocks_ec, 0, stream, + amrex::launch(nblocks_ec, 0, stream, [=] AMREX_GPU_DEVICE () noexcept { #endif @@ -528,7 +522,7 @@ public: if (threadIdx.x == 0 && static_cast(blockIdx.x) >= nblocks) { dst = r; } - for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x; + for (int icell = AMREX_GPU_MAX_THREADS_REDUCE*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS_REDUCE*gridDim.x; icell < ncells; icell += stride) { int k = icell / lenxy; int j = (icell - k*lenxy) / lenx; @@ -562,21 +556,21 @@ public: const auto lenxy = len.x*len.y; const auto lenx = len.x; constexpr int nitems_per_thread = 4; - int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS-1) - / (nitems_per_thread*AMREX_GPU_MAX_THREADS); + int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE-1) + / (nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE); nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks()); reduce_data.updateMaxStreamIndex(stream); #ifdef AMREX_USE_SYCL // device reduce needs local(i.e., shared) memory constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size; - amrex::launch(nblocks_ec, shared_mem_bytes, stream, + amrex::launch(nblocks_ec, shared_mem_bytes, stream, [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept { Dim1 blockIdx {gh.blockIdx()}; Dim1 threadIdx{gh.threadIdx()}; Dim1 gridDim {gh.gridDim()}; #else - amrex::launch(nblocks_ec, 0, stream, + amrex::launch(nblocks_ec, 0, stream, [=] AMREX_GPU_DEVICE () noexcept { #endif @@ -586,7 +580,7 @@ public: if (threadIdx.x == 0 && static_cast(blockIdx.x) >= nblocks) { dst = r; } - for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x; + for (int icell = AMREX_GPU_MAX_THREADS_REDUCE*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS_REDUCE*gridDim.x; icell < ncells; icell += stride) { int k = icell / lenxy; int j = (icell - k*lenxy) / lenx; @@ -618,21 +612,21 @@ public: auto dp = reduce_data.devicePtr(stream); int& nblocks = reduce_data.nBlocks(stream); constexpr int nitems_per_thread = 4; - int nblocks_ec = (n + nitems_per_thread*AMREX_GPU_MAX_THREADS-1) - / (nitems_per_thread*AMREX_GPU_MAX_THREADS); + int nblocks_ec = (n + nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE-1) + / (nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE); nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks()); reduce_data.updateMaxStreamIndex(stream); #ifdef AMREX_USE_SYCL // device reduce needs local(i.e., shared) memory constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size; - amrex::launch(nblocks_ec, shared_mem_bytes, stream, + amrex::launch(nblocks_ec, shared_mem_bytes, stream, [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept { Dim1 blockIdx {gh.blockIdx()}; Dim1 threadIdx{gh.threadIdx()}; Dim1 gridDim {gh.gridDim()}; #else - amrex::launch(nblocks_ec, 0, stream, + amrex::launch(nblocks_ec, 0, stream, [=] AMREX_GPU_DEVICE () noexcept { #endif @@ -642,7 +636,7 @@ public: if (threadIdx.x == 0 && static_cast(blockIdx.x) >= nblocks) { dst = r; } - for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x; + for (N i = AMREX_GPU_MAX_THREADS_REDUCE*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS_REDUCE*gridDim.x; i < n; i += stride) { auto pr = f(i); Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr); @@ -697,7 +691,7 @@ public: #else auto presult = hp; #endif - amrex::launch(1, shared_mem_bytes, stream, + amrex::launch(1, shared_mem_bytes, stream, [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept { ReduceTuple r; @@ -717,7 +711,7 @@ public: Gpu::dtoh_memcpy_async(hp, dtmp.data(), sizeof(ReduceTuple)); #endif #else - amrex::launch(1, 0, stream, + amrex::launch(1, 0, stream, [=] AMREX_GPU_DEVICE () noexcept { ReduceTuple r; @@ -725,7 +719,7 @@ public: ReduceTuple dst = r; for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) { auto dp_stream = dp+istream*maxblocks; - for (int i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x; + for (int i = AMREX_GPU_MAX_THREADS_REDUCE*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS_REDUCE*gridDim.x; i < nblocks[istream]; i += stride) { Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]); } @@ -859,7 +853,7 @@ bool AnyOf (N n, T const* v, P const& pred) #ifdef AMREX_USE_SYCL const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1; const std::size_t shared_mem_bytes = num_ints*sizeof(int); - amrex::launch(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(), + amrex::launch(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept { int* has_any = &(static_cast(gh.sharedMemory())[num_ints-1]); if (gh.threadIdx() == 0) { *has_any = *dp; } @@ -868,7 +862,7 @@ bool AnyOf (N n, T const* v, P const& pred) if (!(*has_any)) { int r = false; - for (N i = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim(); + for (N i = AMREX_GPU_MAX_THREADS_REDUCE*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS_REDUCE*gh.gridDim(); i < n && !r; i += stride) { r = pred(v[i]) ? 1 : 0; @@ -880,7 +874,7 @@ bool AnyOf (N n, T const* v, P const& pred) } }); #else - amrex::launch(ec.numBlocks.x, 0, Gpu::gpuStream(), + amrex::launch(ec.numBlocks.x, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { __shared__ int has_any; if (threadIdx.x == 0) { has_any = *dp; } @@ -889,7 +883,7 @@ bool AnyOf (N n, T const* v, P const& pred) if (!has_any) { int r = false; - for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x; + for (N i = AMREX_GPU_MAX_THREADS_REDUCE*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS_REDUCE*gridDim.x; i < n && !r; i += stride) { r = pred(v[i]) ? 1 : 0; @@ -920,7 +914,7 @@ bool AnyOf (Box const& box, P const& pred) #ifdef AMREX_USE_SYCL const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1; const std::size_t shared_mem_bytes = num_ints*sizeof(int); - amrex::launch(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(), + amrex::launch(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept { int* has_any = &(static_cast(gh.sharedMemory())[num_ints-1]); if (gh.threadIdx() == 0) { *has_any = *dp; } @@ -929,7 +923,7 @@ bool AnyOf (Box const& box, P const& pred) if (!(*has_any)) { int r = false; - for (int icell = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim(); + for (int icell = AMREX_GPU_MAX_THREADS_REDUCE*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS_REDUCE*gh.gridDim(); icell < ncells && !r; icell += stride) { int k = icell / lenxy; int j = (icell - k*lenxy) / lenx; @@ -945,7 +939,7 @@ bool AnyOf (Box const& box, P const& pred) } }); #else - AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, ec.numBlocks, ec.numThreads, 0, + AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS_REDUCE, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { __shared__ int has_any; @@ -955,7 +949,7 @@ bool AnyOf (Box const& box, P const& pred) if (!has_any) { int r = false; - for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x; + for (int icell = AMREX_GPU_MAX_THREADS_REDUCE*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS_REDUCE*gridDim.x; icell < ncells && !r; icell += stride) { int k = icell / lenxy; int j = (icell - k*lenxy) / lenx; diff --git a/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H b/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H index 4ba916e372..d78bbca25b 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H @@ -712,11 +712,11 @@ protected: [[nodiscard]] Array4 compactify (Array4 const& a) const noexcept { if (info.hidden_direction == 0) { - return Array4(a.dataPtr(), {a.begin.y,a.begin.z,0}, {a.end.y,a.end.z,1}, a.nComp()); + return Array4(a.dataPtr(), {a.begin.y,a.begin.z,0}, {(a.begin.y+a.len.y),(a.begin.z+a.len.z),1}, a.nComp()); } else if (info.hidden_direction == 1) { - return Array4(a.dataPtr(), {a.begin.x,a.begin.z,0}, {a.end.x,a.end.z,1}, a.nComp()); + return Array4(a.dataPtr(), {a.begin.x,a.begin.z,0}, {(a.begin.x+a.len.x),(a.begin.z+a.len.z),1}, a.nComp()); } else if (info.hidden_direction == 2) { - return Array4(a.dataPtr(), {a.begin.x,a.begin.y,0}, {a.end.x,a.end.y,1}, a.nComp()); + return Array4(a.dataPtr(), {a.begin.x,a.begin.y,0}, {(a.begin.x+a.len.x),(a.begin.y+a.len.y),1}, a.nComp()); } else { return a; } diff --git a/Src/Particle/AMReX_ParticleContainerI.H b/Src/Particle/AMReX_ParticleContainerI.H index 7b200bedb7..e189753923 100644 --- a/Src/Particle/AMReX_ParticleContainerI.H +++ b/Src/Particle/AMReX_ParticleContainerI.H @@ -988,7 +988,7 @@ ParticleContainer_impl(amrex::Math::floor(hx)); for (int i = lo_x; i <= hi_x; ++i) { - if (i < rho.begin.x || i >= rho.end.x) { continue; } + if (i < rho.begin.x || i >= (rho.begin.x+rho.len.x)) { continue; } amrex::Real wx = amrex::min(hx - static_cast(i), amrex::Real(1.0)) - amrex::max(lx - static_cast(i), amrex::Real(0.0)); amrex::Real weight = wx*factor; amrex::Gpu::Atomic::AddNoRet(&rho(i, 0, 0, 0), static_cast(weight*p.rdata(0))); @@ -147,7 +147,7 @@ void amrex_deposit_particle_dx_cic (P const& p, int nc, amrex::Array4= rho.end.x) { continue; } + if (i < rho.begin.x || i >= (rho.begin.x+rho.len.x)) { continue; } amrex::Real wx = amrex::min(hx - static_cast(i), amrex::Real(1.0)) - amrex::max(lx - static_cast(i), amrex::Real(0.0)); amrex::Real weight = wx*factor; amrex::Gpu::Atomic::AddNoRet(&rho(i, 0, 0, comp), static_cast(weight*p.rdata(0)*p.rdata(comp))); @@ -170,10 +170,10 @@ void amrex_deposit_particle_dx_cic (P const& p, int nc, amrex::Array4(amrex::Math::floor(hy)); for (int j = lo_y; j <= hi_y; ++j) { - if (j < rho.begin.y || j >= rho.end.y) { continue; } + if (j < rho.begin.y || j >= (rho.begin.y+rho.len.y)) { continue; } amrex::Real wy = amrex::min(hy - static_cast(j), amrex::Real(1.0)) - amrex::max(ly - static_cast(j), amrex::Real(0.0)); for (int i = lo_x; i <= hi_x; ++i) { - if (i < rho.begin.x || i >= rho.end.x) { continue; } + if (i < rho.begin.x || i >= (rho.begin.x+rho.len.x)) { continue; } amrex::Real wx = amrex::min(hx - static_cast(i), amrex::Real(1.0)) - amrex::max(lx - static_cast(i), amrex::Real(0.0)); amrex::Real weight = wx*wy*factor; amrex::Gpu::Atomic::AddNoRet(&rho(i, j, 0, 0), static_cast(weight*p.rdata(0))); @@ -182,10 +182,10 @@ void amrex_deposit_particle_dx_cic (P const& p, int nc, amrex::Array4= rho.end.y) { continue; } + if (j < rho.begin.y || j >= (rho.begin.y+rho.len.y)) { continue; } amrex::Real wy = amrex::min(hy - static_cast(j), amrex::Real(1.0)) - amrex::max(ly - static_cast(j), amrex::Real(0.0)); for (int i = lo_x; i <= hi_x; ++i) { - if (i < rho.begin.x || i >= rho.end.x) { continue; } + if (i < rho.begin.x || i >= (rho.begin.x+rho.len.x)) { continue; } amrex::Real wx = amrex::min(hx - static_cast(i), amrex::Real(1.0)) - amrex::max(lx - static_cast(i), amrex::Real(0.0)); amrex::Real weight = wx*wy*factor; amrex::Gpu::Atomic::AddNoRet(&rho(i, j, 0, comp), static_cast(weight*p.rdata(0)*p.rdata(comp))); @@ -213,13 +213,13 @@ void amrex_deposit_particle_dx_cic (P const& p, int nc, amrex::Array4(amrex::Math::floor(hz)); for (int k = lo_z; k <= hi_z; ++k) { - if (k < rho.begin.z || k >= rho.end.z) { continue; } + if (k < rho.begin.z || k >= (rho.begin.z+rho.len.z)) { continue; } amrex::Real wz = amrex::min(hz - static_cast(k), amrex::Real(1.0)) - amrex::max(lz - static_cast(k), amrex::Real(0.0)); for (int j = lo_y; j <= hi_y; ++j) { - if (j < rho.begin.y || j >= rho.end.y) { continue; } + if (j < rho.begin.y || j >= (rho.begin.y+rho.len.y)) { continue; } amrex::Real wy = amrex::min(hy - static_cast(j), amrex::Real(1.0)) - amrex::max(ly - static_cast(j), amrex::Real(0.0)); for (int i = lo_x; i <= hi_x; ++i) { - if (i < rho.begin.x || i >= rho.end.x) { continue; } + if (i < rho.begin.x || i >= (rho.begin.x+rho.len.x)) { continue; } amrex::Real wx = amrex::min(hx - static_cast(i), amrex::Real(1.0)) - amrex::max(lx - static_cast(i), amrex::Real(0.0)); amrex::Real weight = wx*wy*wz*factor; amrex::Gpu::Atomic::AddNoRet(&rho(i, j, k, 0), static_cast(weight*p.rdata(0))); @@ -229,13 +229,13 @@ void amrex_deposit_particle_dx_cic (P const& p, int nc, amrex::Array4= rho.end.z) { continue; } + if (k < rho.begin.z || k >= (rho.begin.z+rho.len.z)) { continue; } amrex::Real wz = amrex::min(hz - static_cast(k), amrex::Real(1.0)) - amrex::max(lz - static_cast(k), amrex::Real(0.0)); for (int j = lo_y; j <= hi_y; ++j) { - if (j < rho.begin.y || j >= rho.end.y) { continue; } + if (j < rho.begin.y || j >= (rho.begin.y+rho.len.y)) { continue; } amrex::Real wy = amrex::min(hy - static_cast(j), amrex::Real(1.0)) - amrex::max(ly - static_cast(j), amrex::Real(0.0)); for (int i = lo_x; i <= hi_x; ++i) { - if (i < rho.begin.x || i >= rho.end.x) { continue; } + if (i < rho.begin.x || i >= (rho.begin.x+rho.len.x)) { continue; } amrex::Real wx = amrex::min(hx - static_cast(i), amrex::Real(1.0)) - amrex::max(lx - static_cast(i), amrex::Real(0.0)); amrex::Real weight = wx*wy*wz*factor; amrex::Gpu::Atomic::AddNoRet(&rho(i, j, k, comp), static_cast(weight*p.rdata(0)*p.rdata(comp))); diff --git a/Tools/CMake/AMReXOptions.cmake b/Tools/CMake/AMReXOptions.cmake index ef55a1053c..51063b7dc1 100644 --- a/Tools/CMake/AMReXOptions.cmake +++ b/Tools/CMake/AMReXOptions.cmake @@ -132,8 +132,11 @@ endif () if (NOT AMReX_GPU_BACKEND STREQUAL NONE) message( STATUS " AMReX_GPU_BACKEND = ${AMReX_GPU_BACKEND}") - # We might set different default for different GPUs in the future. - set(AMReX_GPU_MAX_THREADS_DEFAULT "256") + if (AMReX_GPU_BACKEND STREQUAL CUDA) + set(AMReX_GPU_MAX_THREADS_DEFAULT "128") + else () + set(AMReX_GPU_MAX_THREADS_DEFAULT "256") + endif () set(AMReX_GPU_MAX_THREADS ${AMReX_GPU_MAX_THREADS_DEFAULT} CACHE STRING "Maximum number of GPU threads per block" ) message( STATUS " AMReX_GPU_MAX_THREADS = ${AMReX_GPU_MAX_THREADS}") diff --git a/Tools/GNUMake/Make.defs b/Tools/GNUMake/Make.defs index 6c2aad9d83..18cd9634d6 100644 --- a/Tools/GNUMake/Make.defs +++ b/Tools/GNUMake/Make.defs @@ -269,8 +269,12 @@ else endif # Maximum number of GPU threads per block. -CUDA_MAX_THREADS ?= 256 -GPU_MAX_THREADS ?= $(CUDA_MAX_THREADS) +CUDA_MAX_THREADS ?= 128 +ifeq ($(USE_CUDA),TRUE) + GPU_MAX_THREADS ?= $(CUDA_MAX_THREADS) +else + GPU_MAX_THREADS ?= 256 +endif ifeq ($(USE_CUDA),TRUE) # Set the default CUDA architecture version.