AMREX_GPU_MAX_THREADS: 256 -> 128

WeiqunZhang · WeiqunZhang · commit 23ed91b50459 · 2025-04-13T21:07:40.000-04:00
diff --git a/Docs/sphinx_documentation/source/GPU.rst b/Docs/sphinx_documentation/source/GPU.rst
@@ -229,9 +229,9 @@ Building with CMake
 
 To build AMReX with GPU support in CMake, add
 ``-DAMReX_GPU_BACKEND=CUDA|HIP|SYCL`` to the ``cmake`` invocation, for CUDA,
-HIP and SYCL, respectively. By default, AMReX uses 256 threads per GPU
+HIP and SYCL, respectively. By default, AMReX uses 128 threads per GPU
 block/group in most situations. This can be changed with
-``-DAMReX_GPU_MAX_THREADS=N``, where ``N`` is 128 for example.
+``-DAMReX_GPU_MAX_THREADS=N``, where ``N`` is 256 for example.
 
 Enabling CUDA support
 ^^^^^^^^^^^^^^^^^^^^^
@@ -1166,7 +1166,7 @@ GPU block size
 
 By default, :cpp:`ParallelFor` launches ``AMREX_GPU_MAX_THREADS`` threads
 per GPU block, where ``AMREX_GPU_MAX_THREADS`` is a compile-time constant
-with a default value of 256.  The users can also explicitly specify the
+with a default value of 128.  The users can also explicitly specify the
 number of threads per block by :cpp:`ParallelFor<MY_BLOCK_SIZE>(...)`, where
 ``MY_BLOCK_SIZE`` is a multiple of the warp size (e.g., 128).  This allows
 the users to do performance tuning for individual kernels.
diff --git a/Src/AmrCore/AMReX_TagBox.cpp b/Src/AmrCore/AMReX_TagBox.cpp
@@ -447,8 +447,8 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
         const int ncells = fai.fabbox().numPts();
         const char* tags = (*this)[fai].dataPtr();
 #ifdef AMREX_USE_SYCL
-        amrex::launch(nblocks[li], block_size, sizeof(int)*Gpu::Device::warp_size,
-                      Gpu::Device::gpuStream(),
+        amrex::launch<block_size>(nblocks[li], sizeof(int)*Gpu::Device::warp_size,
+                                  Gpu::Device::gpuStream(),
         [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
         {
             int bid = h.item->get_group_linear_id();
@@ -467,7 +467,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
             }
         });
 #else
-        amrex::launch(nblocks[li], block_size, Gpu::Device::gpuStream(),
+        amrex::launch<block_size>(nblocks[li], Gpu::Device::gpuStream(),
         [=] AMREX_GPU_DEVICE () noexcept
         {
             int bid = blockIdx.x;
@@ -525,7 +525,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
             const int ncells = bx.numPts();
             const char* tags = (*this)[fai].dataPtr();
 #ifdef AMREX_USE_SYCL
-            amrex::launch(nblocks[li], block_size, sizeof(unsigned int), Gpu::Device::gpuStream(),
+            amrex::launch<block_size>(nblocks[li], sizeof(unsigned int), Gpu::Device::gpuStream(),
             [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
             {
                 int bid = h.item->get_group(0);
@@ -553,7 +553,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
                 }
             });
 #else
-            amrex::launch(nblocks[li], block_size, sizeof(unsigned int), Gpu::Device::gpuStream(),
+            amrex::launch<block_size>(nblocks[li], sizeof(unsigned int), Gpu::Device::gpuStream(),
             [=] AMREX_GPU_DEVICE () noexcept
             {
                 int bid = blockIdx.x;
diff --git a/Src/Base/AMReX_BaseFabUtility.H b/Src/Base/AMReX_BaseFabUtility.H
@@ -38,14 +38,14 @@ void fill (BaseFab<STRUCT>& aos_fab, F const& f)
     if (Gpu::inLaunchRegion()) {
         BoxIndexer indexer(box);
         const auto ntotcells = std::uint64_t(box.numPts());
-        int nthreads_per_block = (STRUCTSIZE <= 8) ? 256 : 128;
+        constexpr int nthreads_per_block = (STRUCTSIZE <= 8) ? 256 : 128;
         std::uint64_t nblocks_long = (ntotcells+nthreads_per_block-1)/nthreads_per_block;
         AMREX_ASSERT(nblocks_long <= std::uint64_t(std::numeric_limits<int>::max()));
         auto nblocks = int(nblocks_long);
         std::size_t shared_mem_bytes = nthreads_per_block * sizeof(STRUCT);
         T* p = (T*)aos_fab.dataPtr();
 #ifdef AMREX_USE_SYCL
-        amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
+        amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
         {
             auto const icell = std::uint64_t(handler.globalIdx());
@@ -66,7 +66,7 @@ void fill (BaseFab<STRUCT>& aos_fab, F const& f)
             }
         });
 #else
-        amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
+        amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE () noexcept
         {
             std::uint64_t const icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
diff --git a/Src/Base/AMReX_BlockMutex.cpp b/Src/Base/AMReX_BlockMutex.cpp
@@ -9,7 +9,7 @@ void BlockMutex::init_states (state_t* state, int N) noexcept {
     amrex::ignore_unused(state,N);
     amrex::Abort("xxxxx SYCL todo");
 #else
-    amrex::launch((N+255)/256, 256, Gpu::gpuStream(),
+    amrex::launch<256>((N+255)/256, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept
     {
         int i = threadIdx.x + blockIdx.x*blockDim.x;
diff --git a/Src/Base/AMReX_GpuContainers.H b/Src/Base/AMReX_GpuContainers.H
@@ -433,11 +433,11 @@ namespace amrex::Gpu {
                                          unsigned long long, unsigned int>;
             constexpr Long nU = sizeof(T) / sizeof(U);
             auto pu = reinterpret_cast<U*>(p);
-            int nthreads_per_block = (sizeof(T) <= 64) ? 256 : 128;
+            constexpr int nthreads_per_block = (sizeof(T) <= 64) ? 256 : 128;
             int nblocks = static_cast<int>((N+nthreads_per_block-1)/nthreads_per_block);
             std::size_t shared_mem_bytes = nthreads_per_block * sizeof(T);
 #ifdef AMREX_USE_SYCL
-            amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
+            amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
             [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
             {
                 Long i = handler.globalIdx();
@@ -458,7 +458,7 @@ namespace amrex::Gpu {
                 }
             });
 #else
-            amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
+            amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
                           [=] AMREX_GPU_DEVICE () noexcept
             {
                 Long blockDimx = blockDim.x;
diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H
@@ -34,9 +34,13 @@
 #ifdef AMREX_USE_CUDA
 #  define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \
         amrex::launch_global<MT><<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__)
+#  define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream, ... ) \
+        amrex::launch_global    <<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__)
 #elif defined(AMREX_USE_HIP)
 #  define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \
         hipLaunchKernelGGL(launch_global<MT>, blocks, threads, sharedMem, stream, __VA_ARGS__)
+#  define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream, ... ) \
+        hipLaunchKernelGGL(launch_global    , blocks, threads, sharedMem, stream, __VA_ARGS__)
 #endif
 
 
diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -735,9 +735,8 @@ template<typename L>
 void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
              gpuStream_t stream, L const& f) noexcept
 {
-    AMREX_ASSERT(nthreads_per_block <= AMREX_GPU_MAX_THREADS);
-    AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, nblocks, nthreads_per_block, shared_mem_bytes,
-                        stream, [=] AMREX_GPU_DEVICE () noexcept { f(); });
+    AMREX_LAUNCH_KERNEL_NOBOUND(nblocks, nthreads_per_block, shared_mem_bytes,
+                                stream, [=] AMREX_GPU_DEVICE () noexcept { f(); });
     AMREX_GPU_ERROR_CHECK();
 }
 
diff --git a/Src/Base/AMReX_MultiFabUtil.cpp b/Src/Base/AMReX_MultiFabUtil.cpp
@@ -846,10 +846,10 @@ namespace amrex
                 int nblocks = n2dblocks * b.length(direction);
 #ifdef AMREX_USE_SYCL
                 std::size_t shared_mem_byte = sizeof(Real)*Gpu::Device::warp_size;
-                amrex::launch(nblocks, AMREX_GPU_MAX_THREADS, shared_mem_byte, Gpu::gpuStream(),
+                amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks, shared_mem_byte, Gpu::gpuStream(),
                               [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
 #else
-                amrex::launch(nblocks, AMREX_GPU_MAX_THREADS, Gpu::gpuStream(),
+                amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks, Gpu::gpuStream(),
                               [=] AMREX_GPU_DEVICE () noexcept
 #endif
                 {
diff --git a/Src/Base/AMReX_Scan.H b/Src/Base/AMReX_Scan.H
@@ -209,7 +209,7 @@ T PrefixSum_mp (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum)
     T* blocksum_p = (T*)(dp + nbytes_blockresult);
     T* totalsum_p = (T*)(dp + nbytes_blockresult + nbytes_blocksum);
 
-    amrex::launch(nblocks, nthreads, sm, stream,
+    amrex::launch<nthreads>(nblocks, sm, stream,
     [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
     {
         sycl::sub_group const& sg = gh.item->get_sub_group();
@@ -289,7 +289,7 @@ T PrefixSum_mp (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum)
         }
     });
 
-    amrex::launch(1, nthreads, sm, stream,
+    amrex::launch<nthreads>(1, sm, stream,
     [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
     {
         sycl::sub_group const& sg = gh.item->get_sub_group();
@@ -355,7 +355,7 @@ T PrefixSum_mp (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum)
         }
     });
 
-    amrex::launch(nblocks, nthreads, 0, stream,
+    amrex::launch<nthreads>(nblocks, 0, stream,
     [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
     {
         int threadIdxx = gh.item->get_local_id(0);
@@ -429,7 +429,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum
         }
     });
 
-    amrex::launch(nblocks, nthreads, sm, stream,
+    amrex::launch<nthreads>(nblocks, sm, stream,
     [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
     {
         sycl::sub_group const& sg = gh.item->get_sub_group();
@@ -672,7 +672,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
         (reinterpret_cast<OrderedBlockId::id_type*>(dp + nbytes_tile_state));
 
     // Init ScanTileState on device
-    amrex::launch((nblocks+nthreads-1)/nthreads, nthreads, 0, stream, [=] AMREX_GPU_DEVICE ()
+    amrex::launch<nthreads>((nblocks+nthreads-1)/nthreads, 0, stream, [=] AMREX_GPU_DEVICE ()
     {
         auto& scan_tile_state = const_cast<ScanTileState&>(tile_state);
         auto& scan_bid = const_cast<OrderedBlockId&>(ordered_block_id);
@@ -813,7 +813,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
 
     if (nblocks > 1) {
         // Init ScanTileState on device
-        amrex::launch((nblocks+nthreads-1)/nthreads, nthreads, 0, stream, [=] AMREX_GPU_DEVICE ()
+        amrex::launch<nthreads>((nblocks+nthreads-1)/nthreads, 0, stream, [=] AMREX_GPU_DEVICE ()
         {
             const_cast<ScanTileState&>(tile_state).InitializeStatus(nblocks);
         });
@@ -957,7 +957,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
         }
     });
 
-    amrex::launch(nblocks, nthreads, sm, stream,
+    amrex::launch<nthreads>(nblocks, sm, stream,
     [=] AMREX_GPU_DEVICE () noexcept
     {
         int lane = threadIdx.x % Gpu::Device::warp_size;
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp_bc.cpp b/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp_bc.cpp
@@ -85,7 +85,7 @@ MLEBTensorOp::applyBCTensor (int amrlev, int mglev, MultiFab& vel,
 
 #ifdef AMREX_USE_GPU
             if (Gpu::inLaunchRegion()) {
-                amrex::launch(12, 64, Gpu::gpuStream(),
+                amrex::launch<64>(12, Gpu::gpuStream(),
 #ifdef AMREX_USE_SYCL
                 [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item)
                 {
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp
@@ -408,7 +408,7 @@ MLTensorOp::applyBCTensor (int amrlev, int mglev, MultiFab& vel, // NOLINT(reada
         // only edge vals used in 3D stencil
 #ifdef AMREX_USE_GPU
         if (Gpu::inLaunchRegion()) {
-            amrex::launch(12, 64, Gpu::gpuStream(),
+            amrex::launch<64>(12, Gpu::gpuStream(),
 #ifdef AMREX_USE_SYCL
             [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item)
             {
diff --git a/Src/LinearSolvers/OpenBC/AMReX_OpenBC.cpp b/Src/LinearSolvers/OpenBC/AMReX_OpenBC.cpp
@@ -745,7 +745,7 @@ void OpenBCSolver::compute_potential (Gpu::DeviceVector<openbc::Moments> const&
                              lenxy,lenx);
         amrex::Abort("xxxxx SYCL todo: openbc compute_potential");
 #else
-        amrex::launch(b.numPts(), AMREX_GPU_MAX_THREADS, Gpu::gpuStream(),
+        amrex::launch<AMREX_GPU_MAX_THREADS>(b.numPts(), Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE () noexcept
         {
             int icell = blockIdx.x;
diff --git a/Tools/CMake/AMReXOptions.cmake b/Tools/CMake/AMReXOptions.cmake
@@ -133,7 +133,7 @@ if (NOT AMReX_GPU_BACKEND STREQUAL NONE)
    message( STATUS "   AMReX_GPU_BACKEND = ${AMReX_GPU_BACKEND}")
 
    # We might set different default for different GPUs in the future.
-   set(AMReX_GPU_MAX_THREADS_DEFAULT "256")
+   set(AMReX_GPU_MAX_THREADS_DEFAULT "128")
    set(AMReX_GPU_MAX_THREADS ${AMReX_GPU_MAX_THREADS_DEFAULT} CACHE STRING
        "Maximum number of GPU threads per block" )
    message( STATUS "   AMReX_GPU_MAX_THREADS = ${AMReX_GPU_MAX_THREADS}")
diff --git a/Tools/GNUMake/Make.defs b/Tools/GNUMake/Make.defs
@@ -269,7 +269,7 @@ else
 endif
 
 # Maximum number of GPU threads per block.
-CUDA_MAX_THREADS ?= 256
+CUDA_MAX_THREADS ?= 128
 GPU_MAX_THREADS ?= $(CUDA_MAX_THREADS)
 
 ifeq ($(USE_CUDA),TRUE)

Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@ void BlockMutex::init_states (state_t* state, int N) noexcept {`
`9`	`9`	`amrex::ignore_unused(state,N);`
`10`	`10`	`amrex::Abort("xxxxx SYCL todo");`
`11`	`11`	`#else`
`12`		`- amrex::launch((N+255)/256, 256, Gpu::gpuStream(),`
	`12`	`+ amrex::launch<256>((N+255)/256, Gpu::gpuStream(),`
`13`	`13`	`[=] AMREX_GPU_DEVICE () noexcept`
`14`	`14`	`{`
`15`	`15`	`int i = threadIdx.x + blockIdx.x*blockDim.x;`
Original file line number	Diff line number	Diff line change
`@@ -735,9 +735,8 @@ template<typename L>`
`735`	`735`	`void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,`
`736`	`736`	`gpuStream_t stream, L const& f) noexcept`
`737`	`737`	`{`
`738`		`- AMREX_ASSERT(nthreads_per_block <= AMREX_GPU_MAX_THREADS);`
`739`		`- AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, nblocks, nthreads_per_block, shared_mem_bytes,`
`740`		`- stream, [=] AMREX_GPU_DEVICE () noexcept { f(); });`
	`738`	`+ AMREX_LAUNCH_KERNEL_NOBOUND(nblocks, nthreads_per_block, shared_mem_bytes,`
	`739`	`+ stream, [=] AMREX_GPU_DEVICE () noexcept { f(); });`
`741`	`740`	`AMREX_GPU_ERROR_CHECK();`
`742`	`741`	`}`
`743`	`742`
Original file line number	Diff line number	Diff line change
`@@ -209,7 +209,7 @@ T PrefixSum_mp (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum)`
`209`	`209`	`T* blocksum_p = (T*)(dp + nbytes_blockresult);`
`210`	`210`	`T* totalsum_p = (T*)(dp + nbytes_blockresult + nbytes_blocksum);`
`211`	`211`
`212`		`- amrex::launch(nblocks, nthreads, sm, stream,`
	`212`	`+ amrex::launch<nthreads>(nblocks, sm, stream,`
`213`	`213`	`[=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept`
`214`	`214`	`{`
`215`	`215`	`sycl::sub_group const& sg = gh.item->get_sub_group();`
`@@ -289,7 +289,7 @@ T PrefixSum_mp (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum)`
`289`	`289`	`}`
`290`	`290`	`});`
`291`	`291`
`292`		`- amrex::launch(1, nthreads, sm, stream,`
	`292`	`+ amrex::launch<nthreads>(1, sm, stream,`
`293`	`293`	`[=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept`
`294`	`294`	`{`
`295`	`295`	`sycl::sub_group const& sg = gh.item->get_sub_group();`
`@@ -355,7 +355,7 @@ T PrefixSum_mp (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum)`
`355`	`355`	`}`
`356`	`356`	`});`
`357`	`357`
`358`		`- amrex::launch(nblocks, nthreads, 0, stream,`
	`358`	`+ amrex::launch<nthreads>(nblocks, 0, stream,`
`359`	`359`	`[=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept`
`360`	`360`	`{`
`361`	`361`	`int threadIdxx = gh.item->get_local_id(0);`
`@@ -429,7 +429,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum`
`429`	`429`	`}`
`430`	`430`	`});`
`431`	`431`
`432`		`- amrex::launch(nblocks, nthreads, sm, stream,`
	`432`	`+ amrex::launch<nthreads>(nblocks, sm, stream,`
`433`	`433`	`[=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept`
`434`	`434`	`{`
`435`	`435`	`sycl::sub_group const& sg = gh.item->get_sub_group();`
`@@ -672,7 +672,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret`
`672`	`672`	`(reinterpret_cast<OrderedBlockId::id_type*>(dp + nbytes_tile_state));`
`673`	`673`
`674`	`674`	`// Init ScanTileState on device`
`675`		`- amrex::launch((nblocks+nthreads-1)/nthreads, nthreads, 0, stream, [=] AMREX_GPU_DEVICE ()`
	`675`	`+ amrex::launch<nthreads>((nblocks+nthreads-1)/nthreads, 0, stream, [=] AMREX_GPU_DEVICE ()`
`676`	`676`	`{`
`677`	`677`	`auto& scan_tile_state = const_cast<ScanTileState&>(tile_state);`
`678`	`678`	`auto& scan_bid = const_cast<OrderedBlockId&>(ordered_block_id);`
`@@ -813,7 +813,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret`
`813`	`813`
`814`	`814`	`if (nblocks > 1) {`
`815`	`815`	`// Init ScanTileState on device`
`816`		`- amrex::launch((nblocks+nthreads-1)/nthreads, nthreads, 0, stream, [=] AMREX_GPU_DEVICE ()`
	`816`	`+ amrex::launch<nthreads>((nblocks+nthreads-1)/nthreads, 0, stream, [=] AMREX_GPU_DEVICE ()`
`817`	`817`	`{`
`818`	`818`	`const_cast<ScanTileState&>(tile_state).InitializeStatus(nblocks);`
`819`	`819`	`});`
`@@ -957,7 +957,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret`
`957`	`957`	`}`
`958`	`958`	`});`
`959`	`959`
`960`		`- amrex::launch(nblocks, nthreads, sm, stream,`
	`960`	`+ amrex::launch<nthreads>(nblocks, sm, stream,`
`961`	`961`	`[=] AMREX_GPU_DEVICE () noexcept`
`962`	`962`	`{`
`963`	`963`	`int lane = threadIdx.x % Gpu::Device::warp_size;`
Original file line number	Diff line number	Diff line change
`@@ -85,7 +85,7 @@ MLEBTensorOp::applyBCTensor (int amrlev, int mglev, MultiFab& vel,`
`85`	`85`
`86`	`86`	`#ifdef AMREX_USE_GPU`
`87`	`87`	`if (Gpu::inLaunchRegion()) {`
`88`		`- amrex::launch(12, 64, Gpu::gpuStream(),`
	`88`	`+ amrex::launch<64>(12, Gpu::gpuStream(),`
`89`	`89`	`#ifdef AMREX_USE_SYCL`
`90`	`90`	`[=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item)`
`91`	`91`	`{`
Original file line number	Diff line number	Diff line change
`@@ -408,7 +408,7 @@ MLTensorOp::applyBCTensor (int amrlev, int mglev, MultiFab& vel, // NOLINT(reada`
`408`	`408`	`// only edge vals used in 3D stencil`
`409`	`409`	`#ifdef AMREX_USE_GPU`
`410`	`410`	`if (Gpu::inLaunchRegion()) {`
`411`		`- amrex::launch(12, 64, Gpu::gpuStream(),`
	`411`	`+ amrex::launch<64>(12, Gpu::gpuStream(),`
`412`	`412`	`#ifdef AMREX_USE_SYCL`
`413`	`413`	`[=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item)`
`414`	`414`	`{`
Original file line number	Diff line number	Diff line change
`@@ -745,7 +745,7 @@ void OpenBCSolver::compute_potential (Gpu::DeviceVector<openbc::Moments> const&`
`745`	`745`	`lenxy,lenx);`
`746`	`746`	`amrex::Abort("xxxxx SYCL todo: openbc compute_potential");`
`747`	`747`	`#else`
`748`		`- amrex::launch(b.numPts(), AMREX_GPU_MAX_THREADS, Gpu::gpuStream(),`
	`748`	`+ amrex::launch<AMREX_GPU_MAX_THREADS>(b.numPts(), Gpu::gpuStream(),`
`749`	`749`	`[=] AMREX_GPU_DEVICE () noexcept`
`750`	`750`	`{`
`751`	`751`	`int icell = blockIdx.x;`