|
47 | 47 | #include <cub/util_macro.cuh>
|
48 | 48 | #include <cub/util_namespace.cuh>
|
49 | 49 |
|
| 50 | +#include <cuda/cmath> |
| 51 | +#include <cuda/std/__algorithm/max.h> |
| 52 | +#include <cuda/std/__algorithm/min.h> |
| 53 | + |
50 | 54 | // Legacy include; this functionality used to be defined in here.
|
51 | 55 | #include <cub/detail/detect_cuda_runtime.cuh>
|
52 | 56 |
|
@@ -143,27 +147,24 @@ namespace detail
|
143 | 147 | static constexpr ::cuda::std::size_t max_smem_per_block = 48 * 1024;
|
144 | 148 | } // namespace detail
|
145 | 149 |
|
146 |
| -template <int NOMINAL_4B_BLOCK_THREADS, int NOMINAL_4B_ITEMS_PER_THREAD, typename T> |
| 150 | +template <int Nominal4ByteBlockThreads, int Nominal4ByteItemsPerThread, typename T> |
147 | 151 | struct RegBoundScaling
|
148 | 152 | {
|
149 |
| - enum |
150 |
| - { |
151 |
| - ITEMS_PER_THREAD = CUB_MAX(1, NOMINAL_4B_ITEMS_PER_THREAD * 4 / CUB_MAX(4, sizeof(T))), |
152 |
| - BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, |
153 |
| - ((cub::detail::max_smem_per_block / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32), |
154 |
| - }; |
| 153 | + static constexpr int ITEMS_PER_THREAD = |
| 154 | + ::cuda::std::max(1, Nominal4ByteItemsPerThread * 4 / ::cuda::std::max(4, int{sizeof(T)})); |
| 155 | + static constexpr int BLOCK_THREADS = |
| 156 | + ::cuda::std::min(Nominal4ByteBlockThreads, |
| 157 | + ::cuda::ceil_div(int{detail::max_smem_per_block} / (int{sizeof(T)} * ITEMS_PER_THREAD), 32) * 32); |
155 | 158 | };
|
156 | 159 |
|
157 |
| -template <int NOMINAL_4B_BLOCK_THREADS, int NOMINAL_4B_ITEMS_PER_THREAD, typename T> |
| 160 | +template <int Nominal4ByteBlockThreads, int Nominal4ByteItemsPerThread, typename T> |
158 | 161 | struct MemBoundScaling
|
159 | 162 | {
|
160 |
| - enum |
161 |
| - { |
162 |
| - ITEMS_PER_THREAD = |
163 |
| - CUB_MAX(1, CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T), NOMINAL_4B_ITEMS_PER_THREAD * 2)), |
164 |
| - BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, |
165 |
| - ((cub::detail::max_smem_per_block / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32), |
166 |
| - }; |
| 163 | + static constexpr int ITEMS_PER_THREAD = ::cuda::std::max( |
| 164 | + 1, ::cuda::std::min(Nominal4ByteItemsPerThread * 4 / int{sizeof(T)}, Nominal4ByteItemsPerThread * 2)); |
| 165 | + static constexpr int BLOCK_THREADS = |
| 166 | + ::cuda::std::min(Nominal4ByteBlockThreads, |
| 167 | + ::cuda::ceil_div(int{detail::max_smem_per_block} / (int{sizeof(T)} * ITEMS_PER_THREAD), 32) * 32); |
167 | 168 | };
|
168 | 169 |
|
169 | 170 | #endif // Do not document
|
|
0 commit comments