-
Notifications
You must be signed in to change notification settings - Fork 184
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add round up/down to multiple (#3234)
- Loading branch information
Showing
8 changed files
with
554 additions
and
43 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,52 +1,28 @@ | ||
.. _libcudacxx-extended-api-math: | ||
|
||
Math | ||
===== | ||
==== | ||
|
||
.. code:: cuda | ||
.. toctree:: | ||
:hidden: | ||
:maxdepth: 1 | ||
|
||
template <typename T> | ||
[[nodiscard]] __host__ __device__ constexpr T ceil_div(T a, T b) noexcept; | ||
cuda::ceil_div <math/ceil_div> | ||
cuda::round_up <math/round_up> | ||
cuda::round_down <math/round_down> | ||
|
||
ceil_div | ||
--------- | ||
.. list-table:: | ||
:widths: 25 45 30 | ||
:header-rows: 0 | ||
|
||
- _Requires_: `is_integral_v<T>` is true. | ||
- _Preconditions_: `a >= 0` is true and `b > 0` is true. | ||
- _Returns_: divides `a` by `b`. If `a` is not a multiple of `b` rounds the result up to the next integer value. | ||
* - :ref:`ceil_div <libcudacxx-extended-api-math-ceil-div>` | ||
- Ceiling division | ||
- CCCL 2.6.0 / CUDA 12.6 | ||
|
||
.. note:: | ||
* - :ref:`round_up <libcudacxx-extended-api-math-round-up>` | ||
- Round to the next multiple | ||
- CCCL 2.9.0 / CUDA 12.9 | ||
|
||
The function is only constexpr from C++14 onwards | ||
|
||
**Example**: This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block: | ||
|
||
.. code:: cuda | ||
#include <vector> | ||
#include <cuda/cmath> | ||
__global__ void vscale(int n, float s, float *x) { | ||
int i = blockIdx.x * blockDim.x + threadIdx.x; | ||
if (i < n) x[i] *= s; | ||
} | ||
int main() { | ||
const int n = 100000; | ||
const float s = 2.f; | ||
std::vector<float> x(n, 1.f); | ||
// Given a fixed number of threads per block... | ||
constexpr int threads_per_block = 256; | ||
// ...dividing some "n" by "threads_per_block" may lead to a remainder, | ||
// requiring the kernel to be launched with an extra thread block to handle it. | ||
const int thread_blocks = cuda::ceil_div(n, threads_per_block); | ||
vscale<<<thread_blocks, threads_per_block>>>(n, s, x.data()); | ||
cudaDeviceSynchronize(); | ||
return 0; | ||
} | ||
`See it on Godbolt TODO` | ||
* - :ref:`round_down <libcudacxx-extended-api-math-round-down>` | ||
- Round to the previous multiple | ||
- CCCL 2.9.0 / CUDA 12.9 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
.. _libcudacxx-extended-api-math-ceil-div: | ||
|
||
``ceil_div`` Ceiling Division | ||
============================= | ||
|
||
.. code:: cuda | ||
template <typename T, typename = U> | ||
[[nodiscard]] __host__ __device__ constexpr T ceil_div(T value, U divisor) noexcept; | ||
``value``: The value to be divided. | ||
``divisor``: The divisor. | ||
|
||
- *Requires*: ``is_integral_v<T>`` is true and ``is_integral_v<U>`` is true. | ||
- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true. | ||
- *Returns*: divides ``a`` by ``b``. If ``a`` is not a multiple of ``b`` rounds the result up to the next integer value. | ||
|
||
.. note:: | ||
|
||
The function is only constexpr from C++14 onwards | ||
|
||
**Example**: This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block: | ||
|
||
.. code:: cuda | ||
#include <vector> | ||
#include <cuda/cmath> | ||
__global__ void vscale(int n, float s, float *x) { | ||
int i = blockIdx.x * blockDim.x + threadIdx.x; | ||
if (i < n) x[i] *= s; | ||
} | ||
int main() { | ||
const int n = 100000; | ||
const float s = 2.f; | ||
std::vector<float> x(n, 1.f); | ||
// Given a fixed number of threads per block... | ||
constexpr int threads_per_block = 256; | ||
// ...dividing some "n" by "threads_per_block" may lead to a remainder, | ||
// requiring the kernel to be launched with an extra thread block to handle it. | ||
const int thread_blocks = cuda::ceil_div(n, threads_per_block); | ||
vscale<<<thread_blocks, threads_per_block>>>(n, s, x.data()); | ||
cudaDeviceSynchronize(); | ||
return 0; | ||
} | ||
`See it on Godbolt TODO` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
.. _libcudacxx-extended-api-math-round-down: | ||
|
||
``round_down`` Round to the previous multiple | ||
============================================= | ||
|
||
.. code:: cuda | ||
template <typename T, typename = U> | ||
[[nodiscard]] __host__ __device__ inline | ||
constexpr cuda::std::common_type_t<T, U> round_down(T value, U base_multiple) noexcept; | ||
``value``: The value to be rounded down. | ||
``base_multiple``: The base multiple to which the value rounds down. | ||
|
||
- *Requires*: ``T`` and ``U`` are integral types (including 128-bit integers) or enumerators. | ||
- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true. | ||
- *Returns*: ``a`` rounded down to the largest multiple of ``b`` less than or equal to ``a``. If ``a`` is already a multiple of ``b``, return ``a``. | ||
|
||
.. note:: | ||
|
||
The function requires C++17 onwards | ||
|
||
**Performance considerations**: | ||
|
||
- The function performs a truncation division followed by a multiplication. It provides better performance than ``a / b * b`` when the common type is a signed integer | ||
|
||
**Example**: | ||
|
||
.. code:: cuda | ||
#include <cuda/cmath> | ||
__global__ void example_kernel(int a, unsigned b, unsigned* result) { | ||
// a = 7, b = 3 -> result = 6 | ||
*result = cuda::round_down(a, b); | ||
} | ||
`See it on Godbolt TODO` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
.. _libcudacxx-extended-api-math-round-up: | ||
|
||
``round_up`` Round to the next multiple | ||
======================================= | ||
|
||
.. code:: cuda | ||
template <typename T, typename = U> | ||
[[nodiscard]] __host__ __device__ inline | ||
constexpr cuda::std::common_type_t<T, U> round_up(T value, U base_multiple) noexcept; | ||
``value``: The value to be rounded up. | ||
``base_multiple``: The base multiple to which the value rounds up. | ||
|
||
- *Requires*: ``T`` and ``U`` are integral types (including 128-bit integers) or enumerators. | ||
- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true. | ||
- *Returns*: ``a`` rounded up to the smallest multiple of ``b`` greater than or equal to ``a``. If ``a`` is already a multiple of ``b``, return ``a``. | ||
- *Note*: the result can overflow if ``ceil(a / b) * b`` exceeds the maximum value of the common type of | ||
``a`` and ``b``. The condition is checked in debug mode. | ||
|
||
.. note:: | ||
|
||
The function requires C++17 onwards | ||
|
||
**Performance considerations**: | ||
|
||
- The function performs a ceiling division (``cuda::ceil_div()``) followed by a multiplication | ||
|
||
**Example**: | ||
|
||
.. code:: cuda | ||
#include <cuda/cmath> | ||
__global__ void example_kernel(int a, unsigned b, unsigned* result) { | ||
// a = 7, b = 3 -> result = 9 | ||
*result = cuda::round_up(a, b); | ||
} | ||
`See it on Godbolt TODO` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// Part of libcu++, the C++ Standard Library for your entire system, | ||
// under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#ifndef _CUDA___CMATH_ROUND_DOWN_H | ||
#define _CUDA___CMATH_ROUND_DOWN_H | ||
|
||
#include <cuda/std/detail/__config> | ||
|
||
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) | ||
# pragma GCC system_header | ||
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) | ||
# pragma clang system_header | ||
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) | ||
# pragma system_header | ||
#endif // no system header | ||
|
||
#if _CCCL_STD_VER >= 2017 | ||
|
||
# include <cuda/std/__concepts/concept_macros.h> | ||
# include <cuda/std/__type_traits/common_type.h> | ||
# include <cuda/std/__type_traits/is_enum.h> | ||
# include <cuda/std/__type_traits/is_integral.h> | ||
# include <cuda/std/__type_traits/is_signed.h> | ||
# include <cuda/std/__type_traits/make_unsigned.h> | ||
# include <cuda/std/__utility/to_underlying.h> | ||
# include <cuda/std/limits> | ||
|
||
_LIBCUDACXX_BEGIN_NAMESPACE_CUDA | ||
|
||
//! @brief Round the number \p __a to the previous multiple of \p __b | ||
//! @param __a The input number | ||
//! @param __b The multiplicand | ||
//! @pre \p __a must be non-negative | ||
//! @pre \p __b must be positive | ||
_CCCL_TEMPLATE(class _Tp, class _Up) | ||
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up)) | ||
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_Tp, _Up> | ||
round_down(const _Tp __a, const _Up __b) noexcept | ||
{ | ||
_CCCL_ASSERT(__b > _Up{0}, "cuda::round_down: 'b' must be positive"); | ||
if constexpr (_CUDA_VSTD::is_signed_v<_Tp>) | ||
{ | ||
_CCCL_ASSERT(__a >= _Tp{0}, "cuda::round_down: 'a' must be non negative"); | ||
} | ||
using _Common = _CUDA_VSTD::common_type_t<_Tp, _Up>; | ||
using _Prom = decltype(_Tp{} / _Up{}); | ||
using _UProm = _CUDA_VSTD::make_unsigned_t<_Prom>; | ||
auto __c1 = static_cast<_UProm>(__a) / static_cast<_UProm>(__b); | ||
return static_cast<_Common>(__c1 * static_cast<_UProm>(__b)); | ||
} | ||
|
||
//! @brief Round the number \p __a to the previous multiple of \p __b | ||
//! @param __a The input number | ||
//! @param __b The multiplicand | ||
//! @pre \p __a must be non-negative | ||
//! @pre \p __b must be positive | ||
_CCCL_TEMPLATE(class _Tp, class _Up) | ||
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up)) | ||
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_Tp, _CUDA_VSTD::underlying_type_t<_Up>> | ||
round_down(const _Tp __a, const _Up __b) noexcept | ||
{ | ||
return ::cuda::round_down(__a, _CUDA_VSTD::to_underlying(__b)); | ||
} | ||
|
||
//! @brief Round the number \p __a to the previous multiple of \p __b | ||
//! @param __a The input number | ||
//! @param __b The multiplicand | ||
//! @pre \p __a must be non-negative | ||
//! @pre \p __b must be positive | ||
_CCCL_TEMPLATE(class _Tp, class _Up) | ||
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up)) | ||
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_CUDA_VSTD::underlying_type_t<_Tp>, _Up> | ||
round_down(const _Tp __a, const _Up __b) noexcept | ||
{ | ||
return ::cuda::round_down(_CUDA_VSTD::to_underlying(__a), __b); | ||
} | ||
|
||
//! @brief Round the number \p __a to the previous multiple of \p __b | ||
//! @param __a The input number | ||
//! @param __b The multiplicand | ||
//! @pre \p __a must be non-negative | ||
//! @pre \p __b must be positive | ||
_CCCL_TEMPLATE(class _Tp, class _Up) | ||
_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up)) | ||
_CCCL_NODISCARD | ||
_LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_CUDA_VSTD::underlying_type_t<_Tp>, | ||
_CUDA_VSTD::underlying_type_t<_Up>> | ||
round_down(const _Tp __a, const _Up __b) noexcept | ||
{ | ||
return ::cuda::round_down(_CUDA_VSTD::to_underlying(__a), _CUDA_VSTD::to_underlying(__b)); | ||
} | ||
|
||
_LIBCUDACXX_END_NAMESPACE_CUDA | ||
|
||
#endif // _CCCL_STD_VER >= 2017 | ||
#endif // _CUDA___CMATH_ROUND_DOWN_H |
Oops, something went wrong.