Add round up/down to multiple (#3234)

NVIDIA · Jan 16, 2025 · a128d86 · a128d86
1 parent 3cc47b2
commit a128d86
Show file tree

Hide file tree

Showing 8 changed files with 554 additions and 43 deletions.
diff --git a/docs/libcudacxx/extended_api/math.rst b/docs/libcudacxx/extended_api/math.rst
@@ -1,52 +1,28 @@
 .. _libcudacxx-extended-api-math:
 
 Math
-=====
+====
 
-.. code:: cuda
+.. toctree::
+   :hidden:
+   :maxdepth: 1
 
-   template <typename T>
-   [[nodiscard]] __host__ __device__ constexpr T ceil_div(T a, T b) noexcept;
+   cuda::ceil_div <math/ceil_div>
+   cuda::round_up <math/round_up>
+   cuda::round_down <math/round_down>
 
-ceil_div
----------
+.. list-table::
+   :widths: 25 45 30
+   :header-rows: 0
 
-- _Requires_: `is_integral_v<T>` is true.
-- _Preconditions_: `a >= 0` is true and `b > 0` is true.
-- _Returns_: divides `a` by `b`. If `a` is not a multiple of `b` rounds the result up to the next integer value.
+   * - :ref:`ceil_div <libcudacxx-extended-api-math-ceil-div>`
+     - Ceiling division
+     - CCCL 2.6.0 / CUDA 12.6
 
-.. note::
+   * - :ref:`round_up <libcudacxx-extended-api-math-round-up>`
+     - Round to the next multiple
+     - CCCL 2.9.0 / CUDA 12.9
 
-   The function is only constexpr from C++14 onwards
-
-**Example**: This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block:
-
-.. code:: cuda
-
-   #include <vector>
-   #include <cuda/cmath>
-
-   __global__ void vscale(int n, float s, float *x) {
-     int i = blockIdx.x * blockDim.x + threadIdx.x;
-     if (i < n) x[i] *= s;
-   }
-
-   int main() {
-     const int n = 100000;
-     const float s = 2.f;
-     std::vector<float> x(n, 1.f);
-
-     // Given a fixed number of threads per block...
-     constexpr int threads_per_block = 256;
-
-     // ...dividing some "n" by "threads_per_block" may lead to a remainder,
-     // requiring the kernel to be launched with an extra thread block to handle it.
-     const int thread_blocks = cuda::ceil_div(n, threads_per_block);
-
-     vscale<<<thread_blocks, threads_per_block>>>(n, s, x.data());
-     cudaDeviceSynchronize();
-
-     return 0;
-   }
-
-`See it on Godbolt TODO`
+   * - :ref:`round_down <libcudacxx-extended-api-math-round-down>`
+     - Round to the previous multiple
+     - CCCL 2.9.0 / CUDA 12.9
diff --git a/docs/libcudacxx/extended_api/math/ceil_div.rst b/docs/libcudacxx/extended_api/math/ceil_div.rst
@@ -0,0 +1,52 @@
+.. _libcudacxx-extended-api-math-ceil-div:
+
+``ceil_div`` Ceiling Division
+=============================
+
+.. code:: cuda
+
+   template <typename T, typename = U>
+   [[nodiscard]] __host__ __device__ constexpr T ceil_div(T value, U divisor) noexcept;
+
+``value``: The value to be divided.
+``divisor``:  The divisor.
+
+- *Requires*: ``is_integral_v<T>`` is true and ``is_integral_v<U>`` is true.
+- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true.
+- *Returns*: divides ``a`` by ``b``. If ``a`` is not a multiple of ``b`` rounds the result up to the next integer value.
+
+.. note::
+
+   The function is only constexpr from C++14 onwards
+
+**Example**: This API is very useful for determining the *number of thread blocks* required to process a fixed amount of work, given a fixed number of threads per block:
+
+.. code:: cuda
+
+   #include <vector>
+   #include <cuda/cmath>
+
+   __global__ void vscale(int n, float s, float *x) {
+     int i = blockIdx.x * blockDim.x + threadIdx.x;
+     if (i < n) x[i] *= s;
+   }
+
+   int main() {
+     const int n = 100000;
+     const float s = 2.f;
+     std::vector<float> x(n, 1.f);
+
+     // Given a fixed number of threads per block...
+     constexpr int threads_per_block = 256;
+
+     // ...dividing some "n" by "threads_per_block" may lead to a remainder,
+     // requiring the kernel to be launched with an extra thread block to handle it.
+     const int thread_blocks = cuda::ceil_div(n, threads_per_block);
+
+     vscale<<<thread_blocks, threads_per_block>>>(n, s, x.data());
+     cudaDeviceSynchronize();
+
+     return 0;
+   }
+
+`See it on Godbolt TODO`
diff --git a/docs/libcudacxx/extended_api/math/round_down.rst b/docs/libcudacxx/extended_api/math/round_down.rst
@@ -0,0 +1,38 @@
+.. _libcudacxx-extended-api-math-round-down:
+
+``round_down`` Round to the previous multiple
+=============================================
+
+.. code:: cuda
+
+   template <typename T, typename = U>
+   [[nodiscard]] __host__ __device__ inline
+   constexpr cuda::std::common_type_t<T, U> round_down(T value, U base_multiple) noexcept;
+
+``value``: The value to be rounded down.
+``base_multiple``:  The base multiple to which the value rounds down.
+
+- *Requires*: ``T`` and ``U`` are integral types (including 128-bit integers) or enumerators.
+- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true.
+- *Returns*: ``a`` rounded down to the largest multiple of ``b`` less than or equal to ``a``. If ``a`` is already a multiple of ``b``, return ``a``.
+
+.. note::
+
+   The function requires C++17 onwards
+
+**Performance considerations**:
+
+- The function performs a truncation division followed by a multiplication. It provides better performance than ``a / b * b`` when the common type is a signed integer
+
+**Example**:
+
+.. code:: cuda
+
+   #include <cuda/cmath>
+
+   __global__ void example_kernel(int a, unsigned b, unsigned* result) {
+     // a = 7, b = 3 -> result = 6
+     *result = cuda::round_down(a, b);
+   }
+
+`See it on Godbolt TODO`
diff --git a/docs/libcudacxx/extended_api/math/round_up.rst b/docs/libcudacxx/extended_api/math/round_up.rst
@@ -0,0 +1,40 @@
+.. _libcudacxx-extended-api-math-round-up:
+
+``round_up`` Round to the next multiple
+=======================================
+
+.. code:: cuda
+
+   template <typename T, typename = U>
+   [[nodiscard]] __host__ __device__ inline
+   constexpr cuda::std::common_type_t<T, U> round_up(T value, U base_multiple) noexcept;
+
+``value``: The value to be rounded up.
+``base_multiple``:  The base multiple to which the value rounds up.
+
+- *Requires*: ``T`` and ``U`` are integral types (including 128-bit integers) or enumerators.
+- *Preconditions*: ``a >= 0`` is true and ``b > 0`` is true.
+- *Returns*: ``a`` rounded up to the smallest multiple of ``b`` greater than or equal to ``a``. If ``a`` is already a multiple of ``b``, return ``a``.
+- *Note*: the result can overflow if ``ceil(a / b) * b`` exceeds the maximum value of the common type of
+          ``a`` and ``b``. The condition is checked in debug mode.
+
+.. note::
+
+   The function requires C++17 onwards
+
+**Performance considerations**:
+
+- The function performs a ceiling division (``cuda::ceil_div()``) followed by a multiplication
+
+**Example**:
+
+.. code:: cuda
+
+   #include <cuda/cmath>
+
+   __global__ void example_kernel(int a, unsigned b, unsigned* result) {
+     // a = 7, b = 3 -> result = 9
+     *result = cuda::round_up(a, b);
+   }
+
+`See it on Godbolt TODO`
diff --git a/libcudacxx/include/cuda/__cmath/round_down.h b/libcudacxx/include/cuda/__cmath/round_down.h
@@ -0,0 +1,103 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___CMATH_ROUND_DOWN_H
+#define _CUDA___CMATH_ROUND_DOWN_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_STD_VER >= 2017
+
+#  include <cuda/std/__concepts/concept_macros.h>
+#  include <cuda/std/__type_traits/common_type.h>
+#  include <cuda/std/__type_traits/is_enum.h>
+#  include <cuda/std/__type_traits/is_integral.h>
+#  include <cuda/std/__type_traits/is_signed.h>
+#  include <cuda/std/__type_traits/make_unsigned.h>
+#  include <cuda/std/__utility/to_underlying.h>
+#  include <cuda/std/limits>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+//! @brief Round the number \p __a to the previous multiple of \p __b
+//! @param __a The input number
+//! @param __b The multiplicand
+//! @pre \p __a must be non-negative
+//! @pre \p __b must be positive
+_CCCL_TEMPLATE(class _Tp, class _Up)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up))
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_Tp, _Up>
+round_down(const _Tp __a, const _Up __b) noexcept
+{
+  _CCCL_ASSERT(__b > _Up{0}, "cuda::round_down: 'b' must be positive");
+  if constexpr (_CUDA_VSTD::is_signed_v<_Tp>)
+  {
+    _CCCL_ASSERT(__a >= _Tp{0}, "cuda::round_down: 'a' must be non negative");
+  }
+  using _Common = _CUDA_VSTD::common_type_t<_Tp, _Up>;
+  using _Prom   = decltype(_Tp{} / _Up{});
+  using _UProm  = _CUDA_VSTD::make_unsigned_t<_Prom>;
+  auto __c1     = static_cast<_UProm>(__a) / static_cast<_UProm>(__b);
+  return static_cast<_Common>(__c1 * static_cast<_UProm>(__b));
+}
+
+//! @brief Round the number \p __a to the previous multiple of \p __b
+//! @param __a The input number
+//! @param __b The multiplicand
+//! @pre \p __a must be non-negative
+//! @pre \p __b must be positive
+_CCCL_TEMPLATE(class _Tp, class _Up)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_integral, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up))
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_Tp, _CUDA_VSTD::underlying_type_t<_Up>>
+round_down(const _Tp __a, const _Up __b) noexcept
+{
+  return ::cuda::round_down(__a, _CUDA_VSTD::to_underlying(__b));
+}
+
+//! @brief Round the number \p __a to the previous multiple of \p __b
+//! @param __a The input number
+//! @param __b The multiplicand
+//! @pre \p __a must be non-negative
+//! @pre \p __b must be positive
+_CCCL_TEMPLATE(class _Tp, class _Up)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_integral, _Up))
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_CUDA_VSTD::underlying_type_t<_Tp>, _Up>
+round_down(const _Tp __a, const _Up __b) noexcept
+{
+  return ::cuda::round_down(_CUDA_VSTD::to_underlying(__a), __b);
+}
+
+//! @brief Round the number \p __a to the previous multiple of \p __b
+//! @param __a The input number
+//! @param __b The multiplicand
+//! @pre \p __a must be non-negative
+//! @pre \p __b must be positive
+_CCCL_TEMPLATE(class _Tp, class _Up)
+_CCCL_REQUIRES(_CCCL_TRAIT(_CUDA_VSTD::is_enum, _Tp) _CCCL_AND _CCCL_TRAIT(_CUDA_VSTD::is_enum, _Up))
+_CCCL_NODISCARD
+_LIBCUDACXX_HIDE_FROM_ABI constexpr _CUDA_VSTD::common_type_t<_CUDA_VSTD::underlying_type_t<_Tp>,
+                                                              _CUDA_VSTD::underlying_type_t<_Up>>
+round_down(const _Tp __a, const _Up __b) noexcept
+{
+  return ::cuda::round_down(_CUDA_VSTD::to_underlying(__a), _CUDA_VSTD::to_underlying(__b));
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CCCL_STD_VER >= 2017
+#endif // _CUDA___CMATH_ROUND_DOWN_H