Skip to content

Commit e0a7902

Browse files
committed
refactor <cuda/std/cstring>
Cleanup util_arch (NVIDIA#2773) Improves `DeviceSegmentedSort` test run time for large number of items and segments (NVIDIA#3246) * fixes segment offset generation * switches to analytical verification * switches to analytical verification for pairs * fixes spelling * adds tests for large number of segments * fixes narrowing conversion in tests * addresses review comments * fixes includes Adds support for large number of items and large number of segments to `DeviceSegmentedSort` (NVIDIA#3308) * fixes segment offset generation * switches to analytical verification * switches to analytical verification for pairs * addresses review comments * introduces segment offset type * adds tests for large number of segments * adds support for large number of segments * drops segment offset type * fixes thrust namespace * removes about-to-be-deprecated cub iterators * no exec specifier on defaulted ctor * fixes gcc7 linker error * uses local_segment_index_t throughout * determine offset type based on type returned by segment iterator begin/end iterators * minor style improvements cuda.parallel: Support structured types as algorithm inputs (NVIDIA#3218) * Introduce gpu_struct decorator and typing * Enable `reduce` to accept arrays of structs as inputs * Add test for reducing arrays-of-struct * Update documentation * Use a numpy array rather than ctypes object * Change zeros -> empty for output array and temp storage * Add a TODO for typing GpuStruct * Documentation udpates * Remove test_reduce_struct_type from test_reduce.py * Revert to `to_cccl_value()` accepting ndarray + GpuStruct * Bump copyrights --------- Co-authored-by: Ashwin Srinath <[email protected]> Deprecate thrust::async (NVIDIA#3324) Fixes: NVIDIA#100 Review/Deprecate CUB `util.ptx` for CCCL 2.x (NVIDIA#3342) Deprecate thrust::numeric_limits (NVIDIA#3366) Upgrade to Catch2 3.8 (NVIDIA#3310) Fixes: NVIDIA#1724 Fix sign-compare warning (NVIDIA#3408) Implement more cmath functions to be usable on host and device (NVIDIA#3382) * Implement more cmath functions to be usable on host and device * Implement math roots functions * Implement exponential functions Redefine and deprecate thrust::remove_cvref (NVIDIA#3394) * Redefine and deprecate thrust::remove_cvref Co-authored-by: Michael Schellenberger Costa <[email protected]> cuda.parallel: Add optional stream argument to reduce_into() (NVIDIA#3348) * Add optional stream argument to reduce_into() * Add tests to check for reduce_into() stream behavior * Move protocol related utils to separate file and rework __cuda_stream__ error messages * Fix synchronization issue in stream test and add one more invalid stream test case * Rename cuda stream validation function after removing leading underscore * Unpack values from __cuda_stream__ instead of indexing * Fix linting errors * Handle TypeError when unpacking invalid __cuda_stream__ return * Use stream to allocate cupy memory in new stream test Deprecate `cub::{min, max}` and replace internal uses with those from libcu++ (NVIDIA#3419) * Deprecate `cub::{min, max}` and replace internal uses with those from libcu++ Fixes NVIDIA#3404 Remove deprecated `cub::min` (NVIDIA#3450) * Remove deprecated `cuda::{min,max}` * Drop unused `thrust::remove_cvref` file Fix typo in builtin (NVIDIA#3451) Moves agents to `detail::<algorithm_name>` namespace (NVIDIA#3435) Drop thrust::detail::integer_traits (NVIDIA#3391) Add cuda::is_floating_point supporting half and bfloat (NVIDIA#3379) Co-authored-by: Michael Schellenberger Costa <[email protected]> add `_CCCL_HAS_NVFP8` macro (NVIDIA#3429) Specialize __is_extended_floating_point for FP8 types (NVIDIA#3470) Also ensure that we actually can enable FP8 due to FP16 and BF16 requirements Co-authored-by: Michael Schellenberger Costa <[email protected]> Moves CUB kernel entry points to a detail namespace (NVIDIA#3468) * moves emptykernel to detail ns * second batch * third batch * fourth batch * fixes cuda parallel * concatenates nested namespaces Deprecate block/warp algo specializations (NVIDIA#3455) Fixes: NVIDIA#3409 fix documentation
1 parent 0c8e627 commit e0a7902

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+455
-493
lines changed

.github/CODEOWNERS

+2-2
Original file line numberDiff line numberDiff line change
@@ -25,5 +25,5 @@ benchmarks/ @nvidia/cccl-benchmark-codeowners
2525
**/benchmarks @nvidia/cccl-benchmark-codeowners
2626

2727
# docs
28-
docs/ @nvidia/cccl-codeowners
29-
examples/ @nvidia/cccl-codeowners
28+
docs/ @nvidia/cccl-docs-codeowners
29+
examples/ @nvidia/cccl-docs-codeowners

cub/benchmarks/bench/reduce/custom.cu

-42
This file was deleted.

docs/libcudacxx/standard_api/c_library.rst

+3
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,6 @@ Any Standard C++ header not listed below is omitted.
3030
* - `\<cuda/std/cstdlib\> <https://en.cppreference.com/w/cpp/header/cstdlib>`_
3131
- Common utilities
3232
- libcu++ 2.2.0 / CCCL 2.2.0 / CUDA 12.3
33+
* - `\<cuda/std/cstring\> <https://en.cppreference.com/w/cpp/header/cstring>`_
34+
- Provides array manipulation functions such as ``memcpy``, ``memset`` and ``memcmp``
35+
- CCCL 3.0.0

libcudacxx/include/cuda/__memcpy_async/dispatch_memcpy_async.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
#include <cuda/__memcpy_async/cp_async_shared_global.h>
2929
#include <cuda/std/cstddef>
3030
#include <cuda/std/cstdint>
31-
#include <cuda/std/detail/libcxx/include/cstring>
31+
#include <cuda/std/cstring>
3232

3333
#include <nv/target>
3434

@@ -135,7 +135,7 @@ _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI __completion_mechanism __dispatch_memc
135135
(
136136
// Host code path:
137137
if (__group.thread_rank() == 0) {
138-
memcpy(__dest_char, __src_char, __size);
138+
_CUDA_VSTD::memcpy(__dest_char, __src_char, __size);
139139
} return __completion_mechanism::__sync;));
140140
}
141141

libcudacxx/include/cuda/std/__algorithm/copy.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@
2828
#include <cuda/std/__type_traits/is_trivially_copyable.h>
2929
#include <cuda/std/__type_traits/remove_const.h>
3030
#include <cuda/std/cstdint>
31-
#include <cuda/std/cstdlib> // ::memmove
32-
#include <cuda/std/detail/libcxx/include/cstring>
31+
#include <cuda/std/cstdlib>
32+
#include <cuda/std/cstring> // memmove
3333

3434
_LIBCUDACXX_BEGIN_NAMESPACE_STD
3535

libcudacxx/include/cuda/std/__atomic/functions/cuda_local.h

+8-7
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
#include <cuda/std/__atomic/types/common.h>
2424
#include <cuda/std/cstdint>
25+
#include <cuda/std/cstring>
2526

2627
// This file works around a bug in CUDA in which the compiler miscompiles
2728
// atomics to automatic storage (local memory). This bug is not fixed on any
@@ -96,7 +97,7 @@ _CCCL_DEVICE inline bool __cuda_load_weak_if_local(const volatile void* __ptr, v
9697
{
9798
return false;
9899
}
99-
memcpy(__ret, const_cast<const void*>(__ptr), __size);
100+
_CUDA_VSTD::memcpy(__ret, const_cast<const void*>(__ptr), __size);
100101
// Required to workaround a compiler bug, see nvbug/4064730
101102
NV_IF_TARGET(NV_PROVIDES_SM_70, (__nanosleep(0);))
102103
return true;
@@ -108,7 +109,7 @@ _CCCL_DEVICE inline bool __cuda_store_weak_if_local(volatile void* __ptr, const
108109
{
109110
return false;
110111
}
111-
memcpy(const_cast<void*>(__ptr), __val, __size);
112+
_CUDA_VSTD::memcpy(const_cast<void*>(__ptr), __val, __size);
112113
return true;
113114
}
114115

@@ -122,12 +123,12 @@ __cuda_compare_exchange_weak_if_local(volatile _Type* __ptr, _Type* __expected,
122123
}
123124
if (__atomic_memcmp(const_cast<const _Type*>(__ptr), const_cast<const _Type*>(__expected), sizeof(_Type)) == 0)
124125
{
125-
memcpy(const_cast<_Type*>(__ptr), const_cast<_Type const*>(__desired), sizeof(_Type));
126+
_CUDA_VSTD::memcpy(const_cast<_Type*>(__ptr), const_cast<_Type const*>(__desired), sizeof(_Type));
126127
*__success = true;
127128
}
128129
else
129130
{
130-
memcpy(const_cast<_Type*>(__expected), const_cast<_Type const*>(__ptr), sizeof(_Type));
131+
_CUDA_VSTD::memcpy(const_cast<_Type*>(__expected), const_cast<_Type const*>(__ptr), sizeof(_Type));
131132
*__success = false;
132133
}
133134
NV_IF_TARGET(NV_PROVIDES_SM_70, (__nanosleep(0);))
@@ -141,8 +142,8 @@ _CCCL_DEVICE bool __cuda_exchange_weak_if_local(volatile _Type* __ptr, _Type* __
141142
{
142143
return false;
143144
}
144-
memcpy(const_cast<_Type*>(__ret), const_cast<const _Type*>(__ptr), sizeof(_Type));
145-
memcpy(const_cast<_Type*>(__ptr), const_cast<const _Type*>(__val), sizeof(_Type));
145+
_CUDA_VSTD::memcpy(const_cast<_Type*>(__ret), const_cast<const _Type*>(__ptr), sizeof(_Type));
146+
_CUDA_VSTD::memcpy(const_cast<_Type*>(__ptr), const_cast<const _Type*>(__val), sizeof(_Type));
146147
NV_IF_TARGET(NV_PROVIDES_SM_70, (__nanosleep(0);))
147148
return true;
148149
}
@@ -154,7 +155,7 @@ _CCCL_DEVICE bool __cuda_fetch_weak_if_local(volatile _Type* __ptr, _Type __val,
154155
{
155156
return false;
156157
}
157-
memcpy(const_cast<_Type*>(__ret), const_cast<const _Type*>(__ptr), sizeof(_Type));
158+
_CUDA_VSTD::memcpy(const_cast<_Type*>(__ret), const_cast<const _Type*>(__ptr), sizeof(_Type));
158159
__bop(*__ptr, __val);
159160
NV_IF_TARGET(NV_PROVIDES_SM_70, (__nanosleep(0);))
160161
return true;

libcudacxx/include/cuda/std/__atomic/types/common.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
#include <cuda/std/__type_traits/is_assignable.h>
2626
#include <cuda/std/__type_traits/remove_cv.h>
2727
#include <cuda/std/__type_traits/remove_cvref.h>
28-
#include <cuda/std/detail/libcxx/include/cstring>
28+
#include <cuda/std/cstring>
2929

3030
_LIBCUDACXX_BEGIN_NAMESPACE_STD
3131

@@ -92,7 +92,7 @@ _CCCL_HOST_DEVICE inline int __atomic_memcmp(void const* __lhs, void const* __rh
9292
}
9393
} return 0;),
9494
NV_IS_HOST,
95-
(return memcmp(__lhs, __rhs, __count);))
95+
(return _CUDA_VSTD::memcmp(__lhs, __rhs, __count);))
9696
}
9797

9898
_LIBCUDACXX_END_NAMESPACE_STD

libcudacxx/include/cuda/std/__atomic/types/small.h

+3-2
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include <cuda/std/__type_traits/enable_if.h>
2929
#include <cuda/std/__type_traits/is_arithmetic.h>
3030
#include <cuda/std/__type_traits/is_signed.h>
31+
#include <cuda/std/cstring>
3132

3233
_LIBCUDACXX_BEGIN_NAMESPACE_STD
3334

@@ -53,15 +54,15 @@ template <class _Tp, enable_if_t<!_CCCL_TRAIT(is_arithmetic, _Tp), int> = 0>
5354
_CCCL_HOST_DEVICE inline __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __val)
5455
{
5556
__atomic_small_proxy_t<_Tp> __temp{};
56-
memcpy(&__temp, &__val, sizeof(_Tp));
57+
_CUDA_VSTD::memcpy(&__temp, &__val, sizeof(_Tp));
5758
return __temp;
5859
}
5960

6061
template <class _Tp, enable_if_t<!_CCCL_TRAIT(is_arithmetic, _Tp), int> = 0>
6162
_CCCL_HOST_DEVICE inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp> __val)
6263
{
6364
_Tp __temp{};
64-
memcpy(&__temp, &__val, sizeof(_Tp));
65+
_CUDA_VSTD::memcpy(&__temp, &__val, sizeof(_Tp));
6566
return __temp;
6667
}
6768

libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h

+5-1
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@
2525
#include <cuda/std/__atomic/scopes.h>
2626
#include <cuda/std/__atomic/wait/polling.h>
2727

28+
#if !_CCCL_COMPILER(NVRTC)
29+
# include <cstring>
30+
#endif // !_CCCL_COMPILER(NVRTC)
31+
2832
_LIBCUDACXX_BEGIN_NAMESPACE_STD
2933

3034
extern "C" _CCCL_DEVICE void __atomic_try_wait_unsupported_before_SM_70__();
@@ -56,7 +60,7 @@ _LIBCUDACXX_HIDE_FROM_ABI bool __nonatomic_compare_equal(_Tp const& __lhs, _Tp c
5660
#if _CCCL_HAS_CUDA_COMPILER
5761
return __lhs == __rhs;
5862
#else
59-
return memcmp(&__lhs, &__rhs, sizeof(_Tp)) == 0;
63+
return _CUDA_VSTD::memcmp(&__lhs, &__rhs, sizeof(_Tp)) == 0;
6064
#endif
6165
}
6266

libcudacxx/include/cuda/std/__bit/bit_cast.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
#include <cuda/std/__type_traits/is_extended_floating_point.h>
2626
#include <cuda/std/__type_traits/is_trivially_copyable.h>
2727
#include <cuda/std/__type_traits/is_trivially_default_constructible.h>
28-
#include <cuda/std/detail/libcxx/include/cstring>
28+
#include <cuda/std/cstring>
2929

3030
_LIBCUDACXX_BEGIN_NAMESPACE_STD
3131

libcudacxx/include/cuda/std/__bit/reference.h

-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
#include <cuda/std/__memory/pointer_traits.h>
3131
#include <cuda/std/__type_traits/conditional.h>
3232
#include <cuda/std/__utility/swap.h>
33-
#include <cuda/std/detail/libcxx/include/cstring>
3433

3534
_CCCL_PUSH_MACROS
3635

libcudacxx/include/cuda/std/__functional/hash.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
#include <cuda/std/__utility/pair.h>
3636
#include <cuda/std/__utility/swap.h>
3737
#include <cuda/std/cstdint>
38-
#include <cuda/std/detail/libcxx/include/cstring>
38+
#include <cuda/std/cstring>
3939

4040
#ifndef __cuda_std__
4141

@@ -45,7 +45,7 @@ template <class _Size>
4545
_LIBCUDACXX_HIDE_FROM_ABI _Size __loadword(const void* __p)
4646
{
4747
_Size __r;
48-
std::memcpy(&__r, __p, sizeof(__r));
48+
_CUDA_VSTD::memcpy(&__r, __p, sizeof(__r));
4949
return __r;
5050
}
5151

libcudacxx/include/cuda/std/__memory/allocator_traits.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
#include <cuda/std/__type_traits/void_t.h>
3636
#include <cuda/std/__utility/declval.h>
3737
#include <cuda/std/__utility/forward.h>
38-
#include <cuda/std/detail/libcxx/include/cstring>
38+
#include <cuda/std/cstring>
3939
#include <cuda/std/limits>
4040

4141
_CCCL_PUSH_MACROS

libcudacxx/include/cuda/std/cstring

+103
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of libcu++, the C++ Standard Library for your entire system,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#ifndef _CUDA_STD_CSTRING
12+
#define _CUDA_STD_CSTRING
13+
14+
#include <cuda/std/detail/__config>
15+
16+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17+
# pragma GCC system_header
18+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19+
# pragma clang system_header
20+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21+
# pragma system_header
22+
#endif // no system header
23+
24+
#include <cuda/std/__cstddef/types.h>
25+
26+
#if !_CCCL_COMPILER(NVRTC)
27+
# include <cstring>
28+
#endif // !_CCCL_COMPILER(NVRTC)
29+
30+
_LIBCUDACXX_BEGIN_NAMESPACE_STD
31+
32+
using ::memcpy;
33+
using ::memset;
34+
using ::size_t;
35+
36+
_LIBCUDACXX_HIDE_FROM_ABI const void* memchr(const void* __ptr, int __c, size_t __n) noexcept
37+
{
38+
NV_IF_ELSE_TARGET(
39+
NV_IS_HOST,
40+
(return ::std::memchr(__ptr, __c, __n);),
41+
(auto __p = static_cast<const unsigned char*>(__ptr); const auto __p_end = __p + __n;
42+
43+
while (__p != __p_end) {
44+
if (*__p == static_cast<unsigned char>(__c))
45+
{
46+
return __p;
47+
}
48+
++__p;
49+
}
50+
51+
return nullptr;))
52+
}
53+
54+
_LIBCUDACXX_HIDE_FROM_ABI void* memchr(void* __ptr, int __c, size_t __n) noexcept
55+
{
56+
NV_IF_ELSE_TARGET(NV_IS_HOST,
57+
(return ::std::memchr(__ptr, __c, __n);),
58+
(return const_cast<void*>(_CUDA_VSTD::memchr(const_cast<const void*>(__ptr), __c, __n));))
59+
}
60+
61+
_LIBCUDACXX_HIDE_FROM_ABI void* memmove(void* __dst, const void* __src, size_t __n) noexcept
62+
{
63+
NV_IF_ELSE_TARGET(
64+
NV_IS_HOST,
65+
(return ::std::memmove(__dst, __src, __n);),
66+
(auto __d = (__dst <= __src) ? static_cast<unsigned char*>(__dst) : (static_cast<unsigned char*>(__dst) + __n - 1);
67+
auto __s = (__dst <= __src) ? static_cast<const unsigned char*>(__src)
68+
: (static_cast<const unsigned char*>(__src) + __n - 1);
69+
const auto __inc = (__dst <= __src) ? 1 : -1;
70+
const auto __d_end = (__dst <= __src) ? (__d + __n) : (static_cast<unsigned char*>(__dst) - 1);
71+
72+
while (__d != __d_end) {
73+
*__d = *__s;
74+
__d += __inc;
75+
__s += __inc;
76+
}
77+
78+
return __dst;))
79+
}
80+
81+
_LIBCUDACXX_HIDE_FROM_ABI int memcmp(const void* __lhs, const void* __rhs, size_t __n) noexcept
82+
{
83+
NV_IF_ELSE_TARGET(
84+
NV_IS_HOST,
85+
(return ::std::memcmp(__lhs, __rhs, __n);),
86+
(auto __l = static_cast<const unsigned char*>(__lhs); auto __r = static_cast<const unsigned char*>(__rhs);
87+
const auto __l_end = __l + __n;
88+
89+
while (__l != __l_end) {
90+
if (*__l != *__r)
91+
{
92+
return *__l < *__r ? -1 : 1;
93+
}
94+
++__l;
95+
++__r;
96+
}
97+
98+
return 0;))
99+
}
100+
101+
_LIBCUDACXX_END_NAMESPACE_STD
102+
103+
#endif // _CUDA_STD_CSTRING

libcudacxx/include/cuda/std/detail/libcxx/include/algorithm

-1
Original file line numberDiff line numberDiff line change
@@ -751,7 +751,6 @@ template <class BidirectionalIterator, class Compare>
751751
#include <cuda/std/__type_traits/remove_const.h>
752752
#include <cuda/std/bit>
753753
#include <cuda/std/cstddef>
754-
#include <cuda/std/detail/libcxx/include/cstring>
755754
#include <cuda/std/functional>
756755
#include <cuda/std/initializer_list>
757756
#include <cuda/std/type_traits>

0 commit comments

Comments
 (0)