caugonnet
diff --git a/‎docs/libcudacxx/standard_api.rst
+2 b/‎docs/libcudacxx/standard_api.rst
+2
diff --git a/‎docs/libcudacxx/standard_api/utility_library.rst
+4 b/‎docs/libcudacxx/standard_api/utility_library.rst
+4
diff --git a/‎docs/libcudacxx/standard_api/utility_library/memory.rst
+25 b/‎docs/libcudacxx/standard_api/utility_library/memory.rst
+25
diff --git a/‎libcudacxx/include/cuda/std/__cccl/builtin.h
+11-1 b/‎libcudacxx/include/cuda/std/__cccl/builtin.h
+11-1
diff --git a/‎libcudacxx/include/cuda/std/__memory/assume_aligned.h
+53 b/‎libcudacxx/include/cuda/std/__memory/assume_aligned.h
+53
diff --git a/‎libcudacxx/include/cuda/std/__memory/construct_at.h
+11-19 b/‎libcudacxx/include/cuda/std/__memory/construct_at.h
+11-19
diff --git a/‎libcudacxx/include/cuda/std/__memory_
+5-16 b/‎libcudacxx/include/cuda/std/__memory_
+5-16
diff --git a/‎libcudacxx/include/cuda/std/memory
+39 b/‎libcudacxx/include/cuda/std/memory
+39
diff --git a/‎libcudacxx/test/libcudacxx/std/containers/sequences/array/size_and_alignment.pass.cpp
+1-1 b/‎libcudacxx/test/libcudacxx/std/containers/sequences/array/size_and_alignment.pass.cpp
+1-1
diff --git a/‎libcudacxx/test/libcudacxx/std/utilities/memory/default.allocator/allocator.members/allocate.pass.cpp
+1-1 b/‎libcudacxx/test/libcudacxx/std/utilities/memory/default.allocator/allocator.members/allocate.pass.cpp
+1-1
diff --git a/‎libcudacxx/test/libcudacxx/std/utilities/memory/ptr.align/align.pass.cpp
+1-1 b/‎libcudacxx/test/libcudacxx/std/utilities/memory/ptr.align/align.pass.cpp
+1-1
diff --git a/‎libcudacxx/test/libcudacxx/std/utilities/memory/ptr.align/assume_aligned.nodiscard.fail.cpp
+30 b/‎libcudacxx/test/libcudacxx/std/utilities/memory/ptr.align/assume_aligned.nodiscard.fail.cpp
+30
@@ -74,6 +74,8 @@ Feature availability:
       they need to be used similar to type traits as language concepts
       are not available.
 
+-  C++20 ``std::assume_aligned`` in ``<memory>`` is available in C++11.
+
 -  C++20 ``<ranges>`` are available in C++17.
 
    -  all ``<ranges>`` concepts are available in C++17. However, they
 
@@ -10,6 +10,7 @@ Utility Library
    utility_library/bitset
    utility_library/expected
    utility_library/functional
+   utility_library/memory
    utility_library/optional
    utility_library/tuple
    utility_library/type_traits
@@ -34,6 +35,9 @@ the information about the individual features for details.
      - Optional value with error channel
      - CCCL 2.3.0 / CUDA 12.4
    * - :ref:`libcudacxx-standard-api-utility-functional`
+     - ``std::assume_aligned``
+     - CCCL 2.9.0 / CUDA 12.9
+   * - :ref:`libcudacxx-standard-api-utility-memory`
      - Function objects and function wrappers
      - libcu++ 1.1.0 / CCCL 2.0.0 / CUDA 11.2
    * - :ref:`libcudacxx-standard-api-utility-optional`
 
@@ -0,0 +1,25 @@
+.. _libcudacxx-standard-api-utility-memory:
+
+<cuda/std/memory>
+===================
+
+Provided functionalities
+------------------------
+
+- ``cuda::std::addressof``. See the C++ documentation of `std::addressof <https://en.cppreference.com/w/cpp/memory/addressof>`_
+- ``cuda::std::align``. See the C++ documentation of `std::align <https://en.cppreference.com/w/cpp/memory/align>`_
+- ``cuda::std::assume_aligned``. See the C++ documentation of `std::assume_aligned <https://en.cppreference.com/w/cpp/memory/assume_aligned>`_
+- Uninitialized memory algorithms. See the C++ documentation `<https://en.cppreference.com/w/cpp/memory>`_
+
+Extensions
+----------
+
+-  Most features are available from C++11 onwards.
+-  ``cuda::std::addressof`` is constexpr from C++11 on if compiler support is available
+-  ``cuda::std::assume_aligned`` is constexpr from C++14 on
+
+Restrictions
+------------
+
+- `construct_at` and is only available in C++20 as that is explicitly mentioned in the standard
+- The specialized memory algorithms are not parallel
@@ -84,7 +84,17 @@
 
 #if _CCCL_HAS_BUILTIN(__array_extent)
 #  define _CCCL_BUILTIN_ARRAY_EXTENT(...) __array_extent(__VA_ARGS__)
-#endif // _CCCL_HAS_BUILTIN(array_extent)
+#endif // _CCCL_HAS_BUILTIN(__array_extent)
+
+#if _CCCL_HAS_BUILTIN(__builtin_assume_aligned) || (defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION >= 1923) \
+  || defined(_CCCL_COMPILER_GCC)
+#  define _CCCL_BUILTIN_ASSUME_ALIGNED(...) __builtin_assume_aligned(__VA_ARGS__)
+#endif // _CCCL_HAS_BUILTIN(__builtin_assume_aligned)
+
+// NVCC below 11.2 treats this as a host only function
+#if defined(_CCCL_CUDACC_BELOW_11_2)
+#  undef _CCCL_BUILTIN_ASSUME_ALIGNED
+#endif // _CCCL_CUDACC_BELOW_11_2
 
 // nvhpc has a bug where it supports __builtin_addressof but does not mark it via _CCCL_CHECK_BUILTIN
 #if _CCCL_CHECK_BUILTIN(builtin_addressof) || (defined(_CCCL_COMPILER_GCC) && _CCCL_GCC_VERSION >= 70000) \
 
@@ -0,0 +1,53 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___MEMORY_ASSUME_ALIGNED_H
+#define _LIBCUDACXX___MEMORY_ASSUME_ALIGNED_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__bit/has_single_bit.h>
+#include <cuda/std/__type_traits/is_constant_evaluated.h>
+#include <cuda/std/cstddef> // size_t
+#include <cuda/std/cstdint> // uintptr_t
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <size_t _Align, class _Tp>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp* assume_aligned(_Tp* __ptr) noexcept
+{
+  static_assert(_CUDA_VSTD::has_single_bit(_Align), "std::assume_aligned requires the alignment to be a power of 2!");
+#if defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) && defined(_CCCL_BUILTIN_ASSUME_ALIGNED)
+  if (!_CCCL_BUILTIN_IS_CONSTANT_EVALUATED())
+  {
+#  if !defined(_CCCL_COMPILER_MSVC) // MSVC checks within the builtin
+    _CCCL_ASSERT(reinterpret_cast<uintptr_t>(__ptr) % _Align == 0, "Alignment assumption is violated");
+#  endif // !_CCCL_COMPILER_MSVC
+    return static_cast<_Tp*>(_CCCL_BUILTIN_ASSUME_ALIGNED(__ptr, _Align));
+  }
+  else
+#endif // _CCCL_BUILTIN_IS_CONSTANT_EVALUATED && _CCCL_BUILTIN_ASSUME_ALIGNED
+  {
+    return __ptr;
+  }
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___MEMORY_ASSUME_ALIGNED_H
@@ -4,7 +4,7 @@
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES
 //
 //===----------------------------------------------------------------------===//
 
@@ -185,8 +185,8 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator __destroy(_Forw
 
 _CCCL_EXEC_CHECK_DISABLE
 template <class _Tp,
-          __enable_if_t<!is_array<_Tp>::value, int>                  = 0,
-          __enable_if_t<!is_trivially_destructible<_Tp>::value, int> = 0>
+          __enable_if_t<!_CCCL_TRAIT(is_array, _Tp), int>                  = 0,
+          __enable_if_t<!_CCCL_TRAIT(is_trivially_destructible, _Tp), int> = 0>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void __destroy_at(_Tp* __loc)
 {
   _CCCL_ASSERT(__loc != nullptr, "null pointer given to destroy_at");
@@ -195,22 +195,20 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void __destroy_at(_Tp* __loc)
 
 _CCCL_EXEC_CHECK_DISABLE
 template <class _Tp,
-          __enable_if_t<!is_array<_Tp>::value, int>                 = 0,
-          __enable_if_t<is_trivially_destructible<_Tp>::value, int> = 0>
+          __enable_if_t<!_CCCL_TRAIT(is_array, _Tp), int>                 = 0,
+          __enable_if_t<_CCCL_TRAIT(is_trivially_destructible, _Tp), int> = 0>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void __destroy_at(_Tp* __loc)
 {
   _CCCL_ASSERT(__loc != nullptr, "null pointer given to destroy_at");
   (void) __loc;
 }
 
-#if _CCCL_STD_VER >= 2020
-template <class _Tp, __enable_if_t<is_array<_Tp>::value, int> = 0>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr void __destroy_at(_Tp* __loc)
+template <class _Tp, __enable_if_t<_CCCL_TRAIT(is_array, _Tp), int> = 0>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void __destroy_at(_Tp* __loc)
 {
   _CCCL_ASSERT(__loc != nullptr, "null pointer given to destroy_at");
   _CUDA_VSTD::__destroy(_CUDA_VSTD::begin(*__loc), _CUDA_VSTD::end(*__loc));
 }
-#endif // _CCCL_STD_VER >= 2020
 
 template <class _ForwardIterator>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
@@ -235,22 +233,18 @@ __reverse_destroy(_BidirectionalIterator __first, _BidirectionalIterator __last)
   return __last;
 }
 
-#if _CCCL_STD_VER >= 2017
-
-template <class _Tp, enable_if_t<!is_array_v<_Tp>, int> = 0>
-_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void destroy_at(_Tp* __loc) noexcept
+template <class _Tp, __enable_if_t<!_CCCL_TRAIT(is_array, _Tp), int> = 0>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void destroy_at(_Tp* __loc)
 {
   _CCCL_ASSERT(__loc != nullptr, "null pointer given to destroy_at");
   __loc->~_Tp();
 }
 
-#  if _CCCL_STD_VER >= 2020
-template <class _Tp, enable_if_t<is_array_v<_Tp>, int> = 0>
-_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void destroy_at(_Tp* __loc) noexcept
+template <class _Tp, __enable_if_t<_CCCL_TRAIT(is_array, _Tp), int> = 0>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void destroy_at(_Tp* __loc)
 {
   _CUDA_VSTD::__destroy_at(__loc);
 }
-#  endif // _CCCL_STD_VER >= 2020
 
 template <class _ForwardIterator>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void destroy(_ForwardIterator __first, _ForwardIterator __last) noexcept
@@ -268,8 +262,6 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 _ForwardIterator destroy_n(_Forw
   return __first;
 }
 
-#endif // _CCCL_STD_VER >= 2017
-
 _LIBCUDACXX_END_NAMESPACE_STD
 
 #endif // _LIBCUDACXX___MEMORY_CONSTRUCT_AT_H
@@ -8,8 +8,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _CUDA_STD_MEMORY
-#define _CUDA_STD_MEMORY
+#ifndef _CUDA_STD___MEMORY_
+#define _CUDA_STD___MEMORY_
 
 #include <cuda/std/detail/__config>
 
@@ -21,25 +21,14 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/__memory/addressof.h>
-#include <cuda/std/__memory/align.h>
 #include <cuda/std/__memory/allocate_at_least.h>
 #include <cuda/std/__memory/allocation_guard.h>
 #include <cuda/std/__memory/allocator.h>
 #include <cuda/std/__memory/allocator_arg_t.h>
 #include <cuda/std/__memory/allocator_traits.h>
-#include <cuda/std/__memory/construct_at.h>
-#include <cuda/std/__memory/pointer_traits.h>
-#include <cuda/std/__memory/uninitialized_algorithms.h>
+#include <cuda/std/__memory/temporary_buffer.h>
 #include <cuda/std/__memory/unique_ptr.h>
 #include <cuda/std/__memory/uses_allocator.h>
+#include <cuda/std/memory>
 
-// standard-mandated includes
-#include <cuda/std/version>
-
-// [memory.syn]
-#ifndef _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
-#  include <cuda/std/compare>
-#endif // !_LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
-
-#endif // _CUDA_STD_MEMORY
+#endif // _CUDA_STD___MEMORY_
@@ -0,0 +1,39 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD_MEMORY
+#define _CUDA_STD_MEMORY
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__memory/addressof.h>
+#include <cuda/std/__memory/align.h>
+#include <cuda/std/__memory/assume_aligned.h>
+#include <cuda/std/__memory/construct_at.h>
+#include <cuda/std/__memory/pointer_traits.h>
+#include <cuda/std/__memory/uninitialized_algorithms.h>
+
+// standard-mandated includes
+#include <cuda/std/version>
+
+// [memory.syn]
+#ifndef _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
+#  include <cuda/std/compare>
+#endif // !_LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
+
+#endif // _CUDA_STD_MEMORY
@@ -29,7 +29,7 @@
 TEST_NV_DIAG_SUPPRESS(cuda_demote_unsupported_floating_point)
 
 #if defined(TEST_COMPILER_MSVC)
-#  pragma warning(disable : 4324)
+#  pragma warning(disable : 4324) // structure was padded due to alignment specifier
 #endif // TEST_COMPILER_MSVC
 
 template <class T, cuda::std::size_t Size>
 
@@ -23,7 +23,7 @@
 #include "test_macros.h"
 
 #if defined(TEST_COMPILER_MSVC)
-#  pragma warning(disable : 4324)
+#  pragma warning(disable : 4324) // structure was padded due to alignment specifier
 #endif // TEST_COMPILER_MSVC
 
 #ifdef TEST_HAS_NO_ALIGNED_ALLOCATION
 
@@ -11,9 +11,9 @@
 
 // void* align(size_t alignment, size_t size, void*& ptr, size_t& space);
 
-#include <cuda/std/__memory_>
 #include <cuda/std/cassert>
 #include <cuda/std/cstddef>
+#include <cuda/std/memory>
 
 #include "test_macros.h"
 
 
@@ -0,0 +1,30 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// #include <memory>
+
+// template<size_t N, class T>
+// [[nodiscard]] constexpr T* assume_aligned(T* ptr);
+
+// UNSUPPORTED: nvrtc
+// nvrtc currently compiles the test with a warning
+
+#include <cuda/std/memory>
+
+__host__ __device__ void f()
+{
+  int* p = nullptr;
+  cuda::std::assume_aligned<4>(p); // expected-warning {{ignoring return value of function declared with 'nodiscard'
+                                   // attribute}}
+}
+
+int main(int, char**)
+{
+  return 0;
+}