diff --git a/docs/libcudacxx/standard_api.rst b/docs/libcudacxx/standard_api.rst
index 821c9ae833d..0729df55406 100644
--- a/docs/libcudacxx/standard_api.rst
+++ b/docs/libcudacxx/standard_api.rst
@@ -74,6 +74,8 @@ Feature availability:
       they need to be used similar to type traits as language concepts
       are not available.
 
+-  C++20 ``std::assume_aligned`` in ``<memory>`` is available in C++11.
+
 -  C++20 ``<ranges>`` are available in C++17.
 
    -  all ``<ranges>`` concepts are available in C++17. However, they
diff --git a/docs/libcudacxx/standard_api/utility_library.rst b/docs/libcudacxx/standard_api/utility_library.rst
index 4df28701a39..491498ac8f1 100644
--- a/docs/libcudacxx/standard_api/utility_library.rst
+++ b/docs/libcudacxx/standard_api/utility_library.rst
@@ -10,6 +10,7 @@ Utility Library
    utility_library/bitset
    utility_library/expected
    utility_library/functional
+   utility_library/memory
    utility_library/optional
    utility_library/tuple
    utility_library/type_traits
@@ -34,6 +35,9 @@ the information about the individual features for details.
      - Optional value with error channel
      - CCCL 2.3.0 / CUDA 12.4
    * - :ref:`libcudacxx-standard-api-utility-functional`
+     - ``std::assume_aligned``
+     - CCCL 2.9.0 / CUDA 12.9
+   * - :ref:`libcudacxx-standard-api-utility-memory`
      - Function objects and function wrappers
      - libcu++ 1.1.0 / CCCL 2.0.0 / CUDA 11.2
    * - :ref:`libcudacxx-standard-api-utility-optional`
diff --git a/docs/libcudacxx/standard_api/utility_library/memory.rst b/docs/libcudacxx/standard_api/utility_library/memory.rst
new file mode 100644
index 00000000000..676253818b1
--- /dev/null
+++ b/docs/libcudacxx/standard_api/utility_library/memory.rst
@@ -0,0 +1,25 @@
+.. _libcudacxx-standard-api-utility-memory:
+
+<cuda/std/memory>
+===================
+
+Provided functionalities
+------------------------
+
+- ``cuda::std::addressof``. See the C++ documentation of `std::addressof <https://en.cppreference.com/w/cpp/memory/addressof>`_
+- ``cuda::std::align``. See the C++ documentation of `std::align <https://en.cppreference.com/w/cpp/memory/align>`_
+- ``cuda::std::assume_aligned``. See the C++ documentation of `std::assume_aligned <https://en.cppreference.com/w/cpp/memory/assume_aligned>`_
+- Uninitialized memory algorithms. See the C++ documentation `<https://en.cppreference.com/w/cpp/memory>`_
+
+Extensions
+----------
+
+-  Most features are available from C++11 onwards.
+-  ``cuda::std::addressof`` is constexpr from C++11 on if compiler support is available
+-  ``cuda::std::assume_aligned`` is constexpr from C++14 on
+
+Restrictions
+------------
+
+- `construct_at` and is only available in C++20 as that is explicitly mentioned in the standard
+- The specialized memory algorithms are not parallel
diff --git a/libcudacxx/include/cuda/std/__cccl/builtin.h b/libcudacxx/include/cuda/std/__cccl/builtin.h
index 005a5283a41..aa2f59c466e 100644
--- a/libcudacxx/include/cuda/std/__cccl/builtin.h
+++ b/libcudacxx/include/cuda/std/__cccl/builtin.h
@@ -84,7 +84,17 @@
 
 #if _CCCL_HAS_BUILTIN(__array_extent)
 #  define _CCCL_BUILTIN_ARRAY_EXTENT(...) __array_extent(__VA_ARGS__)
-#endif // _CCCL_HAS_BUILTIN(array_extent)
+#endif // _CCCL_HAS_BUILTIN(__array_extent)
+
+#if _CCCL_HAS_BUILTIN(__builtin_assume_aligned) || (defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION >= 1923) \
+  || defined(_CCCL_COMPILER_GCC)
+#  define _CCCL_BUILTIN_ASSUME_ALIGNED(...) __builtin_assume_aligned(__VA_ARGS__)
+#endif // _CCCL_HAS_BUILTIN(__builtin_assume_aligned)
+
+// NVCC below 11.2 treats this as a host only function
+#if defined(_CCCL_CUDACC_BELOW_11_2)
+#  undef _CCCL_BUILTIN_ASSUME_ALIGNED
+#endif // _CCCL_CUDACC_BELOW_11_2
 
 // nvhpc has a bug where it supports __builtin_addressof but does not mark it via _CCCL_CHECK_BUILTIN
 #if _CCCL_CHECK_BUILTIN(builtin_addressof) || (defined(_CCCL_COMPILER_GCC) && _CCCL_GCC_VERSION >= 70000) \
diff --git a/libcudacxx/include/cuda/std/__memory/assume_aligned.h b/libcudacxx/include/cuda/std/__memory/assume_aligned.h
new file mode 100644
index 00000000000..c8f9310ed1a
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__memory/assume_aligned.h
@@ -0,0 +1,53 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___MEMORY_ASSUME_ALIGNED_H
+#define _LIBCUDACXX___MEMORY_ASSUME_ALIGNED_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__bit/has_single_bit.h>
+#include <cuda/std/__type_traits/is_constant_evaluated.h>
+#include <cuda/std/cstddef> // size_t
+#include <cuda/std/cstdint> // uintptr_t
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <size_t _Align, class _Tp>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _Tp* assume_aligned(_Tp* __ptr) noexcept
+{
+  static_assert(_CUDA_VSTD::has_single_bit(_Align), "std::assume_aligned requires the alignment to be a power of 2!");
+#if defined(_CCCL_BUILTIN_IS_CONSTANT_EVALUATED) && defined(_CCCL_BUILTIN_ASSUME_ALIGNED)
+  if (!_CCCL_BUILTIN_IS_CONSTANT_EVALUATED())
+  {
+#  if !defined(_CCCL_COMPILER_MSVC) // MSVC checks within the builtin
+    _CCCL_ASSERT(reinterpret_cast<uintptr_t>(__ptr) % _Align == 0, "Alignment assumption is violated");
+#  endif // !_CCCL_COMPILER_MSVC
+    return static_cast<_Tp*>(_CCCL_BUILTIN_ASSUME_ALIGNED(__ptr, _Align));
+  }
+  else
+#endif // _CCCL_BUILTIN_IS_CONSTANT_EVALUATED && _CCCL_BUILTIN_ASSUME_ALIGNED
+  {
+    return __ptr;
+  }
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___MEMORY_ASSUME_ALIGNED_H
diff --git a/libcudacxx/include/cuda/std/__memory/construct_at.h b/libcudacxx/include/cuda/std/__memory/construct_at.h
index a454ae43d95..abbf1f8ad12 100644
--- a/libcudacxx/include/cuda/std/__memory/construct_at.h
+++ b/libcudacxx/include/cuda/std/__memory/construct_at.h
@@ -4,7 +4,7 @@
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES
 //
 //===----------------------------------------------------------------------===//
 
@@ -185,8 +185,8 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator __destroy(_Forw
 
 _CCCL_EXEC_CHECK_DISABLE
 template <class _Tp,
-          __enable_if_t<!is_array<_Tp>::value, int>                  = 0,
-          __enable_if_t<!is_trivially_destructible<_Tp>::value, int> = 0>
+          __enable_if_t<!_CCCL_TRAIT(is_array, _Tp), int>                  = 0,
+          __enable_if_t<!_CCCL_TRAIT(is_trivially_destructible, _Tp), int> = 0>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void __destroy_at(_Tp* __loc)
 {
   _CCCL_ASSERT(__loc != nullptr, "null pointer given to destroy_at");
@@ -195,22 +195,20 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void __destroy_at(_Tp* __loc)
 
 _CCCL_EXEC_CHECK_DISABLE
 template <class _Tp,
-          __enable_if_t<!is_array<_Tp>::value, int>                 = 0,
-          __enable_if_t<is_trivially_destructible<_Tp>::value, int> = 0>
+          __enable_if_t<!_CCCL_TRAIT(is_array, _Tp), int>                 = 0,
+          __enable_if_t<_CCCL_TRAIT(is_trivially_destructible, _Tp), int> = 0>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void __destroy_at(_Tp* __loc)
 {
   _CCCL_ASSERT(__loc != nullptr, "null pointer given to destroy_at");
   (void) __loc;
 }
 
-#if _CCCL_STD_VER >= 2020
-template <class _Tp, __enable_if_t<is_array<_Tp>::value, int> = 0>
-_LIBCUDACXX_HIDE_FROM_ABI constexpr void __destroy_at(_Tp* __loc)
+template <class _Tp, __enable_if_t<_CCCL_TRAIT(is_array, _Tp), int> = 0>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 void __destroy_at(_Tp* __loc)
 {
   _CCCL_ASSERT(__loc != nullptr, "null pointer given to destroy_at");
   _CUDA_VSTD::__destroy(_CUDA_VSTD::begin(*__loc), _CUDA_VSTD::end(*__loc));
 }
-#endif // _CCCL_STD_VER >= 2020
 
 template <class _ForwardIterator>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX14 _ForwardIterator
@@ -235,22 +233,18 @@ __reverse_destroy(_BidirectionalIterator __first, _BidirectionalIterator __last)
   return __last;
 }
 
-#if _CCCL_STD_VER >= 2017
-
-template <class _Tp, enable_if_t<!is_array_v<_Tp>, int> = 0>
-_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void destroy_at(_Tp* __loc) noexcept
+template <class _Tp, __enable_if_t<!_CCCL_TRAIT(is_array, _Tp), int> = 0>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void destroy_at(_Tp* __loc)
 {
   _CCCL_ASSERT(__loc != nullptr, "null pointer given to destroy_at");
   __loc->~_Tp();
 }
 
-#  if _CCCL_STD_VER >= 2020
-template <class _Tp, enable_if_t<is_array_v<_Tp>, int> = 0>
-_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void destroy_at(_Tp* __loc) noexcept
+template <class _Tp, __enable_if_t<_CCCL_TRAIT(is_array, _Tp), int> = 0>
+_LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void destroy_at(_Tp* __loc)
 {
   _CUDA_VSTD::__destroy_at(__loc);
 }
-#  endif // _CCCL_STD_VER >= 2020
 
 template <class _ForwardIterator>
 _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 void destroy(_ForwardIterator __first, _ForwardIterator __last) noexcept
@@ -268,8 +262,6 @@ _LIBCUDACXX_HIDE_FROM_ABI _CCCL_CONSTEXPR_CXX20 _ForwardIterator destroy_n(_Forw
   return __first;
 }
 
-#endif // _CCCL_STD_VER >= 2017
-
 _LIBCUDACXX_END_NAMESPACE_STD
 
 #endif // _LIBCUDACXX___MEMORY_CONSTRUCT_AT_H
diff --git a/libcudacxx/include/cuda/std/__memory_ b/libcudacxx/include/cuda/std/__memory_
index 62fb5e000da..648335e96c4 100644
--- a/libcudacxx/include/cuda/std/__memory_
+++ b/libcudacxx/include/cuda/std/__memory_
@@ -8,8 +8,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _CUDA_STD_MEMORY
-#define _CUDA_STD_MEMORY
+#ifndef _CUDA_STD___MEMORY_
+#define _CUDA_STD___MEMORY_
 
 #include <cuda/std/detail/__config>
 
@@ -21,25 +21,14 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/__memory/addressof.h>
-#include <cuda/std/__memory/align.h>
 #include <cuda/std/__memory/allocate_at_least.h>
 #include <cuda/std/__memory/allocation_guard.h>
 #include <cuda/std/__memory/allocator.h>
 #include <cuda/std/__memory/allocator_arg_t.h>
 #include <cuda/std/__memory/allocator_traits.h>
-#include <cuda/std/__memory/construct_at.h>
-#include <cuda/std/__memory/pointer_traits.h>
-#include <cuda/std/__memory/uninitialized_algorithms.h>
+#include <cuda/std/__memory/temporary_buffer.h>
 #include <cuda/std/__memory/unique_ptr.h>
 #include <cuda/std/__memory/uses_allocator.h>
+#include <cuda/std/memory>
 
-// standard-mandated includes
-#include <cuda/std/version>
-
-// [memory.syn]
-#ifndef _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
-#  include <cuda/std/compare>
-#endif // !_LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
-
-#endif // _CUDA_STD_MEMORY
+#endif // _CUDA_STD___MEMORY_
diff --git a/libcudacxx/include/cuda/std/memory b/libcudacxx/include/cuda/std/memory
new file mode 100644
index 00000000000..e9461d554f5
--- /dev/null
+++ b/libcudacxx/include/cuda/std/memory
@@ -0,0 +1,39 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD_MEMORY
+#define _CUDA_STD_MEMORY
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__memory/addressof.h>
+#include <cuda/std/__memory/align.h>
+#include <cuda/std/__memory/assume_aligned.h>
+#include <cuda/std/__memory/construct_at.h>
+#include <cuda/std/__memory/pointer_traits.h>
+#include <cuda/std/__memory/uninitialized_algorithms.h>
+
+// standard-mandated includes
+#include <cuda/std/version>
+
+// [memory.syn]
+#ifndef _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
+#  include <cuda/std/compare>
+#endif // !_LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
+
+#endif // _CUDA_STD_MEMORY
diff --git a/libcudacxx/test/libcudacxx/std/containers/sequences/array/size_and_alignment.pass.cpp b/libcudacxx/test/libcudacxx/std/containers/sequences/array/size_and_alignment.pass.cpp
index a5a9ca3813f..d9d0c8211a6 100644
--- a/libcudacxx/test/libcudacxx/std/containers/sequences/array/size_and_alignment.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/containers/sequences/array/size_and_alignment.pass.cpp
@@ -29,7 +29,7 @@
 TEST_NV_DIAG_SUPPRESS(cuda_demote_unsupported_floating_point)
 
 #if defined(TEST_COMPILER_MSVC)
-#  pragma warning(disable : 4324)
+#  pragma warning(disable : 4324) // structure was padded due to alignment specifier
 #endif // TEST_COMPILER_MSVC
 
 template <class T, cuda::std::size_t Size>
diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/default.allocator/allocator.members/allocate.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/default.allocator/allocator.members/allocate.pass.cpp
index bef89c738ad..b30463d75fc 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/memory/default.allocator/allocator.members/allocate.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/memory/default.allocator/allocator.members/allocate.pass.cpp
@@ -23,7 +23,7 @@
 #include "test_macros.h"
 
 #if defined(TEST_COMPILER_MSVC)
-#  pragma warning(disable : 4324)
+#  pragma warning(disable : 4324) // structure was padded due to alignment specifier
 #endif // TEST_COMPILER_MSVC
 
 #ifdef TEST_HAS_NO_ALIGNED_ALLOCATION
diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/ptr.align/align.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/ptr.align/align.pass.cpp
index 657956cd51f..1916c98b8dc 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/memory/ptr.align/align.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/memory/ptr.align/align.pass.cpp
@@ -11,9 +11,9 @@
 
 // void* align(size_t alignment, size_t size, void*& ptr, size_t& space);
 
-#include <cuda/std/__memory_>
 #include <cuda/std/cassert>
 #include <cuda/std/cstddef>
+#include <cuda/std/memory>
 
 #include "test_macros.h"
 
diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/ptr.align/assume_aligned.nodiscard.fail.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/ptr.align/assume_aligned.nodiscard.fail.cpp
new file mode 100644
index 00000000000..9d0eebdc4fd
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/memory/ptr.align/assume_aligned.nodiscard.fail.cpp
@@ -0,0 +1,30 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// #include <memory>
+
+// template<size_t N, class T>
+// [[nodiscard]] constexpr T* assume_aligned(T* ptr);
+
+// UNSUPPORTED: nvrtc
+// nvrtc currently compiles the test with a warning
+
+#include <cuda/std/memory>
+
+__host__ __device__ void f()
+{
+  int* p = nullptr;
+  cuda::std::assume_aligned<4>(p); // expected-warning {{ignoring return value of function declared with 'nodiscard'
+                                   // attribute}}
+}
+
+int main(int, char**)
+{
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/ptr.align/assume_aligned.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/ptr.align/assume_aligned.pass.cpp
new file mode 100644
index 00000000000..36a1f9ede32
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/utilities/memory/ptr.align/assume_aligned.pass.cpp
@@ -0,0 +1,117 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// #include <memory>
+
+// template<size_t N, class T>
+// [[nodiscard]] constexpr T* assume_aligned(T* ptr);
+
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+#include <cuda/std/memory>
+
+#include "test_macros.h"
+
+#if defined(TEST_COMPILER_MSVC)
+#  pragma warning(disable : 4324) // structure was padded due to alignment specifier
+#endif // TEST_COMPILER_MSVC
+
+template <typename T>
+__host__ __device__ TEST_CONSTEXPR_CXX14 void check(T* p)
+{
+  ASSERT_SAME_TYPE(T*, decltype(cuda::std::assume_aligned<1>(p)));
+  constexpr cuda::std::size_t alignment = alignof(T);
+
+  _CCCL_IF_CONSTEXPR (alignment >= 1)
+  {
+    assert(p == cuda::std::assume_aligned<1>(p));
+  }
+  _CCCL_IF_CONSTEXPR (alignment >= 2)
+  {
+    assert(p == cuda::std::assume_aligned<2>(p));
+  }
+  _CCCL_IF_CONSTEXPR (alignment >= 4)
+  {
+    assert(p == cuda::std::assume_aligned<4>(p));
+  }
+  _CCCL_IF_CONSTEXPR (alignment >= 8)
+  {
+    assert(p == cuda::std::assume_aligned<8>(p));
+  }
+  _CCCL_IF_CONSTEXPR (alignment >= 16)
+  {
+    assert(p == cuda::std::assume_aligned<16>(p));
+  }
+  _CCCL_IF_CONSTEXPR (alignment >= 32)
+  {
+    assert(p == cuda::std::assume_aligned<32>(p));
+  }
+  _CCCL_IF_CONSTEXPR (alignment >= 64)
+  {
+    assert(p == cuda::std::assume_aligned<64>(p));
+  }
+  _CCCL_IF_CONSTEXPR (alignment >= 128)
+  {
+    assert(p == cuda::std::assume_aligned<128>(p));
+  }
+}
+
+struct S
+{};
+struct alignas(4) S4
+{};
+struct alignas(8) S8
+{};
+struct alignas(16) S16
+{};
+struct alignas(32) S32
+{};
+struct alignas(64) S64
+{};
+struct alignas(128) S128
+{};
+
+__host__ __device__ TEST_CONSTEXPR_CXX14 bool tests()
+{
+  char c{};
+  int i{};
+  long l{};
+  double d{};
+  check(&c);
+  check(&i);
+  check(&l);
+  check(&d);
+
+  S s{};
+  S4 s4{};
+  S8 s8{};
+  S16 s16{};
+  S32 s32{};
+  S64 s64{};
+  S128 s128{};
+  check(&s);
+  check(&s4);
+  check(&s8);
+  check(&s16);
+  check(&s32);
+  check(&s64);
+  check(&s128);
+
+  return true;
+}
+
+int main(int, char**)
+{
+  tests();
+#if TEST_STD_VER >= 2014
+  static_assert(tests(), "");
+#endif // TEST_STD_VER >= 2014
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/sanity.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/sanity.pass.cpp
index a8c11f98c05..524c1478463 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/memory/sanity.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/memory/sanity.pass.cpp
@@ -12,7 +12,7 @@
 
 // template <ObjectType T> T* addressof(T& r);
 
-#include <cuda/std/__memory_>
+#include <cuda/std/memory>
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/overload_compare_iterator.h b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/overload_compare_iterator.h
index cfbaf20643c..c72f2732fdf 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/overload_compare_iterator.h
+++ b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/overload_compare_iterator.h
@@ -12,8 +12,8 @@
 #ifndef LIBCUDACXX_TEST_STD_UTILITIES_MEMORY_SPECIALIZED_ALGORITHMS_OVERLOAD_COMPARE_ITERATOR_H
 #define LIBCUDACXX_TEST_STD_UTILITIES_MEMORY_SPECIALIZED_ALGORITHMS_OVERLOAD_COMPARE_ITERATOR_H
 
-#include <cuda/std/__memory_>
 #include <cuda/std/iterator>
+#include <cuda/std/memory>
 
 #include "test_macros.h"
 
diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/specialized.construct/construct_at.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/specialized.construct/construct_at.pass.cpp
index 47895fc48f4..fbd34ebde01 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/specialized.construct/construct_at.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/specialized.construct/construct_at.pass.cpp
@@ -16,7 +16,7 @@
 
 // #include <cuda/std/memory>
 #include <cuda/std/cassert>
-#include <cuda/std/utility>
+#include <cuda/std/memory>
 
 #include "test_iterators.h"
 #include "test_macros.h"
diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy.pass.cpp
index d1bddf50595..b5313969719 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy.pass.cpp
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: gcc-6
 
 // <memory>
@@ -17,8 +16,8 @@
 
 // #include <cuda/std/memory>
 #include <cuda/std/cassert>
+#include <cuda/std/memory>
 #include <cuda/std/type_traits>
-#include <cuda/std/utility>
 
 #include "test_iterators.h"
 #include "test_macros.h"
@@ -26,12 +25,12 @@
 struct Counted
 {
   int* counter_ = nullptr;
-  __host__ __device__ TEST_CONSTEXPR Counted(int* counter)
+  __host__ __device__ TEST_CONSTEXPR_CXX14 Counted(int* counter)
       : counter_(counter)
   {
     ++*counter_;
   }
-  __host__ __device__ TEST_CONSTEXPR Counted(Counted const& other)
+  __host__ __device__ TEST_CONSTEXPR_CXX14 Counted(Counted const& other)
       : counter_(other.counter_)
   {
     ++*counter_;
@@ -43,8 +42,7 @@ struct Counted
   __host__ __device__ friend void operator&(Counted) = delete;
 };
 
-#if TEST_STD_VER > 2017
-__host__ __device__ constexpr bool test_arrays()
+__host__ __device__ TEST_CONSTEXPR_CXX20 bool test_arrays()
 {
   {
     int counter     = 0;
@@ -83,7 +81,6 @@ __host__ __device__ constexpr bool test_arrays()
 
   return true;
 }
-#endif
 
 template <class It>
 __host__ __device__ TEST_CONSTEXPR_CXX20 void test()
@@ -113,8 +110,8 @@ __host__ __device__ TEST_CONSTEXPR_CXX20 bool tests()
 int main(int, char**)
 {
   tests();
-#if TEST_STD_VER > 2017
   test_arrays();
+#if TEST_STD_VER > 2017
 #  if !defined(TEST_COMPILER_NVRTC)
 #    if (defined(TEST_COMPILER_CLANG) && __clang_major__ > 10) || (defined(TEST_COMPILER_GCC) && __GNUC__ > 9) \
       || defined(TEST_COMPILER_MSVC_2022) || defined(TEST_COMPILER_NVHPC)
diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy_at.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy_at.pass.cpp
index 8a5c1731688..6426a1dd14a 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy_at.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy_at.pass.cpp
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: gcc-6
 
 // <memory>
@@ -17,15 +16,15 @@
 
 // #include <cuda/std/memory>
 #include <cuda/std/cassert>
+#include <cuda/std/memory>
 #include <cuda/std/type_traits>
-#include <cuda/std/utility>
 
 #include "test_macros.h"
 
 struct Counted
 {
   int* counter_;
-  __host__ __device__ TEST_CONSTEXPR Counted(int* counter)
+  __host__ __device__ TEST_CONSTEXPR_CXX14 Counted(int* counter)
       : counter_(counter)
   {
     ++*counter_;
@@ -40,7 +39,7 @@ struct Counted
 struct VirtualCounted
 {
   int* counter_;
-  __host__ __device__ TEST_CONSTEXPR VirtualCounted(int* counter)
+  __host__ __device__ TEST_CONSTEXPR_CXX14 VirtualCounted(int* counter)
       : counter_(counter)
   {
     ++*counter_;
@@ -54,14 +53,13 @@ struct VirtualCounted
 
 struct DerivedCounted : VirtualCounted
 {
-  __host__ __device__ TEST_CONSTEXPR DerivedCounted(int* counter)
+  __host__ __device__ TEST_CONSTEXPR_CXX14 DerivedCounted(int* counter)
       : VirtualCounted(counter)
   {}
   __host__ __device__ TEST_CONSTEXPR_CXX20 ~DerivedCounted() override {}
 };
 
-#if TEST_STD_VER > 2017
-__host__ __device__ constexpr bool test_arrays()
+__host__ __device__ TEST_CONSTEXPR_CXX20 bool test_arrays()
 {
   {
     int counter    = 0;
@@ -98,7 +96,6 @@ __host__ __device__ constexpr bool test_arrays()
   }
   return true;
 }
-#endif
 
 __host__ __device__ TEST_CONSTEXPR_CXX20 bool test()
 {
@@ -143,8 +140,8 @@ __host__ __device__ TEST_CONSTEXPR_CXX20 bool test()
 int main(int, char**)
 {
   test();
-#if TEST_STD_VER > 2017
   test_arrays();
+#if TEST_STD_VER > 2017
 #  if !defined(TEST_COMPILER_NVRTC)
 #    if (defined(TEST_COMPILER_CLANG) && __clang_major__ > 10) || (defined(TEST_COMPILER_GCC) && __GNUC__ > 9) \
       || defined(TEST_COMPILER_MSVC_2022) || defined(TEST_COMPILER_NVHPC)
diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy_n.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy_n.pass.cpp
index 18cb94b049c..041cd313565 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy_n.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy_n.pass.cpp
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: gcc-6
 
 // <memory>
@@ -17,8 +16,8 @@
 
 // #include <cuda/std/memory>
 #include <cuda/std/cassert>
+#include <cuda/std/memory>
 #include <cuda/std/type_traits>
-#include <cuda/std/utility>
 
 #include "test_iterators.h"
 #include "test_macros.h"
@@ -26,12 +25,12 @@
 struct Counted
 {
   int* counter_ = nullptr;
-  __host__ __device__ TEST_CONSTEXPR Counted(int* counter)
+  __host__ __device__ TEST_CONSTEXPR_CXX14 Counted(int* counter)
       : counter_(counter)
   {
     ++*counter_;
   }
-  __host__ __device__ TEST_CONSTEXPR Counted(Counted const& other)
+  __host__ __device__ TEST_CONSTEXPR_CXX14 Counted(Counted const& other)
       : counter_(other.counter_)
   {
     ++*counter_;
@@ -43,8 +42,7 @@ struct Counted
   __host__ __device__ friend void operator&(Counted) = delete;
 };
 
-#if TEST_STD_VER > 2017
-__host__ __device__ constexpr bool test_arrays()
+__host__ __device__ TEST_CONSTEXPR_CXX20 bool test_arrays()
 {
   {
     int counter     = 0;
@@ -70,7 +68,6 @@ __host__ __device__ constexpr bool test_arrays()
 
   return true;
 }
-#endif
 
 template <class It>
 __host__ __device__ TEST_CONSTEXPR_CXX20 void test()
@@ -102,8 +99,8 @@ __host__ __device__ TEST_CONSTEXPR_CXX20 bool tests()
 int main(int, char**)
 {
   tests();
-#if TEST_STD_VER > 2017
   test_arrays();
+#if TEST_STD_VER > 2017
 #  if !defined(TEST_COMPILER_NVRTC)
 #    if (defined(TEST_COMPILER_CLANG) && __clang_major__ > 10) || (defined(TEST_COMPILER_GCC) && __GNUC__ > 9) \
       || defined(TEST_COMPILER_MSVC_2022) || defined(TEST_COMPILER_NVHPC)
diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.construct.default/uninitialized_default_construct.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.construct.default/uninitialized_default_construct.pass.cpp
index 865ac56be00..00335d07f67 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.construct.default/uninitialized_default_construct.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.construct.default/uninitialized_default_construct.pass.cpp
@@ -13,9 +13,9 @@
 // template <class ForwardIt>
 // void uninitialized_default_construct(ForwardIt, ForwardIt);
 
-#include <cuda/std/__memory_>
 #include <cuda/std/cassert>
 #include <cuda/std/cstdlib>
+#include <cuda/std/memory>
 
 #include "test_iterators.h"
 #include "test_macros.h"
diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.construct.default/uninitialized_default_construct_n.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.construct.default/uninitialized_default_construct_n.pass.cpp
index d0e47f0572e..b79de9f3623 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.construct.default/uninitialized_default_construct_n.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.construct.default/uninitialized_default_construct_n.pass.cpp
@@ -13,9 +13,9 @@
 // template <class ForwardIt>
 // void uninitialized_default_construct(ForwardIt, ForwardIt);
 
-#include <cuda/std/__memory_>
 #include <cuda/std/cassert>
 #include <cuda/std/cstdlib>
+#include <cuda/std/memory>
 
 #include "test_iterators.h"
 #include "test_macros.h"
diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.construct.value/uninitialized_value_construct.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.construct.value/uninitialized_value_construct.pass.cpp
index fe588e4ed44..72bd561d83f 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.construct.value/uninitialized_value_construct.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.construct.value/uninitialized_value_construct.pass.cpp
@@ -13,9 +13,9 @@
 // template <class ForwardIt>
 // void uninitialized_value_construct(ForwardIt, ForwardIt);
 
-#include <cuda/std/__memory_>
 #include <cuda/std/cassert>
 #include <cuda/std/cstdlib>
+#include <cuda/std/memory>
 
 #include "test_iterators.h"
 #include "test_macros.h"
diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.construct.value/uninitialized_value_construct_n.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.construct.value/uninitialized_value_construct_n.pass.cpp
index 0a585aa8919..9c899a877bc 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.construct.value/uninitialized_value_construct_n.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.construct.value/uninitialized_value_construct_n.pass.cpp
@@ -13,9 +13,9 @@
 // template <class ForwardIt>
 // void uninitialized_value_construct(ForwardIt, ForwardIt);
 
-#include <cuda/std/__memory_>
 #include <cuda/std/cassert>
 #include <cuda/std/cstdlib>
+#include <cuda/std/memory>
 
 #include "test_iterators.h"
 #include "test_macros.h"
diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.copy/uninitialized_copy.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.copy/uninitialized_copy.pass.cpp
index 8d1f9acc87c..343ed8f45dc 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.copy/uninitialized_copy.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.copy/uninitialized_copy.pass.cpp
@@ -15,8 +15,8 @@
 //   uninitialized_copy(InputIterator first, InputIterator last,
 //                      ForwardIterator result);
 
-#include <cuda/std/__memory_>
 #include <cuda/std/cassert>
+#include <cuda/std/memory>
 
 #include "../overload_compare_iterator.h"
 #include "test_macros.h"
diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.copy/uninitialized_copy_n.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.copy/uninitialized_copy_n.pass.cpp
index b3d9f11dad9..08b17419908 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.copy/uninitialized_copy_n.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.copy/uninitialized_copy_n.pass.cpp
@@ -15,8 +15,8 @@
 //   uninitialized_copy_n(InputIterator first, Size n,
 //                        ForwardIterator result);
 
-#include <cuda/std/__memory_>
 #include <cuda/std/cassert>
+#include <cuda/std/memory>
 
 #include "../overload_compare_iterator.h"
 #include "test_macros.h"
diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.fill.n/uninitialized_fill_n.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.fill.n/uninitialized_fill_n.pass.cpp
index b1e3388c3b5..ba31f26e963 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.fill.n/uninitialized_fill_n.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.fill.n/uninitialized_fill_n.pass.cpp
@@ -14,8 +14,8 @@
 //   ForwardIterator
 //   uninitialized_fill_n(ForwardIterator first, Size n, const T& x);
 
-#include <cuda/std/__memory_>
 #include <cuda/std/cassert>
+#include <cuda/std/memory>
 
 #include "test_macros.h"
 
diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.fill/uninitialized_fill.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.fill/uninitialized_fill.pass.cpp
index d3b081bec12..9a9fd1e3c14 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.fill/uninitialized_fill.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.fill/uninitialized_fill.pass.cpp
@@ -15,8 +15,8 @@
 //   uninitialized_fill(ForwardIterator first, ForwardIterator last,
 //                      const T& x);
 
-#include <cuda/std/__memory_>
 #include <cuda/std/cassert>
+#include <cuda/std/memory>
 
 #include "test_macros.h"
 
diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.move/uninitialized_move.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.move/uninitialized_move.pass.cpp
index 23a8590efa2..6a1bd2914bd 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.move/uninitialized_move.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.move/uninitialized_move.pass.cpp
@@ -13,9 +13,9 @@
 // template <class InputIt, class ForwardIt>
 // ForwardIt uninitialized_move(InputIt, InputIt, ForwardIt);
 
-#include <cuda/std/__memory_>
 #include <cuda/std/cassert>
 #include <cuda/std/cstdlib>
+#include <cuda/std/memory>
 
 #include "../overload_compare_iterator.h"
 #include "test_iterators.h"
diff --git a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.move/uninitialized_move_n.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.move/uninitialized_move_n.pass.cpp
index 55cae15bf85..d420c1f3b4c 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.move/uninitialized_move_n.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/memory/specialized.algorithms/uninitialized.move/uninitialized_move_n.pass.cpp
@@ -13,9 +13,9 @@
 // template <class InputIt, class Size, class ForwardIt>
 // pair<InputIt, ForwardIt> uninitialized_move_n(InputIt, Size, ForwardIt);
 
-#include <cuda/std/__memory_>
 #include <cuda/std/cassert>
 #include <cuda/std/cstdlib>
+#include <cuda/std/memory>
 
 #include "../overload_compare_iterator.h"
 #include "test_iterators.h"