diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5eb43784063..fff9035a2bb 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -20,7 +20,20 @@ repos:
     rev: v17.0.6
     hooks:
       - id: clang-format
-        types_or: [c, c++, cuda]
+        types_or: [file]
+        files: |
+          (?x)^(
+            ^.*\.c$|
+            ^.*\.cpp$|
+            ^.*\.cu$|
+            ^.*\.cuh$|
+            ^.*\.cxx$|
+            ^.*\.h$|
+            ^.*\.hpp$|
+            ^.*\.inl$|
+            ^.*\.mm$|
+            ^libcudacxx/include/.*/[^.]*$
+          )
         args: ["-fallback-style=none", "-style=file", "-i"]
 
 default_language_version:
diff --git a/libcudacxx/include/cuda/annotated_ptr b/libcudacxx/include/cuda/annotated_ptr
index bd9f26ad591..f5e04e56623 100644
--- a/libcudacxx/include/cuda/annotated_ptr
+++ b/libcudacxx/include/cuda/annotated_ptr
@@ -3,50 +3,128 @@
  *
  * NVIDIA SOFTWARE LICENSE
  *
- * This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the NVIDIA/CUDA C++ Library software and materials provided hereunder (“SOFTWARE”).
+ * This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the
+ * NVIDIA/CUDA C++ Library software and materials provided hereunder (“SOFTWARE”).
  *
- * This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used. If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of this license, and you take legal and financial responsibility for the actions of your permitted users.
+ * This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used.
+ * If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By
+ * taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of
+ * this license, and you take legal and financial responsibility for the actions of your permitted users.
  *
- * You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law, regulation or generally accepted practices or guidelines in the relevant jurisdictions.
+ * You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law,
+ * regulation or generally accepted practices or guidelines in the relevant jurisdictions.
  *
- * 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under this license.
+ * 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install
+ * and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this
+ * license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under
+ * this license.
  *
  * 2. DISTRIBUTION REQUIREMENTS. These are the distribution requirements for you to exercise the distribution grant:
- * a.      The terms under which you distribute the SOFTWARE must be consistent with the terms of this license, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA’s intellectual property rights.
- * b.      You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms of your agreements with respect to distributed SOFTWARE.
+ * a.      The terms under which you distribute the SOFTWARE must be consistent with the terms of this license,
+ * including (without limitation) terms relating to the license grant and license restrictions and protection of
+ * NVIDIA’s intellectual property rights. b.      You agree to notify NVIDIA in writing of any known or suspected
+ * distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms
+ * of your agreements with respect to distributed SOFTWARE.
  *
  * 3. LIMITATIONS. Your license to use the SOFTWARE is restricted as follows:
  * a.      The SOFTWARE is licensed for you to develop applications only for use in systems with NVIDIA GPUs.
- * b.      You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SOFTWARE or copies of the SOFTWARE.
- * c.      You may not modify or create derivative works of any portion of the SOFTWARE.
- * d.      You may not bypass, disable, or circumvent any technical measure, encryption, security, digital rights management or authentication mechanism in the SOFTWARE.
- * e.      You may not use the SOFTWARE in any manner that would cause it to become subject to an open source software license. As examples, licenses that require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge.
- * f.      Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses.
- * g.      You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to use of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms.
+ * b.      You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from
+ * any portion of the SOFTWARE or copies of the SOFTWARE. c.      You may not modify or create derivative works of any
+ * portion of the SOFTWARE. d.      You may not bypass, disable, or circumvent any technical measure, encryption,
+ * security, digital rights management or authentication mechanism in the SOFTWARE. e.      You may not use the SOFTWARE
+ * in any manner that would cause it to become subject to an open source software license. As examples, licenses that
+ * require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in
+ * source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge. f.
+ * Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or
+ * application where the use or failure of the system or application can reasonably be expected to threaten or result in
+ * personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life
+ * support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these
+ * critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or
+ * damages arising from such uses. g.      You agree to defend, indemnify and hold harmless NVIDIA and its affiliates,
+ * and their respective employees, contractors, agents, officers and directors, from and against any and all claims,
+ * damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited
+ * to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to use
+ * of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms.
  *
- * 4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in production or business-critical systems.
+ * 4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may
+ * not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy,
+ * availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use
+ * a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in
+ * production or business-critical systems.
  *
- * 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time without notice, but is not obligated to support or update the SOFTWARE.
+ * 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and
+ * exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United
+ * States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time
+ * without notice, but is not obligated to support or update the SOFTWARE.
  *
- * 6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is a conflict between the terms in this license and the license terms associated with a component, the license terms associated with the components control only to the extent necessary to resolve the conflict.
+ * 6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal
+ * notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is
+ * a conflict between the terms in this license and the license terms associated with a component, the license terms
+ * associated with the components control only to the extent necessary to resolve the conflict.
  *
- * 7. FEEDBACK. You may, but don’t have to, provide to NVIDIA any Feedback. “Feedback” means any suggestions, bug fixes, enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA will use Feedback at its choice.
+ * 7. FEEDBACK. You may, but don’t have to, provide to NVIDIA any Feedback. “Feedback” means any suggestions, bug fixes,
+ * enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you
+ * voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable
+ * license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute
+ * (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA
+ * will use Feedback at its choice.
  *
- * 8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED.
+ * 8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT
+ * NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT
+ * WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR
+ * ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED.
  *
- * 9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT.
+ * 9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE
+ * FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE,
+ * LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH
+ * THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON
+ * BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION
+ * OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE
+ * POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING
+ * OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE
+ * OR EXTEND THIS LIMIT.
  *
- * 10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail to comply with any term and condition of this license or if you commence or participate in any legal proceeding against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA’s sole discretion, the continued use of it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this license are not affected by the termination of this license. All provisions of this license will survive termination, except for the license granted to you.
+ * 10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail
+ * to comply with any term and condition of this license or if you commence or participate in any legal proceeding
+ * against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if
+ * NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA’s sole discretion, the continued use of
+ * it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of
+ * the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this
+ * license are not affected by the termination of this license. All provisions of this license will survive termination,
+ * except for the license granted to you.
  *
- * 11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
+ * 11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State
+ * of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware
+ * residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the
+ * International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English
+ * language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction
+ * over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be
+ * allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
  *
- * 12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or operation of law without NVIDIA’s permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect.
+ * 12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or
+ * operation of law without NVIDIA’s permission. Any attempted assignment not approved by NVIDIA in writing shall be
+ * void and of no effect.
  *
- * 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship, transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from receiving the SOFTWARE.
+ * 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship,
+ * transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States
+ * Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s
+ * Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws
+ * include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not
+ * a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from
+ * receiving the SOFTWARE.
  *
- * 14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is “commercial items” consisting of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051.
+ * 14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is “commercial items” consisting
+ * of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS.
+ * Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the
+ * restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the
+ * Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is
+ * NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051.
  *
- * 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect. This license may only be modified in a writing signed by an authorized representative of each party.
+ * 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the
+ * subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to
+ * this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of
+ * this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect.
+ * This license may only be modified in a writing signed by an authorized representative of each party.
  *
  * (v. August 20, 2021)
  */
@@ -71,56 +149,96 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
-class access_property {
-  private:
-    std::uint64_t __descriptor = 0;
-
-  public:
-    struct shared {};
-    struct global {};
-    struct persisting {
-      _CCCL_HOST_DEVICE constexpr operator cudaAccessProperty() const noexcept {
-        return cudaAccessProperty::cudaAccessPropertyPersisting;
-      }
-    };
-    struct streaming {
-      _CCCL_HOST_DEVICE constexpr operator cudaAccessProperty() const noexcept {
-        return cudaAccessProperty::cudaAccessPropertyStreaming;
-      }
-    };
-    struct normal {
-      _CCCL_HOST_DEVICE constexpr operator cudaAccessProperty() const noexcept {
-        return cudaAccessProperty::cudaAccessPropertyNormal;
-      }
-    };
-
-    _CCCL_HOST_DEVICE constexpr access_property(global) noexcept : __descriptor(__detail_ap::__sm_80::__interleave_normal()) {}
-    _CCCL_HOST_DEVICE constexpr access_property() noexcept : __descriptor(__detail_ap::__sm_80::__interleave_normal()) {}
-    constexpr access_property(access_property const&) noexcept = default;
-    access_property& operator=(const access_property& other) noexcept = default;
-
-    _CCCL_HOST_DEVICE constexpr access_property(normal, float __fraction) : __descriptor(__detail_ap::__interleave(normal{}, __fraction)) {}
-    _CCCL_HOST_DEVICE constexpr access_property(streaming, float __fraction) : __descriptor(__detail_ap::__interleave(streaming{}, __fraction)) {}
-    _CCCL_HOST_DEVICE constexpr access_property(persisting, float __fraction) : __descriptor(__detail_ap::__interleave(persisting{}, __fraction)) {}
-    _CCCL_HOST_DEVICE constexpr access_property(normal, float __fraction, streaming) : __descriptor(__detail_ap::__interleave(normal{}, __fraction, streaming{})) {}
-    _CCCL_HOST_DEVICE constexpr access_property(persisting, float __fraction, streaming) : __descriptor(__detail_ap::__interleave(persisting{}, __fraction, streaming{})) {}
-
-    _CCCL_HOST_DEVICE constexpr access_property(normal) noexcept : access_property(normal{}, 1.0) {}
-    _CCCL_HOST_DEVICE constexpr access_property(streaming) noexcept : access_property(streaming{}, 1.0) {}
-    _CCCL_HOST_DEVICE constexpr access_property(persisting) noexcept : access_property(persisting{}, 1.0) {}
-
-    _CCCL_HOST_DEVICE constexpr access_property(void* __ptr, std::size_t __hit_bytes, std::size_t __total_bytes, normal)
-      : __descriptor(__detail_ap::__block(__ptr, __hit_bytes, __total_bytes, normal{})) {}
-    _CCCL_HOST_DEVICE constexpr access_property(void* __ptr, std::size_t __hit_bytes, std::size_t __total_bytes, streaming)
-      : __descriptor(__detail_ap::__block(__ptr, __hit_bytes, __total_bytes, streaming{})) {}
-    _CCCL_HOST_DEVICE constexpr access_property(void* __ptr, std::size_t __hit_bytes, std::size_t __total_bytes, persisting)
-      : __descriptor(__detail_ap::__block(__ptr, __hit_bytes, __total_bytes, persisting{})) {}
-    _CCCL_HOST_DEVICE constexpr access_property(void* __ptr, std::size_t __hit_bytes, std::size_t __total_bytes, normal, streaming)
-      : __descriptor(__detail_ap::__block(__ptr, __hit_bytes, __total_bytes, normal{}, streaming{})) {}
-    _CCCL_HOST_DEVICE constexpr access_property(void* __ptr, std::size_t __hit_bytes, std::size_t __total_bytes, persisting, streaming)
-      : __descriptor(__detail_ap::__block(__ptr, __hit_bytes, __total_bytes, persisting{}, streaming{})) {}
-
-    _CCCL_HOST_DEVICE constexpr explicit operator std::uint64_t() const noexcept { return __descriptor; }
+class access_property
+{
+private:
+  std::uint64_t __descriptor = 0;
+
+public:
+  struct shared
+  {};
+  struct global
+  {};
+  struct persisting
+  {
+    _CCCL_HOST_DEVICE constexpr operator cudaAccessProperty() const noexcept
+    {
+      return cudaAccessProperty::cudaAccessPropertyPersisting;
+    }
+  };
+  struct streaming
+  {
+    _CCCL_HOST_DEVICE constexpr operator cudaAccessProperty() const noexcept
+    {
+      return cudaAccessProperty::cudaAccessPropertyStreaming;
+    }
+  };
+  struct normal
+  {
+    _CCCL_HOST_DEVICE constexpr operator cudaAccessProperty() const noexcept
+    {
+      return cudaAccessProperty::cudaAccessPropertyNormal;
+    }
+  };
+
+  _CCCL_HOST_DEVICE constexpr access_property(global) noexcept
+      : __descriptor(__detail_ap::__sm_80::__interleave_normal())
+  {}
+  _CCCL_HOST_DEVICE constexpr access_property() noexcept
+      : __descriptor(__detail_ap::__sm_80::__interleave_normal())
+  {}
+  constexpr access_property(access_property const&) noexcept        = default;
+  access_property& operator=(const access_property& other) noexcept = default;
+
+  _CCCL_HOST_DEVICE constexpr access_property(normal, float __fraction)
+      : __descriptor(__detail_ap::__interleave(normal{}, __fraction))
+  {}
+  _CCCL_HOST_DEVICE constexpr access_property(streaming, float __fraction)
+      : __descriptor(__detail_ap::__interleave(streaming{}, __fraction))
+  {}
+  _CCCL_HOST_DEVICE constexpr access_property(persisting, float __fraction)
+      : __descriptor(__detail_ap::__interleave(persisting{}, __fraction))
+  {}
+  _CCCL_HOST_DEVICE constexpr access_property(normal, float __fraction, streaming)
+      : __descriptor(__detail_ap::__interleave(normal{}, __fraction, streaming{}))
+  {}
+  _CCCL_HOST_DEVICE constexpr access_property(persisting, float __fraction, streaming)
+      : __descriptor(__detail_ap::__interleave(persisting{}, __fraction, streaming{}))
+  {}
+
+  _CCCL_HOST_DEVICE constexpr access_property(normal) noexcept
+      : access_property(normal{}, 1.0)
+  {}
+  _CCCL_HOST_DEVICE constexpr access_property(streaming) noexcept
+      : access_property(streaming{}, 1.0)
+  {}
+  _CCCL_HOST_DEVICE constexpr access_property(persisting) noexcept
+      : access_property(persisting{}, 1.0)
+  {}
+
+  _CCCL_HOST_DEVICE constexpr access_property(void* __ptr, std::size_t __hit_bytes, std::size_t __total_bytes, normal)
+      : __descriptor(__detail_ap::__block(__ptr, __hit_bytes, __total_bytes, normal{}))
+  {}
+  _CCCL_HOST_DEVICE constexpr access_property(void* __ptr, std::size_t __hit_bytes, std::size_t __total_bytes, streaming)
+      : __descriptor(__detail_ap::__block(__ptr, __hit_bytes, __total_bytes, streaming{}))
+  {}
+  _CCCL_HOST_DEVICE constexpr access_property(
+    void* __ptr, std::size_t __hit_bytes, std::size_t __total_bytes, persisting)
+      : __descriptor(__detail_ap::__block(__ptr, __hit_bytes, __total_bytes, persisting{}))
+  {}
+  _CCCL_HOST_DEVICE constexpr access_property(
+    void* __ptr, std::size_t __hit_bytes, std::size_t __total_bytes, normal, streaming)
+      : __descriptor(__detail_ap::__block(__ptr, __hit_bytes, __total_bytes, normal{}, streaming{}))
+  {}
+  _CCCL_HOST_DEVICE constexpr access_property(
+    void* __ptr, std::size_t __hit_bytes, std::size_t __total_bytes, persisting, streaming)
+      : __descriptor(__detail_ap::__block(__ptr, __hit_bytes, __total_bytes, persisting{}, streaming{}))
+  {}
+
+  _CCCL_HOST_DEVICE constexpr explicit operator std::uint64_t() const noexcept
+  {
+    return __descriptor;
+  }
 };
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
@@ -130,195 +248,201 @@ _LIBCUDACXX_END_NAMESPACE_CUDA
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
 template <class _Tp, class _Property>
-_CCCL_HOST_DEVICE
-_Tp* associate_access_property(_Tp* __ptr, _Property __prop) {
+_CCCL_HOST_DEVICE _Tp* associate_access_property(_Tp* __ptr, _Property __prop)
+{
   static_assert(
-    std::is_same<_Property, access_property>::value ||
-    std::is_same<_Property, access_property::persisting>::value ||
-    std::is_same<_Property, access_property::streaming>::value ||
-    std::is_same<_Property, access_property::normal>::value ||
-    std::is_same<_Property, access_property::global>::value ||
-    std::is_same<_Property, access_property::shared>::value
-      , "property is not convertible to cuda::access_property");
+    std::is_same<_Property, access_property>::value || std::is_same<_Property, access_property::persisting>::value
+      || std::is_same<_Property, access_property::streaming>::value
+      || std::is_same<_Property, access_property::normal>::value
+      || std::is_same<_Property, access_property::global>::value
+      || std::is_same<_Property, access_property::shared>::value,
+    "property is not convertible to cuda::access_property");
   return __detail_ap::__associate(__ptr, __prop);
 }
 
 template <class _Shape>
-_CCCL_HOST_DEVICE
-void apply_access_property(const volatile void* __ptr, const _Shape __shape, access_property::persisting __prop) noexcept {
-  NV_IF_TARGET(NV_PROVIDES_SM_80,(
-    if (!__isGlobal((void*)__ptr)) return;
-
-    char* __p = reinterpret_cast<char*>(const_cast<void*>(__ptr));
-    static constexpr std::size_t _LINE_SIZE = 128;
-    std::size_t __nbytes = static_cast<std::size_t>(__shape);
-    std::size_t __end = ((std::uintptr_t)(__p + __nbytes) % _LINE_SIZE) ? __nbytes + _LINE_SIZE : __nbytes;
-    __end /= _LINE_SIZE;
-
-    //Apply to all 128 bytes aligned cache lines inclusive of __p
-    for (std::size_t __i = 0; __i < __end; __i += _LINE_SIZE) {
-      asm volatile ("prefetch.global.L2::evict_last [%0];" ::"l"(__p + (__i * _LINE_SIZE)) :);
-    }
-  ))
+_CCCL_HOST_DEVICE void
+apply_access_property(const volatile void* __ptr, const _Shape __shape, access_property::persisting __prop) noexcept
+{
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_80,
+    (if (!__isGlobal((void*) __ptr)) return;
+
+     char* __p                               = reinterpret_cast<char*>(const_cast<void*>(__ptr));
+     static constexpr std::size_t _LINE_SIZE = 128;
+     std::size_t __nbytes                    = static_cast<std::size_t>(__shape);
+     std::size_t __end = ((std::uintptr_t)(__p + __nbytes) % _LINE_SIZE) ? __nbytes + _LINE_SIZE : __nbytes;
+     __end /= _LINE_SIZE;
+
+     // Apply to all 128 bytes aligned cache lines inclusive of __p
+     for (std::size_t __i = 0; __i < __end; __i += _LINE_SIZE) {
+       asm volatile("prefetch.global.L2::evict_last [%0];" ::"l"(__p + (__i * _LINE_SIZE)) :);
+     }))
 }
 
 template <class _Shape>
-_CCCL_HOST_DEVICE
-void apply_access_property(const volatile void* __ptr, const _Shape __shape, access_property::normal __prop) noexcept {
-  NV_IF_TARGET(NV_PROVIDES_SM_80,(
-    if (!__isGlobal((void*)__ptr)) return;
-
-    char* __p = reinterpret_cast<char*>(const_cast<void*>(__ptr));
-    static constexpr std::size_t _LINE_SIZE = 128;
-    std::size_t __nbytes = static_cast<std::size_t>(__shape);
-    std::size_t __end = ((std::uintptr_t)(__p + __nbytes) % _LINE_SIZE) ? __nbytes + _LINE_SIZE : __nbytes;
-    __end /= _LINE_SIZE;
-
-    //Apply to all 128 bytes aligned cache lines inclusive of __p
-    for (std::size_t __i = 0; __i < __end; __i += _LINE_SIZE) {
-      asm volatile ("prefetch.global.L2::evict_normal [%0];" ::"l"(__p + (__i * _LINE_SIZE)) :);
-    }
-  ))
+_CCCL_HOST_DEVICE void
+apply_access_property(const volatile void* __ptr, const _Shape __shape, access_property::normal __prop) noexcept
+{
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_80,
+    (if (!__isGlobal((void*) __ptr)) return;
+
+     char* __p                               = reinterpret_cast<char*>(const_cast<void*>(__ptr));
+     static constexpr std::size_t _LINE_SIZE = 128;
+     std::size_t __nbytes                    = static_cast<std::size_t>(__shape);
+     std::size_t __end = ((std::uintptr_t)(__p + __nbytes) % _LINE_SIZE) ? __nbytes + _LINE_SIZE : __nbytes;
+     __end /= _LINE_SIZE;
+
+     // Apply to all 128 bytes aligned cache lines inclusive of __p
+     for (std::size_t __i = 0; __i < __end; __i += _LINE_SIZE) {
+       asm volatile("prefetch.global.L2::evict_normal [%0];" ::"l"(__p + (__i * _LINE_SIZE)) :);
+     }))
 }
 
-template<class _Tp, class _Property>
-class annotated_ptr: public __detail_ap::__annotated_ptr_base<_Property> {
-  public:
-    using value_type = _Tp;
-    using size_type = std::size_t;
-    using reference = value_type&;
-    using pointer = value_type*;
-    using const_pointer = value_type const*;
-    using difference_type = std::ptrdiff_t;
-
-  private:
-    using __self = annotated_ptr<_Tp, _Property>;
-
-    // Converting from a 64-bit to 32-bit shared pointer and maybe back just for storage might or might not be profitable.
-    pointer __repr = (pointer)((size_type)nullptr);
-
-    _CCCL_HOST_DEVICE pointer __get(bool __skip_prop = false, difference_type __n = 0) const {
-      NV_IF_TARGET(NV_IS_DEVICE,(
-        if (!__skip_prop) {
-          return static_cast<pointer>(this->__apply_prop(const_cast<void*>(static_cast<const volatile void*>(__repr + __n))));
-        }
-      ))
-      return __repr + __n;
-    }
-    _CCCL_HOST_DEVICE pointer __offset(difference_type __n, bool __skip_prop = false) const {
-      return __get(__skip_prop, __n);
-    }
-
-  public:
-    _CCCL_HOST_DEVICE pointer operator->() const {
-      return __get();
-    }
-
-    _CCCL_HOST_DEVICE reference operator*() const {
-      return *__get();
-    }
-
-    _CCCL_HOST_DEVICE reference operator[](difference_type __n) const {
-      return *__offset(__n);
-    }
-
-    _CCCL_HOST_DEVICE constexpr difference_type operator-(annotated_ptr o) const {
-      return __repr - o.__repr;
-    }
-
-    constexpr annotated_ptr() noexcept = default;
-    constexpr annotated_ptr(annotated_ptr const&) noexcept = default;
-    // No constexpr for c11 as the method can't be const
-    _CCCL_CONSTEXPR_CXX14 annotated_ptr& operator=(annotated_ptr const& other) noexcept = default;
-
-    _CCCL_HOST_DEVICE explicit annotated_ptr(pointer __p)
+template <class _Tp, class _Property>
+class annotated_ptr : public __detail_ap::__annotated_ptr_base<_Property>
+{
+public:
+  using value_type      = _Tp;
+  using size_type       = std::size_t;
+  using reference       = value_type&;
+  using pointer         = value_type*;
+  using const_pointer   = value_type const*;
+  using difference_type = std::ptrdiff_t;
+
+private:
+  using __self = annotated_ptr<_Tp, _Property>;
+
+  // Converting from a 64-bit to 32-bit shared pointer and maybe back just for storage might or might not be profitable.
+  pointer __repr = (pointer) ((size_type) nullptr);
+
+  _CCCL_HOST_DEVICE pointer __get(bool __skip_prop = false, difference_type __n = 0) const
+  {
+    NV_IF_TARGET(NV_IS_DEVICE, (if (!__skip_prop) {
+                   return static_cast<pointer>(
+                     this->__apply_prop(const_cast<void*>(static_cast<const volatile void*>(__repr + __n))));
+                 }))
+    return __repr + __n;
+  }
+  _CCCL_HOST_DEVICE pointer __offset(difference_type __n, bool __skip_prop = false) const
+  {
+    return __get(__skip_prop, __n);
+  }
+
+public:
+  _CCCL_HOST_DEVICE pointer operator->() const
+  {
+    return __get();
+  }
+
+  _CCCL_HOST_DEVICE reference operator*() const
+  {
+    return *__get();
+  }
+
+  _CCCL_HOST_DEVICE reference operator[](difference_type __n) const
+  {
+    return *__offset(__n);
+  }
+
+  _CCCL_HOST_DEVICE constexpr difference_type operator-(annotated_ptr o) const
+  {
+    return __repr - o.__repr;
+  }
+
+  constexpr annotated_ptr() noexcept                     = default;
+  constexpr annotated_ptr(annotated_ptr const&) noexcept = default;
+  // No constexpr for c11 as the method can't be const
+  _CCCL_CONSTEXPR_CXX14 annotated_ptr& operator=(annotated_ptr const& other) noexcept = default;
+
+  _CCCL_HOST_DEVICE explicit annotated_ptr(pointer __p)
       : __repr(__p)
-    {
-      NV_IF_TARGET(NV_IS_DEVICE,(
-        _LIBCUDACXX_DEBUG_ASSERT((std::is_same<_Property, shared>::value && __isShared(__p) || __isGlobal(__p)), "");
-      ))
-    }
-
-    template <typename _RuntimeProperty>
-    _CCCL_HOST_DEVICE annotated_ptr(pointer __p, _RuntimeProperty __prop)
-      : __detail_ap::__annotated_ptr_base<_Property>(static_cast<std::uint64_t>(access_property(__prop))), __repr(__p)
-    {
-      static_assert(std::is_same<_Property, access_property>::value,
-		    "This method requires annotated_ptr<T, cuda::access_property>");
-      static_assert(std::is_same<_RuntimeProperty, access_property::global>::value ||
-                    std::is_same<_RuntimeProperty, access_property::normal>::value ||
-                    std::is_same<_RuntimeProperty, access_property::streaming>::value ||
-                    std::is_same<_RuntimeProperty, access_property::persisting>::value ||
-                    std::is_same<_RuntimeProperty, access_property>::value,
-                    "This method requires RuntimeProperty=global|normal|streaming|persisting|access_property");
-      NV_IF_TARGET(NV_IS_DEVICE,(
-        _LIBCUDACXX_DEBUG_ASSERT((__isGlobal(__p) == true), "");
-      ))
-    }
-
-    template<class _TTp, class _Prop>
-    _CCCL_HOST_DEVICE annotated_ptr(const annotated_ptr<_TTp,_Prop>& __other);
-
-    _CCCL_HOST_DEVICE constexpr explicit operator bool() const noexcept {
-      return __repr != nullptr;
-    }
-
-    _CCCL_HOST_DEVICE pointer get() const noexcept {
-      constexpr bool __is_shared = std::is_same<_Property, access_property::shared>::value;
-      return __is_shared ? __repr : &(*annotated_ptr<value_type, access_property::global>(__repr));
-    }
-
-    _CCCL_HOST_DEVICE _Property __property() const noexcept {
-      return this->__get_property();
-    }
+  {
+    NV_IF_TARGET(
+      NV_IS_DEVICE,
+      (_LIBCUDACXX_DEBUG_ASSERT((std::is_same<_Property, shared>::value && __isShared(__p) || __isGlobal(__p)), "");))
+  }
+
+  template <typename _RuntimeProperty>
+  _CCCL_HOST_DEVICE annotated_ptr(pointer __p, _RuntimeProperty __prop)
+      : __detail_ap::__annotated_ptr_base<_Property>(static_cast<std::uint64_t>(access_property(__prop)))
+      , __repr(__p)
+  {
+    static_assert(std::is_same<_Property, access_property>::value,
+                  "This method requires annotated_ptr<T, cuda::access_property>");
+    static_assert(
+      std::is_same<_RuntimeProperty, access_property::global>::value
+        || std::is_same<_RuntimeProperty, access_property::normal>::value
+        || std::is_same<_RuntimeProperty, access_property::streaming>::value
+        || std::is_same<_RuntimeProperty, access_property::persisting>::value
+        || std::is_same<_RuntimeProperty, access_property>::value,
+      "This method requires RuntimeProperty=global|normal|streaming|persisting|access_property");
+    NV_IF_TARGET(NV_IS_DEVICE, (_LIBCUDACXX_DEBUG_ASSERT((__isGlobal(__p) == true), "");))
+  }
+
+  template <class _TTp, class _Prop>
+  _CCCL_HOST_DEVICE annotated_ptr(const annotated_ptr<_TTp, _Prop>& __other);
+
+  _CCCL_HOST_DEVICE constexpr explicit operator bool() const noexcept
+  {
+    return __repr != nullptr;
+  }
+
+  _CCCL_HOST_DEVICE pointer get() const noexcept
+  {
+    constexpr bool __is_shared = std::is_same<_Property, access_property::shared>::value;
+    return __is_shared ? __repr : &(*annotated_ptr<value_type, access_property::global>(__repr));
+  }
+
+  _CCCL_HOST_DEVICE _Property __property() const noexcept
+  {
+    return this->__get_property();
+  }
 };
 
-
-template<class _Tp, class _Property>
-template<class _TTp, class _Prop>
-_CCCL_HOST_DEVICE annotated_ptr<_Tp, _Property>::annotated_ptr(const annotated_ptr<_TTp,_Prop>& __other)
-  : __detail_ap::__annotated_ptr_base<_Property>(__other.__property()), __repr(__other.get())
+template <class _Tp, class _Property>
+template <class _TTp, class _Prop>
+_CCCL_HOST_DEVICE annotated_ptr<_Tp, _Property>::annotated_ptr(const annotated_ptr<_TTp, _Prop>& __other)
+    : __detail_ap::__annotated_ptr_base<_Property>(__other.__property())
+    , __repr(__other.get())
 {
   static_assert(std::is_assignable<pointer&, _TTp*>::value, "pointer must be assignable from other pointer");
-  static_assert((std::is_same<_Property, access_property>::value && !std::is_same<_Prop, access_property::shared>::value) ||
-		std::is_same<_Property, _Prop>::value, "Property must be either access_property or other property, and both properties must have same address space");
+  static_assert(
+    (std::is_same<_Property, access_property>::value && !std::is_same<_Prop, access_property::shared>::value)
+      || std::is_same<_Property, _Prop>::value,
+    "Property must be either access_property or other property, and both properties must have same address space");
   // note: precondition "__other.__rep must be compatible with _Property" currently always holds
 }
 
-template<class _Dst, class _Src, class _SrcProperty, class _Shape, class _Sync>
-_CCCL_HOST_DEVICE
-void memcpy_async(_Dst* __dst,
-    annotated_ptr<_Src,_SrcProperty> __src,
-    _Shape __shape, _Sync & __sync) {
+template <class _Dst, class _Src, class _SrcProperty, class _Shape, class _Sync>
+_CCCL_HOST_DEVICE void memcpy_async(_Dst* __dst, annotated_ptr<_Src, _SrcProperty> __src, _Shape __shape, _Sync& __sync)
+{
   memcpy_async(__dst, &(*__src), __shape, __sync);
 }
 
-template<class _Dst, class _DstProperty, class _Src, class _SrcProperty,
-  class _Shape, class _Sync>
-_CCCL_HOST_DEVICE
-void memcpy_async(annotated_ptr<_Dst,_DstProperty> __dst,
-    annotated_ptr<_Src,_SrcProperty> __src,
-    _Shape __shape, _Sync & __sync){
+template <class _Dst, class _DstProperty, class _Src, class _SrcProperty, class _Shape, class _Sync>
+_CCCL_HOST_DEVICE void memcpy_async(
+  annotated_ptr<_Dst, _DstProperty> __dst, annotated_ptr<_Src, _SrcProperty> __src, _Shape __shape, _Sync& __sync)
+{
   memcpy_async(&(*__dst), &(*__src), __shape, __sync);
 }
 
-template<class _Group, class _Dst, class _Src, class _SrcProperty,
-  class _Shape, class _Sync>
-_CCCL_HOST_DEVICE
-void memcpy_async(const _Group & __group,
-    _Dst * __dst,
-    annotated_ptr<_Src,_SrcProperty> __src,
-    _Shape __shape, _Sync & __sync) {
+template <class _Group, class _Dst, class _Src, class _SrcProperty, class _Shape, class _Sync>
+_CCCL_HOST_DEVICE void
+memcpy_async(const _Group& __group, _Dst* __dst, annotated_ptr<_Src, _SrcProperty> __src, _Shape __shape, _Sync& __sync)
+{
   memcpy_async(__group, __dst, &(*__src), __shape, __sync);
 }
 
-template<class _Group, class _Dst, class _DstProperty, class _Src, class _SrcProperty,
-  class _Shape, class _Sync>
-_CCCL_HOST_DEVICE
-void memcpy_async(const _Group & __group,
-    annotated_ptr<_Dst,_DstProperty> __dst,
-    annotated_ptr<_Src,_SrcProperty> __src,
-    _Shape __shape, _Sync & __sync) {
+template <class _Group, class _Dst, class _DstProperty, class _Src, class _SrcProperty, class _Shape, class _Sync>
+_CCCL_HOST_DEVICE void memcpy_async(
+  const _Group& __group,
+  annotated_ptr<_Dst, _DstProperty> __dst,
+  annotated_ptr<_Src, _SrcProperty> __src,
+  _Shape __shape,
+  _Sync& __sync)
+{
   memcpy_async(__group, &(*__dst), &(*__src), __shape, __sync);
 }
 
diff --git a/libcudacxx/include/cuda/barrier b/libcudacxx/include/cuda/barrier
index e19684cfece..99117dde90b 100644
--- a/libcudacxx/include/cuda/barrier
+++ b/libcudacxx/include/cuda/barrier
@@ -21,8 +21,8 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/barrier>
 #include <cuda/ptx>
+#include <cuda/std/barrier>
 
 // Forward-declare CUtensorMap for use in cp_async_bulk_tensor_* PTX wrapping
 // functions. These functions take a pointer to CUtensorMap, so do not need to
@@ -54,175 +54,185 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL
 #ifdef __cccl_lib_experimental_ctk12_cp_async_exposure
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
-inline _CCCL_DEVICE
-void cp_async_bulk_global_to_shared(void *__dest, const void *__src, _CUDA_VSTD::uint32_t __size, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
+inline _CCCL_DEVICE void cp_async_bulk_global_to_shared(
+  void* __dest, const void* __src, _CUDA_VSTD::uint32_t __size, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
 {
-    _LIBCUDACXX_DEBUG_ASSERT(__size % 16 == 0,   "Size must be multiple of 16.");
-    _LIBCUDACXX_DEBUG_ASSERT(__isShared(__dest), "Destination must be shared memory address.");
-    _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__src),  "Source must be global memory address.");
-
-    _CUDA_VPTX::cp_async_bulk(
-        _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
-        __dest, __src, __size,
-        ::cuda::device::barrier_native_handle(__bar));
+  _LIBCUDACXX_DEBUG_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
+  _LIBCUDACXX_DEBUG_ASSERT(__isShared(__dest), "Destination must be shared memory address.");
+  _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__src), "Source must be global memory address.");
+
+  _CUDA_VPTX::cp_async_bulk(
+    _CUDA_VPTX::space_cluster,
+    _CUDA_VPTX::space_global,
+    __dest,
+    __src,
+    __size,
+    ::cuda::device::barrier_native_handle(__bar));
 }
 
-
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
-inline _CCCL_DEVICE
-void cp_async_bulk_shared_to_global(void *__dest, const void * __src, _CUDA_VSTD::uint32_t __size)
+inline _CCCL_DEVICE void cp_async_bulk_shared_to_global(void* __dest, const void* __src, _CUDA_VSTD::uint32_t __size)
 {
-    _LIBCUDACXX_DEBUG_ASSERT(__size % 16 == 0,   "Size must be multiple of 16.");
-    _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__dest), "Destination must be global memory address.");
-    _LIBCUDACXX_DEBUG_ASSERT(__isShared(__src),  "Source must be shared memory address.");
+  _LIBCUDACXX_DEBUG_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
+  _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__dest), "Destination must be global memory address.");
+  _LIBCUDACXX_DEBUG_ASSERT(__isShared(__src), "Source must be shared memory address.");
 
-    _CUDA_VPTX::cp_async_bulk(
-        _CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
-        __dest, __src, __size);
+  _CUDA_VPTX::cp_async_bulk(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __dest, __src, __size);
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE
-void cp_async_bulk_tensor_1d_global_to_shared(
-    void *__dest, const CUtensorMap *__tensor_map , int __c0, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
+inline _CCCL_DEVICE void cp_async_bulk_tensor_1d_global_to_shared(
+  void* __dest, const CUtensorMap* __tensor_map, int __c0, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
 {
-    const _CUDA_VSTD::int32_t __coords[]{__c0};
-
-    _CUDA_VPTX::cp_async_bulk_tensor(
-        _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
-        __dest, __tensor_map, __coords,
-        ::cuda::device::barrier_native_handle(__bar));
+  const _CUDA_VSTD::int32_t __coords[]{__c0};
+
+  _CUDA_VPTX::cp_async_bulk_tensor(
+    _CUDA_VPTX::space_cluster,
+    _CUDA_VPTX::space_global,
+    __dest,
+    __tensor_map,
+    __coords,
+    ::cuda::device::barrier_native_handle(__bar));
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE
-void cp_async_bulk_tensor_2d_global_to_shared(
-    void *__dest, const CUtensorMap *__tensor_map , int __c0, int __c1, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
+inline _CCCL_DEVICE void cp_async_bulk_tensor_2d_global_to_shared(
+  void* __dest, const CUtensorMap* __tensor_map, int __c0, int __c1, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
 {
-    const _CUDA_VSTD::int32_t __coords[]{__c0, __c1};
-
-    _CUDA_VPTX::cp_async_bulk_tensor(
-        _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
-        __dest, __tensor_map, __coords,
-        ::cuda::device::barrier_native_handle(__bar));
+  const _CUDA_VSTD::int32_t __coords[]{__c0, __c1};
+
+  _CUDA_VPTX::cp_async_bulk_tensor(
+    _CUDA_VPTX::space_cluster,
+    _CUDA_VPTX::space_global,
+    __dest,
+    __tensor_map,
+    __coords,
+    ::cuda::device::barrier_native_handle(__bar));
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE
-void cp_async_bulk_tensor_3d_global_to_shared(
-    void *__dest, const CUtensorMap *__tensor_map, int __c0, int __c1, int __c2, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
+inline _CCCL_DEVICE void cp_async_bulk_tensor_3d_global_to_shared(
+  void* __dest,
+  const CUtensorMap* __tensor_map,
+  int __c0,
+  int __c1,
+  int __c2,
+  ::cuda::barrier<::cuda::thread_scope_block>& __bar)
 {
-    const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2};
-
-    _CUDA_VPTX::cp_async_bulk_tensor(
-        _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
-        __dest, __tensor_map, __coords,
-        ::cuda::device::barrier_native_handle(__bar));
+  const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2};
+
+  _CUDA_VPTX::cp_async_bulk_tensor(
+    _CUDA_VPTX::space_cluster,
+    _CUDA_VPTX::space_global,
+    __dest,
+    __tensor_map,
+    __coords,
+    ::cuda::device::barrier_native_handle(__bar));
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE
-void cp_async_bulk_tensor_4d_global_to_shared(
-    void *__dest, const CUtensorMap *__tensor_map , int __c0, int __c1, int __c2, int __c3, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
+inline _CCCL_DEVICE void cp_async_bulk_tensor_4d_global_to_shared(
+  void* __dest,
+  const CUtensorMap* __tensor_map,
+  int __c0,
+  int __c1,
+  int __c2,
+  int __c3,
+  ::cuda::barrier<::cuda::thread_scope_block>& __bar)
 {
-    const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3};
-
-    _CUDA_VPTX::cp_async_bulk_tensor(
-        _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
-        __dest, __tensor_map, __coords,
-        ::cuda::device::barrier_native_handle(__bar));
+  const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3};
+
+  _CUDA_VPTX::cp_async_bulk_tensor(
+    _CUDA_VPTX::space_cluster,
+    _CUDA_VPTX::space_global,
+    __dest,
+    __tensor_map,
+    __coords,
+    ::cuda::device::barrier_native_handle(__bar));
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE
-void cp_async_bulk_tensor_5d_global_to_shared(
-    void *__dest, const CUtensorMap *__tensor_map , int __c0, int __c1, int __c2, int __c3, int __c4, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
+inline _CCCL_DEVICE void cp_async_bulk_tensor_5d_global_to_shared(
+  void* __dest,
+  const CUtensorMap* __tensor_map,
+  int __c0,
+  int __c1,
+  int __c2,
+  int __c3,
+  int __c4,
+  ::cuda::barrier<::cuda::thread_scope_block>& __bar)
 {
-    const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};
-
-    _CUDA_VPTX::cp_async_bulk_tensor(
-        _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
-        __dest, __tensor_map, __coords,
-        ::cuda::device::barrier_native_handle(__bar));
+  const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};
+
+  _CUDA_VPTX::cp_async_bulk_tensor(
+    _CUDA_VPTX::space_cluster,
+    _CUDA_VPTX::space_global,
+    __dest,
+    __tensor_map,
+    __coords,
+    ::cuda::device::barrier_native_handle(__bar));
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE
-void cp_async_bulk_tensor_1d_shared_to_global(
-    const CUtensorMap *__tensor_map, int __c0, const void *__src)
+inline _CCCL_DEVICE void
+cp_async_bulk_tensor_1d_shared_to_global(const CUtensorMap* __tensor_map, int __c0, const void* __src)
 {
-    const _CUDA_VSTD::int32_t __coords[]{__c0};
+  const _CUDA_VSTD::int32_t __coords[]{__c0};
 
-    _CUDA_VPTX::cp_async_bulk_tensor(
-        _CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
-        __tensor_map, __coords, __src);
+  _CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src);
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE
-void cp_async_bulk_tensor_2d_shared_to_global(
-    const CUtensorMap *__tensor_map, int __c0, int __c1, const void *__src)
+inline _CCCL_DEVICE void
+cp_async_bulk_tensor_2d_shared_to_global(const CUtensorMap* __tensor_map, int __c0, int __c1, const void* __src)
 {
-    const _CUDA_VSTD::int32_t __coords[]{__c0, __c1};
+  const _CUDA_VSTD::int32_t __coords[]{__c0, __c1};
 
-    _CUDA_VPTX::cp_async_bulk_tensor(
-        _CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
-        __tensor_map, __coords, __src);
+  _CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src);
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE
-void cp_async_bulk_tensor_3d_shared_to_global(
-    const CUtensorMap *__tensor_map, int __c0, int __c1, int __c2, const void *__src)
+inline _CCCL_DEVICE void cp_async_bulk_tensor_3d_shared_to_global(
+  const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, const void* __src)
 {
-    const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2};
+  const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2};
 
-    _CUDA_VPTX::cp_async_bulk_tensor(
-        _CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
-        __tensor_map, __coords, __src);
+  _CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src);
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE
-void cp_async_bulk_tensor_4d_shared_to_global(
-    const CUtensorMap *__tensor_map, int __c0, int __c1, int __c2, int __c3, const void *__src)
+inline _CCCL_DEVICE void cp_async_bulk_tensor_4d_shared_to_global(
+  const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, int __c3, const void* __src)
 {
-    const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3};
+  const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3};
 
-    _CUDA_VPTX::cp_async_bulk_tensor(
-        _CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
-        __tensor_map, __coords, __src);
+  _CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src);
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE
-void cp_async_bulk_tensor_5d_shared_to_global(
-    const CUtensorMap *__tensor_map, int __c0, int __c1, int __c2, int __c3, int __c4, const void *__src)
+inline _CCCL_DEVICE void cp_async_bulk_tensor_5d_shared_to_global(
+  const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, int __c3, int __c4, const void* __src)
 {
-    const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};
+  const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};
 
-    _CUDA_VPTX::cp_async_bulk_tensor(
-        _CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
-        __tensor_map, __coords, __src);
+  _CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src);
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar
-inline _CCCL_DEVICE
-void fence_proxy_async_shared_cta() {
-    _CUDA_VPTX::fence_proxy_async(_CUDA_VPTX::space_shared);
+inline _CCCL_DEVICE void fence_proxy_async_shared_cta()
+{
+  _CUDA_VPTX::fence_proxy_async(_CUDA_VPTX::space_shared);
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group
-inline _CCCL_DEVICE
-void cp_async_bulk_commit_group()
+inline _CCCL_DEVICE void cp_async_bulk_commit_group()
 {
-    _CUDA_VPTX::cp_async_bulk_commit_group();
+  _CUDA_VPTX::cp_async_bulk_commit_group();
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group
 template <int __n_prior>
-inline _CCCL_DEVICE
-void cp_async_bulk_wait_group_read()
+inline _CCCL_DEVICE void cp_async_bulk_wait_group_read()
 {
   static_assert(__n_prior <= 63, "cp_async_bulk_wait_group_read: waiting for more than 63 groups is not supported.");
   _CUDA_VPTX::cp_async_bulk_wait_group_read(_CUDA_VPTX::n32_t<__n_prior>{});
diff --git a/libcudacxx/include/cuda/discard_memory b/libcudacxx/include/cuda/discard_memory
index cc4963874ae..6da2ea209c4 100644
--- a/libcudacxx/include/cuda/discard_memory
+++ b/libcudacxx/include/cuda/discard_memory
@@ -36,14 +36,14 @@ inline _CCCL_HOST_DEVICE void discard_memory(volatile void* __ptr, size_t __nbyt
     NV_PROVIDES_SM_80,
     (if (!__isGlobal((void*) __ptr)) return;
 
-     char* __p                               = reinterpret_cast<char*>(const_cast<void*>(__ptr));
-     char* const __end_p                     = __p + __nbytes;
+     char* __p                          = reinterpret_cast<char*>(const_cast<void*>(__ptr));
+     char* const __end_p                = __p + __nbytes;
      static constexpr size_t _LINE_SIZE = 128;
 
      // Trim the first block and last block if they're not 128 bytes aligned
-     size_t __misalignment = reinterpret_cast<uintptr_t>(__p) % _LINE_SIZE;
-     char* __start_aligned      = __misalignment == 0 ? __p : __p + (_LINE_SIZE - __misalignment);
-     char* const __end_aligned  = __end_p - (reinterpret_cast<uintptr_t>(__end_p) % _LINE_SIZE);
+     size_t __misalignment     = reinterpret_cast<uintptr_t>(__p) % _LINE_SIZE;
+     char* __start_aligned     = __misalignment == 0 ? __p : __p + (_LINE_SIZE - __misalignment);
+     char* const __end_aligned = __end_p - (reinterpret_cast<uintptr_t>(__end_p) % _LINE_SIZE);
 
      while (__start_aligned < __end_aligned) {
        asm volatile("discard.global.L2 [%0], 128;" ::"l"(__start_aligned) :);
diff --git a/libcudacxx/include/cuda/functional b/libcudacxx/include/cuda/functional
index 7820c8352cc..d88472f50a0 100644
--- a/libcudacxx/include/cuda/functional
+++ b/libcudacxx/include/cuda/functional
@@ -4,50 +4,128 @@
  *
  * NVIDIA SOFTWARE LICENSE
  *
- * This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the NVIDIA/CUDA C++ Library software and materials provided hereunder (“SOFTWARE”).
+ * This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the
+ * NVIDIA/CUDA C++ Library software and materials provided hereunder (“SOFTWARE”).
  *
- * This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used. If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of this license, and you take legal and financial responsibility for the actions of your permitted users.
+ * This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used.
+ * If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By
+ * taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of
+ * this license, and you take legal and financial responsibility for the actions of your permitted users.
  *
- * You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law, regulation or generally accepted practices or guidelines in the relevant jurisdictions.
+ * You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law,
+ * regulation or generally accepted practices or guidelines in the relevant jurisdictions.
  *
- * 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under this license.
+ * 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install
+ * and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this
+ * license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under
+ * this license.
  *
  * 2. DISTRIBUTION REQUIREMENTS. These are the distribution requirements for you to exercise the distribution grant:
- * a.      The terms under which you distribute the SOFTWARE must be consistent with the terms of this license, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA’s intellectual property rights.
- * b.      You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms of your agreements with respect to distributed SOFTWARE.
+ * a.      The terms under which you distribute the SOFTWARE must be consistent with the terms of this license,
+ * including (without limitation) terms relating to the license grant and license restrictions and protection of
+ * NVIDIA’s intellectual property rights. b.      You agree to notify NVIDIA in writing of any known or suspected
+ * distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms
+ * of your agreements with respect to distributed SOFTWARE.
  *
  * 3. LIMITATIONS. Your license to use the SOFTWARE is restricted as follows:
  * a.      The SOFTWARE is licensed for you to develop applications only for use in systems with NVIDIA GPUs.
- * b.      You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SOFTWARE or copies of the SOFTWARE.
- * c.      You may not modify or create derivative works of any portion of the SOFTWARE.
- * d.      You may not bypass, disable, or circumvent any technical measure, encryption, security, digital rights management or authentication mechanism in the SOFTWARE.
- * e.      You may not use the SOFTWARE in any manner that would cause it to become subject to an open source software license. As examples, licenses that require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge.
- * f.      Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses.
- * g.      You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to use of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms.
- *
- * 4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in production or business-critical systems.
- *
- * 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time without notice, but is not obligated to support or update the SOFTWARE.
- *
- * 6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is a conflict between the terms in this license and the license terms associated with a component, the license terms associated with the components control only to the extent necessary to resolve the conflict.
- *
- * 7. FEEDBACK. You may, but don’t have to, provide to NVIDIA any Feedback. “Feedback” means any suggestions, bug fixes, enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA will use Feedback at its choice.
- *
- * 8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED.
- *
- * 9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT.
- *
- * 10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail to comply with any term and condition of this license or if you commence or participate in any legal proceeding against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA’s sole discretion, the continued use of it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this license are not affected by the termination of this license. All provisions of this license will survive termination, except for the license granted to you.
- *
- * 11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
- *
- * 12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or operation of law without NVIDIA’s permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect.
- *
- * 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship, transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from receiving the SOFTWARE.
- *
- * 14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is “commercial items” consisting of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051.
- *
- * 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect. This license may only be modified in a writing signed by an authorized representative of each party.
+ * b.      You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from
+ * any portion of the SOFTWARE or copies of the SOFTWARE. c.      You may not modify or create derivative works of any
+ * portion of the SOFTWARE. d.      You may not bypass, disable, or circumvent any technical measure, encryption,
+ * security, digital rights management or authentication mechanism in the SOFTWARE. e.      You may not use the SOFTWARE
+ * in any manner that would cause it to become subject to an open source software license. As examples, licenses that
+ * require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in
+ * source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge. f.
+ * Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or
+ * application where the use or failure of the system or application can reasonably be expected to threaten or result in
+ * personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life
+ * support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these
+ * critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or
+ * damages arising from such uses. g.      You agree to defend, indemnify and hold harmless NVIDIA and its affiliates,
+ * and their respective employees, contractors, agents, officers and directors, from and against any and all claims,
+ * damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited
+ * to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to use
+ * of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms.
+ *
+ * 4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may
+ * not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy,
+ * availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use
+ * a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in
+ * production or business-critical systems.
+ *
+ * 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and
+ * exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United
+ * States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time
+ * without notice, but is not obligated to support or update the SOFTWARE.
+ *
+ * 6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal
+ * notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is
+ * a conflict between the terms in this license and the license terms associated with a component, the license terms
+ * associated with the components control only to the extent necessary to resolve the conflict.
+ *
+ * 7. FEEDBACK. You may, but don’t have to, provide to NVIDIA any Feedback. “Feedback” means any suggestions, bug fixes,
+ * enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you
+ * voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable
+ * license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute
+ * (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA
+ * will use Feedback at its choice.
+ *
+ * 8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT
+ * NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT
+ * WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR
+ * ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED.
+ *
+ * 9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE
+ * FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE,
+ * LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH
+ * THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON
+ * BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION
+ * OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE
+ * POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING
+ * OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE
+ * OR EXTEND THIS LIMIT.
+ *
+ * 10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail
+ * to comply with any term and condition of this license or if you commence or participate in any legal proceeding
+ * against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if
+ * NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA’s sole discretion, the continued use of
+ * it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of
+ * the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this
+ * license are not affected by the termination of this license. All provisions of this license will survive termination,
+ * except for the license granted to you.
+ *
+ * 11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State
+ * of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware
+ * residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the
+ * International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English
+ * language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction
+ * over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be
+ * allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
+ *
+ * 12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or
+ * operation of law without NVIDIA’s permission. Any attempted assignment not approved by NVIDIA in writing shall be
+ * void and of no effect.
+ *
+ * 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship,
+ * transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States
+ * Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s
+ * Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws
+ * include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not
+ * a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from
+ * receiving the SOFTWARE.
+ *
+ * 14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is “commercial items” consisting
+ * of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS.
+ * Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the
+ * restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the
+ * Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is
+ * NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051.
+ *
+ * 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the
+ * subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to
+ * this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of
+ * this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect.
+ * This license may only be modified in a writing signed by an authorized representative of each party.
  *
  * (v. August 20, 2021)
  */
@@ -65,101 +143,83 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/type_traits>
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
 #include <cuda/std/functional>
+#include <cuda/std/type_traits>
 #include <cuda/std/utility>
-#include <cuda/std/__algorithm/min.h>
-#include <cuda/std/__algorithm/max.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 namespace __detail
 {
 
 template <class _Ret, class _DecayFn>
-class __return_type_wrapper {
- private:
+class __return_type_wrapper
+{
+private:
   _DecayFn __fn_;
 
- public:
+public:
   __return_type_wrapper() = delete;
 
   template <class _Fn,
             class = _CUDA_VSTD::__enable_if_t<_CUDA_VSTD::is_same<_CUDA_VSTD::__decay_t<_Fn>, _DecayFn>::value>>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
-  explicit __return_type_wrapper(_Fn &&__fn) noexcept
-    : __fn_(_CUDA_VSTD::forward<_Fn>(__fn)) {}
+  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 explicit __return_type_wrapper(_Fn&& __fn) noexcept
+      : __fn_(_CUDA_VSTD::forward<_Fn>(__fn))
+  {}
 
   template <class... _As>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
-  _Ret operator()(_As&&... __as) & noexcept {
+  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Ret operator()(_As&&... __as) & noexcept
+  {
 #if !defined(__NVCC__) || defined(__CUDA_ARCH__)
-    static_assert(
-        _CUDA_VSTD::is_same<
-            _Ret,
-            typename _CUDA_VSTD::__invoke_of<_DecayFn&, _As...>::type
-          >::value,
-        "Return type shall match the proclaimed one exactly");
+    static_assert(_CUDA_VSTD::is_same<_Ret, typename _CUDA_VSTD::__invoke_of<_DecayFn&, _As...>::type>::value,
+                  "Return type shall match the proclaimed one exactly");
 #endif
 
     return _CUDA_VSTD::__invoke(__fn_, _CUDA_VSTD::forward<_As>(__as)...);
   }
 
   template <class... _As>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
-  _Ret operator()(_As&&... __as) && noexcept {
+  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Ret operator()(_As&&... __as) && noexcept
+  {
 #if !defined(__NVCC__) || defined(__CUDA_ARCH__)
-    static_assert(
-        _CUDA_VSTD::is_same<
-            _Ret,
-            typename _CUDA_VSTD::__invoke_of<_DecayFn, _As...>::type
-          >::value,
-        "Return type shall match the proclaimed one exactly");
+    static_assert(_CUDA_VSTD::is_same<_Ret, typename _CUDA_VSTD::__invoke_of<_DecayFn, _As...>::type>::value,
+                  "Return type shall match the proclaimed one exactly");
 #endif
 
-    return _CUDA_VSTD::__invoke(_CUDA_VSTD::move(__fn_),
-                                _CUDA_VSTD::forward<_As>(__as)...);
+    return _CUDA_VSTD::__invoke(_CUDA_VSTD::move(__fn_), _CUDA_VSTD::forward<_As>(__as)...);
   }
 
   template <class... _As>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
-  _Ret operator()(_As&&... __as) const& noexcept {
+  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Ret operator()(_As&&... __as) const& noexcept
+  {
 #if !defined(__NVCC__) || defined(__CUDA_ARCH__)
-    static_assert(
-        _CUDA_VSTD::is_same<
-            _Ret,
-            typename _CUDA_VSTD::__invoke_of<const _DecayFn&, _As...>::type
-          >::value,
-        "Return type shall match the proclaimed one exactly");
+    static_assert(_CUDA_VSTD::is_same<_Ret, typename _CUDA_VSTD::__invoke_of<const _DecayFn&, _As...>::type>::value,
+                  "Return type shall match the proclaimed one exactly");
 #endif
 
     return _CUDA_VSTD::__invoke(__fn_, _CUDA_VSTD::forward<_As>(__as)...);
   }
 
   template <class... _As>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
-  _Ret operator()(_As&&... __as) const&& noexcept {
+  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Ret operator()(_As&&... __as) const&& noexcept
+  {
 #if !defined(__NVCC__) || defined(__CUDA_ARCH__)
-    static_assert(
-        _CUDA_VSTD::is_same<
-            _Ret,
-            typename _CUDA_VSTD::__invoke_of<const _DecayFn, _As...>::type
-          >::value,
-        "Return type shall match the proclaimed one exactly");
+    static_assert(_CUDA_VSTD::is_same<_Ret, typename _CUDA_VSTD::__invoke_of<const _DecayFn, _As...>::type>::value,
+                  "Return type shall match the proclaimed one exactly");
 #endif
 
-    return _CUDA_VSTD::__invoke(_CUDA_VSTD::move(__fn_),
-                                _CUDA_VSTD::forward<_As>(__as)...);
+    return _CUDA_VSTD::__invoke(_CUDA_VSTD::move(__fn_), _CUDA_VSTD::forward<_As>(__as)...);
   }
 };
 
-}  // __detail
+} // namespace __detail
 
 template <class _Ret, class _Fn>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-__detail::__return_type_wrapper<_Ret, _CUDA_VSTD::__decay_t<_Fn>>
-proclaim_return_type(_Fn&& __fn) noexcept {
-  return __detail::__return_type_wrapper<_Ret, _CUDA_VSTD::__decay_t<_Fn>>(
-      _CUDA_VSTD::forward<_Fn>(__fn));
+inline _LIBCUDACXX_INLINE_VISIBILITY __detail::__return_type_wrapper<_Ret, _CUDA_VSTD::__decay_t<_Fn>>
+proclaim_return_type(_Fn&& __fn) noexcept
+{
+  return __detail::__return_type_wrapper<_Ret, _CUDA_VSTD::__decay_t<_Fn>>(_CUDA_VSTD::forward<_Fn>(__fn));
 }
 _LIBCUDACXX_END_NAMESPACE_CUDA
 
diff --git a/libcudacxx/include/cuda/memory_resource b/libcudacxx/include/cuda/memory_resource
index a138995aa5f..4c23140f8db 100644
--- a/libcudacxx/include/cuda/memory_resource
+++ b/libcudacxx/include/cuda/memory_resource
@@ -80,17 +80,18 @@ class resource_ref {
 */
 // clang-format on
 
-#  include <cuda_runtime_api.h> // cuda_runtime_api needs to come first
-
-#  include "__cccl_config"
-
-#  if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#    pragma GCC system_header
-#  elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#    pragma clang system_header
-#  elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#    pragma system_header
-#  endif // no system header
+#include <cuda_runtime_api.h>
+// cuda_runtime_api needs to come first
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
 
 #include <cuda/__memory_resource/cuda_managed_memory_resource.h>
 #include <cuda/__memory_resource/cuda_memory_resource.h>
diff --git a/libcudacxx/include/cuda/pipeline b/libcudacxx/include/cuda/pipeline
index 509dfd65cbe..583a6fb6c72 100644
--- a/libcudacxx/include/cuda/pipeline
+++ b/libcudacxx/include/cuda/pipeline
@@ -3,50 +3,128 @@
  *
  * NVIDIA SOFTWARE LICENSE
  *
- * This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the NVIDIA/CUDA C++ Library software and materials provided hereunder (“SOFTWARE”).
+ * This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the
+ * NVIDIA/CUDA C++ Library software and materials provided hereunder (“SOFTWARE”).
  *
- * This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used. If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of this license, and you take legal and financial responsibility for the actions of your permitted users.
+ * This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used.
+ * If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By
+ * taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of
+ * this license, and you take legal and financial responsibility for the actions of your permitted users.
  *
- * You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law, regulation or generally accepted practices or guidelines in the relevant jurisdictions.
+ * You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law,
+ * regulation or generally accepted practices or guidelines in the relevant jurisdictions.
  *
- * 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under this license.
+ * 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install
+ * and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this
+ * license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under
+ * this license.
  *
  * 2. DISTRIBUTION REQUIREMENTS. These are the distribution requirements for you to exercise the distribution grant:
- * a.      The terms under which you distribute the SOFTWARE must be consistent with the terms of this license, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA’s intellectual property rights.
- * b.      You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms of your agreements with respect to distributed SOFTWARE.
+ * a.      The terms under which you distribute the SOFTWARE must be consistent with the terms of this license,
+ * including (without limitation) terms relating to the license grant and license restrictions and protection of
+ * NVIDIA’s intellectual property rights. b.      You agree to notify NVIDIA in writing of any known or suspected
+ * distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms
+ * of your agreements with respect to distributed SOFTWARE.
  *
  * 3. LIMITATIONS. Your license to use the SOFTWARE is restricted as follows:
  * a.      The SOFTWARE is licensed for you to develop applications only for use in systems with NVIDIA GPUs.
- * b.      You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SOFTWARE or copies of the SOFTWARE.
- * c.      You may not modify or create derivative works of any portion of the SOFTWARE.
- * d.      You may not bypass, disable, or circumvent any technical measure, encryption, security, digital rights management or authentication mechanism in the SOFTWARE.
- * e.      You may not use the SOFTWARE in any manner that would cause it to become subject to an open source software license. As examples, licenses that require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge.
- * f.      Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses.
- * g.      You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to use of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms.
+ * b.      You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from
+ * any portion of the SOFTWARE or copies of the SOFTWARE. c.      You may not modify or create derivative works of any
+ * portion of the SOFTWARE. d.      You may not bypass, disable, or circumvent any technical measure, encryption,
+ * security, digital rights management or authentication mechanism in the SOFTWARE. e.      You may not use the SOFTWARE
+ * in any manner that would cause it to become subject to an open source software license. As examples, licenses that
+ * require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in
+ * source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge. f.
+ * Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or
+ * application where the use or failure of the system or application can reasonably be expected to threaten or result in
+ * personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life
+ * support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these
+ * critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or
+ * damages arising from such uses. g.      You agree to defend, indemnify and hold harmless NVIDIA and its affiliates,
+ * and their respective employees, contractors, agents, officers and directors, from and against any and all claims,
+ * damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited
+ * to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to use
+ * of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms.
  *
- * 4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in production or business-critical systems.
+ * 4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may
+ * not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy,
+ * availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use
+ * a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in
+ * production or business-critical systems.
  *
- * 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time without notice, but is not obligated to support or update the SOFTWARE.
+ * 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and
+ * exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United
+ * States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time
+ * without notice, but is not obligated to support or update the SOFTWARE.
  *
- * 6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is a conflict between the terms in this license and the license terms associated with a component, the license terms associated with the components control only to the extent necessary to resolve the conflict.
+ * 6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal
+ * notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is
+ * a conflict between the terms in this license and the license terms associated with a component, the license terms
+ * associated with the components control only to the extent necessary to resolve the conflict.
  *
- * 7. FEEDBACK. You may, but don’t have to, provide to NVIDIA any Feedback. “Feedback” means any suggestions, bug fixes, enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA will use Feedback at its choice.
+ * 7. FEEDBACK. You may, but don’t have to, provide to NVIDIA any Feedback. “Feedback” means any suggestions, bug fixes,
+ * enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you
+ * voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable
+ * license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute
+ * (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA
+ * will use Feedback at its choice.
  *
- * 8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED.
+ * 8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT
+ * NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT
+ * WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR
+ * ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED.
  *
- * 9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT.
+ * 9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE
+ * FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE,
+ * LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH
+ * THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON
+ * BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION
+ * OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE
+ * POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING
+ * OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE
+ * OR EXTEND THIS LIMIT.
  *
- * 10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail to comply with any term and condition of this license or if you commence or participate in any legal proceeding against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA’s sole discretion, the continued use of it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this license are not affected by the termination of this license. All provisions of this license will survive termination, except for the license granted to you.
+ * 10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail
+ * to comply with any term and condition of this license or if you commence or participate in any legal proceeding
+ * against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if
+ * NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA’s sole discretion, the continued use of
+ * it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of
+ * the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this
+ * license are not affected by the termination of this license. All provisions of this license will survive termination,
+ * except for the license granted to you.
  *
- * 11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
+ * 11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State
+ * of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware
+ * residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the
+ * International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English
+ * language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction
+ * over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be
+ * allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
  *
- * 12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or operation of law without NVIDIA’s permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect.
+ * 12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or
+ * operation of law without NVIDIA’s permission. Any attempted assignment not approved by NVIDIA in writing shall be
+ * void and of no effect.
  *
- * 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship, transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from receiving the SOFTWARE.
+ * 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship,
+ * transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States
+ * Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s
+ * Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws
+ * include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not
+ * a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from
+ * receiving the SOFTWARE.
  *
- * 14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is “commercial items” consisting of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051.
+ * 14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is “commercial items” consisting
+ * of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS.
+ * Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the
+ * restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the
+ * Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is
+ * NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051.
  *
- * 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect. This license may only be modified in a writing signed by an authorized representative of each party.
+ * 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the
+ * subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to
+ * this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of
+ * this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect.
+ * This license may only be modified in a writing signed by an authorized representative of each party.
  *
  * (v. August 20, 2021)
  */
@@ -63,532 +141,563 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/barrier>
 #include <cuda/atomic>
+#include <cuda/barrier>
 #include <cuda/std/chrono>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
-    // Forward declaration in barrier of pipeline
-    enum class pipeline_role {
-        producer,
-        consumer
-    };
-
-    template<thread_scope _Scope>
-    struct __pipeline_stage {
-        barrier<_Scope> __produced;
-        barrier<_Scope> __consumed;
-    };
-
-    template<thread_scope _Scope, uint8_t _Stages_count>
-    class pipeline_shared_state {
-    public:
-        pipeline_shared_state() = default;
-        pipeline_shared_state(const pipeline_shared_state &) = delete;
-        pipeline_shared_state(pipeline_shared_state &&) = delete;
-        pipeline_shared_state & operator=(pipeline_shared_state &&) = delete;
-        pipeline_shared_state & operator=(const pipeline_shared_state &) =  delete;
-
-    private:
-        __pipeline_stage<_Scope> __stages[_Stages_count];
-        atomic<uint32_t, _Scope> __refcount;
-
-        template<thread_scope _Pipeline_scope>
-        friend class pipeline;
-
-        template<class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
-        friend _LIBCUDACXX_INLINE_VISIBILITY
-        pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state);
-
-        template<class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
-        friend _LIBCUDACXX_INLINE_VISIBILITY
-        pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state, size_t __producer_count);
-
-        template<class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
-        friend _LIBCUDACXX_INLINE_VISIBILITY
-        pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state, pipeline_role __role);
-    };
-
-    struct __pipeline_asm_helper {
-        _CCCL_DEVICE
-        static inline uint32_t __lane_id()
-        {
-            NV_IF_ELSE_TARGET(
-                NV_IS_DEVICE,
-                (
-                    uint32_t __lane_id;
-                    asm volatile ("mov.u32 %0, %%laneid;" : "=r"(__lane_id));
-                    return __lane_id;
-                ),
-                (
-                    return 0;
-                )
-            )
-        }
-    };
-
-    template<thread_scope _Scope>
-    class pipeline {
-    public:
-        pipeline(pipeline &&) = default;
-        pipeline(const pipeline &) = delete;
-        pipeline & operator=(pipeline &&) = delete;
-        pipeline & operator=(const pipeline &) = delete;
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        ~pipeline()
-        {
-            if (__active) {
-                (void)quit();
-            }
-        }
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        bool quit()
-        {
-            bool __elected;
-            uint32_t __sub_count;
-NV_IF_TARGET(NV_IS_DEVICE,
-            const uint32_t __match_mask = __match_any_sync(__activemask(), reinterpret_cast<uintptr_t>(__shared_state_get_refcount()));
-            const uint32_t __elected_id = __ffs(__match_mask) - 1;
-            __elected = (__pipeline_asm_helper::__lane_id() == __elected_id);
-            __sub_count = __popc(__match_mask);
-,
-            __elected = true;
-            __sub_count = 1;
-)
-            bool __released = false;
-            if (__elected) {
-                const uint32_t __old = __shared_state_get_refcount()->fetch_sub(__sub_count);
-                const bool __last = (__old == __sub_count);
-                if (__last) {
-                    for (uint8_t __stage = 0; __stage < __stages_count; ++__stage) {
-                        __shared_state_get_stage(__stage)->__produced.~barrier();
-                        __shared_state_get_stage(__stage)->__consumed.~barrier();
-                    }
-                    __released = true;
-                }
-            }
-            __active = false;
-            return __released;
-        }
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        void producer_acquire()
-        {
-            barrier<_Scope> & __stage_barrier = __shared_state_get_stage(__head)->__consumed;
-            __stage_barrier.wait_parity(__consumed_phase_parity);
-        }
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        void producer_commit()
-        {
-            barrier<_Scope> & __stage_barrier = __shared_state_get_stage(__head)->__produced;
-            (void)__memcpy_completion_impl::__defer(__completion_mechanism::__async_group, __single_thread_group{}, 0, __stage_barrier);
-            (void)__stage_barrier.arrive();
-            if (++__head == __stages_count) {
-                __head = 0;
-                __consumed_phase_parity = !__consumed_phase_parity;
-            }
-        }
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        void consumer_wait()
-        {
-            barrier<_Scope> & __stage_barrier = __shared_state_get_stage(__tail)->__produced;
-            __stage_barrier.wait_parity(__produced_phase_parity);
-        }
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        void consumer_release()
-        {
-            (void)__shared_state_get_stage(__tail)->__consumed.arrive();
-            if (++__tail == __stages_count) {
-                __tail = 0;
-                __produced_phase_parity = !__produced_phase_parity;
-            }
-        }
-
-        template<class _Rep, class _Period>
-        _LIBCUDACXX_INLINE_VISIBILITY
-        bool consumer_wait_for(const _CUDA_VSTD::chrono::duration<_Rep, _Period> & __duration)
-        {
-            barrier<_Scope> & __stage_barrier = __shared_state_get_stage(__tail)->__produced;
-            return _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
-                        _CUDA_VSTD::__barrier_poll_tester_parity<barrier<_Scope>>(
-                            &__stage_barrier,
-                            __produced_phase_parity),
-                        _CUDA_VSTD::chrono::duration_cast<_CUDA_VSTD::chrono::nanoseconds>(__duration)
-            );
-        }
-
-        template<class _Clock, class _Duration>
-        _LIBCUDACXX_INLINE_VISIBILITY
-        bool consumer_wait_until(const _CUDA_VSTD::chrono::time_point<_Clock, _Duration> & __time_point)
-        {
-            return consumer_wait_for(__time_point - _Clock::now());
-        }
-
-    private:
-        uint8_t __head               : 8;
-        uint8_t __tail               : 8;
-        const uint8_t __stages_count : 8;
-        bool __consumed_phase_parity : 1;
-        bool __produced_phase_parity : 1;
-        bool __active                : 1;
-        // TODO: Remove partitioned on next ABI break
-        const bool __partitioned     : 1;
-        char * const __shared_state;
-
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        pipeline(char * __shared_state, uint8_t __stages_count, bool __partitioned)
-            : __head(0)
-            , __tail(0)
-            , __stages_count(__stages_count)
-            , __consumed_phase_parity(true)
-            , __produced_phase_parity(false)
-            , __active(true)
-            , __partitioned(__partitioned)
-            , __shared_state(__shared_state)
-        {}
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        __pipeline_stage<_Scope> * __shared_state_get_stage(uint8_t __stage)
-        {
-            ptrdiff_t __stage_offset = __stage * sizeof(__pipeline_stage<_Scope>);
-            return reinterpret_cast<__pipeline_stage<_Scope>*>(__shared_state + __stage_offset);
-        }
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        atomic<uint32_t, _Scope> * __shared_state_get_refcount()
+// Forward declaration in barrier of pipeline
+enum class pipeline_role
+{
+  producer,
+  consumer
+};
+
+template <thread_scope _Scope>
+struct __pipeline_stage
+{
+  barrier<_Scope> __produced;
+  barrier<_Scope> __consumed;
+};
+
+template <thread_scope _Scope, uint8_t _Stages_count>
+class pipeline_shared_state
+{
+public:
+  pipeline_shared_state()                                        = default;
+  pipeline_shared_state(const pipeline_shared_state&)            = delete;
+  pipeline_shared_state(pipeline_shared_state&&)                 = delete;
+  pipeline_shared_state& operator=(pipeline_shared_state&&)      = delete;
+  pipeline_shared_state& operator=(const pipeline_shared_state&) = delete;
+
+private:
+  __pipeline_stage<_Scope> __stages[_Stages_count];
+  atomic<uint32_t, _Scope> __refcount;
+
+  template <thread_scope _Pipeline_scope>
+  friend class pipeline;
+
+  template <class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
+  friend _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Pipeline_scope>
+  make_pipeline(const _Group& __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count>* __shared_state);
+
+  template <class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
+  friend _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Pipeline_scope>
+  make_pipeline(const _Group& __group,
+                pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count>* __shared_state,
+                size_t __producer_count);
+
+  template <class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
+  friend _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Pipeline_scope>
+  make_pipeline(const _Group& __group,
+                pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count>* __shared_state,
+                pipeline_role __role);
+};
+
+struct __pipeline_asm_helper
+{
+  _CCCL_DEVICE static inline uint32_t __lane_id()
+  {
+    NV_IF_ELSE_TARGET(
+      NV_IS_DEVICE,
+      (uint32_t __lane_id; asm volatile("mov.u32 %0, %%laneid;"
+                                        : "=r"(__lane_id));
+       return __lane_id;),
+      (return 0;))
+  }
+};
+
+template <thread_scope _Scope>
+class pipeline
+{
+public:
+  pipeline(pipeline&&)                 = default;
+  pipeline(const pipeline&)            = delete;
+  pipeline& operator=(pipeline&&)      = delete;
+  pipeline& operator=(const pipeline&) = delete;
+
+  _LIBCUDACXX_INLINE_VISIBILITY ~pipeline()
+  {
+    if (__active)
+    {
+      (void) quit();
+    }
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY bool quit()
+  {
+    bool __elected;
+    uint32_t __sub_count;
+    NV_IF_TARGET(
+      NV_IS_DEVICE,
+      const uint32_t __match_mask =
+        __match_any_sync(__activemask(), reinterpret_cast<uintptr_t>(__shared_state_get_refcount()));
+      const uint32_t __elected_id = __ffs(__match_mask) - 1;
+      __elected                   = (__pipeline_asm_helper::__lane_id() == __elected_id);
+      __sub_count                 = __popc(__match_mask);
+      , __elected = true;
+      __sub_count = 1;)
+    bool __released = false;
+    if (__elected)
+    {
+      const uint32_t __old = __shared_state_get_refcount()->fetch_sub(__sub_count);
+      const bool __last    = (__old == __sub_count);
+      if (__last)
+      {
+        for (uint8_t __stage = 0; __stage < __stages_count; ++__stage)
         {
-            ptrdiff_t __refcount_offset = __stages_count * sizeof(__pipeline_stage<_Scope>);
-            return reinterpret_cast<atomic<uint32_t, _Scope>*>(__shared_state + __refcount_offset);
+          __shared_state_get_stage(__stage)->__produced.~barrier();
+          __shared_state_get_stage(__stage)->__consumed.~barrier();
         }
-
-        template<class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
-        friend _LIBCUDACXX_INLINE_VISIBILITY
-        pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state);
-
-        template<class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
-        friend _LIBCUDACXX_INLINE_VISIBILITY
-        pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state, size_t __producer_count);
-
-        template<class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
-        friend _LIBCUDACXX_INLINE_VISIBILITY
-        pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state, pipeline_role __role);
-    };
-
-    template<class _Group, thread_scope _Scope, uint8_t _Stages_count>
-    _LIBCUDACXX_INLINE_VISIBILITY
-    pipeline<_Scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Scope, _Stages_count> * __shared_state)
+        __released = true;
+      }
+    }
+    __active = false;
+    return __released;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY void producer_acquire()
+  {
+    barrier<_Scope>& __stage_barrier = __shared_state_get_stage(__head)->__consumed;
+    __stage_barrier.wait_parity(__consumed_phase_parity);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY void producer_commit()
+  {
+    barrier<_Scope>& __stage_barrier = __shared_state_get_stage(__head)->__produced;
+    (void) __memcpy_completion_impl::__defer(
+      __completion_mechanism::__async_group, __single_thread_group{}, 0, __stage_barrier);
+    (void) __stage_barrier.arrive();
+    if (++__head == __stages_count)
     {
-        const uint32_t __group_size = static_cast<uint32_t>(__group.size());
-        const uint32_t __thread_rank = static_cast<uint32_t>(__group.thread_rank());
-
-        if (__thread_rank == 0) {
-            for (uint8_t __stage = 0; __stage < _Stages_count; ++__stage) {
-                init(&__shared_state->__stages[__stage].__consumed, __group_size);
-                init(&__shared_state->__stages[__stage].__produced, __group_size);
-            }
-            __shared_state->__refcount.store(__group_size, std::memory_order_relaxed);
-        }
-        __group.sync();
-
-        return pipeline<_Scope>(reinterpret_cast<char*>(__shared_state->__stages), _Stages_count, false);
+      __head                  = 0;
+      __consumed_phase_parity = !__consumed_phase_parity;
     }
-
-    template<class _Group, thread_scope _Scope, uint8_t _Stages_count>
-    _LIBCUDACXX_INLINE_VISIBILITY
-    pipeline<_Scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Scope, _Stages_count> * __shared_state, size_t __producer_count)
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY void consumer_wait()
+  {
+    barrier<_Scope>& __stage_barrier = __shared_state_get_stage(__tail)->__produced;
+    __stage_barrier.wait_parity(__produced_phase_parity);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY void consumer_release()
+  {
+    (void) __shared_state_get_stage(__tail)->__consumed.arrive();
+    if (++__tail == __stages_count)
     {
-        const uint32_t __group_size = static_cast<uint32_t>(__group.size());
-        const uint32_t __thread_rank = static_cast<uint32_t>(__group.thread_rank());
-
-        if (__thread_rank == 0) {
-            const size_t __consumer_count = __group_size - __producer_count;
-            for (uint8_t __stage = 0; __stage < _Stages_count; ++__stage) {
-                init(&__shared_state->__stages[__stage].__consumed, __consumer_count);
-                init(&__shared_state->__stages[__stage].__produced, __producer_count);
-            }
-            __shared_state->__refcount.store(__group_size, std::memory_order_relaxed);
-        }
-        __group.sync();
-
-        return pipeline<_Scope>(reinterpret_cast<char*>(__shared_state->__stages), _Stages_count, true);
+      __tail                  = 0;
+      __produced_phase_parity = !__produced_phase_parity;
     }
-
-    template<class _Group, thread_scope _Scope, uint8_t _Stages_count>
-    _LIBCUDACXX_INLINE_VISIBILITY
-    pipeline<_Scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Scope, _Stages_count> * __shared_state, pipeline_role __role)
+  }
+
+  template <class _Rep, class _Period>
+  _LIBCUDACXX_INLINE_VISIBILITY bool consumer_wait_for(const _CUDA_VSTD::chrono::duration<_Rep, _Period>& __duration)
+  {
+    barrier<_Scope>& __stage_barrier = __shared_state_get_stage(__tail)->__produced;
+    return _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
+      _CUDA_VSTD::__barrier_poll_tester_parity<barrier<_Scope>>(&__stage_barrier, __produced_phase_parity),
+      _CUDA_VSTD::chrono::duration_cast<_CUDA_VSTD::chrono::nanoseconds>(__duration));
+  }
+
+  template <class _Clock, class _Duration>
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  consumer_wait_until(const _CUDA_VSTD::chrono::time_point<_Clock, _Duration>& __time_point)
+  {
+    return consumer_wait_for(__time_point - _Clock::now());
+  }
+
+private:
+  uint8_t __head               : 8;
+  uint8_t __tail               : 8;
+  const uint8_t __stages_count : 8;
+  bool __consumed_phase_parity : 1;
+  bool __produced_phase_parity : 1;
+  bool __active                : 1;
+  // TODO: Remove partitioned on next ABI break
+  const bool __partitioned : 1;
+  char* const __shared_state;
+
+  _LIBCUDACXX_INLINE_VISIBILITY pipeline(char* __shared_state, uint8_t __stages_count, bool __partitioned)
+      : __head(0)
+      , __tail(0)
+      , __stages_count(__stages_count)
+      , __consumed_phase_parity(true)
+      , __produced_phase_parity(false)
+      , __active(true)
+      , __partitioned(__partitioned)
+      , __shared_state(__shared_state)
+  {}
+
+  _LIBCUDACXX_INLINE_VISIBILITY __pipeline_stage<_Scope>* __shared_state_get_stage(uint8_t __stage)
+  {
+    ptrdiff_t __stage_offset = __stage * sizeof(__pipeline_stage<_Scope>);
+    return reinterpret_cast<__pipeline_stage<_Scope>*>(__shared_state + __stage_offset);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY atomic<uint32_t, _Scope>* __shared_state_get_refcount()
+  {
+    ptrdiff_t __refcount_offset = __stages_count * sizeof(__pipeline_stage<_Scope>);
+    return reinterpret_cast<atomic<uint32_t, _Scope>*>(__shared_state + __refcount_offset);
+  }
+
+  template <class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
+  friend _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Pipeline_scope>
+  make_pipeline(const _Group& __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count>* __shared_state);
+
+  template <class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
+  friend _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Pipeline_scope>
+  make_pipeline(const _Group& __group,
+                pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count>* __shared_state,
+                size_t __producer_count);
+
+  template <class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
+  friend _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Pipeline_scope>
+  make_pipeline(const _Group& __group,
+                pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count>* __shared_state,
+                pipeline_role __role);
+};
+
+template <class _Group, thread_scope _Scope, uint8_t _Stages_count>
+_LIBCUDACXX_INLINE_VISIBILITY pipeline<_Scope>
+make_pipeline(const _Group& __group, pipeline_shared_state<_Scope, _Stages_count>* __shared_state)
+{
+  const uint32_t __group_size  = static_cast<uint32_t>(__group.size());
+  const uint32_t __thread_rank = static_cast<uint32_t>(__group.thread_rank());
+
+  if (__thread_rank == 0)
+  {
+    for (uint8_t __stage = 0; __stage < _Stages_count; ++__stage)
     {
-        const uint32_t __group_size = static_cast<uint32_t>(__group.size());
-        const uint32_t __thread_rank = static_cast<uint32_t>(__group.thread_rank());
-
-        if (__thread_rank == 0) {
-            __shared_state->__refcount.store(0, std::memory_order_relaxed);
-        }
-        __group.sync();
-
-        if (__role == pipeline_role::producer) {
-            bool __elected;
-            uint32_t __add_count;
-NV_IF_TARGET(NV_IS_DEVICE,
-            const uint32_t __match_mask = __match_any_sync(__activemask(), reinterpret_cast<uintptr_t>(&__shared_state->__refcount));
-            const uint32_t __elected_id = __ffs(__match_mask) - 1;
-            __elected = (__pipeline_asm_helper::__lane_id() == __elected_id);
-            __add_count = __popc(__match_mask);
-,
-            __elected = true;
-            __add_count = 1;
-)
-            if (__elected) {
-                (void)__shared_state->__refcount.fetch_add(__add_count, std::memory_order_relaxed);
-            }
-        }
-        __group.sync();
-
-        if (__thread_rank == 0) {
-            const uint32_t __producer_count = __shared_state->__refcount.load(std::memory_order_relaxed);
-            const uint32_t __consumer_count = __group_size - __producer_count;
-            for (uint8_t __stage = 0; __stage < _Stages_count; ++__stage) {
-                init(&__shared_state->__stages[__stage].__consumed, __consumer_count);
-                init(&__shared_state->__stages[__stage].__produced, __producer_count);
-            }
-            __shared_state->__refcount.store(__group_size, std::memory_order_relaxed);
-        }
-        __group.sync();
-
-        return pipeline<_Scope>(reinterpret_cast<char*>(__shared_state->__stages), _Stages_count, true);
+      init(&__shared_state->__stages[__stage].__consumed, __group_size);
+      init(&__shared_state->__stages[__stage].__produced, __group_size);
+    }
+    __shared_state->__refcount.store(__group_size, std::memory_order_relaxed);
+  }
+  __group.sync();
+
+  return pipeline<_Scope>(reinterpret_cast<char*>(__shared_state->__stages), _Stages_count, false);
+}
+
+template <class _Group, thread_scope _Scope, uint8_t _Stages_count>
+_LIBCUDACXX_INLINE_VISIBILITY pipeline<_Scope> make_pipeline(
+  const _Group& __group, pipeline_shared_state<_Scope, _Stages_count>* __shared_state, size_t __producer_count)
+{
+  const uint32_t __group_size  = static_cast<uint32_t>(__group.size());
+  const uint32_t __thread_rank = static_cast<uint32_t>(__group.thread_rank());
+
+  if (__thread_rank == 0)
+  {
+    const size_t __consumer_count = __group_size - __producer_count;
+    for (uint8_t __stage = 0; __stage < _Stages_count; ++__stage)
+    {
+      init(&__shared_state->__stages[__stage].__consumed, __consumer_count);
+      init(&__shared_state->__stages[__stage].__produced, __producer_count);
+    }
+    __shared_state->__refcount.store(__group_size, std::memory_order_relaxed);
+  }
+  __group.sync();
+
+  return pipeline<_Scope>(reinterpret_cast<char*>(__shared_state->__stages), _Stages_count, true);
+}
+
+template <class _Group, thread_scope _Scope, uint8_t _Stages_count>
+_LIBCUDACXX_INLINE_VISIBILITY pipeline<_Scope>
+make_pipeline(const _Group& __group, pipeline_shared_state<_Scope, _Stages_count>* __shared_state, pipeline_role __role)
+{
+  const uint32_t __group_size  = static_cast<uint32_t>(__group.size());
+  const uint32_t __thread_rank = static_cast<uint32_t>(__group.thread_rank());
+
+  if (__thread_rank == 0)
+  {
+    __shared_state->__refcount.store(0, std::memory_order_relaxed);
+  }
+  __group.sync();
+
+  if (__role == pipeline_role::producer)
+  {
+    bool __elected;
+    uint32_t __add_count;
+    NV_IF_TARGET(
+      NV_IS_DEVICE,
+      const uint32_t __match_mask =
+        __match_any_sync(__activemask(), reinterpret_cast<uintptr_t>(&__shared_state->__refcount));
+      const uint32_t __elected_id = __ffs(__match_mask) - 1;
+      __elected                   = (__pipeline_asm_helper::__lane_id() == __elected_id);
+      __add_count                 = __popc(__match_mask);
+      , __elected = true;
+      __add_count = 1;)
+    if (__elected)
+    {
+      (void) __shared_state->__refcount.fetch_add(__add_count, std::memory_order_relaxed);
     }
+  }
+  __group.sync();
+
+  if (__thread_rank == 0)
+  {
+    const uint32_t __producer_count = __shared_state->__refcount.load(std::memory_order_relaxed);
+    const uint32_t __consumer_count = __group_size - __producer_count;
+    for (uint8_t __stage = 0; __stage < _Stages_count; ++__stage)
+    {
+      init(&__shared_state->__stages[__stage].__consumed, __consumer_count);
+      init(&__shared_state->__stages[__stage].__produced, __producer_count);
+    }
+    __shared_state->__refcount.store(__group_size, std::memory_order_relaxed);
+  }
+  __group.sync();
+
+  return pipeline<_Scope>(reinterpret_cast<char*>(__shared_state->__stages), _Stages_count, true);
+}
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE
 
-    template<uint8_t _Prior>
-    _CCCL_DEVICE
-    void __pipeline_consumer_wait(pipeline<thread_scope_thread> & __pipeline);
+template <uint8_t _Prior>
+_CCCL_DEVICE void __pipeline_consumer_wait(pipeline<thread_scope_thread>& __pipeline);
 
-    _CCCL_DEVICE
-    inline void __pipeline_consumer_wait(pipeline<thread_scope_thread> & __pipeline, uint8_t __prior);
+_CCCL_DEVICE inline void __pipeline_consumer_wait(pipeline<thread_scope_thread>& __pipeline, uint8_t __prior);
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
-    template<>
-    class pipeline<thread_scope_thread> {
-    public:
-        pipeline(pipeline &&) = default;
-        pipeline(const pipeline &) = delete;
-        pipeline & operator=(pipeline &&) = delete;
-        pipeline & operator=(const pipeline &) = delete;
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        ~pipeline() {}
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        bool quit()
-        {
-            return true;
-        }
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        void producer_acquire() {}
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        void producer_commit()
-        {
-NV_IF_TARGET(NV_PROVIDES_SM_80,
-            asm volatile ("cp.async.commit_group;");
-            ++__head;
-)
-        }
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        void consumer_wait()
-        {
-NV_IF_TARGET(NV_PROVIDES_SM_80,
-            if (__head == __tail) {
-                return;
-            }
-
-            const uint8_t __prior = __head - __tail - 1;
-            device::__pipeline_consumer_wait(*this, __prior);
-            ++__tail;
-)
-        }
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        void consumer_release() {}
-
-        template<class _Rep, class _Period>
-        _LIBCUDACXX_INLINE_VISIBILITY
-        bool consumer_wait_for(const _CUDA_VSTD::chrono::duration<_Rep, _Period> & __duration)
-        {
-            (void)__duration;
-            consumer_wait();
-            return true;
-        }
-
-        template<class _Clock, class _Duration>
-        _LIBCUDACXX_INLINE_VISIBILITY
-        bool consumer_wait_until(const _CUDA_VSTD::chrono::time_point<_Clock, _Duration> & __time_point)
-        {
-            (void)__time_point;
-            consumer_wait();
-            return true;
-        }
-
-    private:
-        uint8_t __head;
-        uint8_t __tail;
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        pipeline()
-            : __head(0)
-            , __tail(0)
-        {}
-
-        friend _LIBCUDACXX_INLINE_VISIBILITY inline pipeline<thread_scope_thread> make_pipeline();
-
-        template<uint8_t _Prior>
-        friend _LIBCUDACXX_INLINE_VISIBILITY
-        void pipeline_consumer_wait_prior(pipeline<thread_scope_thread> & __pipeline);
-
-        template<class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
-        friend _LIBCUDACXX_INLINE_VISIBILITY
-        pipeline<_Pipeline_scope> __make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state);
-    };
+template <>
+class pipeline<thread_scope_thread>
+{
+public:
+  pipeline(pipeline&&)                 = default;
+  pipeline(const pipeline&)            = delete;
+  pipeline& operator=(pipeline&&)      = delete;
+  pipeline& operator=(const pipeline&) = delete;
+
+  _LIBCUDACXX_INLINE_VISIBILITY ~pipeline() {}
+
+  _LIBCUDACXX_INLINE_VISIBILITY bool quit()
+  {
+    return true;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY void producer_acquire() {}
+
+  _LIBCUDACXX_INLINE_VISIBILITY void producer_commit()
+  {
+    NV_IF_TARGET(NV_PROVIDES_SM_80, asm volatile("cp.async.commit_group;"); ++__head;)
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY void consumer_wait()
+  {
+    NV_IF_TARGET(
+      NV_PROVIDES_SM_80,
+      if (__head == __tail) { return; }
+
+      const uint8_t __prior = __head - __tail - 1;
+      device::__pipeline_consumer_wait(*this, __prior);
+      ++__tail;)
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY void consumer_release() {}
+
+  template <class _Rep, class _Period>
+  _LIBCUDACXX_INLINE_VISIBILITY bool consumer_wait_for(const _CUDA_VSTD::chrono::duration<_Rep, _Period>& __duration)
+  {
+    (void) __duration;
+    consumer_wait();
+    return true;
+  }
+
+  template <class _Clock, class _Duration>
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  consumer_wait_until(const _CUDA_VSTD::chrono::time_point<_Clock, _Duration>& __time_point)
+  {
+    (void) __time_point;
+    consumer_wait();
+    return true;
+  }
+
+private:
+  uint8_t __head;
+  uint8_t __tail;
+
+  _LIBCUDACXX_INLINE_VISIBILITY pipeline()
+      : __head(0)
+      , __tail(0)
+  {}
+
+  friend _LIBCUDACXX_INLINE_VISIBILITY inline pipeline<thread_scope_thread> make_pipeline();
+
+  template <uint8_t _Prior>
+  friend _LIBCUDACXX_INLINE_VISIBILITY void pipeline_consumer_wait_prior(pipeline<thread_scope_thread>& __pipeline);
+
+  template <class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
+  friend _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Pipeline_scope> __make_pipeline(
+    const _Group& __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count>* __shared_state);
+};
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE
 
-    template<uint8_t _Prior>
-    _CCCL_DEVICE
-    void __pipeline_consumer_wait(pipeline<thread_scope_thread> & __pipeline)
-    {
-        (void)__pipeline;
-NV_IF_TARGET(NV_PROVIDES_SM_80,
-        constexpr uint8_t __max_prior = 8;
-
-        asm volatile ("cp.async.wait_group %0;"
-            :
-            : "n"(_Prior < __max_prior ? _Prior : __max_prior));
-)
-    }
-
-    _CCCL_DEVICE
-    inline void __pipeline_consumer_wait(pipeline<thread_scope_thread> & __pipeline, uint8_t __prior)
-    {
-        switch (__prior) {
-        case 0:  device::__pipeline_consumer_wait<0>(__pipeline); break;
-        case 1:  device::__pipeline_consumer_wait<1>(__pipeline); break;
-        case 2:  device::__pipeline_consumer_wait<2>(__pipeline); break;
-        case 3:  device::__pipeline_consumer_wait<3>(__pipeline); break;
-        case 4:  device::__pipeline_consumer_wait<4>(__pipeline); break;
-        case 5:  device::__pipeline_consumer_wait<5>(__pipeline); break;
-        case 6:  device::__pipeline_consumer_wait<6>(__pipeline); break;
-        case 7:  device::__pipeline_consumer_wait<7>(__pipeline); break;
-        default: device::__pipeline_consumer_wait<8>(__pipeline); break;
-        }
-    }
+template <uint8_t _Prior>
+_CCCL_DEVICE void __pipeline_consumer_wait(pipeline<thread_scope_thread>& __pipeline)
+{
+  (void) __pipeline;
+  NV_IF_TARGET(NV_PROVIDES_SM_80, constexpr uint8_t __max_prior = 8;
+
+               asm volatile("cp.async.wait_group %0;"
+                            :
+                            : "n"(_Prior < __max_prior ? _Prior : __max_prior));)
+}
+
+_CCCL_DEVICE inline void __pipeline_consumer_wait(pipeline<thread_scope_thread>& __pipeline, uint8_t __prior)
+{
+  switch (__prior)
+  {
+    case 0:
+      device::__pipeline_consumer_wait<0>(__pipeline);
+      break;
+    case 1:
+      device::__pipeline_consumer_wait<1>(__pipeline);
+      break;
+    case 2:
+      device::__pipeline_consumer_wait<2>(__pipeline);
+      break;
+    case 3:
+      device::__pipeline_consumer_wait<3>(__pipeline);
+      break;
+    case 4:
+      device::__pipeline_consumer_wait<4>(__pipeline);
+      break;
+    case 5:
+      device::__pipeline_consumer_wait<5>(__pipeline);
+      break;
+    case 6:
+      device::__pipeline_consumer_wait<6>(__pipeline);
+      break;
+    case 7:
+      device::__pipeline_consumer_wait<7>(__pipeline);
+      break;
+    default:
+      device::__pipeline_consumer_wait<8>(__pipeline);
+      break;
+  }
+}
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    inline pipeline<thread_scope_thread> make_pipeline()
-    {
-        return pipeline<thread_scope_thread>();
-    }
-
-    template<uint8_t _Prior>
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void pipeline_consumer_wait_prior(pipeline<thread_scope_thread> & __pipeline)
-    {
-        NV_IF_TARGET(NV_PROVIDES_SM_80,
-            device::__pipeline_consumer_wait<_Prior>(__pipeline);
-            __pipeline.__tail = __pipeline.__head - _Prior;
-        )
-    }
-
-    template<thread_scope _Scope>
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void pipeline_producer_commit(pipeline<thread_scope_thread> & __pipeline, barrier<_Scope> & __barrier)
-    {
-        (void)__pipeline;
-        NV_IF_TARGET(NV_PROVIDES_SM_80,(
-            (void)__memcpy_completion_impl::__defer(__completion_mechanism::__async_group, __single_thread_group{}, 0, __barrier);
-        ));
-    }
-
-    template<typename _Group, class _Tp, typename _Size, thread_scope _Scope>
-    _LIBCUDACXX_INLINE_VISIBILITY
-    async_contract_fulfillment __memcpy_async_pipeline(_Group const & __group, _Tp * __destination, _Tp const * __source, _Size __size, pipeline<_Scope> & __pipeline) {
-        // 1. Set the completion mechanisms that can be used.
-        //
-        //    Do not (yet) allow async_bulk_group completion. Do not allow
-        //    mbarrier_complete_tx completion, even though it may be possible if
-        //    the pipeline has stage barriers in shared memory.
-        _CUDA_VSTD::uint32_t __allowed_completions = _CUDA_VSTD::uint32_t(__completion_mechanism::__async_group);
-
-        // Alignment: Use the maximum of the alignment of _Tp and that of a possible cuda::aligned_size_t.
-        constexpr _CUDA_VSTD::size_t __size_align = __get_size_align<_Size>::align;
-        constexpr _CUDA_VSTD::size_t __align = (alignof(_Tp) < __size_align) ? __size_align : alignof(_Tp);
-        // Cast to char pointers. We don't need the type for alignment anymore and
-        // erasing the types reduces the number of instantiations of down-stream
-        // functions.
-        char * __dest_char = reinterpret_cast<char*>(__destination);
-        char const * __src_char = reinterpret_cast<char const *>(__source);
-
-        // 2. Issue actual copy instructions.
-        auto __cm =  __dispatch_memcpy_async<__align>(__group, __dest_char, __src_char, __size, __allowed_completions);
-
-        // 3. No need to synchronize with copy instructions.
-        return __memcpy_completion_impl::__defer(__cm, __group, __size, __pipeline);
-    }
-
-    template<typename _Group, class _Type, thread_scope _Scope>
-    _LIBCUDACXX_INLINE_VISIBILITY
-    async_contract_fulfillment memcpy_async(_Group const & __group, _Type * __destination, _Type const * __source, std::size_t __size, pipeline<_Scope> & __pipeline) {
-        return __memcpy_async_pipeline(__group, __destination, __source, __size, __pipeline);
-    }
-
-    template<typename _Group, class _Type, std::size_t _Alignment, thread_scope _Scope, std::size_t _Larger_alignment = (alignof(_Type) > _Alignment) ? alignof(_Type) : _Alignment>
-    _LIBCUDACXX_INLINE_VISIBILITY
-    async_contract_fulfillment memcpy_async(_Group const & __group, _Type * __destination, _Type const * __source, aligned_size_t<_Alignment> __size, pipeline<_Scope> & __pipeline) {
-        return __memcpy_async_pipeline(__group, __destination, __source, __size, __pipeline);
-    }
-
-    template<class _Type, typename _Size, thread_scope _Scope>
-    _LIBCUDACXX_INLINE_VISIBILITY
-    async_contract_fulfillment memcpy_async(_Type * __destination, _Type const * __source, _Size __size, pipeline<_Scope> & __pipeline) {
-        return __memcpy_async_pipeline(__single_thread_group{}, __destination, __source, __size, __pipeline);
-    }
-
-    template<typename _Group, thread_scope _Scope>
-    _LIBCUDACXX_INLINE_VISIBILITY
-    async_contract_fulfillment memcpy_async(_Group const & __group, void * __destination, void const * __source, std::size_t __size, pipeline<_Scope> & __pipeline) {
-        return __memcpy_async_pipeline(__group, reinterpret_cast<char *>(__destination), reinterpret_cast<char const *>(__source), __size, __pipeline);
-    }
-
-    template<typename _Group, std::size_t _Alignment, thread_scope _Scope>
-    _LIBCUDACXX_INLINE_VISIBILITY
-    async_contract_fulfillment memcpy_async(_Group const & __group, void * __destination, void const * __source, aligned_size_t<_Alignment> __size, pipeline<_Scope> & __pipeline) {
-        return __memcpy_async_pipeline(__group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const *>(__source), __size, __pipeline);
-    }
-
-    template<typename _Size, thread_scope _Scope>
-    _LIBCUDACXX_INLINE_VISIBILITY
-    async_contract_fulfillment memcpy_async(void * __destination, void const * __source, _Size __size, pipeline<_Scope> & __pipeline) {
-        return __memcpy_async_pipeline(__single_thread_group{}, reinterpret_cast<char*>(__destination), reinterpret_cast<char const *>(__source), __size, __pipeline);
-    }
+_LIBCUDACXX_INLINE_VISIBILITY inline pipeline<thread_scope_thread> make_pipeline()
+{
+  return pipeline<thread_scope_thread>();
+}
+
+template <uint8_t _Prior>
+_LIBCUDACXX_INLINE_VISIBILITY void pipeline_consumer_wait_prior(pipeline<thread_scope_thread>& __pipeline)
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_80, device::__pipeline_consumer_wait<_Prior>(__pipeline);
+               __pipeline.__tail = __pipeline.__head - _Prior;)
+}
+
+template <thread_scope _Scope>
+_LIBCUDACXX_INLINE_VISIBILITY void
+pipeline_producer_commit(pipeline<thread_scope_thread>& __pipeline, barrier<_Scope>& __barrier)
+{
+  (void) __pipeline;
+  NV_IF_TARGET(NV_PROVIDES_SM_80,
+               ((void) __memcpy_completion_impl::__defer(
+                  __completion_mechanism::__async_group, __single_thread_group{}, 0, __barrier);));
+}
+
+template <typename _Group, class _Tp, typename _Size, thread_scope _Scope>
+_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment __memcpy_async_pipeline(
+  _Group const& __group, _Tp* __destination, _Tp const* __source, _Size __size, pipeline<_Scope>& __pipeline)
+{
+  // 1. Set the completion mechanisms that can be used.
+  //
+  //    Do not (yet) allow async_bulk_group completion. Do not allow
+  //    mbarrier_complete_tx completion, even though it may be possible if
+  //    the pipeline has stage barriers in shared memory.
+  _CUDA_VSTD::uint32_t __allowed_completions = _CUDA_VSTD::uint32_t(__completion_mechanism::__async_group);
+
+  // Alignment: Use the maximum of the alignment of _Tp and that of a possible cuda::aligned_size_t.
+  constexpr _CUDA_VSTD::size_t __size_align = __get_size_align<_Size>::align;
+  constexpr _CUDA_VSTD::size_t __align      = (alignof(_Tp) < __size_align) ? __size_align : alignof(_Tp);
+  // Cast to char pointers. We don't need the type for alignment anymore and
+  // erasing the types reduces the number of instantiations of down-stream
+  // functions.
+  char* __dest_char      = reinterpret_cast<char*>(__destination);
+  char const* __src_char = reinterpret_cast<char const*>(__source);
+
+  // 2. Issue actual copy instructions.
+  auto __cm = __dispatch_memcpy_async<__align>(__group, __dest_char, __src_char, __size, __allowed_completions);
+
+  // 3. No need to synchronize with copy instructions.
+  return __memcpy_completion_impl::__defer(__cm, __group, __size, __pipeline);
+}
+
+template <typename _Group, class _Type, thread_scope _Scope>
+_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment memcpy_async(
+  _Group const& __group, _Type* __destination, _Type const* __source, std::size_t __size, pipeline<_Scope>& __pipeline)
+{
+  return __memcpy_async_pipeline(__group, __destination, __source, __size, __pipeline);
+}
+
+template <typename _Group,
+          class _Type,
+          std::size_t _Alignment,
+          thread_scope _Scope,
+          std::size_t _Larger_alignment = (alignof(_Type) > _Alignment) ? alignof(_Type) : _Alignment>
+_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment memcpy_async(
+  _Group const& __group,
+  _Type* __destination,
+  _Type const* __source,
+  aligned_size_t<_Alignment> __size,
+  pipeline<_Scope>& __pipeline)
+{
+  return __memcpy_async_pipeline(__group, __destination, __source, __size, __pipeline);
+}
+
+template <class _Type, typename _Size, thread_scope _Scope>
+_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment
+memcpy_async(_Type* __destination, _Type const* __source, _Size __size, pipeline<_Scope>& __pipeline)
+{
+  return __memcpy_async_pipeline(__single_thread_group{}, __destination, __source, __size, __pipeline);
+}
+
+template <typename _Group, thread_scope _Scope>
+_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment memcpy_async(
+  _Group const& __group, void* __destination, void const* __source, std::size_t __size, pipeline<_Scope>& __pipeline)
+{
+  return __memcpy_async_pipeline(
+    __group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const*>(__source), __size, __pipeline);
+}
+
+template <typename _Group, std::size_t _Alignment, thread_scope _Scope>
+_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment memcpy_async(
+  _Group const& __group,
+  void* __destination,
+  void const* __source,
+  aligned_size_t<_Alignment> __size,
+  pipeline<_Scope>& __pipeline)
+{
+  return __memcpy_async_pipeline(
+    __group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const*>(__source), __size, __pipeline);
+}
+
+template <typename _Size, thread_scope _Scope>
+_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment
+memcpy_async(void* __destination, void const* __source, _Size __size, pipeline<_Scope>& __pipeline)
+{
+  return __memcpy_async_pipeline(
+    __single_thread_group{},
+    reinterpret_cast<char*>(__destination),
+    reinterpret_cast<char const*>(__source),
+    __size,
+    __pipeline);
+}
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
 
diff --git a/libcudacxx/include/cuda/std/__algorithm_ b/libcudacxx/include/cuda/std/__algorithm_
index 91c4160a8b5..2ec4ef668af 100644
--- a/libcudacxx/include/cuda/std/__algorithm_
+++ b/libcudacxx/include/cuda/std/__algorithm_
@@ -13,10 +13,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/algorithm>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_ALGORITHM
diff --git a/libcudacxx/include/cuda/std/__exception_ b/libcudacxx/include/cuda/std/__exception_
index e5aedc1d49d..a54c7a1f77a 100644
--- a/libcudacxx/include/cuda/std/__exception_
+++ b/libcudacxx/include/cuda/std/__exception_
@@ -8,15 +8,25 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _CUDA_STD_NEW
-#define _CUDA_STD_NEW
+#ifndef _CUDA_STD_EXCEPTION
+#define _CUDA_STD_EXCEPTION
 
-#include "detail/__config"
+#include <cuda/std/detail/__config>
 
-#include "detail/__pragma_push"
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
 
-#include "detail/libcxx/include/exception"
+// clang-format off
+#include <cuda/std/detail/__pragma_push>
 
-#include "detail/__pragma_pop"
+#include <cuda/std/detail/libcxx/include/exception>
 
-#endif // _CUDA_STD_NEW
+#include <cuda/std/detail/__pragma_pop>
+// clang-format on
+
+#endif // _CUDA_STD_EXCEPTION
diff --git a/libcudacxx/include/cuda/std/__memory_ b/libcudacxx/include/cuda/std/__memory_
index 1bff78d6773..ee2af7fdb97 100644
--- a/libcudacxx/include/cuda/std/__memory_
+++ b/libcudacxx/include/cuda/std/__memory_
@@ -11,12 +11,22 @@
 #ifndef _CUDA_STD_MEMORY
 #define _CUDA_STD_MEMORY
 
-#include "detail/__config"
+#include <cuda/std/detail/__config>
 
-#include "detail/__pragma_push"
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
 
-#include "detail/libcxx/include/memory"
+// clang-format off
+#include <cuda/std/detail/__pragma_push>
 
-#include "detail/__pragma_pop"
+#include <cuda/std/detail/libcxx/include/memory>
+
+#include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_MEMORY
diff --git a/libcudacxx/include/cuda/std/__new_ b/libcudacxx/include/cuda/std/__new_
index 3e8aefcdb6f..39550ac1080 100644
--- a/libcudacxx/include/cuda/std/__new_
+++ b/libcudacxx/include/cuda/std/__new_
@@ -11,12 +11,22 @@
 #ifndef _CUDA_STD_NEW
 #define _CUDA_STD_NEW
 
-#include "detail/__config"
+#include <cuda/std/detail/__config>
 
-#include "detail/__pragma_push"
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
 
-#include "detail/libcxx/include/new"
+// clang-format off
+#include <cuda/std/detail/__pragma_push>
 
-#include "detail/__pragma_pop"
+#include <cuda/std/detail/libcxx/include/new>
+
+#include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_NEW
diff --git a/libcudacxx/include/cuda/std/array b/libcudacxx/include/cuda/std/array
index f0bd5785600..8190edc899d 100644
--- a/libcudacxx/include/cuda/std/array
+++ b/libcudacxx/include/cuda/std/array
@@ -12,10 +12,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/array>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_ARRAY
diff --git a/libcudacxx/include/cuda/std/atomic b/libcudacxx/include/cuda/std/atomic
index 0daab5f2cb5..3ec9392334f 100644
--- a/libcudacxx/include/cuda/std/atomic
+++ b/libcudacxx/include/cuda/std/atomic
@@ -13,10 +13,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/atomic>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_ATOMIC
diff --git a/libcudacxx/include/cuda/std/barrier b/libcudacxx/include/cuda/std/barrier
index 415c3f80acf..6a29770a6fb 100644
--- a/libcudacxx/include/cuda/std/barrier
+++ b/libcudacxx/include/cuda/std/barrier
@@ -17,10 +17,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/barrier>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_BARRIER
diff --git a/libcudacxx/include/cuda/std/bit b/libcudacxx/include/cuda/std/bit
index 491b346c576..ca3e0ed6470 100644
--- a/libcudacxx/include/cuda/std/bit
+++ b/libcudacxx/include/cuda/std/bit
@@ -13,10 +13,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/bit>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_BIT
diff --git a/libcudacxx/include/cuda/std/cassert b/libcudacxx/include/cuda/std/cassert
index af8af80e43d..5270a28dc93 100644
--- a/libcudacxx/include/cuda/std/cassert
+++ b/libcudacxx/include/cuda/std/cassert
@@ -13,10 +13,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/cassert>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_CASSERT
diff --git a/libcudacxx/include/cuda/std/cfloat b/libcudacxx/include/cuda/std/cfloat
index 31a9f8e4e61..b27bc58c561 100644
--- a/libcudacxx/include/cuda/std/cfloat
+++ b/libcudacxx/include/cuda/std/cfloat
@@ -13,10 +13,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/cfloat>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_CFLOAT
diff --git a/libcudacxx/include/cuda/std/chrono b/libcudacxx/include/cuda/std/chrono
index f8d62efb4f6..5358250f98f 100644
--- a/libcudacxx/include/cuda/std/chrono
+++ b/libcudacxx/include/cuda/std/chrono
@@ -13,10 +13,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/chrono>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_CHRONO
diff --git a/libcudacxx/include/cuda/std/climits b/libcudacxx/include/cuda/std/climits
index f7934b665a9..1cb0c9625ce 100644
--- a/libcudacxx/include/cuda/std/climits
+++ b/libcudacxx/include/cuda/std/climits
@@ -13,10 +13,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/climits>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_CLIMITS
diff --git a/libcudacxx/include/cuda/std/cmath b/libcudacxx/include/cuda/std/cmath
index a6a05ef2430..e3022b2a7ac 100644
--- a/libcudacxx/include/cuda/std/cmath
+++ b/libcudacxx/include/cuda/std/cmath
@@ -12,10 +12,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/cmath>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_CMATH
diff --git a/libcudacxx/include/cuda/std/complex b/libcudacxx/include/cuda/std/complex
index 7c8ea6b5b46..813a47e9afc 100644
--- a/libcudacxx/include/cuda/std/complex
+++ b/libcudacxx/include/cuda/std/complex
@@ -12,10 +12,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/complex>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_COMPLEX
diff --git a/libcudacxx/include/cuda/std/concepts b/libcudacxx/include/cuda/std/concepts
index d3f9eb25dde..8807b10c1e5 100644
--- a/libcudacxx/include/cuda/std/concepts
+++ b/libcudacxx/include/cuda/std/concepts
@@ -12,10 +12,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/concepts>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_CONCEPTS
diff --git a/libcudacxx/include/cuda/std/cstddef b/libcudacxx/include/cuda/std/cstddef
index 95aae77de22..eac158b4763 100644
--- a/libcudacxx/include/cuda/std/cstddef
+++ b/libcudacxx/include/cuda/std/cstddef
@@ -13,10 +13,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/cstddef>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_CSTDDEF
diff --git a/libcudacxx/include/cuda/std/cstdint b/libcudacxx/include/cuda/std/cstdint
index 22c0754e481..78c18424d24 100644
--- a/libcudacxx/include/cuda/std/cstdint
+++ b/libcudacxx/include/cuda/std/cstdint
@@ -13,10 +13,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/cstdint>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_CSTDINT
diff --git a/libcudacxx/include/cuda/std/cstdlib b/libcudacxx/include/cuda/std/cstdlib
index af85815be27..ba11ff9090a 100644
--- a/libcudacxx/include/cuda/std/cstdlib
+++ b/libcudacxx/include/cuda/std/cstdlib
@@ -11,12 +11,22 @@
 #ifndef _CUDA_STD_CSTDLIB
 #define _CUDA_STD_CSTDLIB
 
-#include "detail/__config"
+#include <cuda/std/detail/__config>
 
-#include "detail/__pragma_push"
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
 
-#include "detail/libcxx/include/cstdlib"
+// clang-format off
+#include <cuda/std/detail/__pragma_push>
 
-#include "detail/__pragma_pop"
+#include <cuda/std/detail/libcxx/include/cstdlib>
+
+#include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_CSTDLIB
diff --git a/libcudacxx/include/cuda/std/ctime b/libcudacxx/include/cuda/std/ctime
index d610c831077..38c957eba11 100644
--- a/libcudacxx/include/cuda/std/ctime
+++ b/libcudacxx/include/cuda/std/ctime
@@ -13,10 +13,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/ctime>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_CTIME
diff --git a/libcudacxx/include/cuda/std/detail/__access_property b/libcudacxx/include/cuda/std/detail/__access_property
index 7d9718503e9..c63ec342df9 100644
--- a/libcudacxx/include/cuda/std/detail/__access_property
+++ b/libcudacxx/include/cuda/std/detail/__access_property
@@ -3,325 +3,445 @@
  *
  * NVIDIA SOFTWARE LICENSE
  *
- * This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the NVIDIA/CUDA C++ Library software and materials provided hereunder (“SOFTWARE”).
+ * This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the
+ * NVIDIA/CUDA C++ Library software and materials provided hereunder (“SOFTWARE”).
  *
- * This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used. If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of this license, and you take legal and financial responsibility for the actions of your permitted users.
+ * This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used.
+ * If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By
+ * taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of
+ * this license, and you take legal and financial responsibility for the actions of your permitted users.
  *
- * You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law, regulation or generally accepted practices or guidelines in the relevant jurisdictions.
+ * You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law,
+ * regulation or generally accepted practices or guidelines in the relevant jurisdictions.
  *
- * 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under this license.
+ * 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install
+ * and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this
+ * license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under
+ * this license.
  *
  * 2. DISTRIBUTION REQUIREMENTS. These are the distribution requirements for you to exercise the distribution grant:
- * a.      The terms under which you distribute the SOFTWARE must be consistent with the terms of this license, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA’s intellectual property rights.
- * b.      You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms of your agreements with respect to distributed SOFTWARE.
+ * a.      The terms under which you distribute the SOFTWARE must be consistent with the terms of this license,
+ * including (without limitation) terms relating to the license grant and license restrictions and protection of
+ * NVIDIA’s intellectual property rights. b.      You agree to notify NVIDIA in writing of any known or suspected
+ * distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms
+ * of your agreements with respect to distributed SOFTWARE.
  *
  * 3. LIMITATIONS. Your license to use the SOFTWARE is restricted as follows:
  * a.      The SOFTWARE is licensed for you to develop applications only for use in systems with NVIDIA GPUs.
- * b.      You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SOFTWARE or copies of the SOFTWARE.
- * c.      You may not modify or create derivative works of any portion of the SOFTWARE.
- * d.      You may not bypass, disable, or circumvent any technical measure, encryption, security, digital rights management or authentication mechanism in the SOFTWARE.
- * e.      You may not use the SOFTWARE in any manner that would cause it to become subject to an open source software license. As examples, licenses that require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge.
- * f.      Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses.
- * g.      You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to use of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms.
+ * b.      You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from
+ * any portion of the SOFTWARE or copies of the SOFTWARE. c.      You may not modify or create derivative works of any
+ * portion of the SOFTWARE. d.      You may not bypass, disable, or circumvent any technical measure, encryption,
+ * security, digital rights management or authentication mechanism in the SOFTWARE. e.      You may not use the SOFTWARE
+ * in any manner that would cause it to become subject to an open source software license. As examples, licenses that
+ * require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in
+ * source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge. f.
+ * Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or
+ * application where the use or failure of the system or application can reasonably be expected to threaten or result in
+ * personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life
+ * support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these
+ * critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or
+ * damages arising from such uses. g.      You agree to defend, indemnify and hold harmless NVIDIA and its affiliates,
+ * and their respective employees, contractors, agents, officers and directors, from and against any and all claims,
+ * damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited
+ * to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to use
+ * of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms.
  *
- * 4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in production or business-critical systems.
+ * 4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may
+ * not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy,
+ * availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use
+ * a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in
+ * production or business-critical systems.
  *
- * 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time without notice, but is not obligated to support or update the SOFTWARE.
+ * 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and
+ * exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United
+ * States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time
+ * without notice, but is not obligated to support or update the SOFTWARE.
  *
- * 6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is a conflict between the terms in this license and the license terms associated with a component, the license terms associated with the components control only to the extent necessary to resolve the conflict.
+ * 6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal
+ * notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is
+ * a conflict between the terms in this license and the license terms associated with a component, the license terms
+ * associated with the components control only to the extent necessary to resolve the conflict.
  *
- * 7. FEEDBACK. You may, but don’t have to, provide to NVIDIA any Feedback. “Feedback” means any suggestions, bug fixes, enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA will use Feedback at its choice.
+ * 7. FEEDBACK. You may, but don’t have to, provide to NVIDIA any Feedback. “Feedback” means any suggestions, bug fixes,
+ * enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you
+ * voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable
+ * license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute
+ * (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA
+ * will use Feedback at its choice.
  *
- * 8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED.
+ * 8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT
+ * NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT
+ * WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR
+ * ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED.
  *
- * 9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT.
+ * 9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE
+ * FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE,
+ * LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH
+ * THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON
+ * BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION
+ * OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE
+ * POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING
+ * OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE
+ * OR EXTEND THIS LIMIT.
  *
- * 10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail to comply with any term and condition of this license or if you commence or participate in any legal proceeding against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA’s sole discretion, the continued use of it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this license are not affected by the termination of this license. All provisions of this license will survive termination, except for the license granted to you.
+ * 10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail
+ * to comply with any term and condition of this license or if you commence or participate in any legal proceeding
+ * against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if
+ * NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA’s sole discretion, the continued use of
+ * it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of
+ * the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this
+ * license are not affected by the termination of this license. All provisions of this license will survive termination,
+ * except for the license granted to you.
  *
- * 11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
+ * 11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State
+ * of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware
+ * residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the
+ * International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English
+ * language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction
+ * over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be
+ * allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
  *
- * 12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or operation of law without NVIDIA’s permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect.
+ * 12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or
+ * operation of law without NVIDIA’s permission. Any attempted assignment not approved by NVIDIA in writing shall be
+ * void and of no effect.
  *
- * 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship, transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from receiving the SOFTWARE.
+ * 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship,
+ * transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States
+ * Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s
+ * Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws
+ * include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not
+ * a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from
+ * receiving the SOFTWARE.
  *
- * 14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is “commercial items” consisting of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051.
+ * 14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is “commercial items” consisting
+ * of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS.
+ * Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the
+ * restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the
+ * Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is
+ * NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051.
  *
- * 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect. This license may only be modified in a writing signed by an authorized representative of each party.
+ * 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the
+ * subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to
+ * this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of
+ * this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect.
+ * This license may only be modified in a writing signed by an authorized representative of each party.
  *
  * (v. August 20, 2021)
  */
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
-namespace __detail_ap {
+namespace __detail_ap
+{
 
-  _CCCL_HOST_DEVICE
-  constexpr uint32_t __ap_floor_log2(uint32_t __x) {
-    return (__x == 1 | __x == 0) ? 0 : 1 + __ap_floor_log2(__x >> 1);
-  }
+_CCCL_HOST_DEVICE constexpr uint32_t __ap_floor_log2(uint32_t __x)
+{
+  return (__x == 1 | __x == 0) ? 0 : 1 + __ap_floor_log2(__x >> 1);
+}
 
-  _CCCL_HOST_DEVICE
-  constexpr uint32_t __ap_ceil_log2(uint32_t __x) {
-    return (__x == 1 | __x == 0) ? 0 : __ap_floor_log2(__x - 1) + 1;
-  }
+_CCCL_HOST_DEVICE constexpr uint32_t __ap_ceil_log2(uint32_t __x)
+{
+  return (__x == 1 | __x == 0) ? 0 : __ap_floor_log2(__x - 1) + 1;
+}
 
-  _CCCL_HOST_DEVICE
-  constexpr uint32_t __ap_min(uint32_t __a, uint32_t __b) noexcept {
-    return (__a < __b) ? __a : __b;
-  }
+_CCCL_HOST_DEVICE constexpr uint32_t __ap_min(uint32_t __a, uint32_t __b) noexcept
+{
+  return (__a < __b) ? __a : __b;
+}
 
-  _CCCL_HOST_DEVICE
-  constexpr uint32_t __ap_max(uint32_t __a, uint32_t __b) noexcept {
-    return (__a > __b) ? __a : __b;
-  }
+_CCCL_HOST_DEVICE constexpr uint32_t __ap_max(uint32_t __a, uint32_t __b) noexcept
+{
+  return (__a > __b) ? __a : __b;
+}
 
 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61414
 // Specifically search for 8.4 and 9.3 and above to guarantee uint64_t enum.
-#if defined(_CCCL_COMPILER_GCC) && ( \
-    ((_GNUC_VER < 804)) || \
-    ((_GNUC_VER < 903))    \
-  )
-# define _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION
+#if defined(_CCCL_COMPILER_GCC) && (((_GNUC_VER < 804)) || ((_GNUC_VER < 903)))
+#  define _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION
 #else
-# define _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION : uint64_t
+#  define _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION : uint64_t
 #endif
 
-  namespace __sm_80 {
-    namespace __off {
-      enum __l2_cop_off_t _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION {
-        _L2_EVICT_NORMAL = 0,
-        _L2_EVICT_FIRST = 1,
-      };
-    } // namespace __off
-
-    namespace __on {
-      enum __l2_cop_on_t _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION {
-        _L2_EVICT_NORMAL = 0,
-        _L2_EVICT_FIRST = 1,
-        _L2_EVICT_LAST = 2,
-        _L2_EVICT_NORMAL_DEMOTE = 3,
-      };
-    } // namespace __on
-
-    enum __l2_descriptor_mode_t _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION {
-      _DESC_IMPLICIT = 0,
-      _DESC_INTERLEAVED = 2,
-      _DESC_BLOCK_TYPE = 3,
-    };
-
-    enum __l2_eviction_max_way_t _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION {
-      _CUDA_AMPERE_MAX_L2_WAYS = std::uint32_t{16},
-    };
-
-    enum __block_size_t _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION {
-      _BLOCKSIZE_4K   = 0,
-      _BLOCKSIZE_8K   = 1,
-      _BLOCKSIZE_16K  = 2,
-      _BLOCKSIZE_32K  = 3,
-      _BLOCKSIZE_64K  = 4,
-      _BLOCKSIZE_128K = 5,
-      _BLOCKSIZE_256K = 6,
-      _BLOCKSIZE_512K = 7,
-      _BLOCKSIZE_1M   = 8,
-      _BLOCKSIZE_2M   = 9,
-      _BLOCKSIZE_4M   = 10,
-      _BLOCKSIZE_8M   = 11,
-      _BLOCKSIZE_16M  = 12,
-      _BLOCKSIZE_32M  = 13,
-    };
-
-    struct __block_desc_t {
-      uint64_t __ap_reserved : 37;
-      uint64_t __block_count: 7;
-      uint64_t __block_start: 7;
-      uint64_t __ap_reserved2 : 1;
-      __block_size_t __block_size : 4;
-      __off::__l2_cop_off_t __l2_cop_off : 1;
-      __on::__l2_cop_on_t __l2_cop_on : 2;
-      __l2_descriptor_mode_t __l2_descriptor_mode : 2;
-      uint64_t __l1_inv_dont_allocate : 1;
-      uint64_t __l2_sector_promote_256B : 1;
-      uint64_t __ap_reserved3 : 1;
-
-      _CCCL_HOST_DEVICE
-      constexpr std::uint64_t __get_descriptor_cexpr() const noexcept {
-        return
-          std::uint64_t(__ap_reserved)            << 0  |
-          std::uint64_t(__block_count)            << 37 |
-          std::uint64_t(__block_start)            << 44 |
-          std::uint64_t(__ap_reserved2)           << 51 |
-          std::uint64_t(__block_size)             << 52 |
-          std::uint64_t(__l2_cop_off)             << 56 |
-          std::uint64_t(__l2_cop_on)              << 57 |
-          std::uint64_t(__l2_descriptor_mode)     << 59 |
-          std::uint64_t(__l1_inv_dont_allocate)   << 61 |
-          std::uint64_t(__l2_sector_promote_256B) << 62 |
-          std::uint64_t(__ap_reserved3)           << 63;
-      }
-
-      inline
-      _CCCL_HOST_DEVICE
-      std::uint64_t __get_descriptor_non_cexpr() const noexcept { return *reinterpret_cast<const std::uint64_t*>(this); }
-
-      _CCCL_HOST_DEVICE
-      constexpr std::uint64_t __get_descriptor() const noexcept {
+namespace __sm_80
+{
+namespace __off
+{
+enum __l2_cop_off_t _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION
+{
+  _L2_EVICT_NORMAL = 0,
+  _L2_EVICT_FIRST  = 1,
+};
+} // namespace __off
+
+namespace __on
+{
+enum __l2_cop_on_t _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION
+{
+  _L2_EVICT_NORMAL        = 0,
+  _L2_EVICT_FIRST         = 1,
+  _L2_EVICT_LAST          = 2,
+  _L2_EVICT_NORMAL_DEMOTE = 3,
+};
+} // namespace __on
+
+enum __l2_descriptor_mode_t _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION
+{
+  _DESC_IMPLICIT    = 0,
+  _DESC_INTERLEAVED = 2,
+  _DESC_BLOCK_TYPE  = 3,
+};
+
+enum __l2_eviction_max_way_t _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION
+{
+  _CUDA_AMPERE_MAX_L2_WAYS = std::uint32_t{16},
+};
+
+enum __block_size_t _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION
+{
+  _BLOCKSIZE_4K   = 0,
+  _BLOCKSIZE_8K   = 1,
+  _BLOCKSIZE_16K  = 2,
+  _BLOCKSIZE_32K  = 3,
+  _BLOCKSIZE_64K  = 4,
+  _BLOCKSIZE_128K = 5,
+  _BLOCKSIZE_256K = 6,
+  _BLOCKSIZE_512K = 7,
+  _BLOCKSIZE_1M   = 8,
+  _BLOCKSIZE_2M   = 9,
+  _BLOCKSIZE_4M   = 10,
+  _BLOCKSIZE_8M   = 11,
+  _BLOCKSIZE_16M  = 12,
+  _BLOCKSIZE_32M  = 13,
+};
+
+struct __block_desc_t
+{
+  uint64_t __ap_reserved                      : 37;
+  uint64_t __block_count                      : 7;
+  uint64_t __block_start                      : 7;
+  uint64_t __ap_reserved2                     : 1;
+  __block_size_t __block_size                 : 4;
+  __off::__l2_cop_off_t __l2_cop_off          : 1;
+  __on::__l2_cop_on_t __l2_cop_on             : 2;
+  __l2_descriptor_mode_t __l2_descriptor_mode : 2;
+  uint64_t __l1_inv_dont_allocate             : 1;
+  uint64_t __l2_sector_promote_256B           : 1;
+  uint64_t __ap_reserved3                     : 1;
+
+  _CCCL_HOST_DEVICE constexpr std::uint64_t __get_descriptor_cexpr() const noexcept
+  {
+    return std::uint64_t(__ap_reserved) << 0 | std::uint64_t(__block_count) << 37 | std::uint64_t(__block_start) << 44
+         | std::uint64_t(__ap_reserved2) << 51 | std::uint64_t(__block_size) << 52 | std::uint64_t(__l2_cop_off) << 56
+         | std::uint64_t(__l2_cop_on) << 57 | std::uint64_t(__l2_descriptor_mode) << 59
+         | std::uint64_t(__l1_inv_dont_allocate) << 61 | std::uint64_t(__l2_sector_promote_256B) << 62
+         | std::uint64_t(__ap_reserved3) << 63;
+  }
+
+  inline _CCCL_HOST_DEVICE std::uint64_t __get_descriptor_non_cexpr() const noexcept
+  {
+    return *reinterpret_cast<const std::uint64_t*>(this);
+  }
+
+  _CCCL_HOST_DEVICE constexpr std::uint64_t __get_descriptor() const noexcept
+  {
 #if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
-        return cuda::std::is_constant_evaluated() ?
-                    __get_descriptor_cexpr() :
-                    __get_descriptor_non_cexpr();
+    return cuda::std::is_constant_evaluated() ? __get_descriptor_cexpr() : __get_descriptor_non_cexpr();
 #else
-        return __get_descriptor_cexpr();
+    return __get_descriptor_cexpr();
 #endif
-      }
-    };
-    static_assert(sizeof(__block_desc_t) == 8, "__block_desc_t should be 8 bytes");
-    static_assert(sizeof(__block_desc_t) == sizeof(std::uint64_t), "");
-    static_assert(
-      __block_desc_t{(uint64_t)1, (uint64_t)1, (uint64_t)1, (uint64_t)1, __block_size_t::_BLOCKSIZE_8K, __off::_L2_EVICT_FIRST, __on::_L2_EVICT_FIRST, __l2_descriptor_mode_t::_DESC_INTERLEAVED, (uint64_t)1, (uint64_t)1, (uint64_t)1}.__get_descriptor()
-         == 0xF318102000000001, "");
-
-    /* Factory like struct to build a __block_desc_t due to constexpr C++11
-     */
-    struct __block_descriptor_builder { //variable declaration order matters == usage order
-      std::uint32_t __offset;
-      __block_size_t __block_size;
-      std::uint32_t __block_start, __end_hit;
-      std::uint32_t __block_count;
-      __off::__l2_cop_off_t __l2_cop_off;
-      __on::__l2_cop_on_t __l2_cop_on;
-      __l2_descriptor_mode_t __l2_descriptor_mode;
-      bool __l1_inv_dont_allocate, __l2_sector_promote_256B;
-
-      _CCCL_HOST_DEVICE static constexpr std::uint32_t __calc_offset(std::size_t __total_bytes) {
-          return __ap_max(std::uint32_t{12}, static_cast<std::uint32_t>(__ap_ceil_log2(static_cast<uint32_t>(__total_bytes))) - std::uint32_t{7});
-      }
-
-      _CCCL_HOST_DEVICE static constexpr std::uint32_t __calc_block_start(std::uintptr_t __ptr, std::size_t __total_bytes) {
-          return static_cast<uint32_t>(__ptr >> __calc_offset(static_cast<uint32_t>(__total_bytes)));
-      }
-
-      _CCCL_HOST_DEVICE static constexpr std::uint32_t __calc_end_hit(std::uintptr_t __ptr, std::size_t __hit_bytes, std::size_t __total_bytes) {
-          return static_cast<uint32_t>((__ptr + __hit_bytes + (std::uintptr_t{1} << (__calc_offset(static_cast<uint32_t>(__total_bytes)))) - 1) >> __calc_offset(static_cast<uint32_t>(__total_bytes)));
-      }
-
-      _CCCL_HOST_DEVICE constexpr __block_descriptor_builder(std::uintptr_t __ptr, std::size_t __hit_bytes, std::size_t __total_bytes, __on::__l2_cop_on_t __hit_prop, __off::__l2_cop_off_t __miss_prop)
-        : __offset(__calc_offset(__total_bytes))
-        , __block_size(static_cast<__block_size_t>(__calc_offset(__total_bytes) - std::uint32_t{12}))
-        , __block_start(__calc_block_start(__ptr, __total_bytes))
-        , __end_hit(__calc_end_hit(__ptr, __hit_bytes, __total_bytes))
-        , __block_count(__calc_end_hit(__ptr, __hit_bytes, __total_bytes) - __calc_block_start(__ptr, __total_bytes))
-        , __l2_cop_off(__miss_prop)
-        , __l2_cop_on(__hit_prop)
-        , __l2_descriptor_mode(_DESC_BLOCK_TYPE)
-        , __l1_inv_dont_allocate(false)
-        , __l2_sector_promote_256B(false)
-        {}
-
-      _CCCL_HOST_DEVICE
-      constexpr __block_desc_t __get_block() const noexcept {
-        return __block_desc_t { 0, __ap_min(std::uint32_t{0x7f}, __block_count), (__block_start & std::uint32_t{0x7f}), 0, __block_size, __l2_cop_off, __l2_cop_on, _DESC_BLOCK_TYPE, false, false, 0 };
-      }
-    };
-    static_assert(sizeof(std::uintptr_t) > 4, "std::uintptr_t needs at least 5 bytes for this code to work");
-
-    struct __interleave_descriptor_t {
-      uint64_t __ap_reserved : 52;
-      uint64_t __fraction : 4;
-      __off::__l2_cop_off_t __l2_cop_off : 1;
-      __on::__l2_cop_on_t __l2_cop_on : 2;
-      __l2_descriptor_mode_t __l2_descriptor_mode : 2;
-      uint64_t __l1_inv_dont_allocate : 1;
-      uint64_t __l2_sector_promote_256B : 1;
-      uint64_t __ap_reserved2 : 1;
-
-      _CCCL_HOST_DEVICE
-      constexpr __interleave_descriptor_t(
-                    __on::__l2_cop_on_t __hit_prop,
-                    std::uint32_t __hit_ratio,
-                    __off::__l2_cop_off_t __miss_prop) noexcept
-                        : __ap_reserved(0x0),
-                          __fraction(__hit_ratio),
-                          __l2_cop_off(__miss_prop),
-                          __l2_cop_on(__hit_prop),
-                          __l2_descriptor_mode(_DESC_INTERLEAVED),
-                          __l1_inv_dont_allocate(0x0),
-                          __l2_sector_promote_256B(0x0),
-                          __ap_reserved2(0x0) {}
-
-      _CCCL_HOST_DEVICE
-      constexpr std::uint64_t __get_descriptor_cexpr() const {
-        return
-          std::uint64_t(__ap_reserved)            << 0  |
-          std::uint64_t(__fraction)               << 52 |
-          std::uint64_t(__l2_cop_off)             << 56 |
-          std::uint64_t(__l2_cop_on)              << 57 |
-          std::uint64_t(__l2_descriptor_mode)     << 59 |
-          std::uint64_t(__l1_inv_dont_allocate)   << 61 |
-          std::uint64_t(__l2_sector_promote_256B) << 62 |
-          std::uint64_t(__ap_reserved2)           << 63;
-      }
-
-      inline
-      _CCCL_HOST_DEVICE
-      std::uint64_t __get_descriptor_non_cexpr() const noexcept { return *reinterpret_cast<const std::uint64_t*>(this); }
-
-
-      _CCCL_HOST_DEVICE
-      constexpr std::uint64_t __get_descriptor() const noexcept {
+  }
+};
+static_assert(sizeof(__block_desc_t) == 8, "__block_desc_t should be 8 bytes");
+static_assert(sizeof(__block_desc_t) == sizeof(std::uint64_t), "");
+static_assert(
+  __block_desc_t{
+    (uint64_t) 1,
+    (uint64_t) 1,
+    (uint64_t) 1,
+    (uint64_t) 1,
+    __block_size_t::_BLOCKSIZE_8K,
+    __off::_L2_EVICT_FIRST,
+    __on::_L2_EVICT_FIRST,
+    __l2_descriptor_mode_t::_DESC_INTERLEAVED,
+    (uint64_t) 1,
+    (uint64_t) 1,
+    (uint64_t) 1}
+      .__get_descriptor()
+    == 0xF318102000000001,
+  "");
+
+/* Factory like struct to build a __block_desc_t due to constexpr C++11
+ */
+struct __block_descriptor_builder
+{ // variable declaration order matters == usage order
+  std::uint32_t __offset;
+  __block_size_t __block_size;
+  std::uint32_t __block_start, __end_hit;
+  std::uint32_t __block_count;
+  __off::__l2_cop_off_t __l2_cop_off;
+  __on::__l2_cop_on_t __l2_cop_on;
+  __l2_descriptor_mode_t __l2_descriptor_mode;
+  bool __l1_inv_dont_allocate, __l2_sector_promote_256B;
+
+  _CCCL_HOST_DEVICE static constexpr std::uint32_t __calc_offset(std::size_t __total_bytes)
+  {
+    return __ap_max(
+      std::uint32_t{12},
+      static_cast<std::uint32_t>(__ap_ceil_log2(static_cast<uint32_t>(__total_bytes))) - std::uint32_t{7});
+  }
+
+  _CCCL_HOST_DEVICE static constexpr std::uint32_t __calc_block_start(std::uintptr_t __ptr, std::size_t __total_bytes)
+  {
+    return static_cast<uint32_t>(__ptr >> __calc_offset(static_cast<uint32_t>(__total_bytes)));
+  }
+
+  _CCCL_HOST_DEVICE static constexpr std::uint32_t
+  __calc_end_hit(std::uintptr_t __ptr, std::size_t __hit_bytes, std::size_t __total_bytes)
+  {
+    return static_cast<uint32_t>(
+      (__ptr + __hit_bytes + (std::uintptr_t{1} << (__calc_offset(static_cast<uint32_t>(__total_bytes)))) - 1)
+      >> __calc_offset(static_cast<uint32_t>(__total_bytes)));
+  }
+
+  _CCCL_HOST_DEVICE constexpr __block_descriptor_builder(
+    std::uintptr_t __ptr,
+    std::size_t __hit_bytes,
+    std::size_t __total_bytes,
+    __on::__l2_cop_on_t __hit_prop,
+    __off::__l2_cop_off_t __miss_prop)
+      : __offset(__calc_offset(__total_bytes))
+      , __block_size(static_cast<__block_size_t>(__calc_offset(__total_bytes) - std::uint32_t{12}))
+      , __block_start(__calc_block_start(__ptr, __total_bytes))
+      , __end_hit(__calc_end_hit(__ptr, __hit_bytes, __total_bytes))
+      , __block_count(__calc_end_hit(__ptr, __hit_bytes, __total_bytes) - __calc_block_start(__ptr, __total_bytes))
+      , __l2_cop_off(__miss_prop)
+      , __l2_cop_on(__hit_prop)
+      , __l2_descriptor_mode(_DESC_BLOCK_TYPE)
+      , __l1_inv_dont_allocate(false)
+      , __l2_sector_promote_256B(false)
+  {}
+
+  _CCCL_HOST_DEVICE constexpr __block_desc_t __get_block() const noexcept
+  {
+    return __block_desc_t{
+      0,
+      __ap_min(std::uint32_t{0x7f}, __block_count),
+      (__block_start & std::uint32_t{0x7f}),
+      0,
+      __block_size,
+      __l2_cop_off,
+      __l2_cop_on,
+      _DESC_BLOCK_TYPE,
+      false,
+      false,
+      0};
+  }
+};
+static_assert(sizeof(std::uintptr_t) > 4, "std::uintptr_t needs at least 5 bytes for this code to work");
+
+struct __interleave_descriptor_t
+{
+  uint64_t __ap_reserved                      : 52;
+  uint64_t __fraction                         : 4;
+  __off::__l2_cop_off_t __l2_cop_off          : 1;
+  __on::__l2_cop_on_t __l2_cop_on             : 2;
+  __l2_descriptor_mode_t __l2_descriptor_mode : 2;
+  uint64_t __l1_inv_dont_allocate             : 1;
+  uint64_t __l2_sector_promote_256B           : 1;
+  uint64_t __ap_reserved2                     : 1;
+
+  _CCCL_HOST_DEVICE constexpr __interleave_descriptor_t(
+    __on::__l2_cop_on_t __hit_prop, std::uint32_t __hit_ratio, __off::__l2_cop_off_t __miss_prop) noexcept
+      : __ap_reserved(0x0)
+      , __fraction(__hit_ratio)
+      , __l2_cop_off(__miss_prop)
+      , __l2_cop_on(__hit_prop)
+      , __l2_descriptor_mode(_DESC_INTERLEAVED)
+      , __l1_inv_dont_allocate(0x0)
+      , __l2_sector_promote_256B(0x0)
+      , __ap_reserved2(0x0)
+  {}
+
+  _CCCL_HOST_DEVICE constexpr std::uint64_t __get_descriptor_cexpr() const
+  {
+    return std::uint64_t(__ap_reserved) << 0 | std::uint64_t(__fraction) << 52 | std::uint64_t(__l2_cop_off) << 56
+         | std::uint64_t(__l2_cop_on) << 57 | std::uint64_t(__l2_descriptor_mode) << 59
+         | std::uint64_t(__l1_inv_dont_allocate) << 61 | std::uint64_t(__l2_sector_promote_256B) << 62
+         | std::uint64_t(__ap_reserved2) << 63;
+  }
+
+  inline _CCCL_HOST_DEVICE std::uint64_t __get_descriptor_non_cexpr() const noexcept
+  {
+    return *reinterpret_cast<const std::uint64_t*>(this);
+  }
+
+  _CCCL_HOST_DEVICE constexpr std::uint64_t __get_descriptor() const noexcept
+  {
 #if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
-        return cuda::std::is_constant_evaluated() ?
-                    __get_descriptor_cexpr() :
-                    __get_descriptor_non_cexpr();
+    return cuda::std::is_constant_evaluated() ? __get_descriptor_cexpr() : __get_descriptor_non_cexpr();
 #else
-        return __get_descriptor_cexpr();
+    return __get_descriptor_cexpr();
 #endif
-      }
-    };
-    static_assert(sizeof(__interleave_descriptor_t) == 8, "__interleave_descriptor_t should be 8 bytes");
-    static_assert(sizeof(__interleave_descriptor_t) == sizeof(std::uint64_t), "");
-
-    _CCCL_HOST_DEVICE
-    static constexpr std::uint64_t __interleave_normal() noexcept {
-      return 0x10F0000000000000;
-    }
-
-    _CCCL_HOST_DEVICE
-    static constexpr std::uint64_t __interleave_streaming() noexcept {
-      return 0x12F0000000000000;
-    }
-
-    _CCCL_HOST_DEVICE
-    static constexpr std::uint64_t __interleave_persisting() noexcept {
-      return 0x14F0000000000000;
-    }
-
-    _CCCL_HOST_DEVICE
-    static constexpr std::uint64_t __interleave_normal_demote() noexcept {
-      return 0x16F0000000000000;
-    }
-
-  } // namespace __sm_80
-
-  _CCCL_HOST_DEVICE
-  constexpr std::uint64_t __interleave(cudaAccessProperty __hit_prop, float __hit_ratio, cudaAccessProperty __miss_prop = cudaAccessPropertyNormal) {
-      return __sm_80::__interleave_descriptor_t(
-          ((__hit_prop == cudaAccessPropertyNormal) ? __sm_80::__on::__l2_cop_on_t::_L2_EVICT_NORMAL_DEMOTE : static_cast<__sm_80::__on::__l2_cop_on_t>(__hit_prop)),
-          __ap_min((static_cast<std::uint32_t>(__hit_ratio) * __sm_80::__l2_eviction_max_way_t::_CUDA_AMPERE_MAX_L2_WAYS), static_cast<std::uint32_t>(__sm_80::__l2_eviction_max_way_t::_CUDA_AMPERE_MAX_L2_WAYS - 1)),
-          static_cast<__sm_80::__off::__l2_cop_off_t>(__miss_prop)
-          ).__get_descriptor();
-  }
-
-  _CCCL_HOST_DEVICE
-  constexpr std::uint64_t __block(void* __ptr, std::size_t __hit_bytes, std::size_t __total_bytes, cudaAccessProperty __hit_prop, cudaAccessProperty __miss_prop = cudaAccessPropertyNormal) {
-      return (__total_bytes <= (size_t{0xFFFFFFFF}) & __total_bytes != 0 & __hit_bytes <= __total_bytes) ? __sm_80::__block_descriptor_builder(
-          reinterpret_cast<std::uintptr_t>(__ptr),
-          __hit_bytes,
-          __total_bytes,
-          (__hit_prop == cudaAccessPropertyNormal) ? __sm_80::__on::_L2_EVICT_NORMAL_DEMOTE : static_cast<__sm_80::__on::__l2_cop_on_t>(__hit_prop),
-          static_cast<__sm_80::__off::__l2_cop_off_t>(__miss_prop)
-          ).__get_block().__get_descriptor()
-        : __sm_80::__interleave_normal();
   }
+};
+static_assert(sizeof(__interleave_descriptor_t) == 8, "__interleave_descriptor_t should be 8 bytes");
+static_assert(sizeof(__interleave_descriptor_t) == sizeof(std::uint64_t), "");
+
+_CCCL_HOST_DEVICE static constexpr std::uint64_t __interleave_normal() noexcept
+{
+  return 0x10F0000000000000;
+}
+
+_CCCL_HOST_DEVICE static constexpr std::uint64_t __interleave_streaming() noexcept
+{
+  return 0x12F0000000000000;
+}
+
+_CCCL_HOST_DEVICE static constexpr std::uint64_t __interleave_persisting() noexcept
+{
+  return 0x14F0000000000000;
+}
+
+_CCCL_HOST_DEVICE static constexpr std::uint64_t __interleave_normal_demote() noexcept
+{
+  return 0x16F0000000000000;
+}
+
+} // namespace __sm_80
+
+_CCCL_HOST_DEVICE constexpr std::uint64_t __interleave(
+  cudaAccessProperty __hit_prop, float __hit_ratio, cudaAccessProperty __miss_prop = cudaAccessPropertyNormal)
+{
+  return __sm_80::__interleave_descriptor_t(
+           ((__hit_prop == cudaAccessPropertyNormal) ? __sm_80::__on::__l2_cop_on_t::_L2_EVICT_NORMAL_DEMOTE
+                                                     : static_cast<__sm_80::__on::__l2_cop_on_t>(__hit_prop)),
+           __ap_min(
+             (static_cast<std::uint32_t>(__hit_ratio) * __sm_80::__l2_eviction_max_way_t::_CUDA_AMPERE_MAX_L2_WAYS),
+             static_cast<std::uint32_t>(__sm_80::__l2_eviction_max_way_t::_CUDA_AMPERE_MAX_L2_WAYS - 1)),
+           static_cast<__sm_80::__off::__l2_cop_off_t>(__miss_prop))
+    .__get_descriptor();
+}
+
+_CCCL_HOST_DEVICE constexpr std::uint64_t __block(
+  void* __ptr,
+  std::size_t __hit_bytes,
+  std::size_t __total_bytes,
+  cudaAccessProperty __hit_prop,
+  cudaAccessProperty __miss_prop = cudaAccessPropertyNormal)
+{
+  return (__total_bytes <= (size_t{0xFFFFFFFF}) & __total_bytes != 0 & __hit_bytes <= __total_bytes)
+         ? __sm_80::__block_descriptor_builder(
+             reinterpret_cast<std::uintptr_t>(__ptr),
+             __hit_bytes,
+             __total_bytes,
+             (__hit_prop == cudaAccessPropertyNormal)
+               ? __sm_80::__on::_L2_EVICT_NORMAL_DEMOTE
+               : static_cast<__sm_80::__on::__l2_cop_on_t>(__hit_prop),
+             static_cast<__sm_80::__off::__l2_cop_off_t>(__miss_prop))
+             .__get_block()
+             .__get_descriptor()
+         : __sm_80::__interleave_normal();
+}
 } // namespace __detail_ap
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
diff --git a/libcudacxx/include/cuda/std/detail/__annotated_ptr b/libcudacxx/include/cuda/std/detail/__annotated_ptr
index f1d4b166b6e..eb84a309f45 100644
--- a/libcudacxx/include/cuda/std/detail/__annotated_ptr
+++ b/libcudacxx/include/cuda/std/detail/__annotated_ptr
@@ -3,229 +3,327 @@
  *
  * NVIDIA SOFTWARE LICENSE
  *
- * This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the NVIDIA/CUDA C++ Library software and materials provided hereunder (“SOFTWARE”).
+ * This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the
+ * NVIDIA/CUDA C++ Library software and materials provided hereunder (“SOFTWARE”).
  *
- * This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used. If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of this license, and you take legal and financial responsibility for the actions of your permitted users.
+ * This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used.
+ * If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By
+ * taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of
+ * this license, and you take legal and financial responsibility for the actions of your permitted users.
  *
- * You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law, regulation or generally accepted practices or guidelines in the relevant jurisdictions.
+ * You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law,
+ * regulation or generally accepted practices or guidelines in the relevant jurisdictions.
  *
- * 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under this license.
+ * 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install
+ * and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this
+ * license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under
+ * this license.
  *
  * 2. DISTRIBUTION REQUIREMENTS. These are the distribution requirements for you to exercise the distribution grant:
- * a.      The terms under which you distribute the SOFTWARE must be consistent with the terms of this license, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA’s intellectual property rights.
- * b.      You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms of your agreements with respect to distributed SOFTWARE.
+ * a.      The terms under which you distribute the SOFTWARE must be consistent with the terms of this license,
+ * including (without limitation) terms relating to the license grant and license restrictions and protection of
+ * NVIDIA’s intellectual property rights. b.      You agree to notify NVIDIA in writing of any known or suspected
+ * distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms
+ * of your agreements with respect to distributed SOFTWARE.
  *
  * 3. LIMITATIONS. Your license to use the SOFTWARE is restricted as follows:
  * a.      The SOFTWARE is licensed for you to develop applications only for use in systems with NVIDIA GPUs.
- * b.      You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SOFTWARE or copies of the SOFTWARE.
- * c.      You may not modify or create derivative works of any portion of the SOFTWARE.
- * d.      You may not bypass, disable, or circumvent any technical measure, encryption, security, digital rights management or authentication mechanism in the SOFTWARE.
- * e.      You may not use the SOFTWARE in any manner that would cause it to become subject to an open source software license. As examples, licenses that require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge.
- * f.      Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses.
- * g.      You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to use of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms.
+ * b.      You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from
+ * any portion of the SOFTWARE or copies of the SOFTWARE. c.      You may not modify or create derivative works of any
+ * portion of the SOFTWARE. d.      You may not bypass, disable, or circumvent any technical measure, encryption,
+ * security, digital rights management or authentication mechanism in the SOFTWARE. e.      You may not use the SOFTWARE
+ * in any manner that would cause it to become subject to an open source software license. As examples, licenses that
+ * require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in
+ * source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge. f.
+ * Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or
+ * application where the use or failure of the system or application can reasonably be expected to threaten or result in
+ * personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life
+ * support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these
+ * critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or
+ * damages arising from such uses. g.      You agree to defend, indemnify and hold harmless NVIDIA and its affiliates,
+ * and their respective employees, contractors, agents, officers and directors, from and against any and all claims,
+ * damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited
+ * to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to use
+ * of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms.
  *
- * 4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in production or business-critical systems.
+ * 4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may
+ * not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy,
+ * availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use
+ * a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in
+ * production or business-critical systems.
  *
- * 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time without notice, but is not obligated to support or update the SOFTWARE.
+ * 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and
+ * exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United
+ * States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time
+ * without notice, but is not obligated to support or update the SOFTWARE.
  *
- * 6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is a conflict between the terms in this license and the license terms associated with a component, the license terms associated with the components control only to the extent necessary to resolve the conflict.
+ * 6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal
+ * notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is
+ * a conflict between the terms in this license and the license terms associated with a component, the license terms
+ * associated with the components control only to the extent necessary to resolve the conflict.
  *
- * 7. FEEDBACK. You may, but don’t have to, provide to NVIDIA any Feedback. “Feedback” means any suggestions, bug fixes, enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA will use Feedback at its choice.
+ * 7. FEEDBACK. You may, but don’t have to, provide to NVIDIA any Feedback. “Feedback” means any suggestions, bug fixes,
+ * enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you
+ * voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable
+ * license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute
+ * (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA
+ * will use Feedback at its choice.
  *
- * 8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED.
+ * 8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT
+ * NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT
+ * WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR
+ * ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED.
  *
- * 9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT.
+ * 9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE
+ * FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE,
+ * LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH
+ * THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON
+ * BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION
+ * OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE
+ * POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING
+ * OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE
+ * OR EXTEND THIS LIMIT.
  *
- * 10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail to comply with any term and condition of this license or if you commence or participate in any legal proceeding against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA’s sole discretion, the continued use of it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this license are not affected by the termination of this license. All provisions of this license will survive termination, except for the license granted to you.
+ * 10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail
+ * to comply with any term and condition of this license or if you commence or participate in any legal proceeding
+ * against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if
+ * NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA’s sole discretion, the continued use of
+ * it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of
+ * the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this
+ * license are not affected by the termination of this license. All provisions of this license will survive termination,
+ * except for the license granted to you.
  *
- * 11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
+ * 11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State
+ * of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware
+ * residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the
+ * International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English
+ * language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction
+ * over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be
+ * allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
  *
- * 12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or operation of law without NVIDIA’s permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect.
+ * 12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or
+ * operation of law without NVIDIA’s permission. Any attempted assignment not approved by NVIDIA in writing shall be
+ * void and of no effect.
  *
- * 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship, transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from receiving the SOFTWARE.
+ * 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship,
+ * transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States
+ * Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s
+ * Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws
+ * include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not
+ * a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from
+ * receiving the SOFTWARE.
  *
- * 14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is “commercial items” consisting of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051.
+ * 14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is “commercial items” consisting
+ * of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS.
+ * Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the
+ * restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the
+ * Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is
+ * NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051.
  *
- * 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect. This license may only be modified in a writing signed by an authorized representative of each party.
+ * 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the
+ * subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to
+ * this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of
+ * this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect.
+ * This license may only be modified in a writing signed by an authorized representative of each party.
  *
  * (v. August 20, 2021)
  */
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
-namespace __detail_ap {
+namespace __detail_ap
+{
 
-  template <typename _Property>
-  _CCCL_DEVICE
-  void* __associate_address_space(void* __ptr, _Property __prop) {
-    if (std::is_same<_Property, access_property::shared>::value == true) {
-      bool __b = __isShared(__ptr);
-      _LIBCUDACXX_ASSERT(__b, "");
+template <typename _Property>
+_CCCL_DEVICE void* __associate_address_space(void* __ptr, _Property __prop)
+{
+  if (std::is_same<_Property, access_property::shared>::value == true)
+  {
+    bool __b = __isShared(__ptr);
+    _LIBCUDACXX_ASSERT(__b, "");
 #if !defined(_CCCL_CUDACC_BELOW_11_2)
-      __builtin_assume(__b);
+    __builtin_assume(__b);
 #else // ^^^ !_CCCL_CUDACC_BELOW_11_2 ^^^ / vvv _CCCL_CUDACC_BELOW_11_2 vvv
-      (void)__b;
+    (void) __b;
 #endif // _CCCL_CUDACC_BELOW_11_2
-    } else if (std::is_same<_Property, access_property::global>::value == true ||
-               std::is_same<_Property, access_property::normal>::value == true ||
-               std::is_same<_Property, access_property::persisting>::value == true ||
-               std::is_same<_Property, access_property::streaming>::value == true ||
-               std::is_same<_Property, access_property>::value) {
-      bool __b = __isGlobal(__ptr);
-      _LIBCUDACXX_ASSERT(__b, "");
+  }
+  else if (std::is_same<_Property, access_property::global>::value == true
+           || std::is_same<_Property, access_property::normal>::value == true
+           || std::is_same<_Property, access_property::persisting>::value == true
+           || std::is_same<_Property, access_property::streaming>::value == true
+           || std::is_same<_Property, access_property>::value)
+  {
+    bool __b = __isGlobal(__ptr);
+    _LIBCUDACXX_ASSERT(__b, "");
 #if !defined(_CCCL_CUDACC_BELOW_11_2)
-      __builtin_assume(__b);
+    __builtin_assume(__b);
 #else // ^^^ !_CCCL_CUDACC_BELOW_11_2 ^^^ / vvv _CCCL_CUDACC_BELOW_11_2 vvv
-      (void)__b;
+    (void) __b;
 #endif // _CCCL_CUDACC_BELOW_11_2
-    }
+  }
+
+  return __ptr;
+}
+
+template <typename __Prop>
+_CCCL_DEVICE void* __associate_descriptor(void* __ptr, __Prop __prop)
+{
+  return __associate_descriptor(__ptr, static_cast<std::uint64_t>(access_property(__prop)));
+}
+
+template <>
+inline _CCCL_DEVICE void* __associate_descriptor(void* __ptr, std::uint64_t __prop)
+{
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __nv_associate_access_property(__ptr, __prop);), (return __ptr;))
+}
 
-    return __ptr;
+template <>
+inline _CCCL_DEVICE void* __associate_descriptor(void* __ptr, access_property::shared)
+{
+  return __ptr;
+}
+
+template <typename _Type, typename _Property>
+_CCCL_HOST_DEVICE _Type* __associate(_Type* __ptr, _Property __prop)
+{
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+                    (return static_cast<_Type*>(__associate_descriptor(
+                      __associate_address_space(const_cast<void*>(static_cast<const void*>(__ptr)), __prop), __prop));),
+                    (return __ptr;))
+}
+
+template <typename _Property>
+class __annotated_ptr_base
+{
+  using __error = typename _Property::__unknown_access_property_type;
+};
+
+template <>
+class __annotated_ptr_base<access_property::shared>
+{
+protected:
+  static constexpr std::uint64_t __prop = 0;
+
+  constexpr __annotated_ptr_base() noexcept                                          = default;
+  constexpr __annotated_ptr_base(__annotated_ptr_base const&)                        = default;
+  _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
+  _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property::shared) noexcept {}
+  inline _CCCL_DEVICE void* __apply_prop(void* __p) const
+  {
+    return __associate(__p, access_property::shared{});
+  }
+  _CCCL_HOST_DEVICE constexpr access_property::shared __get_property() const noexcept
+  {
+    return access_property::shared{};
   }
+};
 
-  template <typename __Prop>
-  _CCCL_DEVICE
-  void* __associate_descriptor(void* __ptr, __Prop __prop) {
-    return __associate_descriptor(__ptr, static_cast<std::uint64_t>(access_property(__prop)));
+template <>
+class __annotated_ptr_base<access_property::global>
+{
+protected:
+  static constexpr std::uint64_t __prop = __sm_80::__interleave_normal();
+
+  constexpr __annotated_ptr_base() noexcept                                          = default;
+  constexpr __annotated_ptr_base(__annotated_ptr_base const&)                        = default;
+  _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
+  _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property::global) noexcept {}
+  inline _CCCL_DEVICE void* __apply_prop(void* __p) const
+  {
+    return __associate(__p, access_property::global{});
+  }
+  _CCCL_HOST_DEVICE constexpr access_property::global __get_property() const noexcept
+  {
+    return access_property::global{};
   }
+};
+
+template <>
+class __annotated_ptr_base<access_property::normal>
+{
+protected:
+  static constexpr std::uint64_t __prop = __sm_80::__interleave_normal_demote();
 
-  template <>
-  inline _CCCL_DEVICE
-  void* __associate_descriptor(void* __ptr, std::uint64_t __prop) {
-    NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,(
-      return __nv_associate_access_property(__ptr, __prop);
-    ),(
-      return __ptr;
-    ))
+  constexpr __annotated_ptr_base() noexcept                                          = default;
+  constexpr __annotated_ptr_base(__annotated_ptr_base const&)                        = default;
+  _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
+  _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property::normal) noexcept {}
+  inline _CCCL_DEVICE void* __apply_prop(void* __p) const
+  {
+    return __associate(__p, access_property::normal{});
   }
+  _CCCL_HOST_DEVICE constexpr access_property::normal __get_property() const noexcept
+  {
+    return access_property::normal{};
+  }
+};
+
+template <>
+class __annotated_ptr_base<access_property::persisting>
+{
+protected:
+  static constexpr std::uint64_t __prop = __sm_80::__interleave_persisting();
 
-  template<>
-  inline _CCCL_DEVICE
-  void* __associate_descriptor(void* __ptr, access_property::shared) {
-    return __ptr;
+  constexpr __annotated_ptr_base() noexcept                                          = default;
+  constexpr __annotated_ptr_base(__annotated_ptr_base const&)                        = default;
+  _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
+  _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property::persisting) noexcept {}
+  inline _CCCL_DEVICE void* __apply_prop(void* __p) const
+  {
+    return __associate(__p, access_property::persisting{});
   }
+  _CCCL_HOST_DEVICE constexpr access_property::persisting __get_property() const noexcept
+  {
+    return access_property::persisting{};
+  }
+};
+
+template <>
+class __annotated_ptr_base<access_property::streaming>
+{
+protected:
+  static constexpr std::uint64_t __prop = __sm_80::__interleave_streaming();
 
-  template<typename _Type, typename _Property>
-  _CCCL_HOST_DEVICE
-  _Type* __associate(_Type* __ptr, _Property __prop) {
-    NV_IF_ELSE_TARGET(NV_IS_DEVICE,(
-      return static_cast<_Type*>(__associate_descriptor(
-        __associate_address_space(const_cast<void*>(static_cast<const void*>(__ptr)), __prop),
-        __prop));
-    ),(
-      return __ptr;
-    ))
+  constexpr __annotated_ptr_base() noexcept                                          = default;
+  constexpr __annotated_ptr_base(__annotated_ptr_base const&)                        = default;
+  _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
+  _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property::streaming) noexcept {}
+  inline _CCCL_DEVICE void* __apply_prop(void* __p) const
+  {
+    return __associate(__p, access_property::streaming{});
+  }
+  _CCCL_HOST_DEVICE constexpr access_property::streaming __get_property() const noexcept
+  {
+    return access_property::streaming{};
   }
+};
 
+template <>
+class __annotated_ptr_base<access_property>
+{
+protected:
+  std::uint64_t __prop;
 
-  template<typename _Property>
-  class __annotated_ptr_base {
-    using __error = typename _Property::__unknown_access_property_type;
-  };
-
-  template<>
-  class __annotated_ptr_base<access_property::shared> {
-    protected:
-      static constexpr std::uint64_t __prop = 0;
-
-      constexpr __annotated_ptr_base() noexcept = default;
-      constexpr __annotated_ptr_base(__annotated_ptr_base const&) = default;
-      _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
-      _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property::shared) noexcept {}
-      inline _CCCL_DEVICE void* __apply_prop(void* __p) const {
-        return __associate(__p, access_property::shared{});
-      }
-      _CCCL_HOST_DEVICE constexpr access_property::shared __get_property() const noexcept {
-	return access_property::shared{};
-      }
-  };
-
-  template<>
-  class __annotated_ptr_base<access_property::global> {
-    protected:
-      static constexpr std::uint64_t __prop = __sm_80::__interleave_normal();
-
-      constexpr __annotated_ptr_base() noexcept = default;
-      constexpr __annotated_ptr_base(__annotated_ptr_base const&) = default;
-      _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
-      _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property::global) noexcept {}
-      inline _CCCL_DEVICE void* __apply_prop(void* __p) const {
-        return __associate(__p, access_property::global{});
-      }
-      _CCCL_HOST_DEVICE constexpr access_property::global __get_property() const noexcept {
-	return access_property::global{};
-      }
-  };
-
-  template<>
-  class __annotated_ptr_base<access_property::normal> {
-    protected:
-      static constexpr std::uint64_t __prop = __sm_80::__interleave_normal_demote();
-
-      constexpr __annotated_ptr_base() noexcept = default;
-      constexpr __annotated_ptr_base(__annotated_ptr_base const&) = default;
-      _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
-      _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property::normal) noexcept {}
-      inline _CCCL_DEVICE void* __apply_prop(void* __p) const {
-        return __associate(__p, access_property::normal{});
-      }
-      _CCCL_HOST_DEVICE constexpr access_property::normal __get_property() const noexcept {
-	return access_property::normal{};
-      }
-  };
-
-  template<>
-  class __annotated_ptr_base<access_property::persisting> {
-    protected:
-      static constexpr std::uint64_t __prop = __sm_80::__interleave_persisting();
-
-      constexpr __annotated_ptr_base() noexcept = default;
-      constexpr __annotated_ptr_base(__annotated_ptr_base const&) = default;
-      _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
-      _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property::persisting) noexcept {}
-      inline _CCCL_DEVICE void* __apply_prop(void* __p) const {
-        return __associate(__p, access_property::persisting{});
-      }
-      _CCCL_HOST_DEVICE constexpr access_property::persisting __get_property() const noexcept {
-	return access_property::persisting{};
-      }
-  };
-
-  template<>
-  class __annotated_ptr_base<access_property::streaming> {
-    protected:
-      static constexpr std::uint64_t __prop = __sm_80::__interleave_streaming();
-
-      constexpr __annotated_ptr_base() noexcept = default;
-      constexpr __annotated_ptr_base(__annotated_ptr_base const&) = default;
-      _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
-      _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property::streaming) noexcept {}
-      inline _CCCL_DEVICE void* __apply_prop(void* __p) const {
-        return __associate(__p, access_property::streaming{});
-      }
-      _CCCL_HOST_DEVICE constexpr access_property::streaming __get_property() const noexcept {
-	return access_property::streaming{};
-      }
-  };
-
-  template<>
-  class __annotated_ptr_base<access_property> {
-    protected:
-      std::uint64_t __prop;
-
-      _CCCL_HOST_DEVICE constexpr __annotated_ptr_base() noexcept : __prop(access_property()) {}
-      _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(std::uint64_t __property) noexcept : __prop(__property) {}
-      _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property __property) noexcept
-        : __annotated_ptr_base(static_cast<std::uint64_t>(__property)) {}
-      constexpr __annotated_ptr_base(__annotated_ptr_base const&) = default;
-      _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
-      inline _CCCL_DEVICE void* __apply_prop(void* __p) const {
-        return __associate(__p, __prop);
-      }
-      _CCCL_HOST_DEVICE access_property __get_property() const noexcept {
-	return reinterpret_cast<access_property&>(const_cast<std::uint64_t&>(__prop));
-      }
-  };
+  _CCCL_HOST_DEVICE constexpr __annotated_ptr_base() noexcept
+      : __prop(access_property())
+  {}
+  _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(std::uint64_t __property) noexcept
+      : __prop(__property)
+  {}
+  _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property __property) noexcept
+      : __annotated_ptr_base(static_cast<std::uint64_t>(__property))
+  {}
+  constexpr __annotated_ptr_base(__annotated_ptr_base const&)                        = default;
+  _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
+  inline _CCCL_DEVICE void* __apply_prop(void* __p) const
+  {
+    return __associate(__p, __prop);
+  }
+  _CCCL_HOST_DEVICE access_property __get_property() const noexcept
+  {
+    return reinterpret_cast<access_property&>(const_cast<std::uint64_t&>(__prop));
+  }
+};
 } // namespace __detail_ap
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
diff --git a/libcudacxx/include/cuda/std/detail/__config b/libcudacxx/include/cuda/std/detail/__config
index f4fba1f24d6..aaa22d7cf6d 100644
--- a/libcudacxx/include/cuda/std/detail/__config
+++ b/libcudacxx/include/cuda/std/detail/__config
@@ -13,7 +13,7 @@
 
 #include <cuda/std/__cccl/version.h>
 
-#define _LIBCUDACXX_CUDA_API_VERSION CCCL_VERSION
+#define _LIBCUDACXX_CUDA_API_VERSION       CCCL_VERSION
 #define _LIBCUDACXX_CUDA_API_VERSION_MAJOR CCCL_MAJOR_VERSION
 #define _LIBCUDACXX_CUDA_API_VERSION_MINOR CCCL_MINOR_VERSION
 #define _LIBCUDACXX_CUDA_API_VERSION_PATCH CCCL_PATCH_VERSION
diff --git a/libcudacxx/include/cuda/std/detail/__pragma_push b/libcudacxx/include/cuda/std/detail/__pragma_push
index 5042010790d..e1a507cad04 100644
--- a/libcudacxx/include/cuda/std/detail/__pragma_push
+++ b/libcudacxx/include/cuda/std/detail/__pragma_push
@@ -8,5 +8,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// clang-format off
 #include <cuda/std/detail/libcxx/include/__pragma_push>
 #include <cuda/std/detail/libcxx/include/__undef_macros>
+// clang-format on
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__assert b/libcudacxx/include/cuda/std/detail/libcxx/include/__assert
index ad54f46dfd6..3568b3b746f 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__assert
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__assert
@@ -27,28 +27,28 @@
 // assertions through the Debug mode previously.
 // TODO: In LLVM 16, make it an error to define _LIBCUDACXX_DEBUG
 #if defined(_LIBCUDACXX_DEBUG)
-# ifndef _LIBCUDACXX_ENABLE_ASSERTIONS
-#   define _LIBCUDACXX_ENABLE_ASSERTIONS 1
-# endif
+#  ifndef _LIBCUDACXX_ENABLE_ASSERTIONS
+#    define _LIBCUDACXX_ENABLE_ASSERTIONS 1
+#  endif
 #endif
 
 // Automatically enable assertions when the debug mode is enabled.
 #if defined(_LIBCUDACXX_ENABLE_DEBUG_MODE)
-# ifndef _LIBCUDACXX_ENABLE_ASSERTIONS
-#   define _LIBCUDACXX_ENABLE_ASSERTIONS 1
-# endif
+#  ifndef _LIBCUDACXX_ENABLE_ASSERTIONS
+#    define _LIBCUDACXX_ENABLE_ASSERTIONS 1
+#  endif
 #endif
 
 #ifndef _LIBCUDACXX_ENABLE_ASSERTIONS
-# define _LIBCUDACXX_ENABLE_ASSERTIONS _LIBCUDACXX_ENABLE_ASSERTIONS_DEFAULT
+#  define _LIBCUDACXX_ENABLE_ASSERTIONS _LIBCUDACXX_ENABLE_ASSERTIONS_DEFAULT
 #endif
 
 #if _LIBCUDACXX_ENABLE_ASSERTIONS != 0 && _LIBCUDACXX_ENABLE_ASSERTIONS != 1
-# error "_LIBCUDACXX_ENABLE_ASSERTIONS must be set to 0 or 1"
+#  error "_LIBCUDACXX_ENABLE_ASSERTIONS must be set to 0 or 1"
 #endif
 
 #if _LIBCUDACXX_ENABLE_ASSERTIONS
-# define _LIBCUDACXX_ASSERT(expression, message)                                \
+#  define _LIBCUDACXX_ASSERT(expression, message)        \
     (_CCCL_DIAG_PUSH                                                            \
     _CCCL_DIAG_SUPPRESS_CLANG("-Wassume")                                       \
     __builtin_expect(static_cast<bool>(expression), 1) ?                        \
@@ -56,13 +56,11 @@
       ::_CUDA_VSTD::__libcpp_verbose_abort("%s:%d: assertion %s failed: %s", __FILE__, __LINE__, #expression, message)
     _CCCL_DIAG_POP)
 #elif 0 // !defined(_LIBCUDACXX_ASSERTIONS_DISABLE_ASSUME) && __has_builtin(__builtin_assume)
-# define _LIBCUDACXX_ASSERT(expression, message)                                \
-    (_CCCL_DIAG_PUSH                                                            \
-    _CCCL_DIAG_SUPPRESS_CLANG("-Wassume")                                       \
-    __builtin_assume(static_cast<bool>(expression))                             \
-    _CCCL_DIAG_POP)
+#  define _LIBCUDACXX_ASSERT(expression, message)                                                          \
+    (_CCCL_DIAG_PUSH _CCCL_DIAG_SUPPRESS_CLANG("-Wassume") __builtin_assume(static_cast<bool>(expression)) \
+       _CCCL_DIAG_POP)
 #else
-# define _LIBCUDACXX_ASSERT(expression, message) ((void)0)
+#  define _LIBCUDACXX_ASSERT(expression, message) ((void) 0)
 #endif
 
 #endif // _LIBCUDACXX___ASSERT
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__availability b/libcudacxx/include/cuda/std/detail/libcxx/include/__availability
index 37ac58934ea..f89d2abf1a0 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__availability
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__availability
@@ -63,226 +63,230 @@
 //
 // [1]: https://clang.llvm.org/docs/AttributeReference.html#availability
 
-
 // For backwards compatibility, allow users to define _LIBCUDACXX_DISABLE_AVAILABILITY
 // for a while.
 #if defined(_LIBCUDACXX_DISABLE_AVAILABILITY)
-#   if !defined(_LIBCUDACXX_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS)
-#       define _LIBCUDACXX_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS
-#   endif
+#  if !defined(_LIBCUDACXX_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS)
+#    define _LIBCUDACXX_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS
+#  endif
 #endif
 
 // Availability markup is disabled when building the library, or when the compiler
 // doesn't support the proper attributes.
-#if defined(_LIBCUDACXX_BUILDING_LIBRARY) ||                                        \
-    defined(_LIBCXXABI_BUILDING_LIBRARY) ||                                     \
-    !__has_feature(attribute_availability_with_strict) ||                       \
-    !__has_feature(attribute_availability_in_templates) ||                      \
-    !__has_extension(pragma_clang_attribute_external_declaration)
-#   if !defined(_LIBCUDACXX_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS)
-#       define _LIBCUDACXX_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS
-#   endif
+#if defined(_LIBCUDACXX_BUILDING_LIBRARY) || defined(_LIBCXXABI_BUILDING_LIBRARY)                              \
+  || !__has_feature(attribute_availability_with_strict) || !__has_feature(attribute_availability_in_templates) \
+  || !__has_extension(pragma_clang_attribute_external_declaration)
+#  if !defined(_LIBCUDACXX_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS)
+#    define _LIBCUDACXX_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS
+#  endif
 #endif
 
 #if defined(_LIBCUDACXX_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS)
 
-    // This controls the availability of std::shared_mutex and std::shared_timed_mutex,
-    // which were added to the dylib later.
-#   define _LIBCUDACXX_AVAILABILITY_SHARED_MUTEX
+// This controls the availability of std::shared_mutex and std::shared_timed_mutex,
+// which were added to the dylib later.
+#  define _LIBCUDACXX_AVAILABILITY_SHARED_MUTEX
 // #   define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_mutex
 // #   define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_timed_mutex
 
-    // These macros control the availability of std::bad_optional_access and
-    // other exception types. These were put in the shared library to prevent
-    // code bloat from every user program defining the vtable for these exception
-    // types.
-    //
-    // Note that when exceptions are disabled, the methods that normally throw
-    // these exceptions can be used even on older deployment targets, but those
-    // methods will abort instead of throwing.
-#   define _LIBCUDACXX_AVAILABILITY_BAD_OPTIONAL_ACCESS
-#   define _LIBCUDACXX_AVAILABILITY_BAD_VARIANT_ACCESS
-#   define _LIBCUDACXX_AVAILABILITY_BAD_ANY_CAST
-
-    // This controls the availability of std::uncaught_exceptions().
-#   define _LIBCUDACXX_AVAILABILITY_UNCAUGHT_EXCEPTIONS
-
-    // This controls the availability of the sized version of ::operator delete,
-    // ::operator delete[], and their align_val_t variants, which were all added
-    // in C++17, and hence not present in early dylibs.
-#   define _LIBCUDACXX_AVAILABILITY_SIZED_NEW_DELETE
-
-    // This controls the availability of the std::future_error exception.
-    //
-    // Note that when exceptions are disabled, the methods that normally throw
-    // std::future_error can be used even on older deployment targets, but those
-    // methods will abort instead of throwing.
-#   define _LIBCUDACXX_AVAILABILITY_FUTURE_ERROR
-
-    // This controls the availability of std::type_info's vtable.
-    // I can't imagine how using std::type_info can work at all if
-    // this isn't supported.
-#   define _LIBCUDACXX_AVAILABILITY_TYPEINFO_VTABLE
-
-    // This controls the availability of std::locale::category members
-    // (e.g. std::locale::collate), which are defined in the dylib.
-#   define _LIBCUDACXX_AVAILABILITY_LOCALE_CATEGORY
-
-    // This controls the availability of atomic operations on std::shared_ptr
-    // (e.g. `std::atomic_store(std::shared_ptr)`), which require a shared
-    // lock table located in the dylib.
-#   define _LIBCUDACXX_AVAILABILITY_ATOMIC_SHARED_PTR
-
-    // These macros control the availability of all parts of <filesystem> that
-    // depend on something in the dylib.
-#   define _LIBCUDACXX_AVAILABILITY_FILESYSTEM
-#   define _LIBCUDACXX_AVAILABILITY_FILESYSTEM_PUSH
-#   define _LIBCUDACXX_AVAILABILITY_FILESYSTEM_POP
+// These macros control the availability of std::bad_optional_access and
+// other exception types. These were put in the shared library to prevent
+// code bloat from every user program defining the vtable for these exception
+// types.
+//
+// Note that when exceptions are disabled, the methods that normally throw
+// these exceptions can be used even on older deployment targets, but those
+// methods will abort instead of throwing.
+#  define _LIBCUDACXX_AVAILABILITY_BAD_OPTIONAL_ACCESS
+#  define _LIBCUDACXX_AVAILABILITY_BAD_VARIANT_ACCESS
+#  define _LIBCUDACXX_AVAILABILITY_BAD_ANY_CAST
+
+// This controls the availability of std::uncaught_exceptions().
+#  define _LIBCUDACXX_AVAILABILITY_UNCAUGHT_EXCEPTIONS
+
+// This controls the availability of the sized version of ::operator delete,
+// ::operator delete[], and their align_val_t variants, which were all added
+// in C++17, and hence not present in early dylibs.
+#  define _LIBCUDACXX_AVAILABILITY_SIZED_NEW_DELETE
+
+// This controls the availability of the std::future_error exception.
+//
+// Note that when exceptions are disabled, the methods that normally throw
+// std::future_error can be used even on older deployment targets, but those
+// methods will abort instead of throwing.
+#  define _LIBCUDACXX_AVAILABILITY_FUTURE_ERROR
+
+// This controls the availability of std::type_info's vtable.
+// I can't imagine how using std::type_info can work at all if
+// this isn't supported.
+#  define _LIBCUDACXX_AVAILABILITY_TYPEINFO_VTABLE
+
+// This controls the availability of std::locale::category members
+// (e.g. std::locale::collate), which are defined in the dylib.
+#  define _LIBCUDACXX_AVAILABILITY_LOCALE_CATEGORY
+
+// This controls the availability of atomic operations on std::shared_ptr
+// (e.g. `std::atomic_store(std::shared_ptr)`), which require a shared
+// lock table located in the dylib.
+#  define _LIBCUDACXX_AVAILABILITY_ATOMIC_SHARED_PTR
+
+// These macros control the availability of all parts of <filesystem> that
+// depend on something in the dylib.
+#  define _LIBCUDACXX_AVAILABILITY_FILESYSTEM
+#  define _LIBCUDACXX_AVAILABILITY_FILESYSTEM_PUSH
+#  define _LIBCUDACXX_AVAILABILITY_FILESYSTEM_POP
 // #   define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_filesystem
 
-    // This controls the availability of floating-point std::to_chars functions.
-    // These overloads were added later than the integer overloads.
-#   define _LIBCUDACXX_AVAILABILITY_TO_CHARS_FLOATING_POINT
+// This controls the availability of floating-point std::to_chars functions.
+// These overloads were added later than the integer overloads.
+#  define _LIBCUDACXX_AVAILABILITY_TO_CHARS_FLOATING_POINT
 
-    // This controls the availability of the C++20 synchronization library,
-    // which requires shared library support for various operations
-    // (see libcxx/src/atomic.cpp). This includes <barier>, <latch>,
-    // <semaphore>, and notification functions on std::atomic.
-#   define _LIBCUDACXX_AVAILABILITY_SYNC
+// This controls the availability of the C++20 synchronization library,
+// which requires shared library support for various operations
+// (see libcxx/src/atomic.cpp). This includes <barier>, <latch>,
+// <semaphore>, and notification functions on std::atomic.
+#  define _LIBCUDACXX_AVAILABILITY_SYNC
 // #   define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_atomic_wait
 // #   define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_barrier
 // #   define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_latch
 // #   define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_semaphore
 
-    // This controls the availability of the C++20 format library.
-    // The library is in development and not ABI stable yet. P2216 is
-    // retroactively accepted in C++20. This paper contains ABI breaking
-    // changes.
-#   define _LIBCUDACXX_AVAILABILITY_FORMAT
+// This controls the availability of the C++20 format library.
+// The library is in development and not ABI stable yet. P2216 is
+// retroactively accepted in C++20. This paper contains ABI breaking
+// changes.
+#  define _LIBCUDACXX_AVAILABILITY_FORMAT
 // #   define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_format
 
-    // This controls whether the default verbose termination function is
-    // provided by the library.
-    //
-    // Note that when users provide their own custom function, it doesn't
-    // matter whether the dylib provides a default function, and the
-    // availability markup can actually give a false positive diagnostic
-    // (it will think that no function is provided, when in reality the
-    // user has provided their own).
-    //
-    // Users can pass -D_LIBCUDACXX_AVAILABILITY_CUSTOM_VERBOSE_ABORT_PROVIDED
-    // to the compiler to tell the library not to define its own verbose abort.
-    // Note that defining this macro but failing to define a custom function
-    // will lead to a load-time error on back-deployment targets, so it should
-    // be avoided.
+// This controls whether the default verbose termination function is
+// provided by the library.
+//
+// Note that when users provide their own custom function, it doesn't
+// matter whether the dylib provides a default function, and the
+// availability markup can actually give a false positive diagnostic
+// (it will think that no function is provided, when in reality the
+// user has provided their own).
+//
+// Users can pass -D_LIBCUDACXX_AVAILABILITY_CUSTOM_VERBOSE_ABORT_PROVIDED
+// to the compiler to tell the library not to define its own verbose abort.
+// Note that defining this macro but failing to define a custom function
+// will lead to a load-time error on back-deployment targets, so it should
+// be avoided.
 // #   define _LIBCUDACXX_HAS_NO_VERBOSE_ABORT_IN_LIBRARY
 
 #elif defined(__APPLE__)
 
-#   define _LIBCUDACXX_AVAILABILITY_SHARED_MUTEX                                    \
-        __attribute__((availability(macos,strict,introduced=10.12)))            \
-        __attribute__((availability(ios,strict,introduced=10.0)))               \
-        __attribute__((availability(tvos,strict,introduced=10.0)))              \
-        __attribute__((availability(watchos,strict,introduced=3.0)))
-#   if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101200) ||    \
-        (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 100000) || \
-        (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 100000) ||         \
-        (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 30000)
-#       define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_mutex
-#       define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_timed_mutex
-#   endif
-
-        // Note: bad_optional_access & friends were not introduced in the matching
-        // macOS and iOS versions, so the version mismatch between macOS and others
-        // is intended.
-#   define _LIBCUDACXX_AVAILABILITY_BAD_OPTIONAL_ACCESS                             \
-        __attribute__((availability(macos,strict,introduced=10.13)))            \
-        __attribute__((availability(ios,strict,introduced=12.0)))               \
-        __attribute__((availability(tvos,strict,introduced=12.0)))              \
-        __attribute__((availability(watchos,strict,introduced=5.0)))
-#   define _LIBCUDACXX_AVAILABILITY_BAD_VARIANT_ACCESS                              \
-        _LIBCUDACXX_AVAILABILITY_BAD_OPTIONAL_ACCESS
-#   define _LIBCUDACXX_AVAILABILITY_BAD_ANY_CAST                                    \
-        _LIBCUDACXX_AVAILABILITY_BAD_OPTIONAL_ACCESS
-
-#   define _LIBCUDACXX_AVAILABILITY_UNCAUGHT_EXCEPTIONS                             \
-        __attribute__((availability(macos,strict,introduced=10.12)))            \
-        __attribute__((availability(ios,strict,introduced=10.0)))               \
-        __attribute__((availability(tvos,strict,introduced=10.0)))              \
-        __attribute__((availability(watchos,strict,introduced=3.0)))
-
-#   define _LIBCUDACXX_AVAILABILITY_SIZED_NEW_DELETE                                \
-        __attribute__((availability(macos,strict,introduced=10.12)))            \
-        __attribute__((availability(ios,strict,introduced=10.0)))               \
-        __attribute__((availability(tvos,strict,introduced=10.0)))              \
-        __attribute__((availability(watchos,strict,introduced=3.0)))
-
-#   define _LIBCUDACXX_AVAILABILITY_FUTURE_ERROR                                    \
-        __attribute__((availability(ios,strict,introduced=6.0)))
-
-#   define _LIBCUDACXX_AVAILABILITY_TYPEINFO_VTABLE                                 \
-        __attribute__((availability(macos,strict,introduced=10.9)))             \
-        __attribute__((availability(ios,strict,introduced=7.0)))
-
-#   define _LIBCUDACXX_AVAILABILITY_LOCALE_CATEGORY                                 \
-        __attribute__((availability(macos,strict,introduced=10.9)))             \
-        __attribute__((availability(ios,strict,introduced=7.0)))
-
-#   define _LIBCUDACXX_AVAILABILITY_ATOMIC_SHARED_PTR                               \
-        __attribute__((availability(macos,strict,introduced=10.9)))             \
-        __attribute__((availability(ios,strict,introduced=7.0)))
-
-#   define _LIBCUDACXX_AVAILABILITY_FILESYSTEM                                      \
-        __attribute__((availability(macos,strict,introduced=10.15)))            \
-        __attribute__((availability(ios,strict,introduced=13.0)))               \
-        __attribute__((availability(tvos,strict,introduced=13.0)))              \
-        __attribute__((availability(watchos,strict,introduced=6.0)))
-#   define _LIBCUDACXX_AVAILABILITY_FILESYSTEM_PUSH                                 \
-        _Pragma("clang attribute push(__attribute__((availability(macos,strict,introduced=10.15))), apply_to=any(function,record))") \
-        _Pragma("clang attribute push(__attribute__((availability(ios,strict,introduced=13.0))), apply_to=any(function,record))")    \
-        _Pragma("clang attribute push(__attribute__((availability(tvos,strict,introduced=13.0))), apply_to=any(function,record))")   \
-        _Pragma("clang attribute push(__attribute__((availability(watchos,strict,introduced=6.0))), apply_to=any(function,record))")
-#   define _LIBCUDACXX_AVAILABILITY_FILESYSTEM_POP                                  \
-        _Pragma("clang attribute pop")                                          \
-        _Pragma("clang attribute pop")                                          \
-        _Pragma("clang attribute pop")                                          \
-        _Pragma("clang attribute pop")
-#   if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101500) ||    \
-        (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 130000) || \
-        (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 130000) ||         \
-        (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 60000)
-#       define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_filesystem
-#   endif
-
-#   define _LIBCUDACXX_AVAILABILITY_TO_CHARS_FLOATING_POINT                         \
-        __attribute__((unavailable))
-
-#   define _LIBCUDACXX_AVAILABILITY_SYNC                                            \
-        __attribute__((availability(macos,strict,introduced=11.0)))             \
-        __attribute__((availability(ios,strict,introduced=14.0)))               \
-        __attribute__((availability(tvos,strict,introduced=14.0)))              \
-        __attribute__((availability(watchos,strict,introduced=7.0)))
-#   if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 110000) ||    \
-        (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 140000) || \
-        (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 140000) ||         \
-        (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 70000)
-#       define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_atomic_wait
-#       define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_barrier
-#       define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_latch
-#       define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_semaphore
-#   endif
-
-#   define _LIBCUDACXX_AVAILABILITY_FORMAT                                          \
-        __attribute__((unavailable))
-#   define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_format
-
-#   define _LIBCUDACXX_HAS_NO_VERBOSE_ABORT_IN_LIBRARY
+#  define _LIBCUDACXX_AVAILABILITY_SHARED_MUTEX                      \
+    __attribute__((availability(macos, strict, introduced = 10.12))) \
+    __attribute__((availability(ios, strict, introduced = 10.0)))    \
+    __attribute__((availability(tvos, strict, introduced = 10.0)))   \
+    __attribute__((availability(watchos, strict, introduced = 3.0)))
+#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__)                                                       \
+       && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101200)                                                   \
+    || (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__)                                                     \
+        && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 100000)                                                 \
+    || (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 100000) \
+    || (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__)                                                      \
+        && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 30000)
+#    define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_mutex
+#    define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_timed_mutex
+#  endif
+
+// Note: bad_optional_access & friends were not introduced in the matching
+// macOS and iOS versions, so the version mismatch between macOS and others
+// is intended.
+#  define _LIBCUDACXX_AVAILABILITY_BAD_OPTIONAL_ACCESS               \
+    __attribute__((availability(macos, strict, introduced = 10.13))) \
+    __attribute__((availability(ios, strict, introduced = 12.0)))    \
+    __attribute__((availability(tvos, strict, introduced = 12.0)))   \
+    __attribute__((availability(watchos, strict, introduced = 5.0)))
+#  define _LIBCUDACXX_AVAILABILITY_BAD_VARIANT_ACCESS _LIBCUDACXX_AVAILABILITY_BAD_OPTIONAL_ACCESS
+#  define _LIBCUDACXX_AVAILABILITY_BAD_ANY_CAST       _LIBCUDACXX_AVAILABILITY_BAD_OPTIONAL_ACCESS
+
+#  define _LIBCUDACXX_AVAILABILITY_UNCAUGHT_EXCEPTIONS               \
+    __attribute__((availability(macos, strict, introduced = 10.12))) \
+    __attribute__((availability(ios, strict, introduced = 10.0)))    \
+    __attribute__((availability(tvos, strict, introduced = 10.0)))   \
+    __attribute__((availability(watchos, strict, introduced = 3.0)))
+
+#  define _LIBCUDACXX_AVAILABILITY_SIZED_NEW_DELETE                  \
+    __attribute__((availability(macos, strict, introduced = 10.12))) \
+    __attribute__((availability(ios, strict, introduced = 10.0)))    \
+    __attribute__((availability(tvos, strict, introduced = 10.0)))   \
+    __attribute__((availability(watchos, strict, introduced = 3.0)))
+
+#  define _LIBCUDACXX_AVAILABILITY_FUTURE_ERROR __attribute__((availability(ios, strict, introduced = 6.0)))
+
+#  define _LIBCUDACXX_AVAILABILITY_TYPEINFO_VTABLE                  \
+    __attribute__((availability(macos, strict, introduced = 10.9))) \
+    __attribute__((availability(ios, strict, introduced = 7.0)))
+
+#  define _LIBCUDACXX_AVAILABILITY_LOCALE_CATEGORY                  \
+    __attribute__((availability(macos, strict, introduced = 10.9))) \
+    __attribute__((availability(ios, strict, introduced = 7.0)))
+
+#  define _LIBCUDACXX_AVAILABILITY_ATOMIC_SHARED_PTR                \
+    __attribute__((availability(macos, strict, introduced = 10.9))) \
+    __attribute__((availability(ios, strict, introduced = 7.0)))
+
+#  define _LIBCUDACXX_AVAILABILITY_FILESYSTEM                        \
+    __attribute__((availability(macos, strict, introduced = 10.15))) \
+    __attribute__((availability(ios, strict, introduced = 13.0)))    \
+    __attribute__((availability(tvos, strict, introduced = 13.0)))   \
+    __attribute__((availability(watchos, strict, introduced = 6.0)))
+#  define _LIBCUDACXX_AVAILABILITY_FILESYSTEM_PUSH                                                      \
+    _Pragma("clang attribute push(__attribute__((availability(macos,strict,introduced=10.15))), "       \
+            "apply_to=any(function,record))")                                                           \
+      _Pragma("clang attribute push(__attribute__((availability(ios,strict,introduced=13.0))), "        \
+              "apply_to=any(function,record))")                                                         \
+        _Pragma("clang attribute push(__attribute__((availability(tvos,strict,introduced=13.0))), "     \
+                "apply_to=any(function,record))")                                                       \
+          _Pragma("clang attribute push(__attribute__((availability(watchos,strict,introduced=6.0))), " \
+                  "apply_to=any(function,record))")
+#  define _LIBCUDACXX_AVAILABILITY_FILESYSTEM_POP                                                \
+    _Pragma("clang attribute pop") _Pragma("clang attribute pop") _Pragma("clang attribute pop") \
+      _Pragma("clang attribute pop")
+#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__)                                                       \
+       && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101500)                                                   \
+    || (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__)                                                     \
+        && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 130000)                                                 \
+    || (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 130000) \
+    || (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__)                                                      \
+        && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 60000)
+#    define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_filesystem
+#  endif
+
+#  define _LIBCUDACXX_AVAILABILITY_TO_CHARS_FLOATING_POINT __attribute__((unavailable))
+
+#  define _LIBCUDACXX_AVAILABILITY_SYNC                             \
+    __attribute__((availability(macos, strict, introduced = 11.0))) \
+    __attribute__((availability(ios, strict, introduced = 14.0)))   \
+    __attribute__((availability(tvos, strict, introduced = 14.0)))  \
+    __attribute__((availability(watchos, strict, introduced = 7.0)))
+#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__)                                                       \
+       && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 110000)                                                   \
+    || (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__)                                                     \
+        && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 140000)                                                 \
+    || (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 140000) \
+    || (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__)                                                      \
+        && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 70000)
+#    define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_atomic_wait
+#    define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_barrier
+#    define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_latch
+#    define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_semaphore
+#  endif
+
+#  define _LIBCUDACXX_AVAILABILITY_FORMAT __attribute__((unavailable))
+#  define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_format
+
+#  define _LIBCUDACXX_HAS_NO_VERBOSE_ABORT_IN_LIBRARY
 
 #else
 
 // ...New vendors can add availability markup here...
 
-#   error "It looks like you're trying to enable vendor availability markup, but you haven't defined the corresponding macros yet!"
+#  error \
+    "It looks like you're trying to enable vendor availability markup, but you haven't defined the corresponding macros yet!"
 
 #endif
 
@@ -290,15 +294,15 @@
 // Those are defined in terms of the availability attributes above, and
 // should not be vendor-specific.
 #if defined(_LIBCUDACXX_NO_EXCEPTIONS)
-#   define _LIBCUDACXX_AVAILABILITY_FUTURE
-#   define _LIBCUDACXX_AVAILABILITY_THROW_BAD_ANY_CAST
-#   define _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS
-#   define _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS
+#  define _LIBCUDACXX_AVAILABILITY_FUTURE
+#  define _LIBCUDACXX_AVAILABILITY_THROW_BAD_ANY_CAST
+#  define _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS
+#  define _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS
 #else
-#   define _LIBCUDACXX_AVAILABILITY_FUTURE                    _LIBCUDACXX_AVAILABILITY_FUTURE_ERROR
-#   define _LIBCUDACXX_AVAILABILITY_THROW_BAD_ANY_CAST        _LIBCUDACXX_AVAILABILITY_BAD_ANY_CAST
-#   define _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS _LIBCUDACXX_AVAILABILITY_BAD_OPTIONAL_ACCESS
-#   define _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS  _LIBCUDACXX_AVAILABILITY_BAD_VARIANT_ACCESS
+#  define _LIBCUDACXX_AVAILABILITY_FUTURE                    _LIBCUDACXX_AVAILABILITY_FUTURE_ERROR
+#  define _LIBCUDACXX_AVAILABILITY_THROW_BAD_ANY_CAST        _LIBCUDACXX_AVAILABILITY_BAD_ANY_CAST
+#  define _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS _LIBCUDACXX_AVAILABILITY_BAD_OPTIONAL_ACCESS
+#  define _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS  _LIBCUDACXX_AVAILABILITY_BAD_VARIANT_ACCESS
 #endif
 
 #endif // _LIBCUDACXX___AVAILABILITY
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__bit_reference b/libcudacxx/include/cuda/std/detail/libcxx/include/__bit_reference
index 4ce42eb4c6a..88325c3d5c9 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__bit_reference
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__bit_reference
@@ -10,9 +10,9 @@
 #ifndef _LIBCUDACXX___BIT_REFERENCE
 #define _LIBCUDACXX___BIT_REFERENCE
 
-##include <cuda/std/detail/__config>
-#include <bit>
+##include<cuda / std / detail / __config>
 #include <algorithm>
+#include <bit>
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
@@ -22,229 +22,259 @@
 #  pragma system_header
 #endif // no system header
 
-_LIBCUDACXX_PUSH_MACROS
+  _LIBCUDACXX_PUSH_MACROS
 #include <__undef_macros>
 
+    _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-_LIBCUDACXX_BEGIN_NAMESPACE_STD
-
-template <class _Cp, bool _IsConst, typename _Cp::__storage_type = 0> class __bit_iterator;
-template <class _Cp> class __bit_const_reference;
+template <class _Cp, bool _IsConst, typename _Cp::__storage_type = 0>
+class __bit_iterator;
+template <class _Cp>
+class __bit_const_reference;
 
 template <class _Tp>
 struct __has_storage_type
 {
-    static const bool value = false;
+  static const bool value = false;
 };
 
 template <class _Cp, bool = __has_storage_type<_Cp>::value>
 class __bit_reference
 {
-    typedef typename _Cp::__storage_type    __storage_type;
-    typedef typename _Cp::__storage_pointer __storage_pointer;
-
-    __storage_pointer __seg_;
-    __storage_type    __mask_;
+  typedef typename _Cp::__storage_type __storage_type;
+  typedef typename _Cp::__storage_pointer __storage_pointer;
 
-    friend typename _Cp::__self;
+  __storage_pointer __seg_;
+  __storage_type __mask_;
 
-    friend class __bit_const_reference<_Cp>;
-    friend class __bit_iterator<_Cp, false>;
-public:
-    _LIBCUDACXX_INLINE_VISIBILITY
-    __bit_reference(const __bit_reference&) = default;
+  friend typename _Cp::__self;
 
-    _LIBCUDACXX_INLINE_VISIBILITY operator bool() const noexcept
-        {return static_cast<bool>(*__seg_ & __mask_);}
-    _LIBCUDACXX_INLINE_VISIBILITY bool operator ~() const noexcept
-        {return !static_cast<bool>(*this);}
+  friend class __bit_const_reference<_Cp>;
+  friend class __bit_iterator<_Cp, false>;
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    __bit_reference& operator=(bool __x) noexcept
+public:
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_reference(const __bit_reference&) = default;
+
+  _LIBCUDACXX_INLINE_VISIBILITY operator bool() const noexcept
+  {
+    return static_cast<bool>(*__seg_ & __mask_);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool operator~() const noexcept
+  {
+    return !static_cast<bool>(*this);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_reference& operator=(bool __x) noexcept
+  {
+    if (__x)
     {
-        if (__x)
-            *__seg_ |= __mask_;
-        else
-            *__seg_ &= ~__mask_;
-        return *this;
+      *__seg_ |= __mask_;
     }
+    else
+    {
+      *__seg_ &= ~__mask_;
+    }
+    return *this;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_reference& operator=(const __bit_reference& __x) noexcept
+  {
+    return operator=(static_cast<bool>(__x));
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY void flip() noexcept
+  {
+    *__seg_ ^= __mask_;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false> operator&() const noexcept
+  {
+    return __bit_iterator<_Cp, false>(__seg_, static_cast<unsigned>(__libcpp_ctz(__mask_)));
+  }
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    __bit_reference& operator=(const __bit_reference& __x) noexcept
-        {return operator=(static_cast<bool>(__x));}
-
-    _LIBCUDACXX_INLINE_VISIBILITY void flip() noexcept {*__seg_ ^= __mask_;}
-    _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false> operator&() const noexcept
-        {return __bit_iterator<_Cp, false>(__seg_, static_cast<unsigned>(__libcpp_ctz(__mask_)));}
 private:
-    _LIBCUDACXX_INLINE_VISIBILITY
-    __bit_reference(__storage_pointer __s, __storage_type __m) noexcept
-        : __seg_(__s), __mask_(__m) {}
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_reference(__storage_pointer __s, __storage_type __m) noexcept
+      : __seg_(__s)
+      , __mask_(__m)
+  {}
 };
 
 template <class _Cp>
 class __bit_reference<_Cp, false>
-{
-};
+{};
 
 template <class _Cp>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-swap(__bit_reference<_Cp> __x, __bit_reference<_Cp> __y) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void swap(__bit_reference<_Cp> __x, __bit_reference<_Cp> __y) noexcept
 {
-    bool __t = __x;
-    __x = __y;
-    __y = __t;
+  bool __t = __x;
+  __x      = __y;
+  __y      = __t;
 }
 
 template <class _Cp, class _Dp>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-swap(__bit_reference<_Cp> __x, __bit_reference<_Dp> __y) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void swap(__bit_reference<_Cp> __x, __bit_reference<_Dp> __y) noexcept
 {
-    bool __t = __x;
-    __x = __y;
-    __y = __t;
+  bool __t = __x;
+  __x      = __y;
+  __y      = __t;
 }
 
 template <class _Cp>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-swap(__bit_reference<_Cp> __x, bool& __y) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void swap(__bit_reference<_Cp> __x, bool& __y) noexcept
 {
-    bool __t = __x;
-    __x = __y;
-    __y = __t;
+  bool __t = __x;
+  __x      = __y;
+  __y      = __t;
 }
 
 template <class _Cp>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-swap(bool& __x, __bit_reference<_Cp> __y) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void swap(bool& __x, __bit_reference<_Cp> __y) noexcept
 {
-    bool __t = __x;
-    __x = __y;
-    __y = __t;
+  bool __t = __x;
+  __x      = __y;
+  __y      = __t;
 }
 
 template <class _Cp>
 class __bit_const_reference
 {
-    typedef typename _Cp::__storage_type          __storage_type;
-    typedef typename _Cp::__const_storage_pointer __storage_pointer;
+  typedef typename _Cp::__storage_type __storage_type;
+  typedef typename _Cp::__const_storage_pointer __storage_pointer;
+
+  __storage_pointer __seg_;
+  __storage_type __mask_;
 
-    __storage_pointer        __seg_;
-    __storage_type __mask_;
+  friend typename _Cp::__self;
+  friend class __bit_iterator<_Cp, true>;
 
-    friend typename _Cp::__self;
-    friend class __bit_iterator<_Cp, true>;
 public:
-    _LIBCUDACXX_INLINE_VISIBILITY
-    __bit_const_reference(const __bit_const_reference&) = default;
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_const_reference(const __bit_const_reference&) = default;
+
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_const_reference(const __bit_reference<_Cp>& __x) noexcept
+      : __seg_(__x.__seg_)
+      , __mask_(__x.__mask_)
+  {}
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    __bit_const_reference(const __bit_reference<_Cp>& __x) noexcept
-        : __seg_(__x.__seg_), __mask_(__x.__mask_) {}
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr operator bool() const noexcept
+  {
+    return static_cast<bool>(*__seg_ & __mask_);
+  }
 
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr operator bool() const noexcept
-        {return static_cast<bool>(*__seg_ & __mask_);}
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, true> operator&() const noexcept
+  {
+    return __bit_iterator<_Cp, true>(__seg_, static_cast<unsigned>(__libcpp_ctz(__mask_)));
+  }
 
-    _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, true> operator&() const noexcept
-        {return __bit_iterator<_Cp, true>(__seg_, static_cast<unsigned>(__libcpp_ctz(__mask_)));}
 private:
-    _LIBCUDACXX_INLINE_VISIBILITY
-    constexpr
-    __bit_const_reference(__storage_pointer __s, __storage_type __m) noexcept
-        : __seg_(__s), __mask_(__m) {}
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __bit_const_reference(__storage_pointer __s, __storage_type __m) noexcept
+      : __seg_(__s)
+      , __mask_(__m)
+  {}
 
-    __bit_const_reference& operator=(const __bit_const_reference&) = delete;
+  __bit_const_reference& operator=(const __bit_const_reference&) = delete;
 };
 
 // find
 
 template <class _Cp, bool _IsConst>
-__bit_iterator<_Cp, _IsConst>
-__find_bool_true(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n)
+__bit_iterator<_Cp, _IsConst> __find_bool_true(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n)
 {
-    typedef __bit_iterator<_Cp, _IsConst> _It;
-    typedef typename _It::__storage_type __storage_type;
-    static const int __bits_per_word = _It::__bits_per_word;
-    // do first partial word
-    if (__first.__ctz_ != 0)
+  typedef __bit_iterator<_Cp, _IsConst> _It;
+  typedef typename _It::__storage_type __storage_type;
+  static const int __bits_per_word = _It::__bits_per_word;
+  // do first partial word
+  if (__first.__ctz_ != 0)
+  {
+    __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
+    __storage_type __dn    = _CUDA_VSTD::min(__clz_f, __n);
+    __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+    __storage_type __b     = *__first.__seg_ & __m;
+    if (__b)
     {
-        __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
-        __storage_type __dn = _CUDA_VSTD::min(__clz_f, __n);
-        __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-        __storage_type __b = *__first.__seg_ & __m;
-        if (__b)
-            return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__b)));
-        if (__n == __dn)
-            return __first + __n;
-        __n -= __dn;
-        ++__first.__seg_;
+      return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__b)));
     }
-    // do middle whole words
-    for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word)
-        if (*__first.__seg_)
-            return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(*__first.__seg_)));
-    // do last partial word
-    if (__n > 0)
+    if (__n == __dn)
     {
-        __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-        __storage_type __b = *__first.__seg_ & __m;
-        if (__b)
-            return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__b)));
+      return __first + __n;
     }
-    return _It(__first.__seg_, static_cast<unsigned>(__n));
+    __n -= __dn;
+    ++__first.__seg_;
+  }
+  // do middle whole words
+  for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word)
+  {
+    if (*__first.__seg_)
+    {
+      return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(*__first.__seg_)));
+    }
+  }
+  // do last partial word
+  if (__n > 0)
+  {
+    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+    __storage_type __b = *__first.__seg_ & __m;
+    if (__b)
+    {
+      return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__b)));
+    }
+  }
+  return _It(__first.__seg_, static_cast<unsigned>(__n));
 }
 
 template <class _Cp, bool _IsConst>
-__bit_iterator<_Cp, _IsConst>
-__find_bool_false(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n)
+__bit_iterator<_Cp, _IsConst> __find_bool_false(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n)
 {
-    typedef __bit_iterator<_Cp, _IsConst> _It;
-    typedef typename _It::__storage_type __storage_type;
-    const int __bits_per_word = _It::__bits_per_word;
-    // do first partial word
-    if (__first.__ctz_ != 0)
+  typedef __bit_iterator<_Cp, _IsConst> _It;
+  typedef typename _It::__storage_type __storage_type;
+  const int __bits_per_word = _It::__bits_per_word;
+  // do first partial word
+  if (__first.__ctz_ != 0)
+  {
+    __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
+    __storage_type __dn    = _CUDA_VSTD::min(__clz_f, __n);
+    __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+    __storage_type __b     = ~*__first.__seg_ & __m;
+    if (__b)
     {
-        __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
-        __storage_type __dn = _CUDA_VSTD::min(__clz_f, __n);
-        __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-        __storage_type __b = ~*__first.__seg_ & __m;
-        if (__b)
-            return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__b)));
-        if (__n == __dn)
-            return __first + __n;
-        __n -= __dn;
-        ++__first.__seg_;
+      return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__b)));
     }
-    // do middle whole words
-    for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word)
+    if (__n == __dn)
     {
-        __storage_type __b = ~*__first.__seg_;
-        if (__b)
-            return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__b)));
+      return __first + __n;
     }
-    // do last partial word
-    if (__n > 0)
+    __n -= __dn;
+    ++__first.__seg_;
+  }
+  // do middle whole words
+  for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word)
+  {
+    __storage_type __b = ~*__first.__seg_;
+    if (__b)
     {
-        __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-        __storage_type __b = ~*__first.__seg_ & __m;
-        if (__b)
-            return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__b)));
+      return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__b)));
     }
-    return _It(__first.__seg_, static_cast<unsigned>(__n));
+  }
+  // do last partial word
+  if (__n > 0)
+  {
+    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+    __storage_type __b = ~*__first.__seg_ & __m;
+    if (__b)
+    {
+      return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__b)));
+    }
+  }
+  return _It(__first.__seg_, static_cast<unsigned>(__n));
 }
 
 template <class _Cp, bool _IsConst, class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-__bit_iterator<_Cp, _IsConst>
+inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, _IsConst>
 find(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, const _Tp& __value_)
 {
-    if (static_cast<bool>(__value_))
-        return __find_bool_true(__first, static_cast<typename _Cp::size_type>(__last - __first));
-    return __find_bool_false(__first, static_cast<typename _Cp::size_type>(__last - __first));
+  if (static_cast<bool>(__value_))
+  {
+    return __find_bool_true(__first, static_cast<typename _Cp::size_type>(__last - __first));
+  }
+  return __find_bool_false(__first, static_cast<typename _Cp::size_type>(__last - __first));
 }
 
 // count
@@ -253,627 +283,633 @@ template <class _Cp, bool _IsConst>
 typename __bit_iterator<_Cp, _IsConst>::difference_type
 __count_bool_true(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n)
 {
-    typedef __bit_iterator<_Cp, _IsConst> _It;
-    typedef typename _It::__storage_type __storage_type;
-    typedef typename _It::difference_type difference_type;
-    const int __bits_per_word = _It::__bits_per_word;
-    difference_type __r = 0;
-    // do first partial word
-    if (__first.__ctz_ != 0)
-    {
-        __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
-        __storage_type __dn = _CUDA_VSTD::min(__clz_f, __n);
-        __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-        __r = _CUDA_VSTD::__libcpp_popcount(*__first.__seg_ & __m);
-        __n -= __dn;
-        ++__first.__seg_;
-    }
-    // do middle whole words
-    for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word)
-        __r += _CUDA_VSTD::__libcpp_popcount(*__first.__seg_);
-    // do last partial word
-    if (__n > 0)
-    {
-        __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-        __r += _CUDA_VSTD::__libcpp_popcount(*__first.__seg_ & __m);
-    }
-    return __r;
+  typedef __bit_iterator<_Cp, _IsConst> _It;
+  typedef typename _It::__storage_type __storage_type;
+  typedef typename _It::difference_type difference_type;
+  const int __bits_per_word = _It::__bits_per_word;
+  difference_type __r       = 0;
+  // do first partial word
+  if (__first.__ctz_ != 0)
+  {
+    __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
+    __storage_type __dn    = _CUDA_VSTD::min(__clz_f, __n);
+    __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+    __r                    = _CUDA_VSTD::__libcpp_popcount(*__first.__seg_ & __m);
+    __n -= __dn;
+    ++__first.__seg_;
+  }
+  // do middle whole words
+  for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word)
+  {
+    __r += _CUDA_VSTD::__libcpp_popcount(*__first.__seg_);
+  }
+  // do last partial word
+  if (__n > 0)
+  {
+    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+    __r += _CUDA_VSTD::__libcpp_popcount(*__first.__seg_ & __m);
+  }
+  return __r;
 }
 
 template <class _Cp, bool _IsConst>
 typename __bit_iterator<_Cp, _IsConst>::difference_type
 __count_bool_false(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n)
 {
-    typedef __bit_iterator<_Cp, _IsConst> _It;
-    typedef typename _It::__storage_type __storage_type;
-    typedef typename _It::difference_type difference_type;
-    const int __bits_per_word = _It::__bits_per_word;
-    difference_type __r = 0;
-    // do first partial word
-    if (__first.__ctz_ != 0)
-    {
-        __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
-        __storage_type __dn = _CUDA_VSTD::min(__clz_f, __n);
-        __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-        __r = _CUDA_VSTD::__libcpp_popcount(~*__first.__seg_ & __m);
-        __n -= __dn;
-        ++__first.__seg_;
-    }
-    // do middle whole words
-    for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word)
-        __r += _CUDA_VSTD::__libcpp_popcount(~*__first.__seg_);
-    // do last partial word
-    if (__n > 0)
-    {
-        __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-        __r += _CUDA_VSTD::__libcpp_popcount(~*__first.__seg_ & __m);
-    }
-    return __r;
+  typedef __bit_iterator<_Cp, _IsConst> _It;
+  typedef typename _It::__storage_type __storage_type;
+  typedef typename _It::difference_type difference_type;
+  const int __bits_per_word = _It::__bits_per_word;
+  difference_type __r       = 0;
+  // do first partial word
+  if (__first.__ctz_ != 0)
+  {
+    __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
+    __storage_type __dn    = _CUDA_VSTD::min(__clz_f, __n);
+    __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+    __r                    = _CUDA_VSTD::__libcpp_popcount(~*__first.__seg_ & __m);
+    __n -= __dn;
+    ++__first.__seg_;
+  }
+  // do middle whole words
+  for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word)
+  {
+    __r += _CUDA_VSTD::__libcpp_popcount(~*__first.__seg_);
+  }
+  // do last partial word
+  if (__n > 0)
+  {
+    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+    __r += _CUDA_VSTD::__libcpp_popcount(~*__first.__seg_ & __m);
+  }
+  return __r;
 }
 
 template <class _Cp, bool _IsConst, class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-typename __bit_iterator<_Cp, _IsConst>::difference_type
+inline _LIBCUDACXX_INLINE_VISIBILITY typename __bit_iterator<_Cp, _IsConst>::difference_type
 count(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, const _Tp& __value_)
 {
-    if (static_cast<bool>(__value_))
-        return __count_bool_true(__first, static_cast<typename _Cp::size_type>(__last - __first));
-    return __count_bool_false(__first, static_cast<typename _Cp::size_type>(__last - __first));
+  if (static_cast<bool>(__value_))
+  {
+    return __count_bool_true(__first, static_cast<typename _Cp::size_type>(__last - __first));
+  }
+  return __count_bool_false(__first, static_cast<typename _Cp::size_type>(__last - __first));
 }
 
 // fill_n
 
 template <class _Cp>
-void
-__fill_n_false(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n)
+void __fill_n_false(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n)
 {
-    typedef __bit_iterator<_Cp, false> _It;
-    typedef typename _It::__storage_type __storage_type;
-    const int __bits_per_word = _It::__bits_per_word;
-    // do first partial word
-    if (__first.__ctz_ != 0)
-    {
-        __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
-        __storage_type __dn = _CUDA_VSTD::min(__clz_f, __n);
-        __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-        *__first.__seg_ &= ~__m;
-        __n -= __dn;
-        ++__first.__seg_;
-    }
-    // do middle whole words
-    __storage_type __nw = __n / __bits_per_word;
-    _CUDA_VSTD::memset(_CUDA_VSTD::__to_raw_pointer(__first.__seg_), 0, __nw * sizeof(__storage_type));
-    __n -= __nw * __bits_per_word;
-    // do last partial word
-    if (__n > 0)
-    {
-        __first.__seg_ += __nw;
-        __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-        *__first.__seg_ &= ~__m;
-    }
+  typedef __bit_iterator<_Cp, false> _It;
+  typedef typename _It::__storage_type __storage_type;
+  const int __bits_per_word = _It::__bits_per_word;
+  // do first partial word
+  if (__first.__ctz_ != 0)
+  {
+    __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
+    __storage_type __dn    = _CUDA_VSTD::min(__clz_f, __n);
+    __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+    *__first.__seg_ &= ~__m;
+    __n -= __dn;
+    ++__first.__seg_;
+  }
+  // do middle whole words
+  __storage_type __nw = __n / __bits_per_word;
+  _CUDA_VSTD::memset(_CUDA_VSTD::__to_raw_pointer(__first.__seg_), 0, __nw * sizeof(__storage_type));
+  __n -= __nw * __bits_per_word;
+  // do last partial word
+  if (__n > 0)
+  {
+    __first.__seg_ += __nw;
+    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+    *__first.__seg_ &= ~__m;
+  }
 }
 
 template <class _Cp>
-void
-__fill_n_true(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n)
+void __fill_n_true(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n)
 {
-    typedef __bit_iterator<_Cp, false> _It;
-    typedef typename _It::__storage_type __storage_type;
-    const int __bits_per_word = _It::__bits_per_word;
-    // do first partial word
-    if (__first.__ctz_ != 0)
-    {
-        __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
-        __storage_type __dn = _CUDA_VSTD::min(__clz_f, __n);
-        __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-        *__first.__seg_ |= __m;
-        __n -= __dn;
-        ++__first.__seg_;
-    }
-    // do middle whole words
-    __storage_type __nw = __n / __bits_per_word;
-    _CUDA_VSTD::memset(_CUDA_VSTD::__to_raw_pointer(__first.__seg_), -1, __nw * sizeof(__storage_type));
-    __n -= __nw * __bits_per_word;
-    // do last partial word
-    if (__n > 0)
-    {
-        __first.__seg_ += __nw;
-        __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-        *__first.__seg_ |= __m;
-    }
+  typedef __bit_iterator<_Cp, false> _It;
+  typedef typename _It::__storage_type __storage_type;
+  const int __bits_per_word = _It::__bits_per_word;
+  // do first partial word
+  if (__first.__ctz_ != 0)
+  {
+    __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
+    __storage_type __dn    = _CUDA_VSTD::min(__clz_f, __n);
+    __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+    *__first.__seg_ |= __m;
+    __n -= __dn;
+    ++__first.__seg_;
+  }
+  // do middle whole words
+  __storage_type __nw = __n / __bits_per_word;
+  _CUDA_VSTD::memset(_CUDA_VSTD::__to_raw_pointer(__first.__seg_), -1, __nw * sizeof(__storage_type));
+  __n -= __nw * __bits_per_word;
+  // do last partial word
+  if (__n > 0)
+  {
+    __first.__seg_ += __nw;
+    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+    *__first.__seg_ |= __m;
+  }
 }
 
 template <class _Cp>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
+inline _LIBCUDACXX_INLINE_VISIBILITY void
 fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n, bool __value_)
 {
-    if (__n > 0)
+  if (__n > 0)
+  {
+    if (__value_)
     {
-        if (__value_)
-            __fill_n_true(__first, __n);
-        else
-            __fill_n_false(__first, __n);
+      __fill_n_true(__first, __n);
     }
+    else
+    {
+      __fill_n_false(__first, __n);
+    }
+  }
 }
 
 // fill
 
 template <class _Cp>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
+inline _LIBCUDACXX_INLINE_VISIBILITY void
 fill(__bit_iterator<_Cp, false> __first, __bit_iterator<_Cp, false> __last, bool __value_)
 {
-    _CUDA_VSTD::fill_n(__first, static_cast<typename _Cp::size_type>(__last - __first), __value_);
+  _CUDA_VSTD::fill_n(__first, static_cast<typename _Cp::size_type>(__last - __first), __value_);
 }
 
 // copy
 
 template <class _Cp, bool _IsConst>
-__bit_iterator<_Cp, false>
-__copy_aligned(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last,
-                                                     __bit_iterator<_Cp, false> __result)
+__bit_iterator<_Cp, false> __copy_aligned(
+  __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
-    typedef __bit_iterator<_Cp, _IsConst> _In;
-    typedef  typename _In::difference_type difference_type;
-    typedef typename _In::__storage_type __storage_type;
-    const int __bits_per_word = _In::__bits_per_word;
-    difference_type __n = __last - __first;
+  typedef __bit_iterator<_Cp, _IsConst> _In;
+  typedef typename _In::difference_type difference_type;
+  typedef typename _In::__storage_type __storage_type;
+  const int __bits_per_word = _In::__bits_per_word;
+  difference_type __n       = __last - __first;
+  if (__n > 0)
+  {
+    // do first word
+    if (__first.__ctz_ != 0)
+    {
+      unsigned __clz       = __bits_per_word - __first.__ctz_;
+      difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz), __n);
+      __n -= __dn;
+      __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
+      __storage_type __b = *__first.__seg_ & __m;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b;
+      __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
+      __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
+      ++__first.__seg_;
+      // __first.__ctz_ = 0;
+    }
+    // __first.__ctz_ == 0;
+    // do middle words
+    __storage_type __nw = __n / __bits_per_word;
+    _CUDA_VSTD::memmove(_CUDA_VSTD::__to_raw_pointer(__result.__seg_),
+                        _CUDA_VSTD::__to_raw_pointer(__first.__seg_),
+                        __nw * sizeof(__storage_type));
+    __n -= __nw * __bits_per_word;
+    __result.__seg_ += __nw;
+    // do last word
     if (__n > 0)
     {
-        // do first word
-        if (__first.__ctz_ != 0)
-        {
-            unsigned __clz = __bits_per_word - __first.__ctz_;
-            difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz), __n);
-            __n -= __dn;
-            __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
-            __storage_type __b = *__first.__seg_ & __m;
-            *__result.__seg_ &= ~__m;
-            *__result.__seg_ |= __b;
-            __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
-            __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_)  % __bits_per_word);
-            ++__first.__seg_;
-            // __first.__ctz_ = 0;
-        }
-        // __first.__ctz_ == 0;
-        // do middle words
-        __storage_type __nw = __n / __bits_per_word;
-        _CUDA_VSTD::memmove(_CUDA_VSTD::__to_raw_pointer(__result.__seg_),
-                       _CUDA_VSTD::__to_raw_pointer(__first.__seg_),
-                       __nw * sizeof(__storage_type));
-        __n -= __nw * __bits_per_word;
-        __result.__seg_ += __nw;
-        // do last word
-        if (__n > 0)
-        {
-            __first.__seg_ += __nw;
-            __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-            __storage_type __b = *__first.__seg_ & __m;
-            *__result.__seg_ &= ~__m;
-            *__result.__seg_ |= __b;
-            __result.__ctz_ = static_cast<unsigned>(__n);
-        }
+      __first.__seg_ += __nw;
+      __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+      __storage_type __b = *__first.__seg_ & __m;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b;
+      __result.__ctz_ = static_cast<unsigned>(__n);
     }
-    return __result;
+  }
+  return __result;
 }
 
 template <class _Cp, bool _IsConst>
-__bit_iterator<_Cp, false>
-__copy_unaligned(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last,
-                                                       __bit_iterator<_Cp, false> __result)
+__bit_iterator<_Cp, false> __copy_unaligned(
+  __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
-    typedef __bit_iterator<_Cp, _IsConst> _In;
-    typedef  typename _In::difference_type difference_type;
-    typedef typename _In::__storage_type __storage_type;
-    static const int __bits_per_word = _In::__bits_per_word;
-    difference_type __n = __last - __first;
+  typedef __bit_iterator<_Cp, _IsConst> _In;
+  typedef typename _In::difference_type difference_type;
+  typedef typename _In::__storage_type __storage_type;
+  static const int __bits_per_word = _In::__bits_per_word;
+  difference_type __n              = __last - __first;
+  if (__n > 0)
+  {
+    // do first word
+    if (__first.__ctz_ != 0)
+    {
+      unsigned __clz_f     = __bits_per_word - __first.__ctz_;
+      difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz_f), __n);
+      __n -= __dn;
+      __storage_type __m   = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+      __storage_type __b   = *__first.__seg_ & __m;
+      unsigned __clz_r     = __bits_per_word - __result.__ctz_;
+      __storage_type __ddn = _CUDA_VSTD::min<__storage_type>(__dn, __clz_r);
+      __m                  = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
+      *__result.__seg_ &= ~__m;
+      if (__result.__ctz_ > __first.__ctz_)
+      {
+        *__result.__seg_ |= __b << (__result.__ctz_ - __first.__ctz_);
+      }
+      else
+      {
+        *__result.__seg_ |= __b >> (__first.__ctz_ - __result.__ctz_);
+      }
+      __result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word;
+      __result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_) % __bits_per_word);
+      __dn -= __ddn;
+      if (__dn > 0)
+      {
+        __m = ~__storage_type(0) >> (__bits_per_word - __dn);
+        *__result.__seg_ &= ~__m;
+        *__result.__seg_ |= __b >> (__first.__ctz_ + __ddn);
+        __result.__ctz_ = static_cast<unsigned>(__dn);
+      }
+      ++__first.__seg_;
+      // __first.__ctz_ = 0;
+    }
+    // __first.__ctz_ == 0;
+    // do middle words
+    unsigned __clz_r   = __bits_per_word - __result.__ctz_;
+    __storage_type __m = ~__storage_type(0) << __result.__ctz_;
+    for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_)
+    {
+      __storage_type __b = *__first.__seg_;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b << __result.__ctz_;
+      ++__result.__seg_;
+      *__result.__seg_ &= __m;
+      *__result.__seg_ |= __b >> __clz_r;
+    }
+    // do last word
     if (__n > 0)
     {
-        // do first word
-        if (__first.__ctz_ != 0)
-        {
-            unsigned __clz_f = __bits_per_word - __first.__ctz_;
-            difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz_f), __n);
-            __n -= __dn;
-            __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-            __storage_type __b = *__first.__seg_ & __m;
-            unsigned __clz_r = __bits_per_word - __result.__ctz_;
-            __storage_type __ddn = _CUDA_VSTD::min<__storage_type>(__dn, __clz_r);
-            __m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
-            *__result.__seg_ &= ~__m;
-            if (__result.__ctz_ > __first.__ctz_)
-                *__result.__seg_ |= __b << (__result.__ctz_ - __first.__ctz_);
-            else
-                *__result.__seg_ |= __b >> (__first.__ctz_ - __result.__ctz_);
-            __result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word;
-            __result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_)  % __bits_per_word);
-            __dn -= __ddn;
-            if (__dn > 0)
-            {
-                __m = ~__storage_type(0) >> (__bits_per_word - __dn);
-                *__result.__seg_ &= ~__m;
-                *__result.__seg_ |= __b >> (__first.__ctz_ + __ddn);
-                __result.__ctz_ = static_cast<unsigned>(__dn);
-            }
-            ++__first.__seg_;
-            // __first.__ctz_ = 0;
-        }
-        // __first.__ctz_ == 0;
-        // do middle words
-        unsigned __clz_r = __bits_per_word - __result.__ctz_;
-        __storage_type __m = ~__storage_type(0) << __result.__ctz_;
-        for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_)
-        {
-            __storage_type __b = *__first.__seg_;
-            *__result.__seg_ &= ~__m;
-            *__result.__seg_ |= __b << __result.__ctz_;
-            ++__result.__seg_;
-            *__result.__seg_ &= __m;
-            *__result.__seg_ |= __b >> __clz_r;
-        }
-        // do last word
-        if (__n > 0)
-        {
-            __m = ~__storage_type(0) >> (__bits_per_word - __n);
-            __storage_type __b = *__first.__seg_ & __m;
-            __storage_type __dn = _CUDA_VSTD::min(__n, static_cast<difference_type>(__clz_r));
-            __m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
-            *__result.__seg_ &= ~__m;
-            *__result.__seg_ |= __b << __result.__ctz_;
-            __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
-            __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_)  % __bits_per_word);
-            __n -= __dn;
-            if (__n > 0)
-            {
-                __m = ~__storage_type(0) >> (__bits_per_word - __n);
-                *__result.__seg_ &= ~__m;
-                *__result.__seg_ |= __b >> __dn;
-                __result.__ctz_ = static_cast<unsigned>(__n);
-            }
-        }
+      __m                 = ~__storage_type(0) >> (__bits_per_word - __n);
+      __storage_type __b  = *__first.__seg_ & __m;
+      __storage_type __dn = _CUDA_VSTD::min(__n, static_cast<difference_type>(__clz_r));
+      __m                 = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b << __result.__ctz_;
+      __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
+      __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
+      __n -= __dn;
+      if (__n > 0)
+      {
+        __m = ~__storage_type(0) >> (__bits_per_word - __n);
+        *__result.__seg_ &= ~__m;
+        *__result.__seg_ |= __b >> __dn;
+        __result.__ctz_ = static_cast<unsigned>(__n);
+      }
     }
-    return __result;
+  }
+  return __result;
 }
 
 template <class _Cp, bool _IsConst>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-__bit_iterator<_Cp, false>
+inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false>
 copy(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
-    if (__first.__ctz_ == __result.__ctz_)
-        return __copy_aligned(__first, __last, __result);
-    return __copy_unaligned(__first, __last, __result);
+  if (__first.__ctz_ == __result.__ctz_)
+  {
+    return __copy_aligned(__first, __last, __result);
+  }
+  return __copy_unaligned(__first, __last, __result);
 }
 
 // copy_backward
 
 template <class _Cp, bool _IsConst>
-__bit_iterator<_Cp, false>
-__copy_backward_aligned(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last,
-                                                     __bit_iterator<_Cp, false> __result)
+__bit_iterator<_Cp, false> __copy_backward_aligned(
+  __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
-    typedef __bit_iterator<_Cp, _IsConst> _In;
-    typedef  typename _In::difference_type difference_type;
-    typedef typename _In::__storage_type __storage_type;
-    const int __bits_per_word = _In::__bits_per_word;
-    difference_type __n = __last - __first;
+  typedef __bit_iterator<_Cp, _IsConst> _In;
+  typedef typename _In::difference_type difference_type;
+  typedef typename _In::__storage_type __storage_type;
+  const int __bits_per_word = _In::__bits_per_word;
+  difference_type __n       = __last - __first;
+  if (__n > 0)
+  {
+    // do first word
+    if (__last.__ctz_ != 0)
+    {
+      difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__last.__ctz_), __n);
+      __n -= __dn;
+      unsigned __clz     = __bits_per_word - __last.__ctz_;
+      __storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz);
+      __storage_type __b = *__last.__seg_ & __m;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b;
+      __result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
+      // __last.__ctz_ = 0
+    }
+    // __last.__ctz_ == 0 || __n == 0
+    // __result.__ctz_ == 0 || __n == 0
+    // do middle words
+    __storage_type __nw = __n / __bits_per_word;
+    __result.__seg_ -= __nw;
+    __last.__seg_ -= __nw;
+    _CUDA_VSTD::memmove(_CUDA_VSTD::__to_raw_pointer(__result.__seg_),
+                        _CUDA_VSTD::__to_raw_pointer(__last.__seg_),
+                        __nw * sizeof(__storage_type));
+    __n -= __nw * __bits_per_word;
+    // do last word
     if (__n > 0)
     {
-        // do first word
-        if (__last.__ctz_ != 0)
-        {
-            difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__last.__ctz_), __n);
-            __n -= __dn;
-            unsigned __clz = __bits_per_word - __last.__ctz_;
-            __storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz);
-            __storage_type __b = *__last.__seg_ & __m;
-            *__result.__seg_ &= ~__m;
-            *__result.__seg_ |= __b;
-            __result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) +
-                                                       __result.__ctz_)  % __bits_per_word);
-            // __last.__ctz_ = 0
-         }
-        // __last.__ctz_ == 0 || __n == 0
-        // __result.__ctz_ == 0 || __n == 0
-        // do middle words
-        __storage_type __nw = __n / __bits_per_word;
-        __result.__seg_ -= __nw;
-        __last.__seg_ -= __nw;
-        _CUDA_VSTD::memmove(_CUDA_VSTD::__to_raw_pointer(__result.__seg_),
-                       _CUDA_VSTD::__to_raw_pointer(__last.__seg_),
-                       __nw * sizeof(__storage_type));
-        __n -= __nw * __bits_per_word;
-        // do last word
-        if (__n > 0)
-        {
-            __storage_type __m = ~__storage_type(0) << (__bits_per_word - __n);
-            __storage_type __b = *--__last.__seg_ & __m;
-            *--__result.__seg_ &= ~__m;
-            *__result.__seg_ |= __b;
-            __result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
-        }
+      __storage_type __m = ~__storage_type(0) << (__bits_per_word - __n);
+      __storage_type __b = *--__last.__seg_ & __m;
+      *--__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b;
+      __result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
     }
-    return __result;
+  }
+  return __result;
 }
 
 template <class _Cp, bool _IsConst>
-__bit_iterator<_Cp, false>
-__copy_backward_unaligned(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last,
-                                                       __bit_iterator<_Cp, false> __result)
+__bit_iterator<_Cp, false> __copy_backward_unaligned(
+  __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
-    typedef __bit_iterator<_Cp, _IsConst> _In;
-    typedef  typename _In::difference_type difference_type;
-    typedef typename _In::__storage_type __storage_type;
-    const int __bits_per_word = _In::__bits_per_word;
-    difference_type __n = __last - __first;
-    if (__n > 0)
+  typedef __bit_iterator<_Cp, _IsConst> _In;
+  typedef typename _In::difference_type difference_type;
+  typedef typename _In::__storage_type __storage_type;
+  const int __bits_per_word = _In::__bits_per_word;
+  difference_type __n       = __last - __first;
+  if (__n > 0)
+  {
+    // do first word
+    if (__last.__ctz_ != 0)
     {
-        // do first word
-        if (__last.__ctz_ != 0)
-        {
-            difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__last.__ctz_), __n);
-            __n -= __dn;
-            unsigned __clz_l = __bits_per_word - __last.__ctz_;
-            __storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_l);
-            __storage_type __b = *__last.__seg_ & __m;
-            unsigned __clz_r = __bits_per_word - __result.__ctz_;
-            __storage_type __ddn = _CUDA_VSTD::min(__dn, static_cast<difference_type>(__result.__ctz_));
-            if (__ddn > 0)
-            {
-                __m = (~__storage_type(0) << (__result.__ctz_ - __ddn)) & (~__storage_type(0) >> __clz_r);
-                *__result.__seg_ &= ~__m;
-                if (__result.__ctz_ > __last.__ctz_)
-                    *__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
-                else
-                    *__result.__seg_ |= __b >> (__last.__ctz_ - __result.__ctz_);
-                __result.__ctz_ = static_cast<unsigned>(((-__ddn & (__bits_per_word - 1)) +
-                                                         __result.__ctz_)  % __bits_per_word);
-                __dn -= __ddn;
-            }
-            if (__dn > 0)
-            {
-                // __result.__ctz_ == 0
-                --__result.__seg_;
-                __result.__ctz_ = static_cast<unsigned>(-__dn & (__bits_per_word - 1));
-                __m = ~__storage_type(0) << __result.__ctz_;
-                *__result.__seg_ &= ~__m;
-                __last.__ctz_ -= __dn + __ddn;
-                *__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
-            }
-            // __last.__ctz_ = 0
-         }
-        // __last.__ctz_ == 0 || __n == 0
-        // __result.__ctz_ != 0 || __n == 0
-        // do middle words
-        unsigned __clz_r = __bits_per_word - __result.__ctz_;
-        __storage_type __m = ~__storage_type(0) >> __clz_r;
-        for (; __n >= __bits_per_word; __n -= __bits_per_word)
+      difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__last.__ctz_), __n);
+      __n -= __dn;
+      unsigned __clz_l     = __bits_per_word - __last.__ctz_;
+      __storage_type __m   = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_l);
+      __storage_type __b   = *__last.__seg_ & __m;
+      unsigned __clz_r     = __bits_per_word - __result.__ctz_;
+      __storage_type __ddn = _CUDA_VSTD::min(__dn, static_cast<difference_type>(__result.__ctz_));
+      if (__ddn > 0)
+      {
+        __m = (~__storage_type(0) << (__result.__ctz_ - __ddn)) & (~__storage_type(0) >> __clz_r);
+        *__result.__seg_ &= ~__m;
+        if (__result.__ctz_ > __last.__ctz_)
         {
-            __storage_type __b = *--__last.__seg_;
-            *__result.__seg_ &= ~__m;
-            *__result.__seg_ |= __b >> __clz_r;
-            *--__result.__seg_ &= __m;
-            *__result.__seg_ |= __b << __result.__ctz_;
+          *__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
         }
-        // do last word
-        if (__n > 0)
+        else
         {
-            __m = ~__storage_type(0) << (__bits_per_word - __n);
-            __storage_type __b = *--__last.__seg_ & __m;
-            __clz_r = __bits_per_word - __result.__ctz_;
-            __storage_type __dn = _CUDA_VSTD::min(__n, static_cast<difference_type>(__result.__ctz_));
-            __m = (~__storage_type(0) << (__result.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_r);
-            *__result.__seg_ &= ~__m;
-            *__result.__seg_ |= __b >> (__bits_per_word - __result.__ctz_);
-            __result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) +
-                                                     __result.__ctz_)  % __bits_per_word);
-            __n -= __dn;
-            if (__n > 0)
-            {
-                // __result.__ctz_ == 0
-                --__result.__seg_;
-                __result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
-                __m = ~__storage_type(0) << __result.__ctz_;
-                *__result.__seg_ &= ~__m;
-                *__result.__seg_ |= __b << (__result.__ctz_ - (__bits_per_word - __n - __dn));
-            }
+          *__result.__seg_ |= __b >> (__last.__ctz_ - __result.__ctz_);
         }
+        __result.__ctz_ = static_cast<unsigned>(((-__ddn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
+        __dn -= __ddn;
+      }
+      if (__dn > 0)
+      {
+        // __result.__ctz_ == 0
+        --__result.__seg_;
+        __result.__ctz_ = static_cast<unsigned>(-__dn & (__bits_per_word - 1));
+        __m             = ~__storage_type(0) << __result.__ctz_;
+        *__result.__seg_ &= ~__m;
+        __last.__ctz_ -= __dn + __ddn;
+        *__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
+      }
+      // __last.__ctz_ = 0
+    }
+    // __last.__ctz_ == 0 || __n == 0
+    // __result.__ctz_ != 0 || __n == 0
+    // do middle words
+    unsigned __clz_r   = __bits_per_word - __result.__ctz_;
+    __storage_type __m = ~__storage_type(0) >> __clz_r;
+    for (; __n >= __bits_per_word; __n -= __bits_per_word)
+    {
+      __storage_type __b = *--__last.__seg_;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b >> __clz_r;
+      *--__result.__seg_ &= __m;
+      *__result.__seg_ |= __b << __result.__ctz_;
     }
-    return __result;
+    // do last word
+    if (__n > 0)
+    {
+      __m                 = ~__storage_type(0) << (__bits_per_word - __n);
+      __storage_type __b  = *--__last.__seg_ & __m;
+      __clz_r             = __bits_per_word - __result.__ctz_;
+      __storage_type __dn = _CUDA_VSTD::min(__n, static_cast<difference_type>(__result.__ctz_));
+      __m                 = (~__storage_type(0) << (__result.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_r);
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b >> (__bits_per_word - __result.__ctz_);
+      __result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
+      __n -= __dn;
+      if (__n > 0)
+      {
+        // __result.__ctz_ == 0
+        --__result.__seg_;
+        __result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
+        __m             = ~__storage_type(0) << __result.__ctz_;
+        *__result.__seg_ &= ~__m;
+        *__result.__seg_ |= __b << (__result.__ctz_ - (__bits_per_word - __n - __dn));
+      }
+    }
+  }
+  return __result;
 }
 
 template <class _Cp, bool _IsConst>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-__bit_iterator<_Cp, false>
-copy_backward(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
+inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false> copy_backward(
+  __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
-    if (__last.__ctz_ == __result.__ctz_)
-        return __copy_backward_aligned(__first, __last, __result);
-    return __copy_backward_unaligned(__first, __last, __result);
+  if (__last.__ctz_ == __result.__ctz_)
+  {
+    return __copy_backward_aligned(__first, __last, __result);
+  }
+  return __copy_backward_unaligned(__first, __last, __result);
 }
 
 // move
 
 template <class _Cp, bool _IsConst>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-__bit_iterator<_Cp, false>
+inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false>
 move(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
-    return _CUDA_VSTD::copy(__first, __last, __result);
+  return _CUDA_VSTD::copy(__first, __last, __result);
 }
 
 // move_backward
 
 template <class _Cp, bool _IsConst>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-__bit_iterator<_Cp, false>
-move_backward(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
+inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false> move_backward(
+  __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
-    return _CUDA_VSTD::copy_backward(__first, __last, __result);
+  return _CUDA_VSTD::copy_backward(__first, __last, __result);
 }
 
 // swap_ranges
 
 template <class __C1, class __C2>
-__bit_iterator<__C2, false>
-__swap_ranges_aligned(__bit_iterator<__C1, false> __first, __bit_iterator<__C1, false> __last,
-                      __bit_iterator<__C2, false> __result)
+__bit_iterator<__C2, false> __swap_ranges_aligned(
+  __bit_iterator<__C1, false> __first, __bit_iterator<__C1, false> __last, __bit_iterator<__C2, false> __result)
 {
-    typedef __bit_iterator<__C1, false> _I1;
-    typedef  typename _I1::difference_type difference_type;
-    typedef typename _I1::__storage_type __storage_type;
-    const int __bits_per_word = _I1::__bits_per_word;
-    difference_type __n = __last - __first;
+  typedef __bit_iterator<__C1, false> _I1;
+  typedef typename _I1::difference_type difference_type;
+  typedef typename _I1::__storage_type __storage_type;
+  const int __bits_per_word = _I1::__bits_per_word;
+  difference_type __n       = __last - __first;
+  if (__n > 0)
+  {
+    // do first word
+    if (__first.__ctz_ != 0)
+    {
+      unsigned __clz       = __bits_per_word - __first.__ctz_;
+      difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz), __n);
+      __n -= __dn;
+      __storage_type __m  = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
+      __storage_type __b1 = *__first.__seg_ & __m;
+      *__first.__seg_ &= ~__m;
+      __storage_type __b2 = *__result.__seg_ & __m;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b1;
+      *__first.__seg_ |= __b2;
+      __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
+      __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
+      ++__first.__seg_;
+      // __first.__ctz_ = 0;
+    }
+    // __first.__ctz_ == 0;
+    // do middle words
+    for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_, ++__result.__seg_)
+    {
+      swap(*__first.__seg_, *__result.__seg_);
+    }
+    // do last word
     if (__n > 0)
     {
-        // do first word
-        if (__first.__ctz_ != 0)
-        {
-            unsigned __clz = __bits_per_word - __first.__ctz_;
-            difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz), __n);
-            __n -= __dn;
-            __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
-            __storage_type __b1 = *__first.__seg_ & __m;
-            *__first.__seg_ &= ~__m;
-            __storage_type __b2 = *__result.__seg_ & __m;
-            *__result.__seg_ &= ~__m;
-            *__result.__seg_ |= __b1;
-            *__first.__seg_  |= __b2;
-            __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
-            __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_)  % __bits_per_word);
-            ++__first.__seg_;
-            // __first.__ctz_ = 0;
-        }
-        // __first.__ctz_ == 0;
-        // do middle words
-        for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_, ++__result.__seg_)
-            swap(*__first.__seg_, *__result.__seg_);
-        // do last word
-        if (__n > 0)
-        {
-            __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-            __storage_type __b1 = *__first.__seg_ & __m;
-            *__first.__seg_ &= ~__m;
-            __storage_type __b2 = *__result.__seg_ & __m;
-            *__result.__seg_ &= ~__m;
-            *__result.__seg_ |= __b1;
-            *__first.__seg_  |= __b2;
-            __result.__ctz_ = static_cast<unsigned>(__n);
-        }
+      __storage_type __m  = ~__storage_type(0) >> (__bits_per_word - __n);
+      __storage_type __b1 = *__first.__seg_ & __m;
+      *__first.__seg_ &= ~__m;
+      __storage_type __b2 = *__result.__seg_ & __m;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b1;
+      *__first.__seg_ |= __b2;
+      __result.__ctz_ = static_cast<unsigned>(__n);
     }
-    return __result;
+  }
+  return __result;
 }
 
 template <class __C1, class __C2>
-__bit_iterator<__C2, false>
-__swap_ranges_unaligned(__bit_iterator<__C1, false> __first, __bit_iterator<__C1, false> __last,
-                        __bit_iterator<__C2, false> __result)
+__bit_iterator<__C2, false> __swap_ranges_unaligned(
+  __bit_iterator<__C1, false> __first, __bit_iterator<__C1, false> __last, __bit_iterator<__C2, false> __result)
 {
-    typedef __bit_iterator<__C1, false> _I1;
-    typedef  typename _I1::difference_type difference_type;
-    typedef typename _I1::__storage_type __storage_type;
-    const int __bits_per_word = _I1::__bits_per_word;
-    difference_type __n = __last - __first;
+  typedef __bit_iterator<__C1, false> _I1;
+  typedef typename _I1::difference_type difference_type;
+  typedef typename _I1::__storage_type __storage_type;
+  const int __bits_per_word = _I1::__bits_per_word;
+  difference_type __n       = __last - __first;
+  if (__n > 0)
+  {
+    // do first word
+    if (__first.__ctz_ != 0)
+    {
+      unsigned __clz_f     = __bits_per_word - __first.__ctz_;
+      difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz_f), __n);
+      __n -= __dn;
+      __storage_type __m  = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+      __storage_type __b1 = *__first.__seg_ & __m;
+      *__first.__seg_ &= ~__m;
+      unsigned __clz_r     = __bits_per_word - __result.__ctz_;
+      __storage_type __ddn = _CUDA_VSTD::min<__storage_type>(__dn, __clz_r);
+      __m                  = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
+      __storage_type __b2  = *__result.__seg_ & __m;
+      *__result.__seg_ &= ~__m;
+      if (__result.__ctz_ > __first.__ctz_)
+      {
+        unsigned __s = __result.__ctz_ - __first.__ctz_;
+        *__result.__seg_ |= __b1 << __s;
+        *__first.__seg_ |= __b2 >> __s;
+      }
+      else
+      {
+        unsigned __s = __first.__ctz_ - __result.__ctz_;
+        *__result.__seg_ |= __b1 >> __s;
+        *__first.__seg_ |= __b2 << __s;
+      }
+      __result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word;
+      __result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_) % __bits_per_word);
+      __dn -= __ddn;
+      if (__dn > 0)
+      {
+        __m  = ~__storage_type(0) >> (__bits_per_word - __dn);
+        __b2 = *__result.__seg_ & __m;
+        *__result.__seg_ &= ~__m;
+        unsigned __s = __first.__ctz_ + __ddn;
+        *__result.__seg_ |= __b1 >> __s;
+        *__first.__seg_ |= __b2 << __s;
+        __result.__ctz_ = static_cast<unsigned>(__dn);
+      }
+      ++__first.__seg_;
+      // __first.__ctz_ = 0;
+    }
+    // __first.__ctz_ == 0;
+    // do middle words
+    __storage_type __m = ~__storage_type(0) << __result.__ctz_;
+    unsigned __clz_r   = __bits_per_word - __result.__ctz_;
+    for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_)
+    {
+      __storage_type __b1 = *__first.__seg_;
+      __storage_type __b2 = *__result.__seg_ & __m;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b1 << __result.__ctz_;
+      *__first.__seg_ = __b2 >> __result.__ctz_;
+      ++__result.__seg_;
+      __b2 = *__result.__seg_ & ~__m;
+      *__result.__seg_ &= __m;
+      *__result.__seg_ |= __b1 >> __clz_r;
+      *__first.__seg_ |= __b2 << __clz_r;
+    }
+    // do last word
     if (__n > 0)
     {
-        // do first word
-        if (__first.__ctz_ != 0)
-        {
-            unsigned __clz_f = __bits_per_word - __first.__ctz_;
-            difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz_f), __n);
-            __n -= __dn;
-            __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-            __storage_type __b1 = *__first.__seg_ & __m;
-            *__first.__seg_ &= ~__m;
-            unsigned __clz_r = __bits_per_word - __result.__ctz_;
-            __storage_type __ddn = _CUDA_VSTD::min<__storage_type>(__dn, __clz_r);
-            __m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
-            __storage_type __b2 = *__result.__seg_ & __m;
-            *__result.__seg_ &= ~__m;
-            if (__result.__ctz_ > __first.__ctz_)
-            {
-                unsigned __s = __result.__ctz_ - __first.__ctz_;
-                *__result.__seg_ |= __b1 << __s;
-                *__first.__seg_  |= __b2 >> __s;
-            }
-            else
-            {
-                unsigned __s = __first.__ctz_ - __result.__ctz_;
-                *__result.__seg_ |= __b1 >> __s;
-                *__first.__seg_  |= __b2 << __s;
-            }
-            __result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word;
-            __result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_)  % __bits_per_word);
-            __dn -= __ddn;
-            if (__dn > 0)
-            {
-                __m = ~__storage_type(0) >> (__bits_per_word - __dn);
-                __b2 = *__result.__seg_ & __m;
-                *__result.__seg_ &= ~__m;
-                unsigned __s = __first.__ctz_ + __ddn;
-                *__result.__seg_ |= __b1 >> __s;
-                *__first.__seg_  |= __b2 << __s;
-                __result.__ctz_ = static_cast<unsigned>(__dn);
-            }
-            ++__first.__seg_;
-            // __first.__ctz_ = 0;
-        }
-        // __first.__ctz_ == 0;
-        // do middle words
-        __storage_type __m = ~__storage_type(0) << __result.__ctz_;
-        unsigned __clz_r = __bits_per_word - __result.__ctz_;
-        for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_)
-        {
-            __storage_type __b1 = *__first.__seg_;
-            __storage_type __b2 = *__result.__seg_ & __m;
-            *__result.__seg_ &= ~__m;
-            *__result.__seg_ |= __b1 << __result.__ctz_;
-            *__first.__seg_  = __b2 >> __result.__ctz_;
-            ++__result.__seg_;
-            __b2 = *__result.__seg_ & ~__m;
-            *__result.__seg_ &= __m;
-            *__result.__seg_ |= __b1 >> __clz_r;
-            *__first.__seg_  |= __b2 << __clz_r;
-        }
-        // do last word
-        if (__n > 0)
-        {
-            __m = ~__storage_type(0) >> (__bits_per_word - __n);
-            __storage_type __b1 = *__first.__seg_ & __m;
-            *__first.__seg_ &= ~__m;
-            __storage_type __dn = _CUDA_VSTD::min<__storage_type>(__n, __clz_r);
-            __m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
-            __storage_type __b2 = *__result.__seg_ & __m;
-            *__result.__seg_ &= ~__m;
-            *__result.__seg_ |= __b1 << __result.__ctz_;
-            *__first.__seg_  |= __b2 >> __result.__ctz_;
-            __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
-            __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_)  % __bits_per_word);
-            __n -= __dn;
-            if (__n > 0)
-            {
-                __m = ~__storage_type(0) >> (__bits_per_word - __n);
-                __b2 = *__result.__seg_ & __m;
-                *__result.__seg_ &= ~__m;
-                *__result.__seg_ |= __b1 >> __dn;
-                *__first.__seg_  |= __b2 << __dn;
-                __result.__ctz_ = static_cast<unsigned>(__n);
-            }
-        }
+      __m                 = ~__storage_type(0) >> (__bits_per_word - __n);
+      __storage_type __b1 = *__first.__seg_ & __m;
+      *__first.__seg_ &= ~__m;
+      __storage_type __dn = _CUDA_VSTD::min<__storage_type>(__n, __clz_r);
+      __m                 = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
+      __storage_type __b2 = *__result.__seg_ & __m;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b1 << __result.__ctz_;
+      *__first.__seg_ |= __b2 >> __result.__ctz_;
+      __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
+      __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
+      __n -= __dn;
+      if (__n > 0)
+      {
+        __m  = ~__storage_type(0) >> (__bits_per_word - __n);
+        __b2 = *__result.__seg_ & __m;
+        *__result.__seg_ &= ~__m;
+        *__result.__seg_ |= __b1 >> __dn;
+        *__first.__seg_ |= __b2 << __dn;
+        __result.__ctz_ = static_cast<unsigned>(__n);
+      }
     }
-    return __result;
+  }
+  return __result;
 }
 
 template <class __C1, class __C2>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-__bit_iterator<__C2, false>
-swap_ranges(__bit_iterator<__C1, false> __first1, __bit_iterator<__C1, false> __last1,
-            __bit_iterator<__C2, false> __first2)
+inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<__C2, false> swap_ranges(
+  __bit_iterator<__C1, false> __first1, __bit_iterator<__C1, false> __last1, __bit_iterator<__C2, false> __first2)
 {
-    if (__first1.__ctz_ == __first2.__ctz_)
-        return __swap_ranges_aligned(__first1, __last1, __first2);
-    return __swap_ranges_unaligned(__first1, __last1, __first2);
+  if (__first1.__ctz_ == __first2.__ctz_)
+  {
+    return __swap_ranges_aligned(__first1, __last1, __first2);
+  }
+  return __swap_ranges_unaligned(__first1, __last1, __first2);
 }
 
 // rotate
@@ -881,413 +917,469 @@ swap_ranges(__bit_iterator<__C1, false> __first1, __bit_iterator<__C1, false> __
 template <class _Cp>
 struct __bit_array
 {
-    typedef typename _Cp::difference_type difference_type;
-    typedef typename _Cp::__storage_type  __storage_type;
-    typedef typename _Cp::__storage_pointer __storage_pointer;
-    typedef typename _Cp::iterator        iterator;
-    static const unsigned __bits_per_word = _Cp::__bits_per_word;
-    static const unsigned _Np = 4;
-
-    difference_type __size_;
-    __storage_type __word_[_Np];
-
-    _LIBCUDACXX_INLINE_VISIBILITY static difference_type capacity()
-        {return static_cast<difference_type>(_Np * __bits_per_word);}
-    _LIBCUDACXX_INLINE_VISIBILITY explicit __bit_array(difference_type __s) : __size_(__s) {}
-    _LIBCUDACXX_INLINE_VISIBILITY iterator begin()
-    {
-        return iterator(pointer_traits<__storage_pointer>::pointer_to(__word_[0]), 0);
-    }
-    _LIBCUDACXX_INLINE_VISIBILITY iterator end()
-    {
-        return iterator(pointer_traits<__storage_pointer>::pointer_to(__word_[0]) + __size_ / __bits_per_word,
-                                                  static_cast<unsigned>(__size_ % __bits_per_word));
-    }
+  typedef typename _Cp::difference_type difference_type;
+  typedef typename _Cp::__storage_type __storage_type;
+  typedef typename _Cp::__storage_pointer __storage_pointer;
+  typedef typename _Cp::iterator iterator;
+  static const unsigned __bits_per_word = _Cp::__bits_per_word;
+  static const unsigned _Np             = 4;
+
+  difference_type __size_;
+  __storage_type __word_[_Np];
+
+  _LIBCUDACXX_INLINE_VISIBILITY static difference_type capacity()
+  {
+    return static_cast<difference_type>(_Np * __bits_per_word);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY explicit __bit_array(difference_type __s)
+      : __size_(__s)
+  {}
+  _LIBCUDACXX_INLINE_VISIBILITY iterator begin()
+  {
+    return iterator(pointer_traits<__storage_pointer>::pointer_to(__word_[0]), 0);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY iterator end()
+  {
+    return iterator(pointer_traits<__storage_pointer>::pointer_to(__word_[0]) + __size_ / __bits_per_word,
+                    static_cast<unsigned>(__size_ % __bits_per_word));
+  }
 };
 
 template <class _Cp>
 __bit_iterator<_Cp, false>
 rotate(__bit_iterator<_Cp, false> __first, __bit_iterator<_Cp, false> __middle, __bit_iterator<_Cp, false> __last)
 {
-    typedef __bit_iterator<_Cp, false> _I1;
-    typedef  typename _I1::difference_type difference_type;
-    difference_type __d1 = __middle - __first;
-    difference_type __d2 = __last - __middle;
-    _I1 __r = __first + __d2;
-    while (__d1 != 0 && __d2 != 0)
+  typedef __bit_iterator<_Cp, false> _I1;
+  typedef typename _I1::difference_type difference_type;
+  difference_type __d1 = __middle - __first;
+  difference_type __d2 = __last - __middle;
+  _I1 __r              = __first + __d2;
+  while (__d1 != 0 && __d2 != 0)
+  {
+    if (__d1 <= __d2)
     {
-        if (__d1 <= __d2)
-        {
-            if (__d1 <= __bit_array<_Cp>::capacity())
-            {
-                __bit_array<_Cp> __b(__d1);
-                _CUDA_VSTD::copy(__first, __middle, __b.begin());
-                _CUDA_VSTD::copy(__b.begin(), __b.end(), _CUDA_VSTD::copy(__middle, __last, __first));
-                break;
-            }
-            else
-            {
-                __bit_iterator<_Cp, false> __mp = _CUDA_VSTD::swap_ranges(__first, __middle, __middle);
-                __first = __middle;
-                __middle = __mp;
-                __d2 -= __d1;
-            }
-        }
-        else
-        {
-            if (__d2 <= __bit_array<_Cp>::capacity())
-            {
-                __bit_array<_Cp> __b(__d2);
-                _CUDA_VSTD::copy(__middle, __last, __b.begin());
-                _CUDA_VSTD::copy_backward(__b.begin(), __b.end(), _CUDA_VSTD::copy_backward(__first, __middle, __last));
-                break;
-            }
-            else
-            {
-                __bit_iterator<_Cp, false> __mp = __first + __d2;
-                _CUDA_VSTD::swap_ranges(__first, __mp, __middle);
-                __first = __mp;
-                __d1 -= __d2;
-            }
-        }
+      if (__d1 <= __bit_array<_Cp>::capacity())
+      {
+        __bit_array<_Cp> __b(__d1);
+        _CUDA_VSTD::copy(__first, __middle, __b.begin());
+        _CUDA_VSTD::copy(__b.begin(), __b.end(), _CUDA_VSTD::copy(__middle, __last, __first));
+        break;
+      }
+      else
+      {
+        __bit_iterator<_Cp, false> __mp = _CUDA_VSTD::swap_ranges(__first, __middle, __middle);
+        __first                         = __middle;
+        __middle                        = __mp;
+        __d2 -= __d1;
+      }
+    }
+    else
+    {
+      if (__d2 <= __bit_array<_Cp>::capacity())
+      {
+        __bit_array<_Cp> __b(__d2);
+        _CUDA_VSTD::copy(__middle, __last, __b.begin());
+        _CUDA_VSTD::copy_backward(__b.begin(), __b.end(), _CUDA_VSTD::copy_backward(__first, __middle, __last));
+        break;
+      }
+      else
+      {
+        __bit_iterator<_Cp, false> __mp = __first + __d2;
+        _CUDA_VSTD::swap_ranges(__first, __mp, __middle);
+        __first = __mp;
+        __d1 -= __d2;
+      }
     }
-    return __r;
+  }
+  return __r;
 }
 
 // equal
 
 template <class _Cp, bool _IC1, bool _IC2>
-bool
-__equal_unaligned(__bit_iterator<_Cp, _IC1> __first1, __bit_iterator<_Cp, _IC1> __last1,
-                  __bit_iterator<_Cp, _IC2> __first2)
+bool __equal_unaligned(
+  __bit_iterator<_Cp, _IC1> __first1, __bit_iterator<_Cp, _IC1> __last1, __bit_iterator<_Cp, _IC2> __first2)
 {
-    typedef __bit_iterator<_Cp, _IC1> _It;
-    typedef  typename _It::difference_type difference_type;
-    typedef typename _It::__storage_type __storage_type;
-    static const int __bits_per_word = _It::__bits_per_word;
-    difference_type __n = __last1 - __first1;
-    if (__n > 0)
+  typedef __bit_iterator<_Cp, _IC1> _It;
+  typedef typename _It::difference_type difference_type;
+  typedef typename _It::__storage_type __storage_type;
+  static const int __bits_per_word = _It::__bits_per_word;
+  difference_type __n              = __last1 - __first1;
+  if (__n > 0)
+  {
+    // do first word
+    if (__first1.__ctz_ != 0)
     {
-        // do first word
-        if (__first1.__ctz_ != 0)
+      unsigned __clz_f     = __bits_per_word - __first1.__ctz_;
+      difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz_f), __n);
+      __n -= __dn;
+      __storage_type __m   = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+      __storage_type __b   = *__first1.__seg_ & __m;
+      unsigned __clz_r     = __bits_per_word - __first2.__ctz_;
+      __storage_type __ddn = _CUDA_VSTD::min<__storage_type>(__dn, __clz_r);
+      __m                  = (~__storage_type(0) << __first2.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
+      if (__first2.__ctz_ > __first1.__ctz_)
+      {
+        if ((*__first2.__seg_ & __m) != (__b << (__first2.__ctz_ - __first1.__ctz_)))
         {
-            unsigned __clz_f = __bits_per_word - __first1.__ctz_;
-            difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz_f), __n);
-            __n -= __dn;
-            __storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-            __storage_type __b = *__first1.__seg_ & __m;
-            unsigned __clz_r = __bits_per_word - __first2.__ctz_;
-            __storage_type __ddn = _CUDA_VSTD::min<__storage_type>(__dn, __clz_r);
-            __m = (~__storage_type(0) << __first2.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
-            if (__first2.__ctz_ > __first1.__ctz_)
-            {
-                if ((*__first2.__seg_ & __m) != (__b << (__first2.__ctz_ - __first1.__ctz_)))
-                    return false;
-            }
-            else
-            {
-                if ((*__first2.__seg_ & __m) != (__b >> (__first1.__ctz_ - __first2.__ctz_)))
-                    return false;
-            }
-            __first2.__seg_ += (__ddn + __first2.__ctz_) / __bits_per_word;
-            __first2.__ctz_ = static_cast<unsigned>((__ddn + __first2.__ctz_)  % __bits_per_word);
-            __dn -= __ddn;
-            if (__dn > 0)
-            {
-                __m = ~__storage_type(0) >> (__bits_per_word - __dn);
-                if ((*__first2.__seg_ & __m) != (__b >> (__first1.__ctz_ + __ddn)))
-                    return false;
-                __first2.__ctz_ = static_cast<unsigned>(__dn);
-            }
-            ++__first1.__seg_;
-            // __first1.__ctz_ = 0;
+          return false;
         }
-        // __first1.__ctz_ == 0;
-        // do middle words
-        unsigned __clz_r = __bits_per_word - __first2.__ctz_;
-        __storage_type __m = ~__storage_type(0) << __first2.__ctz_;
-        for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first1.__seg_)
+      }
+      else
+      {
+        if ((*__first2.__seg_ & __m) != (__b >> (__first1.__ctz_ - __first2.__ctz_)))
         {
-            __storage_type __b = *__first1.__seg_;
-            if ((*__first2.__seg_ & __m) != (__b << __first2.__ctz_))
-                return false;
-            ++__first2.__seg_;
-            if ((*__first2.__seg_ & ~__m) != (__b >> __clz_r))
-                return false;
+          return false;
         }
-        // do last word
-        if (__n > 0)
+      }
+      __first2.__seg_ += (__ddn + __first2.__ctz_) / __bits_per_word;
+      __first2.__ctz_ = static_cast<unsigned>((__ddn + __first2.__ctz_) % __bits_per_word);
+      __dn -= __ddn;
+      if (__dn > 0)
+      {
+        __m = ~__storage_type(0) >> (__bits_per_word - __dn);
+        if ((*__first2.__seg_ & __m) != (__b >> (__first1.__ctz_ + __ddn)))
         {
-            __m = ~__storage_type(0) >> (__bits_per_word - __n);
-            __storage_type __b = *__first1.__seg_ & __m;
-            __storage_type __dn = _CUDA_VSTD::min(__n, static_cast<difference_type>(__clz_r));
-            __m = (~__storage_type(0) << __first2.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
-            if ((*__first2.__seg_ & __m) != (__b << __first2.__ctz_))
-                return false;
-            __first2.__seg_ += (__dn + __first2.__ctz_) / __bits_per_word;
-            __first2.__ctz_ = static_cast<unsigned>((__dn + __first2.__ctz_)  % __bits_per_word);
-            __n -= __dn;
-            if (__n > 0)
-            {
-                __m = ~__storage_type(0) >> (__bits_per_word - __n);
-                if ((*__first2.__seg_ & __m) != (__b >> __dn))
-                    return false;
-            }
+          return false;
         }
+        __first2.__ctz_ = static_cast<unsigned>(__dn);
+      }
+      ++__first1.__seg_;
+      // __first1.__ctz_ = 0;
     }
-    return true;
+    // __first1.__ctz_ == 0;
+    // do middle words
+    unsigned __clz_r   = __bits_per_word - __first2.__ctz_;
+    __storage_type __m = ~__storage_type(0) << __first2.__ctz_;
+    for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first1.__seg_)
+    {
+      __storage_type __b = *__first1.__seg_;
+      if ((*__first2.__seg_ & __m) != (__b << __first2.__ctz_))
+      {
+        return false;
+      }
+      ++__first2.__seg_;
+      if ((*__first2.__seg_ & ~__m) != (__b >> __clz_r))
+      {
+        return false;
+      }
+    }
+    // do last word
+    if (__n > 0)
+    {
+      __m                 = ~__storage_type(0) >> (__bits_per_word - __n);
+      __storage_type __b  = *__first1.__seg_ & __m;
+      __storage_type __dn = _CUDA_VSTD::min(__n, static_cast<difference_type>(__clz_r));
+      __m                 = (~__storage_type(0) << __first2.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
+      if ((*__first2.__seg_ & __m) != (__b << __first2.__ctz_))
+      {
+        return false;
+      }
+      __first2.__seg_ += (__dn + __first2.__ctz_) / __bits_per_word;
+      __first2.__ctz_ = static_cast<unsigned>((__dn + __first2.__ctz_) % __bits_per_word);
+      __n -= __dn;
+      if (__n > 0)
+      {
+        __m = ~__storage_type(0) >> (__bits_per_word - __n);
+        if ((*__first2.__seg_ & __m) != (__b >> __dn))
+        {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
 }
 
 template <class _Cp, bool _IC1, bool _IC2>
-bool
-__equal_aligned(__bit_iterator<_Cp, _IC1> __first1, __bit_iterator<_Cp, _IC1> __last1,
-                __bit_iterator<_Cp, _IC2> __first2)
+bool __equal_aligned(
+  __bit_iterator<_Cp, _IC1> __first1, __bit_iterator<_Cp, _IC1> __last1, __bit_iterator<_Cp, _IC2> __first2)
 {
-    typedef __bit_iterator<_Cp, _IC1> _It;
-    typedef  typename _It::difference_type difference_type;
-    typedef typename _It::__storage_type __storage_type;
-    static const int __bits_per_word = _It::__bits_per_word;
-    difference_type __n = __last1 - __first1;
+  typedef __bit_iterator<_Cp, _IC1> _It;
+  typedef typename _It::difference_type difference_type;
+  typedef typename _It::__storage_type __storage_type;
+  static const int __bits_per_word = _It::__bits_per_word;
+  difference_type __n              = __last1 - __first1;
+  if (__n > 0)
+  {
+    // do first word
+    if (__first1.__ctz_ != 0)
+    {
+      unsigned __clz       = __bits_per_word - __first1.__ctz_;
+      difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz), __n);
+      __n -= __dn;
+      __storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
+      if ((*__first2.__seg_ & __m) != (*__first1.__seg_ & __m))
+      {
+        return false;
+      }
+      ++__first2.__seg_;
+      ++__first1.__seg_;
+      // __first1.__ctz_ = 0;
+      // __first2.__ctz_ = 0;
+    }
+    // __first1.__ctz_ == 0;
+    // __first2.__ctz_ == 0;
+    // do middle words
+    for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first1.__seg_, ++__first2.__seg_)
+    {
+      if (*__first2.__seg_ != *__first1.__seg_)
+      {
+        return false;
+      }
+    }
+    // do last word
     if (__n > 0)
     {
-        // do first word
-        if (__first1.__ctz_ != 0)
-        {
-            unsigned __clz = __bits_per_word - __first1.__ctz_;
-            difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz), __n);
-            __n -= __dn;
-            __storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
-            if ((*__first2.__seg_ & __m) != (*__first1.__seg_ & __m))
-                return false;
-            ++__first2.__seg_;
-            ++__first1.__seg_;
-            // __first1.__ctz_ = 0;
-            // __first2.__ctz_ = 0;
-        }
-        // __first1.__ctz_ == 0;
-        // __first2.__ctz_ == 0;
-        // do middle words
-        for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first1.__seg_, ++__first2.__seg_)
-            if (*__first2.__seg_ != *__first1.__seg_)
-                return false;
-        // do last word
-        if (__n > 0)
-        {
-            __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-            if ((*__first2.__seg_ & __m) != (*__first1.__seg_ & __m))
-                return false;
-        }
+      __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+      if ((*__first2.__seg_ & __m) != (*__first1.__seg_ & __m))
+      {
+        return false;
+      }
     }
-    return true;
+  }
+  return true;
 }
 
 template <class _Cp, bool _IC1, bool _IC2>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bool
+inline _LIBCUDACXX_INLINE_VISIBILITY bool
 equal(__bit_iterator<_Cp, _IC1> __first1, __bit_iterator<_Cp, _IC1> __last1, __bit_iterator<_Cp, _IC2> __first2)
 {
-    if (__first1.__ctz_ == __first2.__ctz_)
-        return __equal_aligned(__first1, __last1, __first2);
-    return __equal_unaligned(__first1, __last1, __first2);
+  if (__first1.__ctz_ == __first2.__ctz_)
+  {
+    return __equal_aligned(__first1, __last1, __first2);
+  }
+  return __equal_unaligned(__first1, __last1, __first2);
 }
 
-template <class _Cp, bool _IsConst,
-          typename _Cp::__storage_type>
+template <class _Cp, bool _IsConst, typename _Cp::__storage_type>
 class __bit_iterator
 {
 public:
-    typedef typename _Cp::difference_type                                                          difference_type;
-    typedef bool                                                                                  value_type;
-    typedef __bit_iterator                                                                        pointer;
-    typedef typename conditional<_IsConst, __bit_const_reference<_Cp>, __bit_reference<_Cp> >::type reference;
-    typedef random_access_iterator_tag                                                            iterator_category;
+  typedef typename _Cp::difference_type difference_type;
+  typedef bool value_type;
+  typedef __bit_iterator pointer;
+  typedef typename conditional<_IsConst, __bit_const_reference<_Cp>, __bit_reference<_Cp>>::type reference;
+  typedef random_access_iterator_tag iterator_category;
 
 private:
-    typedef typename _Cp::__storage_type                                           __storage_type;
-    typedef typename conditional<_IsConst, typename _Cp::__const_storage_pointer,
-                                           typename _Cp::__storage_pointer>::type  __storage_pointer;
-    static const unsigned __bits_per_word = _Cp::__bits_per_word;
+  typedef typename _Cp::__storage_type __storage_type;
+  typedef typename conditional<_IsConst, typename _Cp::__const_storage_pointer, typename _Cp::__storage_pointer>::type
+    __storage_pointer;
+  static const unsigned __bits_per_word = _Cp::__bits_per_word;
 
-    __storage_pointer __seg_;
-    unsigned          __ctz_;
+  __storage_pointer __seg_;
+  unsigned __ctz_;
 
 public:
-    _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator() noexcept
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator() noexcept
 #if _CCCL_STD_VER > 2011
-    : __seg_(nullptr), __ctz_(0)
+      : __seg_(nullptr)
+      , __ctz_(0)
 #endif
-    {}
-    // avoid re-declaring a copy constructor for the non-const version.
-    using __type_for_copy_to_const =
-      _If<_IsConst, __bit_iterator<_Cp, false>, struct __private_nat>;
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    __bit_iterator(const __type_for_copy_to_const& __it) noexcept
-        : __seg_(__it.__seg_), __ctz_(__it.__ctz_) {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY reference operator*() const noexcept
-        {return reference(__seg_, __storage_type(1) << __ctz_);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator& operator++()
-    {
-        if (__ctz_ != __bits_per_word-1)
-            ++__ctz_;
-        else
-        {
-            __ctz_ = 0;
-            ++__seg_;
-        }
-        return *this;
-    }
-
-    _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator operator++(int)
+  {}
+  // avoid re-declaring a copy constructor for the non-const version.
+  using __type_for_copy_to_const = _If<_IsConst, __bit_iterator<_Cp, false>, struct __private_nat>;
+
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator(const __type_for_copy_to_const& __it) noexcept
+      : __seg_(__it.__seg_)
+      , __ctz_(__it.__ctz_)
+  {}
+
+  _LIBCUDACXX_INLINE_VISIBILITY reference operator*() const noexcept
+  {
+    return reference(__seg_, __storage_type(1) << __ctz_);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator& operator++()
+  {
+    if (__ctz_ != __bits_per_word - 1)
     {
-        __bit_iterator __tmp = *this;
-        ++(*this);
-        return __tmp;
+      ++__ctz_;
     }
-
-    _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator& operator--()
+    else
     {
-        if (__ctz_ != 0)
-            --__ctz_;
-        else
-        {
-            __ctz_ = __bits_per_word - 1;
-            --__seg_;
-        }
-        return *this;
+      __ctz_ = 0;
+      ++__seg_;
     }
-
-    _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator operator--(int)
+    return *this;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator operator++(int)
+  {
+    __bit_iterator __tmp = *this;
+    ++(*this);
+    return __tmp;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator& operator--()
+  {
+    if (__ctz_ != 0)
     {
-        __bit_iterator __tmp = *this;
-        --(*this);
-        return __tmp;
+      --__ctz_;
     }
-
-    _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator& operator+=(difference_type __n)
-    {
-        if (__n >= 0)
-            __seg_ += (__n + __ctz_) / __bits_per_word;
-        else
-            __seg_ += static_cast<difference_type>(__n - __bits_per_word + __ctz_ + 1)
-                    / static_cast<difference_type>(__bits_per_word);
-        __n &= (__bits_per_word - 1);
-        __ctz_ = static_cast<unsigned>((__n + __ctz_)  % __bits_per_word);
-        return *this;
-    }
-
-    _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator& operator-=(difference_type __n)
+    else
     {
-        return *this += -__n;
+      __ctz_ = __bits_per_word - 1;
+      --__seg_;
     }
-
-    _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator operator+(difference_type __n) const
+    return *this;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator operator--(int)
+  {
+    __bit_iterator __tmp = *this;
+    --(*this);
+    return __tmp;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator& operator+=(difference_type __n)
+  {
+    if (__n >= 0)
     {
-        __bit_iterator __t(*this);
-        __t += __n;
-        return __t;
+      __seg_ += (__n + __ctz_) / __bits_per_word;
     }
-
-    _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator operator-(difference_type __n) const
+    else
     {
-        __bit_iterator __t(*this);
-        __t -= __n;
-        return __t;
+      __seg_ += static_cast<difference_type>(__n - __bits_per_word + __ctz_ + 1)
+              / static_cast<difference_type>(__bits_per_word);
     }
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    friend __bit_iterator operator+(difference_type __n, const __bit_iterator& __it) {return __it + __n;}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    friend difference_type operator-(const __bit_iterator& __x, const __bit_iterator& __y)
-        {return (__x.__seg_ - __y.__seg_) * __bits_per_word + __x.__ctz_ - __y.__ctz_;}
-
-    _LIBCUDACXX_INLINE_VISIBILITY reference operator[](difference_type __n) const {return *(*this + __n);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY friend bool operator==(const __bit_iterator& __x, const __bit_iterator& __y)
-        {return __x.__seg_ == __y.__seg_ && __x.__ctz_ == __y.__ctz_;}
-
-    _LIBCUDACXX_INLINE_VISIBILITY friend bool operator!=(const __bit_iterator& __x, const __bit_iterator& __y)
-        {return !(__x == __y);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY friend bool operator<(const __bit_iterator& __x, const __bit_iterator& __y)
-        {return __x.__seg_ < __y.__seg_ || (__x.__seg_ == __y.__seg_ && __x.__ctz_ < __y.__ctz_);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY friend bool operator>(const __bit_iterator& __x, const __bit_iterator& __y)
-        {return __y < __x;}
-
-    _LIBCUDACXX_INLINE_VISIBILITY friend bool operator<=(const __bit_iterator& __x, const __bit_iterator& __y)
-        {return !(__y < __x);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY friend bool operator>=(const __bit_iterator& __x, const __bit_iterator& __y)
-        {return !(__x < __y);}
+    __n &= (__bits_per_word - 1);
+    __ctz_ = static_cast<unsigned>((__n + __ctz_) % __bits_per_word);
+    return *this;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator& operator-=(difference_type __n)
+  {
+    return *this += -__n;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator operator+(difference_type __n) const
+  {
+    __bit_iterator __t(*this);
+    __t += __n;
+    return __t;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator operator-(difference_type __n) const
+  {
+    __bit_iterator __t(*this);
+    __t -= __n;
+    return __t;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator operator+(difference_type __n, const __bit_iterator& __it)
+  {
+    return __it + __n;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY friend difference_type operator-(const __bit_iterator& __x, const __bit_iterator& __y)
+  {
+    return (__x.__seg_ - __y.__seg_) * __bits_per_word + __x.__ctz_ - __y.__ctz_;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY reference operator[](difference_type __n) const
+  {
+    return *(*this + __n);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY friend bool operator==(const __bit_iterator& __x, const __bit_iterator& __y)
+  {
+    return __x.__seg_ == __y.__seg_ && __x.__ctz_ == __y.__ctz_;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY friend bool operator!=(const __bit_iterator& __x, const __bit_iterator& __y)
+  {
+    return !(__x == __y);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY friend bool operator<(const __bit_iterator& __x, const __bit_iterator& __y)
+  {
+    return __x.__seg_ < __y.__seg_ || (__x.__seg_ == __y.__seg_ && __x.__ctz_ < __y.__ctz_);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY friend bool operator>(const __bit_iterator& __x, const __bit_iterator& __y)
+  {
+    return __y < __x;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY friend bool operator<=(const __bit_iterator& __x, const __bit_iterator& __y)
+  {
+    return !(__y < __x);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY friend bool operator>=(const __bit_iterator& __x, const __bit_iterator& __y)
+  {
+    return !(__x < __y);
+  }
 
 private:
-    _LIBCUDACXX_INLINE_VISIBILITY
-    __bit_iterator(__storage_pointer __s, unsigned __ctz) noexcept
-        : __seg_(__s), __ctz_(__ctz) {}
-
-    friend typename _Cp::__self;
-
-    friend class __bit_reference<_Cp>;
-    friend class __bit_const_reference<_Cp>;
-    friend class __bit_iterator<_Cp, true>;
-    template <class _Dp> friend struct __bit_array;
-    template <class _Dp> friend void __fill_n_false(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n);
-    template <class _Dp> friend void __fill_n_true(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n);
-    template <class _Dp, bool _IC> friend __bit_iterator<_Dp, false> __copy_aligned(__bit_iterator<_Dp, _IC> __first,
-                                                                                  __bit_iterator<_Dp, _IC> __last,
-                                                                                  __bit_iterator<_Dp, false> __result);
-    template <class _Dp, bool _IC> friend __bit_iterator<_Dp, false> __copy_unaligned(__bit_iterator<_Dp, _IC> __first,
-                                                                                    __bit_iterator<_Dp, _IC> __last,
-                                                                                    __bit_iterator<_Dp, false> __result);
-    template <class _Dp, bool _IC> friend __bit_iterator<_Dp, false> copy(__bit_iterator<_Dp, _IC> __first,
-                                                                        __bit_iterator<_Dp, _IC> __last,
-                                                                        __bit_iterator<_Dp, false> __result);
-    template <class _Dp, bool _IC> friend __bit_iterator<_Dp, false> __copy_backward_aligned(__bit_iterator<_Dp, _IC> __first,
-                                                                                           __bit_iterator<_Dp, _IC> __last,
-                                                                                           __bit_iterator<_Dp, false> __result);
-    template <class _Dp, bool _IC> friend __bit_iterator<_Dp, false> __copy_backward_unaligned(__bit_iterator<_Dp, _IC> __first,
-                                                                                             __bit_iterator<_Dp, _IC> __last,
-                                                                                             __bit_iterator<_Dp, false> __result);
-    template <class _Dp, bool _IC> friend __bit_iterator<_Dp, false> copy_backward(__bit_iterator<_Dp, _IC> __first,
-                                                                                 __bit_iterator<_Dp, _IC> __last,
-                                                                                 __bit_iterator<_Dp, false> __result);
-    template <class __C1, class __C2>friend __bit_iterator<__C2, false> __swap_ranges_aligned(__bit_iterator<__C1, false>,
-                                                                                           __bit_iterator<__C1, false>,
-                                                                                           __bit_iterator<__C2, false>);
-    template <class __C1, class __C2>friend __bit_iterator<__C2, false> __swap_ranges_unaligned(__bit_iterator<__C1, false>,
-                                                                                             __bit_iterator<__C1, false>,
-                                                                                             __bit_iterator<__C2, false>);
-    template <class __C1, class __C2>friend __bit_iterator<__C2, false> swap_ranges(__bit_iterator<__C1, false>,
-                                                                                 __bit_iterator<__C1, false>,
-                                                                                 __bit_iterator<__C2, false>);
-    template <class _Dp> friend __bit_iterator<_Dp, false> rotate(__bit_iterator<_Dp, false>,
-                                                                __bit_iterator<_Dp, false>,
-                                                                __bit_iterator<_Dp, false>);
-    template <class _Dp, bool _IC1, bool _IC2> friend bool __equal_aligned(__bit_iterator<_Dp, _IC1>,
-                                                    __bit_iterator<_Dp, _IC1>,
-                                                    __bit_iterator<_Dp, _IC2>);
-    template <class _Dp, bool _IC1, bool _IC2> friend bool __equal_unaligned(__bit_iterator<_Dp, _IC1>,
-                                                      __bit_iterator<_Dp, _IC1>,
-                                                      __bit_iterator<_Dp, _IC2>);
-    template <class _Dp, bool _IC1, bool _IC2> friend bool equal(__bit_iterator<_Dp, _IC1>,
-                                                                __bit_iterator<_Dp, _IC1>,
-                                                                __bit_iterator<_Dp, _IC2>);
-    template <class _Dp, bool _IC> friend __bit_iterator<_Dp, _IC> __find_bool_true(__bit_iterator<_Dp, _IC>,
-                                                                          typename _Dp::size_type);
-    template <class _Dp, bool _IC> friend __bit_iterator<_Dp, _IC> __find_bool_false(__bit_iterator<_Dp, _IC>,
-                                                                           typename _Dp::size_type);
-    template <class _Dp, bool _IC> friend typename __bit_iterator<_Dp, _IC>::difference_type
-                   __count_bool_true(__bit_iterator<_Dp, _IC>, typename _Dp::size_type);
-    template <class _Dp, bool _IC> friend typename __bit_iterator<_Dp, _IC>::difference_type
-                   __count_bool_false(__bit_iterator<_Dp, _IC>, typename _Dp::size_type);
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator(__storage_pointer __s, unsigned __ctz) noexcept
+      : __seg_(__s)
+      , __ctz_(__ctz)
+  {}
+
+  friend typename _Cp::__self;
+
+  friend class __bit_reference<_Cp>;
+  friend class __bit_const_reference<_Cp>;
+  friend class __bit_iterator<_Cp, true>;
+  template <class _Dp>
+  friend struct __bit_array;
+  template <class _Dp>
+  friend void __fill_n_false(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n);
+  template <class _Dp>
+  friend void __fill_n_true(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n);
+  template <class _Dp, bool _IC>
+  friend __bit_iterator<_Dp, false> __copy_aligned(
+    __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
+  template <class _Dp, bool _IC>
+  friend __bit_iterator<_Dp, false> __copy_unaligned(
+    __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
+  template <class _Dp, bool _IC>
+  friend __bit_iterator<_Dp, false>
+  copy(__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
+  template <class _Dp, bool _IC>
+  friend __bit_iterator<_Dp, false> __copy_backward_aligned(
+    __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
+  template <class _Dp, bool _IC>
+  friend __bit_iterator<_Dp, false> __copy_backward_unaligned(
+    __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
+  template <class _Dp, bool _IC>
+  friend __bit_iterator<_Dp, false>
+  copy_backward(__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
+  template <class __C1, class __C2>
+  friend __bit_iterator<__C2, false>
+    __swap_ranges_aligned(__bit_iterator<__C1, false>, __bit_iterator<__C1, false>, __bit_iterator<__C2, false>);
+  template <class __C1, class __C2>
+  friend __bit_iterator<__C2, false>
+    __swap_ranges_unaligned(__bit_iterator<__C1, false>, __bit_iterator<__C1, false>, __bit_iterator<__C2, false>);
+  template <class __C1, class __C2>
+  friend __bit_iterator<__C2, false>
+    swap_ranges(__bit_iterator<__C1, false>, __bit_iterator<__C1, false>, __bit_iterator<__C2, false>);
+  template <class _Dp>
+  friend __bit_iterator<_Dp, false>
+    rotate(__bit_iterator<_Dp, false>, __bit_iterator<_Dp, false>, __bit_iterator<_Dp, false>);
+  template <class _Dp, bool _IC1, bool _IC2>
+  friend bool __equal_aligned(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>);
+  template <class _Dp, bool _IC1, bool _IC2>
+  friend bool __equal_unaligned(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>);
+  template <class _Dp, bool _IC1, bool _IC2>
+  friend bool equal(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>);
+  template <class _Dp, bool _IC>
+  friend __bit_iterator<_Dp, _IC> __find_bool_true(__bit_iterator<_Dp, _IC>, typename _Dp::size_type);
+  template <class _Dp, bool _IC>
+  friend __bit_iterator<_Dp, _IC> __find_bool_false(__bit_iterator<_Dp, _IC>, typename _Dp::size_type);
+  template <class _Dp, bool _IC>
+  friend typename __bit_iterator<_Dp, _IC>::difference_type
+    __count_bool_true(__bit_iterator<_Dp, _IC>, typename _Dp::size_type);
+  template <class _Dp, bool _IC>
+  friend typename __bit_iterator<_Dp, _IC>::difference_type
+    __count_bool_false(__bit_iterator<_Dp, _IC>, typename _Dp::size_type);
 };
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
 _LIBCUDACXX_POP_MACROS
 
-#endif  // _LIBCUDACXX___BIT_REFERENCE
+#endif // _LIBCUDACXX___BIT_REFERENCE
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__pragma_pop b/libcudacxx/include/cuda/std/detail/libcxx/include/__pragma_pop
index 27a9a68b4e6..5bd85a09940 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__pragma_pop
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__pragma_pop
@@ -8,9 +8,9 @@
 //===----------------------------------------------------------------------===//
 
 #if defined(_LIBCUDACXX_USE_PRAGMA_MSVC_WARNING)
-  #pragma warning(pop)
+#  pragma warning(pop)
 #endif
 
 #if defined(_LIBCUDACXX_POP_MACROS)
-  _LIBCUDACXX_POP_MACROS
+_LIBCUDACXX_POP_MACROS
 #endif
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/atomic b/libcudacxx/include/cuda/std/detail/libcxx/include/atomic
index 298b69726f9..2d0a2e56af6 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/atomic
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/atomic
@@ -556,9 +556,6 @@ void atomic_signal_fence(memory_order m) noexcept;
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
-#include <cuda/std/detail/libcxx/include/__debug>
-#include <cuda/std/detail/libcxx/include/__threading_support>
 #include <cuda/std/__type_traits/conditional.h>
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/is_assignable.h>
@@ -568,42 +565,42 @@ void atomic_signal_fence(memory_order m) noexcept;
 #include <cuda/std/__type_traits/is_trivially_copyable.h>
 #include <cuda/std/__type_traits/underlying_type.h>
 #include <cuda/std/__utility/forward.h>
-#include <cuda/std/detail/libcxx/include/cstring>
 #include <cuda/std/cstddef>
 #include <cuda/std/cstdint>
+#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
+#include <cuda/std/detail/libcxx/include/__debug>
+#include <cuda/std/detail/libcxx/include/__threading_support>
+#include <cuda/std/detail/libcxx/include/cstring>
 #include <cuda/std/type_traits>
 #include <cuda/std/version>
 
 #include <cuda/std/detail/libcxx/include/__pragma_push>
 
 #ifdef _LIBCUDACXX_HAS_NO_THREADS
-# error <atomic> is not supported on this single threaded system
+#  error <atomic> is not supported on this single threaded system
 #endif
 #ifdef _LIBCUDACXX_HAS_NO_ATOMIC_HEADER
-# error <atomic> is not implemented
+#  error <atomic> is not implemented
 #endif
 #ifdef _LIBCUDACXX_UNSUPPORTED_THREAD_API
-# error "<atomic> is not supported on this system"
+#  error "<atomic> is not supported on this system"
 #endif
 #ifdef kill_dependency
-# error C++ standard library is incompatible with <stdatomic.h>
+#  error C++ standard library is incompatible with <stdatomic.h>
 #endif
 
-#define _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m) \
-  _LIBCUDACXX_DIAGNOSE_WARNING(__m == memory_order_consume || \
-                           __m == memory_order_acquire || \
-                           __m == memory_order_acq_rel,   \
-                        "memory order argument to atomic operation is invalid")
+#define _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)                                              \
+  _LIBCUDACXX_DIAGNOSE_WARNING(                                                                \
+    __m == memory_order_consume || __m == memory_order_acquire || __m == memory_order_acq_rel, \
+    "memory order argument to atomic operation is invalid")
 
-#define _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m) \
-  _LIBCUDACXX_DIAGNOSE_WARNING(__m == memory_order_release || \
-                           __m == memory_order_acq_rel,   \
-                        "memory order argument to atomic operation is invalid")
+#define _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)                                           \
+  _LIBCUDACXX_DIAGNOSE_WARNING(__m == memory_order_release || __m == memory_order_acq_rel, \
+                               "memory order argument to atomic operation is invalid")
 
-#define _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__m, __f) \
-  _LIBCUDACXX_DIAGNOSE_WARNING(__f == memory_order_release || \
-                           __f == memory_order_acq_rel,   \
-                        "memory order argument to atomic operation is invalid")
+#define _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__m, __f)                                  \
+  _LIBCUDACXX_DIAGNOSE_WARNING(__f == memory_order_release || __f == memory_order_acq_rel, \
+                               "memory order argument to atomic operation is invalid")
 
 #if defined(_LIBCUDACXX_HAS_MSVC_ATOMIC_IMPL)
 #  include <intrin.h>
@@ -614,25 +611,25 @@ void atomic_signal_fence(memory_order m) noexcept;
 #endif
 
 #if !defined(__CLANG_ATOMIC_BOOL_LOCK_FREE) && !defined(__GCC_ATOMIC_BOOL_LOCK_FREE)
-#define ATOMIC_BOOL_LOCK_FREE      2
-#define ATOMIC_CHAR_LOCK_FREE      2
-#define ATOMIC_CHAR16_T_LOCK_FREE  2
-#define ATOMIC_CHAR32_T_LOCK_FREE  2
-#define ATOMIC_WCHAR_T_LOCK_FREE   2
-#define ATOMIC_SHORT_LOCK_FREE     2
-#define ATOMIC_INT_LOCK_FREE       2
-#define ATOMIC_LONG_LOCK_FREE      2
-#define ATOMIC_LLONG_LOCK_FREE     2
-#define ATOMIC_POINTER_LOCK_FREE   2
-#endif //!defined(__CLANG_ATOMIC_BOOL_LOCK_FREE) && !defined(__GCC_ATOMIC_BOOL_LOCK_FREE)
+#  define ATOMIC_BOOL_LOCK_FREE     2
+#  define ATOMIC_CHAR_LOCK_FREE     2
+#  define ATOMIC_CHAR16_T_LOCK_FREE 2
+#  define ATOMIC_CHAR32_T_LOCK_FREE 2
+#  define ATOMIC_WCHAR_T_LOCK_FREE  2
+#  define ATOMIC_SHORT_LOCK_FREE    2
+#  define ATOMIC_INT_LOCK_FREE      2
+#  define ATOMIC_LONG_LOCK_FREE     2
+#  define ATOMIC_LLONG_LOCK_FREE    2
+#  define ATOMIC_POINTER_LOCK_FREE  2
+#endif //! defined(__CLANG_ATOMIC_BOOL_LOCK_FREE) && !defined(__GCC_ATOMIC_BOOL_LOCK_FREE)
 
 #ifndef __ATOMIC_RELAXED
-#define __ATOMIC_RELAXED 0
-#define __ATOMIC_CONSUME 1
-#define __ATOMIC_ACQUIRE 2
-#define __ATOMIC_RELEASE 3
-#define __ATOMIC_ACQ_REL 4
-#define __ATOMIC_SEQ_CST 5
+#  define __ATOMIC_RELAXED 0
+#  define __ATOMIC_CONSUME 1
+#  define __ATOMIC_ACQUIRE 2
+#  define __ATOMIC_RELEASE 3
+#  define __ATOMIC_ACQ_REL 4
+#  define __ATOMIC_SEQ_CST 5
 #endif //__ATOMIC_RELAXED
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
@@ -640,20 +637,22 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 // Figure out what the underlying type for `memory_order` would be if it were
 // declared as an unscoped enum (accounting for -fshort-enums). Use this result
 // to pin the underlying type in C++20.
-enum __legacy_memory_order {
-    __mo_relaxed,
-    __mo_consume,
-    __mo_acquire,
-    __mo_release,
-    __mo_acq_rel,
-    __mo_seq_cst
+enum __legacy_memory_order
+{
+  __mo_relaxed,
+  __mo_consume,
+  __mo_acquire,
+  __mo_release,
+  __mo_acq_rel,
+  __mo_seq_cst
 };
 
 typedef underlying_type<__legacy_memory_order>::type __memory_order_underlying_t;
 
 #if _CCCL_STD_VER > 2017
 
-enum class memory_order : __memory_order_underlying_t {
+enum class memory_order : __memory_order_underlying_t
+{
   relaxed = __mo_relaxed,
   consume = __mo_consume,
   acquire = __mo_acquire,
@@ -671,7 +670,8 @@ inline constexpr auto memory_order_seq_cst = memory_order::seq_cst;
 
 #else
 
-typedef enum memory_order {
+typedef enum memory_order
+{
   memory_order_relaxed = __mo_relaxed,
   memory_order_consume = __mo_consume,
   memory_order_acquire = __mo_acquire,
@@ -682,43 +682,48 @@ typedef enum memory_order {
 
 #endif // _CCCL_STD_VER > 2017
 
-template <typename _Tp> _LIBCUDACXX_INLINE_VISIBILITY
-bool __cxx_nonatomic_compare_equal(_Tp const& __lhs, _Tp const& __rhs) {
+template <typename _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY bool __cxx_nonatomic_compare_equal(_Tp const& __lhs, _Tp const& __rhs)
+{
 #if defined(_CCCL_CUDA_COMPILER)
-    return __lhs == __rhs;
+  return __lhs == __rhs;
 #else
-    return memcmp(&__lhs, &__rhs, sizeof(_Tp)) == 0;
+  return memcmp(&__lhs, &__rhs, sizeof(_Tp)) == 0;
 #endif
 }
 
 static_assert((is_same<underlying_type<memory_order>::type, __memory_order_underlying_t>::value),
-  "unexpected underlying type for std::memory_order");
+              "unexpected underlying type for std::memory_order");
 
-#if defined(_LIBCUDACXX_HAS_GCC_ATOMIC_IMP) || \
-    defined(_LIBCUDACXX_ATOMIC_ONLY_USE_BUILTINS)
+#if defined(_LIBCUDACXX_HAS_GCC_ATOMIC_IMP) || defined(_LIBCUDACXX_ATOMIC_ONLY_USE_BUILTINS)
 
 // [atomics.types.generic]p1 guarantees _Tp is trivially copyable. Because
 // the default operator= in an object is not volatile, a byte-by-byte copy
 // is required.
-template <typename _Tp, typename _Tv> _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t<is_assignable<_Tp&, _Tv>::value>
-__cxx_atomic_assign_volatile(_Tp& __a_value, _Tv const& __val) {
+template <typename _Tp, typename _Tv>
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_assignable<_Tp&, _Tv>::value>
+__cxx_atomic_assign_volatile(_Tp& __a_value, _Tv const& __val)
+{
   __a_value = __val;
 }
-template <typename _Tp, typename _Tv> _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t<is_assignable<_Tp&, _Tv>::value>
-__cxx_atomic_assign_volatile(_Tp volatile& __a_value, _Tv volatile const& __val) {
-  volatile char* __to = reinterpret_cast<volatile char*>(&__a_value);
-  volatile char* __end = __to + sizeof(_Tp);
+template <typename _Tp, typename _Tv>
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_assignable<_Tp&, _Tv>::value>
+__cxx_atomic_assign_volatile(_Tp volatile& __a_value, _Tv volatile const& __val)
+{
+  volatile char* __to         = reinterpret_cast<volatile char*>(&__a_value);
+  volatile char* __end        = __to + sizeof(_Tp);
   volatile const char* __from = reinterpret_cast<volatile const char*>(&__val);
   while (__to != __end)
+  {
     *__to++ = *__from++;
+  }
 }
 
 #endif
 
 // Headers are wrapped like so: (cuda::std::|std::)detail
-namespace __detail {
+namespace __detail
+{
 #if defined(_LIBCUDACXX_HAS_CUDA_ATOMIC_EXT)
 #  include <cuda/std/detail/libcxx/include/support/atomic/atomic_scopes.h>
 #endif
@@ -733,91 +738,98 @@ namespace __detail {
 // TODO: Maybe support C11 atomics?
 // #include <cuda/std/detail/libcxx/include/support/atomic/atomic_c11.h>
 #endif // _LIBCUDACXX_HAS_GCC_ATOMIC_IMP, _LIBCUDACXX_HAS_C_ATOMIC_IMP
-}
+} // namespace __detail
 
 using __detail::__cxx_atomic_base_impl;
-using __detail::__cxx_atomic_ref_base_impl;
-using __detail::__cxx_atomic_thread_fence;
-using __detail::__cxx_atomic_signal_fence;
-using __detail::__cxx_atomic_load;
-using __detail::__cxx_atomic_store;
-using __detail::__cxx_atomic_exchange;
-using __detail::__cxx_atomic_compare_exchange_weak;
 using __detail::__cxx_atomic_compare_exchange_strong;
+using __detail::__cxx_atomic_compare_exchange_weak;
+using __detail::__cxx_atomic_exchange;
 using __detail::__cxx_atomic_fetch_add;
-using __detail::__cxx_atomic_fetch_sub;
-using __detail::__cxx_atomic_fetch_or;
 using __detail::__cxx_atomic_fetch_and;
+using __detail::__cxx_atomic_fetch_or;
+using __detail::__cxx_atomic_fetch_sub;
 using __detail::__cxx_atomic_fetch_xor;
+using __detail::__cxx_atomic_load;
+using __detail::__cxx_atomic_ref_base_impl;
+using __detail::__cxx_atomic_signal_fence;
+using __detail::__cxx_atomic_store;
+using __detail::__cxx_atomic_thread_fence;
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp kill_dependency(_Tp __y) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp kill_dependency(_Tp __y) noexcept
 {
-    return __y;
+  return __y;
 }
 
 #if defined(__CLANG_ATOMIC_BOOL_LOCK_FREE)
-# define ATOMIC_BOOL_LOCK_FREE      __CLANG_ATOMIC_BOOL_LOCK_FREE
-# define ATOMIC_CHAR_LOCK_FREE      __CLANG_ATOMIC_CHAR_LOCK_FREE
-# define ATOMIC_CHAR16_T_LOCK_FREE  __CLANG_ATOMIC_CHAR16_T_LOCK_FREE
-# define ATOMIC_CHAR32_T_LOCK_FREE  __CLANG_ATOMIC_CHAR32_T_LOCK_FREE
-# define ATOMIC_WCHAR_T_LOCK_FREE   __CLANG_ATOMIC_WCHAR_T_LOCK_FREE
-# define ATOMIC_SHORT_LOCK_FREE     __CLANG_ATOMIC_SHORT_LOCK_FREE
-# define ATOMIC_INT_LOCK_FREE       __CLANG_ATOMIC_INT_LOCK_FREE
-# define ATOMIC_LONG_LOCK_FREE      __CLANG_ATOMIC_LONG_LOCK_FREE
-# define ATOMIC_LLONG_LOCK_FREE     __CLANG_ATOMIC_LLONG_LOCK_FREE
-# define ATOMIC_POINTER_LOCK_FREE   __CLANG_ATOMIC_POINTER_LOCK_FREE
+#  define ATOMIC_BOOL_LOCK_FREE     __CLANG_ATOMIC_BOOL_LOCK_FREE
+#  define ATOMIC_CHAR_LOCK_FREE     __CLANG_ATOMIC_CHAR_LOCK_FREE
+#  define ATOMIC_CHAR16_T_LOCK_FREE __CLANG_ATOMIC_CHAR16_T_LOCK_FREE
+#  define ATOMIC_CHAR32_T_LOCK_FREE __CLANG_ATOMIC_CHAR32_T_LOCK_FREE
+#  define ATOMIC_WCHAR_T_LOCK_FREE  __CLANG_ATOMIC_WCHAR_T_LOCK_FREE
+#  define ATOMIC_SHORT_LOCK_FREE    __CLANG_ATOMIC_SHORT_LOCK_FREE
+#  define ATOMIC_INT_LOCK_FREE      __CLANG_ATOMIC_INT_LOCK_FREE
+#  define ATOMIC_LONG_LOCK_FREE     __CLANG_ATOMIC_LONG_LOCK_FREE
+#  define ATOMIC_LLONG_LOCK_FREE    __CLANG_ATOMIC_LLONG_LOCK_FREE
+#  define ATOMIC_POINTER_LOCK_FREE  __CLANG_ATOMIC_POINTER_LOCK_FREE
 #elif defined(__GCC_ATOMIC_BOOL_LOCK_FREE)
-# define ATOMIC_BOOL_LOCK_FREE      __GCC_ATOMIC_BOOL_LOCK_FREE
-# define ATOMIC_CHAR_LOCK_FREE      __GCC_ATOMIC_CHAR_LOCK_FREE
-# define ATOMIC_CHAR16_T_LOCK_FREE  __GCC_ATOMIC_CHAR16_T_LOCK_FREE
-# define ATOMIC_CHAR32_T_LOCK_FREE  __GCC_ATOMIC_CHAR32_T_LOCK_FREE
-# define ATOMIC_WCHAR_T_LOCK_FREE   __GCC_ATOMIC_WCHAR_T_LOCK_FREE
-# define ATOMIC_SHORT_LOCK_FREE     __GCC_ATOMIC_SHORT_LOCK_FREE
-# define ATOMIC_INT_LOCK_FREE       __GCC_ATOMIC_INT_LOCK_FREE
-# define ATOMIC_LONG_LOCK_FREE      __GCC_ATOMIC_LONG_LOCK_FREE
-# define ATOMIC_LLONG_LOCK_FREE     __GCC_ATOMIC_LLONG_LOCK_FREE
-# define ATOMIC_POINTER_LOCK_FREE   __GCC_ATOMIC_POINTER_LOCK_FREE
+#  define ATOMIC_BOOL_LOCK_FREE     __GCC_ATOMIC_BOOL_LOCK_FREE
+#  define ATOMIC_CHAR_LOCK_FREE     __GCC_ATOMIC_CHAR_LOCK_FREE
+#  define ATOMIC_CHAR16_T_LOCK_FREE __GCC_ATOMIC_CHAR16_T_LOCK_FREE
+#  define ATOMIC_CHAR32_T_LOCK_FREE __GCC_ATOMIC_CHAR32_T_LOCK_FREE
+#  define ATOMIC_WCHAR_T_LOCK_FREE  __GCC_ATOMIC_WCHAR_T_LOCK_FREE
+#  define ATOMIC_SHORT_LOCK_FREE    __GCC_ATOMIC_SHORT_LOCK_FREE
+#  define ATOMIC_INT_LOCK_FREE      __GCC_ATOMIC_INT_LOCK_FREE
+#  define ATOMIC_LONG_LOCK_FREE     __GCC_ATOMIC_LONG_LOCK_FREE
+#  define ATOMIC_LLONG_LOCK_FREE    __GCC_ATOMIC_LLONG_LOCK_FREE
+#  define ATOMIC_POINTER_LOCK_FREE  __GCC_ATOMIC_POINTER_LOCK_FREE
 #endif
 
 #ifdef _LIBCUDACXX_ATOMIC_ONLY_USE_BUILTINS
 
-template<typename _Tp, int _Sco>
-struct __cxx_atomic_lock_impl {
-
-  _LIBCUDACXX_INLINE_VISIBILITY
-  __cxx_atomic_lock_impl() noexcept
-    : __a_value(), __a_lock(0) {}
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit
-  __cxx_atomic_lock_impl(_Tp value) noexcept
-    : __a_value(value), __a_lock(0) {}
+template <typename _Tp, int _Sco>
+struct __cxx_atomic_lock_impl
+{
+  _LIBCUDACXX_INLINE_VISIBILITY __cxx_atomic_lock_impl() noexcept
+      : __a_value()
+      , __a_lock(0)
+  {}
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __cxx_atomic_lock_impl(_Tp value) noexcept
+      : __a_value(value)
+      , __a_lock(0)
+  {}
 
   _Tp __a_value;
   mutable __cxx_atomic_base_impl<_LIBCUDACXX_ATOMIC_FLAG_TYPE, _Sco> __a_lock;
 
-  _LIBCUDACXX_INLINE_VISIBILITY void __lock() const volatile {
-    while(1 == __cxx_atomic_exchange(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire))
-        /*spin*/;
+  _LIBCUDACXX_INLINE_VISIBILITY void __lock() const volatile
+  {
+    while (1 == __cxx_atomic_exchange(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire))
+      /*spin*/;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY void __lock() const {
-    while(1 == __cxx_atomic_exchange(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire))
-        /*spin*/;
+  _LIBCUDACXX_INLINE_VISIBILITY void __lock() const
+  {
+    while (1 == __cxx_atomic_exchange(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire))
+      /*spin*/;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY void __unlock() const volatile {
+  _LIBCUDACXX_INLINE_VISIBILITY void __unlock() const volatile
+  {
     __cxx_atomic_store(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY void __unlock() const {
+  _LIBCUDACXX_INLINE_VISIBILITY void __unlock() const
+  {
     __cxx_atomic_store(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp __read() const volatile {
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp __read() const volatile
+  {
     __lock();
     _Tp __old;
     __cxx_atomic_assign_volatile(__old, __a_value);
     __unlock();
     return __old;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp __read() const {
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp __read() const
+  {
     __lock();
     _Tp __old = __a_value;
     __unlock();
@@ -826,45 +838,47 @@ struct __cxx_atomic_lock_impl {
 };
 
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-void __cxx_atomic_init(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,  _Tp __val) {
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_init(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __val)
+{
   __cxx_atomic_assign_volatile(__a->__a_value, __val);
 }
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-void __cxx_atomic_init(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,  _Tp __val) {
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_init(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __val)
+{
   __a->__a_value = __val;
 }
 
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-void __cxx_atomic_store(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,  _Tp __val, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY void
+__cxx_atomic_store(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __val, memory_order)
+{
   __a->__lock();
   __cxx_atomic_assign_volatile(__a->__a_value, __val);
   __a->__unlock();
 }
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-void __cxx_atomic_store(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,  _Tp __val, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_store(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __val, memory_order)
+{
   __a->__lock();
   __a->__a_value = __val;
   __a->__unlock();
 }
 
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_load(const volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp __cxx_atomic_load(const volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, memory_order)
+{
   return __a->__read();
 }
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_load(const __cxx_atomic_lock_impl<_Tp, _Sco>* __a, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp __cxx_atomic_load(const __cxx_atomic_lock_impl<_Tp, _Sco>* __a, memory_order)
+{
   return __a->__read();
 }
 
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_exchange(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __value, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp
+__cxx_atomic_exchange(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __value, memory_order)
+{
   __a->__lock();
   _Tp __old;
   __cxx_atomic_assign_volatile(__old, __a->__a_value);
@@ -873,77 +887,94 @@ _Tp __cxx_atomic_exchange(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp _
   return __old;
 }
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_exchange(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __value, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp
+__cxx_atomic_exchange(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __value, memory_order)
+{
   __a->__lock();
-  _Tp __old = __a->__a_value;
+  _Tp __old      = __a->__a_value;
   __a->__a_value = __value;
   __a->__unlock();
   return __old;
 }
 
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool __cxx_atomic_compare_exchange_strong(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                                          _Tp* __expected, _Tp __value, memory_order, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_strong(
+  volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order)
+{
   __a->__lock();
   _Tp __temp;
   __cxx_atomic_assign_volatile(__temp, __a->__a_value);
   bool __ret = __temp == *__expected;
-  if(__ret)
+  if (__ret)
+  {
     __cxx_atomic_assign_volatile(__a->__a_value, __value);
+  }
   else
+  {
     __cxx_atomic_assign_volatile(*__expected, __a->__a_value);
+  }
   __a->__unlock();
   return __ret;
 }
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool __cxx_atomic_compare_exchange_strong(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                                          _Tp* __expected, _Tp __value, memory_order, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_strong(
+  __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order)
+{
   __a->__lock();
   bool __ret = __a->__a_value == *__expected;
-  if(__ret)
+  if (__ret)
+  {
     __a->__a_value = __value;
+  }
   else
+  {
     *__expected = __a->__a_value;
+  }
   __a->__unlock();
   return __ret;
 }
 
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool __cxx_atomic_compare_exchange_weak(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                                        _Tp* __expected, _Tp __value, memory_order, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_weak(
+  volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order)
+{
   __a->__lock();
   _Tp __temp;
   __cxx_atomic_assign_volatile(__temp, __a->__a_value);
   bool __ret = __temp == *__expected;
-  if(__ret)
+  if (__ret)
+  {
     __cxx_atomic_assign_volatile(__a->__a_value, __value);
+  }
   else
+  {
     __cxx_atomic_assign_volatile(*__expected, __a->__a_value);
+  }
   __a->__unlock();
   return __ret;
 }
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                                        _Tp* __expected, _Tp __value, memory_order, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_weak(
+  __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order)
+{
   __a->__lock();
   bool __ret = __a->__a_value == *__expected;
-  if(__ret)
+  if (__ret)
+  {
     __a->__a_value = __value;
+  }
   else
+  {
     *__expected = __a->__a_value;
+  }
   __a->__unlock();
   return __ret;
 }
 
 template <typename _Tp, typename _Td, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_add(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                           _Td __delta, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp
+__cxx_atomic_fetch_add(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Td __delta, memory_order)
+{
   __a->__lock();
   _Tp __old;
   __cxx_atomic_assign_volatile(__old, __a->__a_value);
@@ -952,9 +983,9 @@ _Tp __cxx_atomic_fetch_add(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
   return __old;
 }
 template <typename _Tp, typename _Td, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_add(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                           _Td __delta, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp
+__cxx_atomic_fetch_add(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Td __delta, memory_order)
+{
   __a->__lock();
   _Tp __old = __a->__a_value;
   __a->__a_value += __delta;
@@ -963,9 +994,9 @@ _Tp __cxx_atomic_fetch_add(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
 }
 
 template <typename _Tp, typename _Td, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp* __cxx_atomic_fetch_add(volatile __cxx_atomic_lock_impl<_Tp*, _Sco>* __a,
-                           ptrdiff_t __delta, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp*
+__cxx_atomic_fetch_add(volatile __cxx_atomic_lock_impl<_Tp*, _Sco>* __a, ptrdiff_t __delta, memory_order)
+{
   __a->__lock();
   _Tp* __old;
   __cxx_atomic_assign_volatile(__old, __a->__a_value);
@@ -974,9 +1005,9 @@ _Tp* __cxx_atomic_fetch_add(volatile __cxx_atomic_lock_impl<_Tp*, _Sco>* __a,
   return __old;
 }
 template <typename _Tp, typename _Td, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp* __cxx_atomic_fetch_add(__cxx_atomic_lock_impl<_Tp*, _Sco>* __a,
-                            ptrdiff_t __delta, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp*
+__cxx_atomic_fetch_add(__cxx_atomic_lock_impl<_Tp*, _Sco>* __a, ptrdiff_t __delta, memory_order)
+{
   __a->__lock();
   _Tp* __old = __a->__a_value;
   __a->__a_value += __delta;
@@ -985,9 +1016,9 @@ _Tp* __cxx_atomic_fetch_add(__cxx_atomic_lock_impl<_Tp*, _Sco>* __a,
 }
 
 template <typename _Tp, typename _Td, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_sub(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                           _Td __delta, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp
+__cxx_atomic_fetch_sub(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Td __delta, memory_order)
+{
   __a->__lock();
   _Tp __old;
   __cxx_atomic_assign_volatile(__old, __a->__a_value);
@@ -996,9 +1027,9 @@ _Tp __cxx_atomic_fetch_sub(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
   return __old;
 }
 template <typename _Tp, typename _Td, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_sub(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                           _Td __delta, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp
+__cxx_atomic_fetch_sub(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Td __delta, memory_order)
+{
   __a->__lock();
   _Tp __old = __a->__a_value;
   __a->__a_value -= __delta;
@@ -1007,9 +1038,9 @@ _Tp __cxx_atomic_fetch_sub(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
 }
 
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_and(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                           _Tp __pattern, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp
+__cxx_atomic_fetch_and(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __pattern, memory_order)
+{
   __a->__lock();
   _Tp __old;
   __cxx_atomic_assign_volatile(__old, __a->__a_value);
@@ -1018,9 +1049,9 @@ _Tp __cxx_atomic_fetch_and(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
   return __old;
 }
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_and(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                           _Tp __pattern, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp
+__cxx_atomic_fetch_and(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __pattern, memory_order)
+{
   __a->__lock();
   _Tp __old = __a->__a_value;
   __a->__a_value &= __pattern;
@@ -1029,9 +1060,9 @@ _Tp __cxx_atomic_fetch_and(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
 }
 
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_or(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                          _Tp __pattern, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp
+__cxx_atomic_fetch_or(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __pattern, memory_order)
+{
   __a->__lock();
   _Tp __old;
   __cxx_atomic_assign_volatile(__old, __a->__a_value);
@@ -1040,9 +1071,9 @@ _Tp __cxx_atomic_fetch_or(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
   return __old;
 }
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_or(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                          _Tp __pattern, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp
+__cxx_atomic_fetch_or(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __pattern, memory_order)
+{
   __a->__lock();
   _Tp __old = __a->__a_value;
   __a->__a_value |= __pattern;
@@ -1051,9 +1082,9 @@ _Tp __cxx_atomic_fetch_or(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
 }
 
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_xor(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                           _Tp __pattern, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp
+__cxx_atomic_fetch_xor(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __pattern, memory_order)
+{
   __a->__lock();
   _Tp __old;
   __cxx_atomic_assign_volatile(__old, __a->__a_value);
@@ -1062,9 +1093,9 @@ _Tp __cxx_atomic_fetch_xor(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
   return __old;
 }
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_xor(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                           _Tp __pattern, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp
+__cxx_atomic_fetch_xor(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __pattern, memory_order)
+{
   __a->__lock();
   _Tp __old = __a->__a_value;
   __a->__a_value ^= __pattern;
@@ -1072,44 +1103,56 @@ _Tp __cxx_atomic_fetch_xor(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
   return __old;
 }
 
-#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+#  if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
 
-template<typename _Tp> struct __cxx_is_always_lock_free {
-    enum { __value = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0) }; };
+template <typename _Tp>
+struct __cxx_is_always_lock_free
+{
+  enum
+  {
+    __value = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0)
+  };
+};
 
-#else
+#  else
 
-template<typename _Tp> struct __cxx_is_always_lock_free {
-    enum { __value = sizeof(_Tp) <= 8 }; };
+template <typename _Tp>
+struct __cxx_is_always_lock_free
+{
+  enum
+  {
+    __value = sizeof(_Tp) <= 8
+  };
+};
 
-#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+#  endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
 
 template <typename _Tp, int _Sco>
-struct __cxx_atomic_impl_conditional {
-    using type = __conditional_t<__cxx_is_always_lock_free<_Tp>::__value,
-                                                __cxx_atomic_base_impl<_Tp, _Sco>,
-                                                __cxx_atomic_lock_impl<_Tp, _Sco> >;
+struct __cxx_atomic_impl_conditional
+{
+  using type = __conditional_t<__cxx_is_always_lock_free<_Tp>::__value,
+                               __cxx_atomic_base_impl<_Tp, _Sco>,
+                               __cxx_atomic_lock_impl<_Tp, _Sco>>;
 };
 
-template <typename _Tp, int _Sco,
-          typename _Base = typename __cxx_atomic_impl_conditional<_Tp, _Sco>::type >
+template <typename _Tp, int _Sco, typename _Base = typename __cxx_atomic_impl_conditional<_Tp, _Sco>::type>
 #else
-template <typename _Tp, int _Sco,
-          typename _Base = __cxx_atomic_base_impl<_Tp, _Sco> >
+template <typename _Tp, int _Sco, typename _Base = __cxx_atomic_base_impl<_Tp, _Sco>>
 #endif //_LIBCUDACXX_ATOMIC_ONLY_USE_BUILTINS
-struct __cxx_atomic_impl : public _Base {
+struct __cxx_atomic_impl : public _Base
+{
   __cxx_atomic_impl() noexcept = default;
   _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __cxx_atomic_impl(_Tp value) noexcept
-    : _Base(value) {}
+      : _Base(value)
+  {}
 };
 
-
-template<int _Sco, typename _Tp = int>
-_LIBCUDACXX_INLINE_VISIBILITY
-__cxx_atomic_impl<_Tp, _Sco>* __cxx_atomic_rebind(_Tp* __inst) {
-    static_assert(sizeof(__cxx_atomic_impl<_Tp, _Sco>) == sizeof(_Tp),"");
-    static_assert(alignof(__cxx_atomic_impl<_Tp, _Sco>) == alignof(_Tp),"");
-    return (__cxx_atomic_impl<_Tp, _Sco>*)__inst;
+template <int _Sco, typename _Tp = int>
+_LIBCUDACXX_INLINE_VISIBILITY __cxx_atomic_impl<_Tp, _Sco>* __cxx_atomic_rebind(_Tp* __inst)
+{
+  static_assert(sizeof(__cxx_atomic_impl<_Tp, _Sco>) == sizeof(_Tp), "");
+  static_assert(alignof(__cxx_atomic_impl<_Tp, _Sco>) == alignof(_Tp), "");
+  return (__cxx_atomic_impl<_Tp, _Sco>*) __inst;
 }
 
 template <typename _Tp, int _Sco>
@@ -1118,25 +1161,29 @@ using __cxx_atomic_ref_impl = __cxx_atomic_ref_base_impl<_Tp, _Sco>;
 #ifdef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
 
 template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>, int _Sco = _Ty::__sco>
-struct __cxx_atomic_poll_tester {
-    _Ty const volatile* __a;
-    _Tp __val;
-    memory_order __order;
+struct __cxx_atomic_poll_tester
+{
+  _Ty const volatile* __a;
+  _Tp __val;
+  memory_order __order;
 
-    _LIBCUDACXX_INLINE_VISIBILITY __cxx_atomic_poll_tester(_Ty const volatile* __a_, _Tp __val_, memory_order __order_)
+  _LIBCUDACXX_INLINE_VISIBILITY __cxx_atomic_poll_tester(_Ty const volatile* __a_, _Tp __val_, memory_order __order_)
       : __a(__a_)
       , __val(__val_)
       , __order(__order_)
-    {}
+  {}
 
-    _LIBCUDACXX_INLINE_VISIBILITY bool operator()() const {
-      return !(__cxx_atomic_load(__a, __order) == __val);
-    }
+  _LIBCUDACXX_INLINE_VISIBILITY bool operator()() const
+  {
+    return !(__cxx_atomic_load(__a, __order) == __val);
+  }
 };
 
 template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>, int _Sco = _Ty::__sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow_fallback(_Ty const volatile* __a, _Tp __val, memory_order __order) {
-    __libcpp_thread_poll_with_backoff(__cxx_atomic_poll_tester<_Ty>(__a, __val, __order));
+_LIBCUDACXX_INLINE_VISIBILITY void
+__cxx_atomic_try_wait_slow_fallback(_Ty const volatile* __a, _Tp __val, memory_order __order)
+{
+  __libcpp_thread_poll_with_backoff(__cxx_atomic_poll_tester<_Ty>(__a, __val, __order));
 }
 
 #endif
@@ -1144,632 +1191,888 @@ _LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow_fallback(_Ty const
 #ifdef _LIBCUDACXX_HAS_PLATFORM_WAIT
 
 template <class _Tp, int _Sco, __enable_if_t<!__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
-#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-    auto * const __c = __libcpp_contention_state(__a);
-    __cxx_atomic_fetch_add(__cxx_atomic_rebind<_Sco>(&__c->__version), (__libcpp_platform_wait_t)1, memory_order_relaxed);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-    if (0 != __cxx_atomic_exchange(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)0, memory_order_relaxed))
-        __libcpp_platform_wake(&__c->__version, true);
-#endif
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a)
+{
+#  ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
+  auto* const __c = __libcpp_contention_state(__a);
+  __cxx_atomic_fetch_add(__cxx_atomic_rebind<_Sco>(&__c->__version), (__libcpp_platform_wait_t) 1, memory_order_relaxed);
+  __cxx_atomic_thread_fence(memory_order_seq_cst);
+  if (0 != __cxx_atomic_exchange(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t) 0, memory_order_relaxed))
+  {
+    __libcpp_platform_wake(&__c->__version, true);
+  }
+#  endif
 }
 template <class _Tp, int _Sco, __enable_if_t<!__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
-    __cxx_atomic_notify_all(__a);
-}
-template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>, int _Sco = _Ty::__sco, __enable_if_t<!__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(_Ty const volatile* __a, _Tp const __val, memory_order __order) {
-#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-    auto * const __c = __libcpp_contention_state(__a);
-    __cxx_atomic_store(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)1, memory_order_relaxed);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-    auto const __version = __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__version), memory_order_relaxed);
-    if (!__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
-        return;
-    if(sizeof(__libcpp_platform_wait_t) < 8) {
-        constexpr timespec __timeout = { 2, 0 }; // Hedge on rare 'int version' aliasing.
-        __libcpp_platform_wait(&__c->__version, __version, &__timeout);
-    }
-    else
-        __libcpp_platform_wait(&__c->__version, __version, nullptr);
-#else
-    __cxx_atomic_try_wait_slow_fallback(__a, __val, __order);
-#endif // _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a)
+{
+  __cxx_atomic_notify_all(__a);
+}
+template <class _Ty,
+          class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>,
+          int _Sco  = _Ty::__sco,
+          __enable_if_t<!__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
+_LIBCUDACXX_INLINE_VISIBILITY void
+__cxx_atomic_try_wait_slow(_Ty const volatile* __a, _Tp const __val, memory_order __order)
+{
+#  ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
+  auto* const __c = __libcpp_contention_state(__a);
+  __cxx_atomic_store(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t) 1, memory_order_relaxed);
+  __cxx_atomic_thread_fence(memory_order_seq_cst);
+  auto const __version = __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__version), memory_order_relaxed);
+  if (!__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
+  {
+    return;
+  }
+  if (sizeof(__libcpp_platform_wait_t) < 8)
+  {
+    constexpr timespec __timeout = {2, 0}; // Hedge on rare 'int version' aliasing.
+    __libcpp_platform_wait(&__c->__version, __version, &__timeout);
+  }
+  else
+  {
+    __libcpp_platform_wait(&__c->__version, __version, nullptr);
+  }
+#  else
+  __cxx_atomic_try_wait_slow_fallback(__a, __val, __order);
+#  endif // _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
 }
 
 template <class _Tp, int _Sco, __enable_if_t<__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a, _Tp __val, memory_order) {
-#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-    auto * const __c = __libcpp_contention_state(__a);
-    __cxx_atomic_fetch_add(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)1, memory_order_relaxed);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-#endif
-    __libcpp_platform_wait((_Tp*)__a, __val, nullptr);
-#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-    __cxx_atomic_fetch_sub(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)1, memory_order_relaxed);
-#endif
+_LIBCUDACXX_INLINE_VISIBILITY void
+__cxx_atomic_try_wait_slow(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a, _Tp __val, memory_order)
+{
+#  ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
+  auto* const __c = __libcpp_contention_state(__a);
+  __cxx_atomic_fetch_add(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t) 1, memory_order_relaxed);
+  __cxx_atomic_thread_fence(memory_order_seq_cst);
+#  endif
+  __libcpp_platform_wait((_Tp*) __a, __val, nullptr);
+#  ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
+  __cxx_atomic_fetch_sub(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t) 1, memory_order_relaxed);
+#  endif
 }
 template <class _Tp, int _Sco, __enable_if_t<__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
-#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-    auto * const __c = __libcpp_contention_state(__a);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-    if (0 != __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__waiters), memory_order_relaxed))
-#endif
-        __libcpp_platform_wake((_Tp*)__a, true);
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a)
+{
+#  ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
+  auto* const __c = __libcpp_contention_state(__a);
+  __cxx_atomic_thread_fence(memory_order_seq_cst);
+  if (0 != __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__waiters), memory_order_relaxed))
+#  endif
+    __libcpp_platform_wake((_Tp*) __a, true);
 }
 template <class _Tp, int _Sco, __enable_if_t<__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
-#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-    auto * const __c = __libcpp_contention_state(__a);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-    if (0 != __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__waiters), memory_order_relaxed))
-#endif
-        __libcpp_platform_wake((_Tp*)__a, false);
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a)
+{
+#  ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
+  auto* const __c = __libcpp_contention_state(__a);
+  __cxx_atomic_thread_fence(memory_order_seq_cst);
+  if (0 != __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__waiters), memory_order_relaxed))
+#  endif
+    __libcpp_platform_wake((_Tp*) __a, false);
 }
 
 #elif !defined(_LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE)
 
 template <class _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
-    auto * const __c = __libcpp_contention_state(__a);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-    if(0 == __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__credit), memory_order_relaxed))
-        return;
-    if(0 != __cxx_atomic_exchange(__cxx_atomic_rebind<_Sco>(&__c->__credit), (ptrdiff_t)0, memory_order_relaxed)) {
-        __libcpp_mutex_lock(&__c->__mutex);
-        __libcpp_mutex_unlock(&__c->__mutex);
-        __libcpp_condvar_broadcast(&__c->__condvar);
-    }
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a)
+{
+  auto* const __c = __libcpp_contention_state(__a);
+  __cxx_atomic_thread_fence(memory_order_seq_cst);
+  if (0 == __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__credit), memory_order_relaxed))
+  {
+    return;
+  }
+  if (0 != __cxx_atomic_exchange(__cxx_atomic_rebind<_Sco>(&__c->__credit), (ptrdiff_t) 0, memory_order_relaxed))
+  {
+    __libcpp_mutex_lock(&__c->__mutex);
+    __libcpp_mutex_unlock(&__c->__mutex);
+    __libcpp_condvar_broadcast(&__c->__condvar);
+  }
 }
 template <class _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
-    __cxx_atomic_notify_all(__a);
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a)
+{
+  __cxx_atomic_notify_all(__a);
 }
 template <class _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a, _Tp const __val, memory_order __order) {
-    auto * const __c = __libcpp_contention_state(__a);
-    __libcpp_mutex_lock(&__c->__mutex);
-    __cxx_atomic_store(__cxx_atomic_rebind<_Sco>(&__c->__credit), (ptrdiff_t)1, memory_order_relaxed);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-    if (__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
-        __libcpp_condvar_wait(&__c->__condvar, &__c->__mutex);
-    __libcpp_mutex_unlock(&__c->__mutex);
+_LIBCUDACXX_INLINE_VISIBILITY void
+__cxx_atomic_try_wait_slow(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a, _Tp const __val, memory_order __order)
+{
+  auto* const __c = __libcpp_contention_state(__a);
+  __libcpp_mutex_lock(&__c->__mutex);
+  __cxx_atomic_store(__cxx_atomic_rebind<_Sco>(&__c->__credit), (ptrdiff_t) 1, memory_order_relaxed);
+  __cxx_atomic_thread_fence(memory_order_seq_cst);
+  if (__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
+  {
+    __libcpp_condvar_wait(&__c->__condvar, &__c->__mutex);
+  }
+  __libcpp_mutex_unlock(&__c->__mutex);
 }
 
 #else
 
-template<typename T>
+template <typename T>
 struct __atomic_wait_and_notify_supported
-#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 700
+#  if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 700
     : false_type
-#else
+#  else
     : true_type
-#endif
+#  endif
 {};
 
 template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(_Ty const volatile* __a, _Tp __val, memory_order __order) {
-    static_assert(__atomic_wait_and_notify_supported<_Tp>::value, "atomic wait operations are unsupported on Pascal");
-    __cxx_atomic_try_wait_slow_fallback(__a, __val, __order);
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(_Ty const volatile* __a, _Tp __val, memory_order __order)
+{
+  static_assert(__atomic_wait_and_notify_supported<_Tp>::value, "atomic wait operations are unsupported on Pascal");
+  __cxx_atomic_try_wait_slow_fallback(__a, __val, __order);
 }
 
 template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(_Ty const volatile*) {
-    static_assert(__atomic_wait_and_notify_supported<_Tp>::value, "atomic notify-one operations are unsupported on Pascal");
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(_Ty const volatile*)
+{
+  static_assert(__atomic_wait_and_notify_supported<_Tp>::value,
+                "atomic notify-one operations are unsupported on Pascal");
 }
 
 template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(_Ty const volatile*) {
-    static_assert(__atomic_wait_and_notify_supported<_Tp>::value, "atomic notify-all operations are unsupported on Pascal");
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(_Ty const volatile*)
+{
+  static_assert(__atomic_wait_and_notify_supported<_Tp>::value,
+                "atomic notify-all operations are unsupported on Pascal");
 }
 
 #endif // _LIBCUDACXX_HAS_PLATFORM_WAIT || !defined(_LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE)
 
 template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_wait(_Ty const volatile* __a, _Tp const __val, memory_order __order) {
-    for(int __i = 0; __i < _LIBCUDACXX_POLLING_COUNT; ++__i) {
-        if(!__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
-            return;
-        if(__i < 12)
-            __libcpp_thread_yield_processor();
-        else
-            __libcpp_thread_yield();
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_wait(_Ty const volatile* __a, _Tp const __val, memory_order __order)
+{
+  for (int __i = 0; __i < _LIBCUDACXX_POLLING_COUNT; ++__i)
+  {
+    if (!__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
+    {
+      return;
+    }
+    if (__i < 12)
+    {
+      __libcpp_thread_yield_processor();
     }
-    while(__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
-        __cxx_atomic_try_wait_slow(__a, __val, __order);
+    else
+    {
+      __libcpp_thread_yield();
+    }
+  }
+  while (__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
+  {
+    __cxx_atomic_try_wait_slow(__a, __val, __order);
+  }
 }
 
 template <class _Tp, typename _Storage>
-struct __atomic_base_storage {
-    mutable _Storage __a_;
+struct __atomic_base_storage
+{
+  mutable _Storage __a_;
 
-    __atomic_base_storage() = default;
-    __atomic_base_storage(const __atomic_base_storage&) = default;
-    __atomic_base_storage(__atomic_base_storage&&) = default;
+  __atomic_base_storage()                             = default;
+  __atomic_base_storage(const __atomic_base_storage&) = default;
+  __atomic_base_storage(__atomic_base_storage&&)      = default;
 
-    __atomic_base_storage& operator=(const __atomic_base_storage&) = default;
-    __atomic_base_storage& operator=(__atomic_base_storage&&) = default;
+  __atomic_base_storage& operator=(const __atomic_base_storage&) = default;
+  __atomic_base_storage& operator=(__atomic_base_storage&&)      = default;
 
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    __atomic_base_storage(_Storage&& __a) noexcept : __a_(_CUDA_VSTD::forward<_Storage>(__a)) {}
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base_storage(_Storage&& __a) noexcept
+      : __a_(_CUDA_VSTD::forward<_Storage>(__a))
+  {}
 };
 
 template <class _Tp, bool _Cq, typename _Storage>
-struct __atomic_base_core : public __atomic_base_storage<_Tp, _Storage>{
-    __atomic_base_core() = default;
-    __atomic_base_core(const __atomic_base_core&) = delete;
-    __atomic_base_core(__atomic_base_core&&) = delete;
+struct __atomic_base_core : public __atomic_base_storage<_Tp, _Storage>
+{
+  __atomic_base_core()                          = default;
+  __atomic_base_core(const __atomic_base_core&) = delete;
+  __atomic_base_core(__atomic_base_core&&)      = delete;
 
-    __atomic_base_core& operator=(const __atomic_base_core&) = delete;
-    __atomic_base_core& operator=(__atomic_base_core&&) = delete;
+  __atomic_base_core& operator=(const __atomic_base_core&) = delete;
+  __atomic_base_core& operator=(__atomic_base_core&&)      = delete;
 
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    __atomic_base_core(_Storage&& __a) noexcept : __atomic_base_storage<_Tp, _Storage>(_CUDA_VSTD::forward<_Storage>(__a)) {}
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base_core(_Storage&& __a) noexcept
+      : __atomic_base_storage<_Tp, _Storage>(_CUDA_VSTD::forward<_Storage>(__a))
+  {}
 
 #if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-    static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+  static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
 #endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool is_lock_free() const volatile noexcept
-        {return _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(sizeof(_Tp));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool is_lock_free() const noexcept
-        {return static_cast<__atomic_base_core const volatile*>(this)->is_lock_free();}
-    _LIBCUDACXX_INLINE_VISIBILITY
-
-    void store(_Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept
-      _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
-        {__cxx_atomic_store(&this->__a_, __d, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void store(_Tp __d, memory_order __m = memory_order_seq_cst) noexcept
-      _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
-        {__cxx_atomic_store(&this->__a_, __d, __m);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp load(memory_order __m = memory_order_seq_cst) const volatile noexcept
-      _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
-        {return __cxx_atomic_load(&this->__a_, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp load(memory_order __m = memory_order_seq_cst) const noexcept
-      _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
-        {return __cxx_atomic_load(&this->__a_, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    operator _Tp() const volatile noexcept {return load();}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    operator _Tp() const noexcept          {return load();}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __cxx_atomic_exchange(&this->__a_, __d, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) noexcept
-        {return __cxx_atomic_exchange(&this->__a_, __d, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                               memory_order __s, memory_order __f) volatile noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                               memory_order __s, memory_order __f) noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                                 memory_order __s, memory_order __f) volatile noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                                 memory_order __s, memory_order __f) noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                              memory_order __m = memory_order_seq_cst) volatile noexcept {
-        if (memory_order_acq_rel == __m)
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire);
-        else if (memory_order_release == __m)
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed);
-        else
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m);
+  _LIBCUDACXX_INLINE_VISIBILITY bool is_lock_free() const volatile noexcept
+  {
+    return _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(sizeof(_Tp));
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool is_lock_free() const noexcept
+  {
+    return static_cast<__atomic_base_core const volatile*>(this)->is_lock_free();
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY
+
+    void
+    store(_Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
+  {
+    __cxx_atomic_store(&this->__a_, __d, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void store(_Tp __d, memory_order __m = memory_order_seq_cst) noexcept
+    _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
+  {
+    __cxx_atomic_store(&this->__a_, __d, __m);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp load(memory_order __m = memory_order_seq_cst) const volatile noexcept
+    _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
+  {
+    return __cxx_atomic_load(&this->__a_, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp load(memory_order __m = memory_order_seq_cst) const noexcept
+    _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
+  {
+    return __cxx_atomic_load(&this->__a_, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY operator _Tp() const volatile noexcept
+  {
+    return load();
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY operator _Tp() const noexcept
+  {
+    return load();
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    return __cxx_atomic_exchange(&this->__a_, __d, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) noexcept
+  {
+    return __cxx_atomic_exchange(&this->__a_, __d, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) volatile noexcept
+    _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
+  {
+    return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) noexcept
+    _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
+  {
+    return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) volatile noexcept
+    _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
+  {
+    return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) noexcept
+    _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
+  {
+    return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    if (memory_order_acq_rel == __m)
+    {
+      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire);
+    }
+    else if (memory_order_release == __m)
+    {
+      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed);
+    }
+    else
+    {
+      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m);
+    }
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) noexcept
+  {
+    if (memory_order_acq_rel == __m)
+    {
+      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire);
     }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                               memory_order __m = memory_order_seq_cst) noexcept {
-        if(memory_order_acq_rel == __m)
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire);
-        else if(memory_order_release == __m)
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed);
-        else
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m);
+    else if (memory_order_release == __m)
+    {
+      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed);
     }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                              memory_order __m = memory_order_seq_cst) volatile noexcept {
-        if (memory_order_acq_rel == __m)
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);
-        else if (memory_order_release == __m)
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
-        else
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m);
+    else
+    {
+      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m);
+    }
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    if (memory_order_acq_rel == __m)
+    {
+      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);
+    }
+    else if (memory_order_release == __m)
+    {
+      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
+    }
+    else
+    {
+      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m);
+    }
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) noexcept
+  {
+    if (memory_order_acq_rel == __m)
+    {
+      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);
     }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                                 memory_order __m = memory_order_seq_cst) noexcept {
-        if (memory_order_acq_rel == __m)
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);
-        else if (memory_order_release == __m)
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
-        else
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m);
+    else if (memory_order_release == __m)
+    {
+      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
     }
+    else
+    {
+      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m);
+    }
+  }
 
-    _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {__cxx_atomic_wait(&this->__a_, __v, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const noexcept
-        {__cxx_atomic_wait(&this->__a_, __v, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY void notify_one() volatile noexcept
-        {__cxx_atomic_notify_one(&this->__a_);}
-    _LIBCUDACXX_INLINE_VISIBILITY void notify_one() noexcept
-        {__cxx_atomic_notify_one(&this->__a_);}
-    _LIBCUDACXX_INLINE_VISIBILITY void notify_all() volatile noexcept
-        {__cxx_atomic_notify_all(&this->__a_);}
-    _LIBCUDACXX_INLINE_VISIBILITY void notify_all() noexcept
-        {__cxx_atomic_notify_all(&this->__a_);}
+  _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    __cxx_atomic_wait(&this->__a_, __v, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    __cxx_atomic_wait(&this->__a_, __v, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_one() volatile noexcept
+  {
+    __cxx_atomic_notify_one(&this->__a_);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_one() noexcept
+  {
+    __cxx_atomic_notify_one(&this->__a_);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_all() volatile noexcept
+  {
+    __cxx_atomic_notify_all(&this->__a_);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_all() noexcept
+  {
+    __cxx_atomic_notify_all(&this->__a_);
+  }
 };
 
 template <class _Tp, typename _Storage>
-struct __atomic_base_core<_Tp, true, _Storage> : public __atomic_base_storage<_Tp, _Storage>{
-    __atomic_base_core() = default;
-    __atomic_base_core(const __atomic_base_core&) = default;
-    __atomic_base_core(__atomic_base_core&&) = default;
+struct __atomic_base_core<_Tp, true, _Storage> : public __atomic_base_storage<_Tp, _Storage>
+{
+  __atomic_base_core()                          = default;
+  __atomic_base_core(const __atomic_base_core&) = default;
+  __atomic_base_core(__atomic_base_core&&)      = default;
 
-    __atomic_base_core& operator=(const __atomic_base_core&) = default;
-    __atomic_base_core& operator=(__atomic_base_core&&) = default;
+  __atomic_base_core& operator=(const __atomic_base_core&) = default;
+  __atomic_base_core& operator=(__atomic_base_core&&)      = default;
 
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    __atomic_base_core(_Storage&& __a) noexcept : __atomic_base_storage<_Tp, _Storage>(_CUDA_VSTD::forward<_Storage>(__a)) {}
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base_core(_Storage&& __a) noexcept
+      : __atomic_base_storage<_Tp, _Storage>(_CUDA_VSTD::forward<_Storage>(__a))
+  {}
 
 #if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-    static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+  static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
 #endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool is_lock_free() const volatile noexcept
-        {return _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(sizeof(_Tp));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool is_lock_free() const noexcept
-        {return static_cast<__atomic_base_core const volatile*>(this)->is_lock_free();}
-    _LIBCUDACXX_INLINE_VISIBILITY
-
-    void store(_Tp __d, memory_order __m = memory_order_seq_cst) const volatile noexcept
-      _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
-        {__cxx_atomic_store(&this->__a_, __d, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void store(_Tp __d, memory_order __m = memory_order_seq_cst) const noexcept
-      _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
-        {__cxx_atomic_store(&this->__a_, __d, __m);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp load(memory_order __m = memory_order_seq_cst) const volatile noexcept
-      _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
-        {return __cxx_atomic_load(&this->__a_, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp load(memory_order __m = memory_order_seq_cst) const noexcept
-      _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
-        {return __cxx_atomic_load(&this->__a_, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    operator _Tp() const volatile noexcept {return load();}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    operator _Tp() const noexcept          {return load();}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return __cxx_atomic_exchange(&this->__a_, __d, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __cxx_atomic_exchange(&this->__a_, __d, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                               memory_order __s, memory_order __f) const volatile noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                               memory_order __s, memory_order __f) const noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                                 memory_order __s, memory_order __f) const volatile noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                                 memory_order __s, memory_order __f) const noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                              memory_order __m = memory_order_seq_cst) const volatile noexcept {
-        if (memory_order_acq_rel == __m)
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire);
-        else if (memory_order_release == __m)
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed);
-        else
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m);
+  _LIBCUDACXX_INLINE_VISIBILITY bool is_lock_free() const volatile noexcept
+  {
+    return _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(sizeof(_Tp));
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool is_lock_free() const noexcept
+  {
+    return static_cast<__atomic_base_core const volatile*>(this)->is_lock_free();
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY
+
+    void
+    store(_Tp __d, memory_order __m = memory_order_seq_cst) const volatile noexcept
+    _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
+  {
+    __cxx_atomic_store(&this->__a_, __d, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void store(_Tp __d, memory_order __m = memory_order_seq_cst) const noexcept
+    _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
+  {
+    __cxx_atomic_store(&this->__a_, __d, __m);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp load(memory_order __m = memory_order_seq_cst) const volatile noexcept
+    _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
+  {
+    return __cxx_atomic_load(&this->__a_, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp load(memory_order __m = memory_order_seq_cst) const noexcept
+    _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
+  {
+    return __cxx_atomic_load(&this->__a_, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY operator _Tp() const volatile noexcept
+  {
+    return load();
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY operator _Tp() const noexcept
+  {
+    return load();
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    return __cxx_atomic_exchange(&this->__a_, __d, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    return __cxx_atomic_exchange(&this->__a_, __d, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) const
+    volatile noexcept _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
+  {
+    return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) const noexcept
+    _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
+  {
+    return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) const
+    volatile noexcept _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
+  {
+    return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) const noexcept
+    _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
+  {
+    return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    if (memory_order_acq_rel == __m)
+    {
+      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire);
+    }
+    else if (memory_order_release == __m)
+    {
+      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed);
+    }
+    else
+    {
+      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m);
+    }
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    if (memory_order_acq_rel == __m)
+    {
+      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire);
+    }
+    else if (memory_order_release == __m)
+    {
+      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed);
     }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                               memory_order __m = memory_order_seq_cst) const noexcept {
-        if(memory_order_acq_rel == __m)
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire);
-        else if(memory_order_release == __m)
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed);
-        else
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m);
+    else
+    {
+      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m);
+    }
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    if (memory_order_acq_rel == __m)
+    {
+      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);
     }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                              memory_order __m = memory_order_seq_cst) const volatile noexcept {
-        if (memory_order_acq_rel == __m)
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);
-        else if (memory_order_release == __m)
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
-        else
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m);
+    else if (memory_order_release == __m)
+    {
+      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
     }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                                 memory_order __m = memory_order_seq_cst) const noexcept {
-        if (memory_order_acq_rel == __m)
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);
-        else if (memory_order_release == __m)
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
-        else
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m);
+    else
+    {
+      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m);
     }
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    if (memory_order_acq_rel == __m)
+    {
+      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);
+    }
+    else if (memory_order_release == __m)
+    {
+      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
+    }
+    else
+    {
+      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m);
+    }
+  }
 
-    _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {__cxx_atomic_wait(&this->__a_, __v, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const noexcept
-        {__cxx_atomic_wait(&this->__a_, __v, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY void notify_one() const volatile noexcept
-        {__cxx_atomic_notify_one(&this->__a_);}
-    _LIBCUDACXX_INLINE_VISIBILITY void notify_one() const noexcept
-        {__cxx_atomic_notify_one(&this->__a_);}
-    _LIBCUDACXX_INLINE_VISIBILITY void notify_all() const volatile noexcept
-        {__cxx_atomic_notify_all(&this->__a_);}
-    _LIBCUDACXX_INLINE_VISIBILITY void notify_all() const noexcept
-        {__cxx_atomic_notify_all(&this->__a_);}
+  _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    __cxx_atomic_wait(&this->__a_, __v, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    __cxx_atomic_wait(&this->__a_, __v, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_one() const volatile noexcept
+  {
+    __cxx_atomic_notify_one(&this->__a_);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_one() const noexcept
+  {
+    __cxx_atomic_notify_one(&this->__a_);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_all() const volatile noexcept
+  {
+    __cxx_atomic_notify_all(&this->__a_);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_all() const noexcept
+  {
+    __cxx_atomic_notify_all(&this->__a_);
+  }
 };
 
 template <class _Tp, bool _Cq, typename _Storage>
-struct __atomic_base_arithmetic : public __atomic_base_core<_Tp, _Cq, _Storage> {
-    __atomic_base_arithmetic() = default;
-    __atomic_base_arithmetic(const __atomic_base_arithmetic&) = delete;
-    __atomic_base_arithmetic(__atomic_base_arithmetic&&) = delete;
-
-    __atomic_base_arithmetic& operator=(const __atomic_base_arithmetic&) = delete;
-    __atomic_base_arithmetic& operator=(__atomic_base_arithmetic&&) = delete;
-
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    __atomic_base_arithmetic(_Storage&& __a) noexcept : __atomic_base_core<_Tp, _Cq, _Storage>(_CUDA_VSTD::forward<_Storage>(__a)) {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-        {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-        {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator++(int) volatile noexcept      {return fetch_add(_Tp(1));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator++(int) noexcept               {return fetch_add(_Tp(1));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator--(int) volatile noexcept      {return fetch_sub(_Tp(1));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator--(int) noexcept               {return fetch_sub(_Tp(1));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator++() volatile noexcept         {return fetch_add(_Tp(1)) + _Tp(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator++() noexcept                  {return fetch_add(_Tp(1)) + _Tp(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator--() volatile noexcept         {return fetch_sub(_Tp(1)) - _Tp(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator--() noexcept                  {return fetch_sub(_Tp(1)) - _Tp(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator+=(_Tp __op) volatile noexcept {return fetch_add(__op) + __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator+=(_Tp __op) noexcept          {return fetch_add(__op) + __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator-=(_Tp __op) volatile noexcept {return fetch_sub(__op) - __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator-=(_Tp __op) noexcept          {return fetch_sub(__op) - __op;}
+struct __atomic_base_arithmetic : public __atomic_base_core<_Tp, _Cq, _Storage>
+{
+  __atomic_base_arithmetic()                                = default;
+  __atomic_base_arithmetic(const __atomic_base_arithmetic&) = delete;
+  __atomic_base_arithmetic(__atomic_base_arithmetic&&)      = delete;
+
+  __atomic_base_arithmetic& operator=(const __atomic_base_arithmetic&) = delete;
+  __atomic_base_arithmetic& operator=(__atomic_base_arithmetic&&)      = delete;
+
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base_arithmetic(_Storage&& __a) noexcept
+      : __atomic_base_core<_Tp, _Cq, _Storage>(_CUDA_VSTD::forward<_Storage>(__a))
+  {}
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    return __cxx_atomic_fetch_add(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
+  {
+    return __cxx_atomic_fetch_add(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
+  {
+    return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator++(int) volatile noexcept
+  {
+    return fetch_add(_Tp(1));
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator++(int) noexcept
+  {
+    return fetch_add(_Tp(1));
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator--(int) volatile noexcept
+  {
+    return fetch_sub(_Tp(1));
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator--(int) noexcept
+  {
+    return fetch_sub(_Tp(1));
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator++() volatile noexcept
+  {
+    return fetch_add(_Tp(1)) + _Tp(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator++() noexcept
+  {
+    return fetch_add(_Tp(1)) + _Tp(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator--() volatile noexcept
+  {
+    return fetch_sub(_Tp(1)) - _Tp(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator--() noexcept
+  {
+    return fetch_sub(_Tp(1)) - _Tp(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator+=(_Tp __op) volatile noexcept
+  {
+    return fetch_add(__op) + __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator+=(_Tp __op) noexcept
+  {
+    return fetch_add(__op) + __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator-=(_Tp __op) volatile noexcept
+  {
+    return fetch_sub(__op) - __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator-=(_Tp __op) noexcept
+  {
+    return fetch_sub(__op) - __op;
+  }
 };
 
 template <class _Tp, typename _Storage>
-struct __atomic_base_arithmetic<_Tp, true, _Storage> : public __atomic_base_core<_Tp, true, _Storage> {
-    __atomic_base_arithmetic() = default;
-    __atomic_base_arithmetic(const __atomic_base_arithmetic&) = default;
-    __atomic_base_arithmetic(__atomic_base_arithmetic&&) = default;
-
-    __atomic_base_arithmetic& operator=(const __atomic_base_arithmetic&) = default;
-    __atomic_base_arithmetic& operator=(__atomic_base_arithmetic&&) = default;
-
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    __atomic_base_arithmetic(_Storage&& __a) noexcept : __atomic_base_core<_Tp, true, _Storage>(_CUDA_VSTD::forward<_Storage>(__a)) {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator++(int) const volatile noexcept      {return fetch_add(_Tp(1));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator++(int) const noexcept               {return fetch_add(_Tp(1));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator--(int) const volatile noexcept      {return fetch_sub(_Tp(1));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator--(int) const noexcept               {return fetch_sub(_Tp(1));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator++() const volatile noexcept         {return fetch_add(_Tp(1)) + _Tp(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator++() const noexcept                  {return fetch_add(_Tp(1)) + _Tp(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator--() const volatile noexcept         {return fetch_sub(_Tp(1)) - _Tp(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator--() const noexcept                  {return fetch_sub(_Tp(1)) - _Tp(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator+=(_Tp __op) const volatile noexcept {return fetch_add(__op) + __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator+=(_Tp __op) const noexcept          {return fetch_add(__op) + __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator-=(_Tp __op) const volatile noexcept {return fetch_sub(__op) - __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator-=(_Tp __op) const noexcept          {return fetch_sub(__op) - __op;}
+struct __atomic_base_arithmetic<_Tp, true, _Storage> : public __atomic_base_core<_Tp, true, _Storage>
+{
+  __atomic_base_arithmetic()                                = default;
+  __atomic_base_arithmetic(const __atomic_base_arithmetic&) = default;
+  __atomic_base_arithmetic(__atomic_base_arithmetic&&)      = default;
+
+  __atomic_base_arithmetic& operator=(const __atomic_base_arithmetic&) = default;
+  __atomic_base_arithmetic& operator=(__atomic_base_arithmetic&&)      = default;
+
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base_arithmetic(_Storage&& __a) noexcept
+      : __atomic_base_core<_Tp, true, _Storage>(_CUDA_VSTD::forward<_Storage>(__a))
+  {}
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    return __cxx_atomic_fetch_add(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    return __cxx_atomic_fetch_add(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator++(int) const volatile noexcept
+  {
+    return fetch_add(_Tp(1));
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator++(int) const noexcept
+  {
+    return fetch_add(_Tp(1));
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator--(int) const volatile noexcept
+  {
+    return fetch_sub(_Tp(1));
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator--(int) const noexcept
+  {
+    return fetch_sub(_Tp(1));
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator++() const volatile noexcept
+  {
+    return fetch_add(_Tp(1)) + _Tp(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator++() const noexcept
+  {
+    return fetch_add(_Tp(1)) + _Tp(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator--() const volatile noexcept
+  {
+    return fetch_sub(_Tp(1)) - _Tp(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator--() const noexcept
+  {
+    return fetch_sub(_Tp(1)) - _Tp(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator+=(_Tp __op) const volatile noexcept
+  {
+    return fetch_add(__op) + __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator+=(_Tp __op) const noexcept
+  {
+    return fetch_add(__op) + __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator-=(_Tp __op) const volatile noexcept
+  {
+    return fetch_sub(__op) - __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator-=(_Tp __op) const noexcept
+  {
+    return fetch_sub(__op) - __op;
+  }
 };
 
 template <class _Tp, bool _Cq, typename _Storage>
-struct __atomic_base_bitwise : public __atomic_base_arithmetic<_Tp, _Cq, _Storage> {
-    __atomic_base_bitwise() = default;
-    __atomic_base_bitwise(const __atomic_base_bitwise&) = delete;
-    __atomic_base_bitwise(__atomic_base_bitwise&&) = delete;
-
-    __atomic_base_bitwise& operator=(const __atomic_base_bitwise&) = delete;
-    __atomic_base_bitwise& operator=(__atomic_base_bitwise&&) = delete;
-
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    __atomic_base_bitwise(_Storage&& __a) noexcept : __atomic_base_arithmetic<_Tp, _Cq, _Storage>(_CUDA_VSTD::forward<_Storage>(__a)) {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __cxx_atomic_fetch_and(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-        {return __cxx_atomic_fetch_and(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __cxx_atomic_fetch_or(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-        {return __cxx_atomic_fetch_or(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-        {return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator&=(_Tp __op) volatile noexcept {return fetch_and(__op) & __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator&=(_Tp __op) noexcept          {return fetch_and(__op) & __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator|=(_Tp __op) volatile noexcept {return fetch_or(__op) | __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator|=(_Tp __op) noexcept          {return fetch_or(__op) | __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator^=(_Tp __op) volatile noexcept {return fetch_xor(__op) ^ __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator^=(_Tp __op) noexcept          {return fetch_xor(__op) ^ __op;}
+struct __atomic_base_bitwise : public __atomic_base_arithmetic<_Tp, _Cq, _Storage>
+{
+  __atomic_base_bitwise()                             = default;
+  __atomic_base_bitwise(const __atomic_base_bitwise&) = delete;
+  __atomic_base_bitwise(__atomic_base_bitwise&&)      = delete;
+
+  __atomic_base_bitwise& operator=(const __atomic_base_bitwise&) = delete;
+  __atomic_base_bitwise& operator=(__atomic_base_bitwise&&)      = delete;
+
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base_bitwise(_Storage&& __a) noexcept
+      : __atomic_base_arithmetic<_Tp, _Cq, _Storage>(_CUDA_VSTD::forward<_Storage>(__a))
+  {}
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    return __cxx_atomic_fetch_and(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
+  {
+    return __cxx_atomic_fetch_and(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    return __cxx_atomic_fetch_or(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
+  {
+    return __cxx_atomic_fetch_or(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
+  {
+    return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator&=(_Tp __op) volatile noexcept
+  {
+    return fetch_and(__op) & __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator&=(_Tp __op) noexcept
+  {
+    return fetch_and(__op) & __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator|=(_Tp __op) volatile noexcept
+  {
+    return fetch_or(__op) | __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator|=(_Tp __op) noexcept
+  {
+    return fetch_or(__op) | __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator^=(_Tp __op) volatile noexcept
+  {
+    return fetch_xor(__op) ^ __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator^=(_Tp __op) noexcept
+  {
+    return fetch_xor(__op) ^ __op;
+  }
 };
 
 template <class _Tp, typename _Storage>
-struct __atomic_base_bitwise<_Tp, true, _Storage> : public __atomic_base_arithmetic<_Tp, true, _Storage> {
-    __atomic_base_bitwise() = default;
-    __atomic_base_bitwise(const __atomic_base_bitwise&) = default;
-    __atomic_base_bitwise(__atomic_base_bitwise&&) = default;
-
-    __atomic_base_bitwise& operator=(const __atomic_base_bitwise&) = default;
-    __atomic_base_bitwise& operator=(__atomic_base_bitwise&&) = default;
-
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    __atomic_base_bitwise(_Storage&& __a) noexcept : __atomic_base_arithmetic<_Tp, true, _Storage>(_CUDA_VSTD::forward<_Storage>(__a)) {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return __cxx_atomic_fetch_and(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __cxx_atomic_fetch_and(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return __cxx_atomic_fetch_or(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __cxx_atomic_fetch_or(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator&=(_Tp __op) const volatile noexcept {return fetch_and(__op) & __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator&=(_Tp __op) const noexcept          {return fetch_and(__op) & __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator|=(_Tp __op) const volatile noexcept {return fetch_or(__op) | __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator|=(_Tp __op) const noexcept          {return fetch_or(__op) | __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator^=(_Tp __op) const volatile noexcept {return fetch_xor(__op) ^ __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator^=(_Tp __op) const noexcept          {return fetch_xor(__op) ^ __op;}
+struct __atomic_base_bitwise<_Tp, true, _Storage> : public __atomic_base_arithmetic<_Tp, true, _Storage>
+{
+  __atomic_base_bitwise()                             = default;
+  __atomic_base_bitwise(const __atomic_base_bitwise&) = default;
+  __atomic_base_bitwise(__atomic_base_bitwise&&)      = default;
+
+  __atomic_base_bitwise& operator=(const __atomic_base_bitwise&) = default;
+  __atomic_base_bitwise& operator=(__atomic_base_bitwise&&)      = default;
+
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base_bitwise(_Storage&& __a) noexcept
+      : __atomic_base_arithmetic<_Tp, true, _Storage>(_CUDA_VSTD::forward<_Storage>(__a))
+  {}
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    return __cxx_atomic_fetch_and(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    return __cxx_atomic_fetch_and(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    return __cxx_atomic_fetch_or(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    return __cxx_atomic_fetch_or(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator&=(_Tp __op) const volatile noexcept
+  {
+    return fetch_and(__op) & __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator&=(_Tp __op) const noexcept
+  {
+    return fetch_and(__op) & __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator|=(_Tp __op) const volatile noexcept
+  {
+    return fetch_or(__op) | __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator|=(_Tp __op) const noexcept
+  {
+    return fetch_or(__op) | __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator^=(_Tp __op) const volatile noexcept
+  {
+    return fetch_xor(__op) ^ __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator^=(_Tp __op) const noexcept
+  {
+    return fetch_xor(__op) ^ __op;
+  }
 };
 
 template <typename _Tp, bool _Cq, typename _Storage>
-using __atomic_select_base = __conditional_t<is_floating_point<_Tp>::value,
-                                             __atomic_base_arithmetic<_Tp, _Cq, _Storage>,
-                                             __conditional_t<is_integral<_Tp>::value,
-                                                __atomic_base_bitwise<_Tp, _Cq, _Storage>,
-                                                __atomic_base_core<_Tp, _Cq, _Storage> >>;
+using __atomic_select_base =
+  __conditional_t<is_floating_point<_Tp>::value,
+                  __atomic_base_arithmetic<_Tp, _Cq, _Storage>,
+                  __conditional_t<is_integral<_Tp>::value,
+                                  __atomic_base_bitwise<_Tp, _Cq, _Storage>,
+                                  __atomic_base_core<_Tp, _Cq, _Storage>>>;
 
 template <typename _Tp, int _Sco = 0, typename _Base = __atomic_select_base<_Tp, false, __cxx_atomic_impl<_Tp, _Sco>>>
-struct __atomic_base : public _Base {
-    __atomic_base() = default;
-    __atomic_base(const __atomic_base&) = delete;
-    __atomic_base(__atomic_base&&) = delete;
+struct __atomic_base : public _Base
+{
+  __atomic_base()                     = default;
+  __atomic_base(const __atomic_base&) = delete;
+  __atomic_base(__atomic_base&&)      = delete;
 
-    __atomic_base& operator=(const __atomic_base&) = delete;
-    __atomic_base& operator=(__atomic_base&&) = delete;
+  __atomic_base& operator=(const __atomic_base&) = delete;
+  __atomic_base& operator=(__atomic_base&&)      = delete;
 
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    __atomic_base(const _Tp& __a) noexcept :
-        _Base(__cxx_atomic_impl<_Tp, _Sco>(__a)) {}
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base(const _Tp& __a) noexcept
+      : _Base(__cxx_atomic_impl<_Tp, _Sco>(__a))
+  {}
 };
 
 template <typename _Tp, int _Sco = 0, typename _Base = __atomic_select_base<_Tp, true, __cxx_atomic_ref_impl<_Tp, _Sco>>>
-struct __atomic_base_ref : public _Base {
-    __atomic_base_ref() = default;
-    __atomic_base_ref(const __atomic_base_ref&) = default;
-    __atomic_base_ref(__atomic_base_ref&&) = default;
+struct __atomic_base_ref : public _Base
+{
+  __atomic_base_ref()                         = default;
+  __atomic_base_ref(const __atomic_base_ref&) = default;
+  __atomic_base_ref(__atomic_base_ref&&)      = default;
 
-    __atomic_base_ref& operator=(const __atomic_base_ref&) = default;
-    __atomic_base_ref& operator=(__atomic_base_ref&&) = default;
+  __atomic_base_ref& operator=(const __atomic_base_ref&) = default;
+  __atomic_base_ref& operator=(__atomic_base_ref&&)      = default;
 
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    __atomic_base_ref(_Tp& __a) noexcept :
-        _Base(__cxx_atomic_ref_impl<_Tp, _Sco>(__a)) {}
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base_ref(_Tp& __a) noexcept
+      : _Base(__cxx_atomic_ref_impl<_Tp, _Sco>(__a))
+  {}
 };
 
 #if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
@@ -1779,1059 +2082,918 @@ constexpr bool __atomic_base_core<_Tp, _Cq, _Storage>::is_always_lock_free;
 
 // atomic<T>
 template <class _Tp>
-struct atomic
-    : public __atomic_base<_Tp>
+struct atomic : public __atomic_base<_Tp>
 {
-    typedef __atomic_base<_Tp> __base;
-    using value_type = _Tp;
+  typedef __atomic_base<_Tp> __base;
+  using value_type = _Tp;
 
-    atomic() noexcept = default;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    constexpr atomic(_Tp __d) noexcept : __base(__d) {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator=(_Tp __d) volatile noexcept
-        {__base::store(__d); return __d;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator=(_Tp __d) noexcept
-        {__base::store(__d); return __d;}
+  atomic() noexcept = default;
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr atomic(_Tp __d) noexcept
+      : __base(__d)
+  {}
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __d) volatile noexcept
+  {
+    __base::store(__d);
+    return __d;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __d) noexcept
+  {
+    __base::store(__d);
+    return __d;
+  }
 };
 
 // atomic<T*>
 
 template <class _Tp>
-struct atomic<_Tp*>
-    : public __atomic_base<_Tp*>
+struct atomic<_Tp*> : public __atomic_base<_Tp*>
 {
-    typedef __atomic_base<_Tp*> __base;
-    using value_type = _Tp*;
+  typedef __atomic_base<_Tp*> __base;
+  using value_type = _Tp*;
 
-    atomic() noexcept = default;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    constexpr atomic(_Tp* __d) noexcept : __base(__d) {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator=(_Tp* __d) volatile noexcept
-        {__base::store(__d); return __d;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator=(_Tp* __d) noexcept
-        {__base::store(__d); return __d;}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
-                                                                        volatile noexcept
-        {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
-                                                                        noexcept
-        {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
-                                                                        volatile noexcept
-        {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
-                                                                        noexcept
-        {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator++(int) volatile noexcept            {return fetch_add(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator++(int) noexcept                     {return fetch_add(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator--(int) volatile noexcept            {return fetch_sub(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator--(int) noexcept                     {return fetch_sub(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator++() volatile noexcept               {return fetch_add(1) + 1;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator++() noexcept                        {return fetch_add(1) + 1;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator--() volatile noexcept               {return fetch_sub(1) - 1;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator--() noexcept                        {return fetch_sub(1) - 1;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator+=(ptrdiff_t __op) volatile noexcept {return fetch_add(__op) + __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator+=(ptrdiff_t __op) noexcept          {return fetch_add(__op) + __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator-=(ptrdiff_t __op) volatile noexcept {return fetch_sub(__op) - __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator-=(ptrdiff_t __op) noexcept          {return fetch_sub(__op) - __op;}
+  atomic() noexcept = default;
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr atomic(_Tp* __d) noexcept
+      : __base(__d)
+  {}
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator=(_Tp* __d) volatile noexcept
+  {
+    __base::store(__d);
+    return __d;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator=(_Tp* __d) noexcept
+  {
+    __base::store(__d);
+    return __d;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    return __cxx_atomic_fetch_add(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) noexcept
+  {
+    return __cxx_atomic_fetch_add(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) noexcept
+  {
+    return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator++(int) volatile noexcept
+  {
+    return fetch_add(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator++(int) noexcept
+  {
+    return fetch_add(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator--(int) volatile noexcept
+  {
+    return fetch_sub(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator--(int) noexcept
+  {
+    return fetch_sub(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator++() volatile noexcept
+  {
+    return fetch_add(1) + 1;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator++() noexcept
+  {
+    return fetch_add(1) + 1;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator--() volatile noexcept
+  {
+    return fetch_sub(1) - 1;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator--() noexcept
+  {
+    return fetch_sub(1) - 1;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator+=(ptrdiff_t __op) volatile noexcept
+  {
+    return fetch_add(__op) + __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator+=(ptrdiff_t __op) noexcept
+  {
+    return fetch_add(__op) + __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator-=(ptrdiff_t __op) volatile noexcept
+  {
+    return fetch_sub(__op) - __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator-=(ptrdiff_t __op) noexcept
+  {
+    return fetch_sub(__op) - __op;
+  }
 };
 
 // atomic_ref<T>
 
 template <class _Tp>
- struct atomic_ref
-    : public __atomic_base_ref<_Tp>
+struct atomic_ref : public __atomic_base_ref<_Tp>
 {
-    typedef __atomic_base_ref<_Tp> __base;
-    using value_type = _Tp;
+  typedef __atomic_base_ref<_Tp> __base;
+  using value_type = _Tp;
 
-    static constexpr size_t required_alignment = sizeof(_Tp);
+  static constexpr size_t required_alignment = sizeof(_Tp);
 
-    static constexpr bool is_always_lock_free = sizeof(_Tp) <= 8;
+  static constexpr bool is_always_lock_free = sizeof(_Tp) <= 8;
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    explicit atomic_ref(_Tp& __ref) : __base(__ref) {}
+  _LIBCUDACXX_INLINE_VISIBILITY explicit atomic_ref(_Tp& __ref)
+      : __base(__ref)
+  {}
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator=(_Tp __v) const volatile noexcept {__base::store(__v); return __v;}
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __v) const volatile noexcept
+  {
+    __base::store(__v);
+    return __v;
+  }
 };
 
 // atomic_ref<T*>
 
 template <class _Tp>
- struct atomic_ref<_Tp*>
-    : public __atomic_base_ref<_Tp*>
+struct atomic_ref<_Tp*> : public __atomic_base_ref<_Tp*>
 {
-    typedef __atomic_base_ref<_Tp*> __base;
-    using value_type = _Tp*;
+  typedef __atomic_base_ref<_Tp*> __base;
+  using value_type = _Tp*;
 
-    static constexpr size_t required_alignment = sizeof(_Tp*);
+  static constexpr size_t required_alignment = sizeof(_Tp*);
 
-    static constexpr bool is_always_lock_free = sizeof(_Tp*) <= 8;
+  static constexpr bool is_always_lock_free = sizeof(_Tp*) <= 8;
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    explicit atomic_ref(_Tp*& __ref) : __base(__ref) {}
+  _LIBCUDACXX_INLINE_VISIBILITY explicit atomic_ref(_Tp*& __ref)
+      : __base(__ref)
+  {}
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator=(_Tp* __v) const noexcept {__base::store(__v); return __v;}
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator=(_Tp* __v) const noexcept
+  {
+    __base::store(__v);
+    return __v;
+  }
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
-                                                                        const noexcept
-        {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
-                                                                        const noexcept
-        {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);}
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    return __cxx_atomic_fetch_add(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);
+  }
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator++(int) const noexcept                     {return fetch_add(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator--(int) const noexcept                     {return fetch_sub(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator++() const noexcept                        {return fetch_add(1) + 1;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator--() const noexcept                        {return fetch_sub(1) - 1;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator+=(ptrdiff_t __op) const noexcept          {return fetch_add(__op) + __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator-=(ptrdiff_t __op) const noexcept          {return fetch_sub(__op) - __op;}
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator++(int) const noexcept
+  {
+    return fetch_add(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator--(int) const noexcept
+  {
+    return fetch_sub(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator++() const noexcept
+  {
+    return fetch_add(1) + 1;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator--() const noexcept
+  {
+    return fetch_sub(1) - 1;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator+=(ptrdiff_t __op) const noexcept
+  {
+    return fetch_add(__op) + __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator-=(ptrdiff_t __op) const noexcept
+  {
+    return fetch_sub(__op) - __op;
+  }
 };
 
 // atomic_is_lock_free
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_is_lock_free(const volatile atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_is_lock_free(const volatile atomic<_Tp>* __o) noexcept
 {
-    return __o->is_lock_free();
+  return __o->is_lock_free();
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_is_lock_free(const atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_is_lock_free(const atomic<_Tp>* __o) noexcept
 {
-    return __o->is_lock_free();
+  return __o->is_lock_free();
 }
 
 // atomic_init
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_init(volatile atomic<_Tp>* __o, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_init(volatile atomic<_Tp>* __o, _Tp __d) noexcept
 {
-    __cxx_atomic_init(&__o->__a_, __d);
+  __cxx_atomic_init(&__o->__a_, __d);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_init(atomic<_Tp>* __o, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_init(atomic<_Tp>* __o, _Tp __d) noexcept
 {
-    __cxx_atomic_init(&__o->__a_, __d);
+  __cxx_atomic_init(&__o->__a_, __d);
 }
 
 // atomic_store
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_store(volatile atomic<_Tp>* __o, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_store(volatile atomic<_Tp>* __o, _Tp __d) noexcept
 {
-    __o->store(__d);
+  __o->store(__d);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_store(atomic<_Tp>* __o, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_store(atomic<_Tp>* __o, _Tp __d) noexcept
 {
-    __o->store(__d);
+  __o->store(__d);
 }
 
 // atomic_store_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_store_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_store_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
   _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
 {
-    __o->store(__d, __m);
+  __o->store(__d, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_store_explicit(atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_store_explicit(atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
   _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
 {
-    __o->store(__d, __m);
+  __o->store(__d, __m);
 }
 
 // atomic_load
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp
-atomic_load(const volatile atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_load(const volatile atomic<_Tp>* __o) noexcept
 {
-    return __o->load();
+  return __o->load();
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp
-atomic_load(const atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_load(const atomic<_Tp>* __o) noexcept
 {
-    return __o->load();
+  return __o->load();
 }
 
 // atomic_load_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp
-atomic_load_explicit(const volatile atomic<_Tp>* __o, memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_load_explicit(const volatile atomic<_Tp>* __o, memory_order __m) noexcept
   _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
 {
-    return __o->load(__m);
+  return __o->load(__m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp
-atomic_load_explicit(const atomic<_Tp>* __o, memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_load_explicit(const atomic<_Tp>* __o, memory_order __m) noexcept
   _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
 {
-    return __o->load(__m);
+  return __o->load(__m);
 }
 
 // atomic_exchange
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp
-atomic_exchange(volatile atomic<_Tp>* __o, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_exchange(volatile atomic<_Tp>* __o, _Tp __d) noexcept
 {
-    return __o->exchange(__d);
+  return __o->exchange(__d);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp
-atomic_exchange(atomic<_Tp>* __o, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_exchange(atomic<_Tp>* __o, _Tp __d) noexcept
 {
-    return __o->exchange(__d);
+  return __o->exchange(__d);
 }
 
 // atomic_exchange_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp
-atomic_exchange_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_exchange_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
 {
-    return __o->exchange(__d, __m);
+  return __o->exchange(__d, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp
-atomic_exchange_explicit(atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_exchange_explicit(atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
 {
-    return __o->exchange(__d, __m);
+  return __o->exchange(__d, __m);
 }
 
 // atomic_compare_exchange_weak
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_compare_exchange_weak(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_weak(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
 {
-    return __o->compare_exchange_weak(*__e, __d);
+  return __o->compare_exchange_weak(*__e, __d);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_compare_exchange_weak(atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_weak(atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
 {
-    return __o->compare_exchange_weak(*__e, __d);
+  return __o->compare_exchange_weak(*__e, __d);
 }
 
 // atomic_compare_exchange_strong
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_compare_exchange_strong(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_strong(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
 {
-    return __o->compare_exchange_strong(*__e, __d);
+  return __o->compare_exchange_strong(*__e, __d);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_compare_exchange_strong(atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_strong(atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
 {
-    return __o->compare_exchange_strong(*__e, __d);
+  return __o->compare_exchange_strong(*__e, __d);
 }
 
 // atomic_compare_exchange_weak_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_compare_exchange_weak_explicit(volatile atomic<_Tp>* __o, _Tp* __e,
-                                      _Tp __d,
-                                      memory_order __s, memory_order __f) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_weak_explicit(
+  volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d, memory_order __s, memory_order __f) noexcept
   _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
 {
-    return __o->compare_exchange_weak(*__e, __d, __s, __f);
+  return __o->compare_exchange_weak(*__e, __d, __s, __f);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_compare_exchange_weak_explicit(atomic<_Tp>* __o, _Tp* __e, _Tp __d,
-                                      memory_order __s, memory_order __f) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool
+atomic_compare_exchange_weak_explicit(atomic<_Tp>* __o, _Tp* __e, _Tp __d, memory_order __s, memory_order __f) noexcept
   _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
 {
-    return __o->compare_exchange_weak(*__e, __d, __s, __f);
+  return __o->compare_exchange_weak(*__e, __d, __s, __f);
 }
 
 // atomic_compare_exchange_strong_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_compare_exchange_strong_explicit(volatile atomic<_Tp>* __o,
-                                        _Tp* __e, _Tp __d,
-                                        memory_order __s, memory_order __f) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_strong_explicit(
+  volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d, memory_order __s, memory_order __f) noexcept
   _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
 {
-    return __o->compare_exchange_strong(*__e, __d, __s, __f);
+  return __o->compare_exchange_strong(*__e, __d, __s, __f);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_compare_exchange_strong_explicit(atomic<_Tp>* __o, _Tp* __e,
-                                        _Tp __d,
-                                        memory_order __s, memory_order __f) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_strong_explicit(
+  atomic<_Tp>* __o, _Tp* __e, _Tp __d, memory_order __s, memory_order __f) noexcept
   _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
 {
-    return __o->compare_exchange_strong(*__e, __d, __s, __f);
+  return __o->compare_exchange_strong(*__e, __d, __s, __f);
 }
 
 // atomic_wait
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void atomic_wait(const volatile atomic<_Tp>* __o,
-                    typename atomic<_Tp>::value_type __v) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void
+atomic_wait(const volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __v) noexcept
 {
-    return __o->wait(__v);
+  return __o->wait(__v);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void atomic_wait(const atomic<_Tp>* __o,
-                    typename atomic<_Tp>::value_type __v) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_wait(const atomic<_Tp>* __o, typename atomic<_Tp>::value_type __v) noexcept
 {
-    return __o->wait(__v);
+  return __o->wait(__v);
 }
 
 // atomic_wait_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void atomic_wait_explicit(const volatile atomic<_Tp>* __o,
-                            typename atomic<_Tp>::value_type __v,
-                            memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void
+atomic_wait_explicit(const volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __v, memory_order __m) noexcept
   _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
 {
-    return __o->wait(__v, __m);
+  return __o->wait(__v, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void atomic_wait_explicit(const atomic<_Tp>* __o,
-                            typename atomic<_Tp>::value_type __v,
-                            memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void
+atomic_wait_explicit(const atomic<_Tp>* __o, typename atomic<_Tp>::value_type __v, memory_order __m) noexcept
   _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
 {
-    return __o->wait(__v, __m);
+  return __o->wait(__v, __m);
 }
 
 // atomic_notify_one
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void atomic_notify_one(volatile atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_notify_one(volatile atomic<_Tp>* __o) noexcept
 {
-    __o->notify_one();
+  __o->notify_one();
 }
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void atomic_notify_one(atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_notify_one(atomic<_Tp>* __o) noexcept
 {
-    __o->notify_one();
+  __o->notify_one();
 }
 
 // atomic_notify_one
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void atomic_notify_all(volatile atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_notify_all(volatile atomic<_Tp>* __o) noexcept
 {
-    __o->notify_all();
+  __o->notify_all();
 }
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void atomic_notify_all(atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_notify_all(atomic<_Tp>* __o) noexcept
 {
-    __o->notify_all();
+  __o->notify_all();
 }
 
 // atomic_fetch_add
 
 template <class _Tp>
 _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    (is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value,
-    _Tp
->
-atomic_fetch_add(volatile atomic<_Tp>* __o, _Tp __op) noexcept
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_add(volatile atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_add(__op);
+  return __o->fetch_add(__op);
 }
 
 template <class _Tp>
 _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    (is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value,
-    _Tp
->
-atomic_fetch_add(atomic<_Tp>* __o, _Tp __op) noexcept
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_add(atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_add(__op);
+  return __o->fetch_add(__op);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp*
-atomic_fetch_add(volatile atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp* atomic_fetch_add(volatile atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
 {
-    return __o->fetch_add(__op);
+  return __o->fetch_add(__op);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp*
-atomic_fetch_add(atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp* atomic_fetch_add(atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
 {
-    return __o->fetch_add(__op);
+  return __o->fetch_add(__op);
 }
 
 // atomic_fetch_add_explicit
 
 template <class _Tp>
 _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    (is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value,
-    _Tp
->
-atomic_fetch_add_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_add_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_add(__op, __m);
+  return __o->fetch_add(__op, __m);
 }
 
 template <class _Tp>
 _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    (is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value,
-    _Tp
->
-atomic_fetch_add_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_add_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_add(__op, __m);
+  return __o->fetch_add(__op, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp*
-atomic_fetch_add_explicit(volatile atomic<_Tp*>* __o, ptrdiff_t __op,
-                          memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp*
+atomic_fetch_add_explicit(volatile atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) noexcept
 {
-    return __o->fetch_add(__op, __m);
+  return __o->fetch_add(__op, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp*
+_LIBCUDACXX_INLINE_VISIBILITY _Tp*
 atomic_fetch_add_explicit(atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) noexcept
 {
-    return __o->fetch_add(__op, __m);
+  return __o->fetch_add(__op, __m);
 }
 
 // atomic_fetch_sub
 
 template <class _Tp>
 _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    (is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value,
-    _Tp
->
-atomic_fetch_sub(volatile atomic<_Tp>* __o, _Tp __op) noexcept
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_sub(volatile atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_sub(__op);
+  return __o->fetch_sub(__op);
 }
 
 template <class _Tp>
 _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    (is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value,
-    _Tp
->
-atomic_fetch_sub(atomic<_Tp>* __o, _Tp __op) noexcept
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_sub(atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_sub(__op);
+  return __o->fetch_sub(__op);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp*
-atomic_fetch_sub(volatile atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp* atomic_fetch_sub(volatile atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
 {
-    return __o->fetch_sub(__op);
+  return __o->fetch_sub(__op);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp*
-atomic_fetch_sub(atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp* atomic_fetch_sub(atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
 {
-    return __o->fetch_sub(__op);
+  return __o->fetch_sub(__op);
 }
 
 // atomic_fetch_sub_explicit
 
 template <class _Tp>
 _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    (is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value,
-    _Tp
->
-atomic_fetch_sub_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_sub_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_sub(__op, __m);
+  return __o->fetch_sub(__op, __m);
 }
 
 template <class _Tp>
 _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    (is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value,
-    _Tp
->
-atomic_fetch_sub_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_sub_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_sub(__op, __m);
+  return __o->fetch_sub(__op, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp*
-atomic_fetch_sub_explicit(volatile atomic<_Tp*>* __o, ptrdiff_t __op,
-                          memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp*
+atomic_fetch_sub_explicit(volatile atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) noexcept
 {
-    return __o->fetch_sub(__op, __m);
+  return __o->fetch_sub(__op, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp*
+_LIBCUDACXX_INLINE_VISIBILITY _Tp*
 atomic_fetch_sub_explicit(atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) noexcept
 {
-    return __o->fetch_sub(__op, __m);
+  return __o->fetch_sub(__op, __m);
 }
 
 // atomic_fetch_and
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_and(volatile atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_and(__op);
+  return __o->fetch_and(__op);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_and(atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_and(__op);
+  return __o->fetch_and(__op);
 }
 
 // atomic_fetch_and_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_and_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_and(__op, __m);
+  return __o->fetch_and(__op, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_and_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_and(__op, __m);
+  return __o->fetch_and(__op, __m);
 }
 
 // atomic_fetch_or
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_or(volatile atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_or(__op);
+  return __o->fetch_or(__op);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_or(atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_or(__op);
+  return __o->fetch_or(__op);
 }
 
 // atomic_fetch_or_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_or_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_or(__op, __m);
+  return __o->fetch_or(__op, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_or_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_or(__op, __m);
+  return __o->fetch_or(__op, __m);
 }
 
 // atomic_fetch_xor
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_xor(volatile atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_xor(__op);
+  return __o->fetch_xor(__op);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_xor(atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_xor(__op);
+  return __o->fetch_xor(__op);
 }
 
 // atomic_fetch_xor_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_xor_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_xor(__op, __m);
+  return __o->fetch_xor(__op, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_xor_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_xor(__op, __m);
+  return __o->fetch_xor(__op, __m);
 }
 
 // flag type and operations
 
 typedef struct atomic_flag
 {
-    __cxx_atomic_impl<_LIBCUDACXX_ATOMIC_FLAG_TYPE, 0> __a_;
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool test(memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true)==__cxx_atomic_load(&__a_, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool test(memory_order __m = memory_order_seq_cst) const noexcept
-        {return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true)==__cxx_atomic_load(&__a_, __m);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool test_and_set(memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __cxx_atomic_exchange(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool test_and_set(memory_order __m = memory_order_seq_cst) noexcept
-        {return __cxx_atomic_exchange(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void clear(memory_order __m = memory_order_seq_cst) volatile noexcept
-        {__cxx_atomic_store(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void clear(memory_order __m = memory_order_seq_cst) noexcept
-        {__cxx_atomic_store(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m);}
+  __cxx_atomic_impl<_LIBCUDACXX_ATOMIC_FLAG_TYPE, 0> __a_;
+
+  _LIBCUDACXX_INLINE_VISIBILITY bool test(memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true) == __cxx_atomic_load(&__a_, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool test(memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true) == __cxx_atomic_load(&__a_, __m);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY bool test_and_set(memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    return __cxx_atomic_exchange(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool test_and_set(memory_order __m = memory_order_seq_cst) noexcept
+  {
+    return __cxx_atomic_exchange(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void clear(memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    __cxx_atomic_store(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void clear(memory_order __m = memory_order_seq_cst) noexcept
+  {
+    __cxx_atomic_store(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m);
+  }
 
 #if !defined(__CUDA_MINIMUM_ARCH__) || __CUDA_MINIMUM_ARCH__ >= 700
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void wait(bool __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {__cxx_atomic_wait(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(__v), __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void wait(bool __v, memory_order __m = memory_order_seq_cst) const noexcept
-        {__cxx_atomic_wait(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(__v), __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void notify_one() volatile noexcept
-        {__cxx_atomic_notify_one(&__a_);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void notify_one() noexcept
-        {__cxx_atomic_notify_one(&__a_);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void notify_all() volatile noexcept
-        {__cxx_atomic_notify_all(&__a_);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void notify_all() noexcept
-        {__cxx_atomic_notify_all(&__a_);}
+  _LIBCUDACXX_INLINE_VISIBILITY void wait(bool __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    __cxx_atomic_wait(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(__v), __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void wait(bool __v, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    __cxx_atomic_wait(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(__v), __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_one() volatile noexcept
+  {
+    __cxx_atomic_notify_one(&__a_);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_one() noexcept
+  {
+    __cxx_atomic_notify_one(&__a_);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_all() volatile noexcept
+  {
+    __cxx_atomic_notify_all(&__a_);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_all() noexcept
+  {
+    __cxx_atomic_notify_all(&__a_);
+  }
 #endif
 
-    atomic_flag() noexcept = default;
+  atomic_flag() noexcept = default;
 
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    atomic_flag(bool __b) noexcept : __a_(__b) {} // EXTENSION
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr atomic_flag(bool __b) noexcept
+      : __a_(__b)
+  {} // EXTENSION
 
-    atomic_flag(const atomic_flag&) = delete;
-    atomic_flag& operator=(const atomic_flag&) = delete;
-    atomic_flag& operator=(const atomic_flag&) volatile = delete;
+  atomic_flag(const atomic_flag&)                     = delete;
+  atomic_flag& operator=(const atomic_flag&)          = delete;
+  atomic_flag& operator=(const atomic_flag&) volatile = delete;
 } atomic_flag;
 
-
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_flag_test(const volatile atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test(const volatile atomic_flag* __o) noexcept
 {
-    return __o->test();
+  return __o->test();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_flag_test(const atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test(const atomic_flag* __o) noexcept
 {
-    return __o->test();
+  return __o->test();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bool
+inline _LIBCUDACXX_INLINE_VISIBILITY bool
 atomic_flag_test_explicit(const volatile atomic_flag* __o, memory_order __m) noexcept
 {
-    return __o->test(__m);
+  return __o->test(__m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_flag_test_explicit(const atomic_flag* __o, memory_order __m) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test_explicit(const atomic_flag* __o, memory_order __m) noexcept
 {
-    return __o->test(__m);
+  return __o->test(__m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_flag_test_and_set(volatile atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test_and_set(volatile atomic_flag* __o) noexcept
 {
-    return __o->test_and_set();
+  return __o->test_and_set();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_flag_test_and_set(atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test_and_set(atomic_flag* __o) noexcept
 {
-    return __o->test_and_set();
+  return __o->test_and_set();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bool
+inline _LIBCUDACXX_INLINE_VISIBILITY bool
 atomic_flag_test_and_set_explicit(volatile atomic_flag* __o, memory_order __m) noexcept
 {
-    return __o->test_and_set(__m);
+  return __o->test_and_set(__m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_flag_test_and_set_explicit(atomic_flag* __o, memory_order __m) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test_and_set_explicit(atomic_flag* __o, memory_order __m) noexcept
 {
-    return __o->test_and_set(__m);
+  return __o->test_and_set(__m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_clear(volatile atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_clear(volatile atomic_flag* __o) noexcept
 {
-    __o->clear();
+  __o->clear();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_clear(atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_clear(atomic_flag* __o) noexcept
 {
-    __o->clear();
+  __o->clear();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
+inline _LIBCUDACXX_INLINE_VISIBILITY void
 atomic_flag_clear_explicit(volatile atomic_flag* __o, memory_order __m) noexcept
 {
-    __o->clear(__m);
+  __o->clear(__m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_clear_explicit(atomic_flag* __o, memory_order __m) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_clear_explicit(atomic_flag* __o, memory_order __m) noexcept
 {
-    __o->clear(__m);
+  __o->clear(__m);
 }
 
 #if !defined(__CUDA_MINIMUM_ARCH__) || __CUDA_MINIMUM_ARCH__ >= 700
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_wait(const volatile atomic_flag* __o, bool __v) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_wait(const volatile atomic_flag* __o, bool __v) noexcept
 {
-    __o->wait(__v);
+  __o->wait(__v);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_wait(const atomic_flag* __o, bool __v) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_wait(const atomic_flag* __o, bool __v) noexcept
 {
-    __o->wait(__v);
+  __o->wait(__v);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_wait_explicit(const volatile atomic_flag* __o,
-                          bool __v, memory_order __m) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void
+atomic_flag_wait_explicit(const volatile atomic_flag* __o, bool __v, memory_order __m) noexcept
 {
-    __o->wait(__v, __m);
+  __o->wait(__v, __m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_wait_explicit(const atomic_flag* __o,
-                          bool __v, memory_order __m) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void
+atomic_flag_wait_explicit(const atomic_flag* __o, bool __v, memory_order __m) noexcept
 {
-    __o->wait(__v, __m);
+  __o->wait(__v, __m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_notify_one(volatile atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_notify_one(volatile atomic_flag* __o) noexcept
 {
-    __o->notify_one();
+  __o->notify_one();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_notify_one(atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_notify_one(atomic_flag* __o) noexcept
 {
-    __o->notify_one();
+  __o->notify_one();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_notify_all(volatile atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_notify_all(volatile atomic_flag* __o) noexcept
 {
-    __o->notify_all();
+  __o->notify_all();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_notify_all(atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_notify_all(atomic_flag* __o) noexcept
 {
-    __o->notify_all();
+  __o->notify_all();
 }
 
 #endif
 
 // fences
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_thread_fence(memory_order __m) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_thread_fence(memory_order __m) noexcept
 {
-    __cxx_atomic_thread_fence(__m);
+  __cxx_atomic_thread_fence(__m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_signal_fence(memory_order __m) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_signal_fence(memory_order __m) noexcept
 {
-    __cxx_atomic_signal_fence(__m);
+  __cxx_atomic_signal_fence(__m);
 }
 
 // Atomics for standard typedef types
 
-typedef atomic<bool>               atomic_bool;
-typedef atomic<char>               atomic_char;
-typedef atomic<signed char>        atomic_schar;
-typedef atomic<unsigned char>      atomic_uchar;
-typedef atomic<short>              atomic_short;
-typedef atomic<unsigned short>     atomic_ushort;
-typedef atomic<int>                atomic_int;
-typedef atomic<unsigned int>       atomic_uint;
-typedef atomic<long>               atomic_long;
-typedef atomic<unsigned long>      atomic_ulong;
-typedef atomic<long long>          atomic_llong;
+typedef atomic<bool> atomic_bool;
+typedef atomic<char> atomic_char;
+typedef atomic<signed char> atomic_schar;
+typedef atomic<unsigned char> atomic_uchar;
+typedef atomic<short> atomic_short;
+typedef atomic<unsigned short> atomic_ushort;
+typedef atomic<int> atomic_int;
+typedef atomic<unsigned int> atomic_uint;
+typedef atomic<long> atomic_long;
+typedef atomic<unsigned long> atomic_ulong;
+typedef atomic<long long> atomic_llong;
 typedef atomic<unsigned long long> atomic_ullong;
-typedef atomic<char16_t>           atomic_char16_t;
-typedef atomic<char32_t>           atomic_char32_t;
-typedef atomic<wchar_t>            atomic_wchar_t;
+typedef atomic<char16_t> atomic_char16_t;
+typedef atomic<char32_t> atomic_char32_t;
+typedef atomic<wchar_t> atomic_wchar_t;
 
-typedef atomic<int_least8_t>   atomic_int_least8_t;
-typedef atomic<uint_least8_t>  atomic_uint_least8_t;
-typedef atomic<int_least16_t>  atomic_int_least16_t;
+typedef atomic<int_least8_t> atomic_int_least8_t;
+typedef atomic<uint_least8_t> atomic_uint_least8_t;
+typedef atomic<int_least16_t> atomic_int_least16_t;
 typedef atomic<uint_least16_t> atomic_uint_least16_t;
-typedef atomic<int_least32_t>  atomic_int_least32_t;
+typedef atomic<int_least32_t> atomic_int_least32_t;
 typedef atomic<uint_least32_t> atomic_uint_least32_t;
-typedef atomic<int_least64_t>  atomic_int_least64_t;
+typedef atomic<int_least64_t> atomic_int_least64_t;
 typedef atomic<uint_least64_t> atomic_uint_least64_t;
 
-typedef atomic<int_fast8_t>   atomic_int_fast8_t;
-typedef atomic<uint_fast8_t>  atomic_uint_fast8_t;
-typedef atomic<int_fast16_t>  atomic_int_fast16_t;
+typedef atomic<int_fast8_t> atomic_int_fast8_t;
+typedef atomic<uint_fast8_t> atomic_uint_fast8_t;
+typedef atomic<int_fast16_t> atomic_int_fast16_t;
 typedef atomic<uint_fast16_t> atomic_uint_fast16_t;
-typedef atomic<int_fast32_t>  atomic_int_fast32_t;
+typedef atomic<int_fast32_t> atomic_int_fast32_t;
 typedef atomic<uint_fast32_t> atomic_uint_fast32_t;
-typedef atomic<int_fast64_t>  atomic_int_fast64_t;
+typedef atomic<int_fast64_t> atomic_int_fast64_t;
 typedef atomic<uint_fast64_t> atomic_uint_fast64_t;
 
-typedef atomic< int8_t>  atomic_int8_t;
-typedef atomic<uint8_t>  atomic_uint8_t;
-typedef atomic< int16_t> atomic_int16_t;
+typedef atomic<int8_t> atomic_int8_t;
+typedef atomic<uint8_t> atomic_uint8_t;
+typedef atomic<int16_t> atomic_int16_t;
 typedef atomic<uint16_t> atomic_uint16_t;
-typedef atomic< int32_t> atomic_int32_t;
+typedef atomic<int32_t> atomic_int32_t;
 typedef atomic<uint32_t> atomic_uint32_t;
-typedef atomic< int64_t> atomic_int64_t;
+typedef atomic<int64_t> atomic_int64_t;
 typedef atomic<uint64_t> atomic_uint64_t;
 
-typedef atomic<intptr_t>  atomic_intptr_t;
+typedef atomic<intptr_t> atomic_intptr_t;
 typedef atomic<uintptr_t> atomic_uintptr_t;
-typedef atomic<size_t>    atomic_size_t;
+typedef atomic<size_t> atomic_size_t;
 typedef atomic<ptrdiff_t> atomic_ptrdiff_t;
-typedef atomic<intmax_t>  atomic_intmax_t;
+typedef atomic<intmax_t> atomic_intmax_t;
 typedef atomic<uintmax_t> atomic_uintmax_t;
 
 static_assert(ATOMIC_INT_LOCK_FREE, "This library assumes atomic<int> is lock-free.");
 
-typedef atomic<int>       atomic_signed_lock_free;
-typedef atomic<unsigned>  atomic_unsigned_lock_free;
+typedef atomic<int> atomic_signed_lock_free;
+typedef atomic<unsigned> atomic_unsigned_lock_free;
 
-#define ATOMIC_FLAG_INIT {false}
-#define ATOMIC_VAR_INIT(__v) {__v}
+#define ATOMIC_FLAG_INIT \
+  {                      \
+    false                \
+  }
+#define ATOMIC_VAR_INIT(__v) \
+  {                          \
+    __v                      \
+  }
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
 #include <cuda/std/__cuda/atomic.h>
 #include <cuda/std/detail/libcxx/include/__pragma_pop>
 
-#endif  // _LIBCUDACXX_ATOMIC
+#endif // _LIBCUDACXX_ATOMIC
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/bitset b/libcudacxx/include/cuda/std/detail/libcxx/include/bitset
index c475bfb7d9f..ebf17ae02a2 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/bitset
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/bitset
@@ -74,14 +74,10 @@ public:
     template <class charT, class traits>
         basic_string<charT, traits, allocator<charT> > to_string(charT zero = charT('0'), charT one = charT('1')) const;
     template <class charT>
-        basic_string<charT, char_traits<charT>, allocator<charT> > to_string(charT zero = charT('0'), charT one = charT('1')) const;
-    basic_string<char, char_traits<char>, allocator<char> > to_string(char zero = '0', char one = '1') const;
-    size_t count() const noexcept;
-    constexpr size_t size() const noexcept;
-    bool operator==(const bitset& rhs) const noexcept;
-    bool operator!=(const bitset& rhs) const noexcept;
-    bool test(size_t pos) const;
-    bool all() const noexcept;
+        basic_string<charT, char_traits<charT>, allocator<charT> > to_string(charT zero = charT('0'), charT one =
+charT('1')) const; basic_string<char, char_traits<char>, allocator<char> > to_string(char zero = '0', char one = '1')
+const; size_t count() const noexcept; constexpr size_t size() const noexcept; bool operator==(const bitset& rhs) const
+noexcept; bool operator!=(const bitset& rhs) const noexcept; bool test(size_t pos) const; bool all() const noexcept;
     bool any() const noexcept;
     bool none() const noexcept;
     bitset operator<<(size_t pos) const noexcept;
@@ -112,14 +108,14 @@ template <size_t N> struct hash<std::bitset<N>>;
 
 */
 
-#include <__config>
 #include <__bit_reference>
-#include <cstddef>
+#include <__config>
+#include <__functional_base>
 #include <climits>
-#include <string>
-#include <stdexcept>
+#include <cstddef>
 #include <iosfwd>
-#include <__functional_base>
+#include <stdexcept>
+#include <string>
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
@@ -132,929 +128,901 @@ template <size_t N> struct hash<std::bitset<N>>;
 _LIBCUDACXX_PUSH_MACROS
 #include <__undef_macros>
 
-
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <size_t _N_words, size_t _Size>
 class __bitset;
 
 template <size_t _N_words, size_t _Size>
-struct __has_storage_type<__bitset<_N_words, _Size> >
+struct __has_storage_type<__bitset<_N_words, _Size>>
 {
-    static const bool value = true;
+  static const bool value = true;
 };
 
 template <size_t _N_words, size_t _Size>
 class __bitset
 {
 public:
-    typedef ptrdiff_t              difference_type;
-    typedef size_t                 size_type;
-    typedef size_type              __storage_type;
+  typedef ptrdiff_t difference_type;
+  typedef size_t size_type;
+  typedef size_type __storage_type;
+
 protected:
-    typedef __bitset __self;
-    typedef       __storage_type*  __storage_pointer;
-    typedef const __storage_type*  __const_storage_pointer;
-    static const unsigned __bits_per_word = static_cast<unsigned>(sizeof(__storage_type) * CHAR_BIT);
-
-    friend class __bit_reference<__bitset>;
-    friend class __bit_const_reference<__bitset>;
-    friend class __bit_iterator<__bitset, false>;
-    friend class __bit_iterator<__bitset, true>;
-    friend struct __bit_array<__bitset>;
-
-    __storage_type __first_[_N_words];
-
-    typedef __bit_reference<__bitset>                  reference;
-    typedef __bit_const_reference<__bitset>            const_reference;
-    typedef __bit_iterator<__bitset, false>            iterator;
-    typedef __bit_iterator<__bitset, true>             const_iterator;
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    constexpr __bitset() noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    explicit constexpr __bitset(unsigned long long __v) noexcept;
-
-    _LIBCUDACXX_INLINE_VISIBILITY reference __make_ref(size_t __pos) noexcept
-        {return reference(__first_ + __pos / __bits_per_word, __storage_type(1) << __pos % __bits_per_word);}
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference __make_ref(size_t __pos) const noexcept
-        {return const_reference(__first_ + __pos / __bits_per_word, __storage_type(1) << __pos % __bits_per_word);}
-    _LIBCUDACXX_INLINE_VISIBILITY iterator __make_iter(size_t __pos) noexcept
-        {return iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word);}
-    _LIBCUDACXX_INLINE_VISIBILITY const_iterator __make_iter(size_t __pos) const noexcept
-        {return const_iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void operator&=(const __bitset& __v) noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void operator|=(const __bitset& __v) noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void operator^=(const __bitset& __v) noexcept;
-
-    void flip() noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY unsigned long to_ulong() const
-        {return to_ulong(integral_constant<bool, _Size < sizeof(unsigned long) * CHAR_BIT>());}
-    _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong() const
-        {return to_ullong(integral_constant<bool, _Size < sizeof(unsigned long long) * CHAR_BIT>());}
-
-    bool all() const noexcept;
-    bool any() const noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    size_t __hash_code() const noexcept;
+  typedef __bitset __self;
+  typedef __storage_type* __storage_pointer;
+  typedef const __storage_type* __const_storage_pointer;
+  static const unsigned __bits_per_word = static_cast<unsigned>(sizeof(__storage_type) * CHAR_BIT);
+
+  friend class __bit_reference<__bitset>;
+  friend class __bit_const_reference<__bitset>;
+  friend class __bit_iterator<__bitset, false>;
+  friend class __bit_iterator<__bitset, true>;
+  friend struct __bit_array<__bitset>;
+
+  __storage_type __first_[_N_words];
+
+  typedef __bit_reference<__bitset> reference;
+  typedef __bit_const_reference<__bitset> const_reference;
+  typedef __bit_iterator<__bitset, false> iterator;
+  typedef __bit_iterator<__bitset, true> const_iterator;
+
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __bitset() noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __bitset(unsigned long long __v) noexcept;
+
+  _LIBCUDACXX_INLINE_VISIBILITY reference __make_ref(size_t __pos) noexcept
+  {
+    return reference(__first_ + __pos / __bits_per_word, __storage_type(1) << __pos % __bits_per_word);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference __make_ref(size_t __pos) const noexcept
+  {
+    return const_reference(__first_ + __pos / __bits_per_word, __storage_type(1) << __pos % __bits_per_word);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY iterator __make_iter(size_t __pos) noexcept
+  {
+    return iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY const_iterator __make_iter(size_t __pos) const noexcept
+  {
+    return const_iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY void operator&=(const __bitset& __v) noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY void operator|=(const __bitset& __v) noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY void operator^=(const __bitset& __v) noexcept;
+
+  void flip() noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY unsigned long to_ulong() const
+  {
+    return to_ulong(integral_constant < bool, _Size<sizeof(unsigned long) * CHAR_BIT>());
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong() const
+  {
+    return to_ullong(integral_constant < bool, _Size<sizeof(unsigned long long) * CHAR_BIT>());
+  }
+
+  bool all() const noexcept;
+  bool any() const noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept;
+
 private:
-    unsigned long to_ulong(false_type) const;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    unsigned long to_ulong(true_type) const;
-    unsigned long long to_ullong(false_type) const;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    unsigned long long to_ullong(true_type) const;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    unsigned long long to_ullong(true_type, false_type) const;
-    unsigned long long to_ullong(true_type, true_type) const;
+  unsigned long to_ulong(false_type) const;
+  _LIBCUDACXX_INLINE_VISIBILITY unsigned long to_ulong(true_type) const;
+  unsigned long long to_ullong(false_type) const;
+  _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong(true_type) const;
+  _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong(true_type, false_type) const;
+  unsigned long long to_ullong(true_type, true_type) const;
 };
 
 template <size_t _N_words, size_t _Size>
-inline constexpr
-__bitset<_N_words, _Size>::__bitset() noexcept
+inline constexpr __bitset<_N_words, _Size>::__bitset() noexcept
     : __first_{0}
 {}
 
 template <size_t _N_words, size_t _Size>
-inline
-constexpr
-__bitset<_N_words, _Size>::__bitset(unsigned long long __v) noexcept
+inline constexpr __bitset<_N_words, _Size>::__bitset(unsigned long long __v) noexcept
 #if __SIZEOF_SIZE_T__ == 8
     : __first_{__v}
 #elif __SIZEOF_SIZE_T__ == 4
-    : __first_{static_cast<__storage_type>(__v),
-                _Size >= 2 * __bits_per_word ? static_cast<__storage_type>(__v >> __bits_per_word)
-                : static_cast<__storage_type>((__v >> __bits_per_word) & (__storage_type(1) << (_Size - __bits_per_word)) - 1)}
+    : __first_{
+      static_cast<__storage_type>(__v),
+      _Size >= 2 * __bits_per_word
+        ? static_cast<__storage_type>(__v >> __bits_per_word)
+        : static_cast<__storage_type>((__v >> __bits_per_word) & (__storage_type(1) << (_Size - __bits_per_word)) - 1)}
 #else
-#error This constructor has not been ported to this platform
+#  error This constructor has not been ported to this platform
 #endif
 {}
 
 template <size_t _N_words, size_t _Size>
-inline
-void
-__bitset<_N_words, _Size>::operator&=(const __bitset& __v) noexcept
+inline void __bitset<_N_words, _Size>::operator&=(const __bitset& __v) noexcept
 {
-    for (size_type __i = 0; __i < _N_words; ++__i)
-        __first_[__i] &= __v.__first_[__i];
+  for (size_type __i = 0; __i < _N_words; ++__i)
+  {
+    __first_[__i] &= __v.__first_[__i];
+  }
 }
 
 template <size_t _N_words, size_t _Size>
-inline
-void
-__bitset<_N_words, _Size>::operator|=(const __bitset& __v) noexcept
+inline void __bitset<_N_words, _Size>::operator|=(const __bitset& __v) noexcept
 {
-    for (size_type __i = 0; __i < _N_words; ++__i)
-        __first_[__i] |= __v.__first_[__i];
+  for (size_type __i = 0; __i < _N_words; ++__i)
+  {
+    __first_[__i] |= __v.__first_[__i];
+  }
 }
 
 template <size_t _N_words, size_t _Size>
-inline
-void
-__bitset<_N_words, _Size>::operator^=(const __bitset& __v) noexcept
+inline void __bitset<_N_words, _Size>::operator^=(const __bitset& __v) noexcept
 {
-    for (size_type __i = 0; __i < _N_words; ++__i)
-        __first_[__i] ^= __v.__first_[__i];
+  for (size_type __i = 0; __i < _N_words; ++__i)
+  {
+    __first_[__i] ^= __v.__first_[__i];
+  }
 }
 
 template <size_t _N_words, size_t _Size>
-void
-__bitset<_N_words, _Size>::flip() noexcept
-{
-    // do middle whole words
-    size_type __n = _Size;
-    __storage_pointer __p = __first_;
-    for (; __n >= __bits_per_word; ++__p, __n -= __bits_per_word)
-        *__p = ~*__p;
-    // do last partial word
-    if (__n > 0)
-    {
-        __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-        __storage_type __b = *__p & __m;
-        *__p &= ~__m;
-        *__p |= ~__b & __m;
-    }
+void __bitset<_N_words, _Size>::flip() noexcept
+{
+  // do middle whole words
+  size_type __n         = _Size;
+  __storage_pointer __p = __first_;
+  for (; __n >= __bits_per_word; ++__p, __n -= __bits_per_word)
+  {
+    *__p = ~*__p;
+  }
+  // do last partial word
+  if (__n > 0)
+  {
+    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+    __storage_type __b = *__p & __m;
+    *__p &= ~__m;
+    *__p |= ~__b & __m;
+  }
 }
 
 template <size_t _N_words, size_t _Size>
-unsigned long
-__bitset<_N_words, _Size>::to_ulong(false_type) const
+unsigned long __bitset<_N_words, _Size>::to_ulong(false_type) const
 {
-    const_iterator __e = __make_iter(_Size);
-    const_iterator __i = _CUDA_VSTD::find(__make_iter(sizeof(unsigned long) * CHAR_BIT), __e, true);
-    if (__i != __e)
-        __throw_overflow_error("bitset to_ulong overflow error");
+  const_iterator __e = __make_iter(_Size);
+  const_iterator __i = _CUDA_VSTD::find(__make_iter(sizeof(unsigned long) * CHAR_BIT), __e, true);
+  if (__i != __e)
+  {
+    __throw_overflow_error("bitset to_ulong overflow error");
+  }
 
-    return __first_[0];
+  return __first_[0];
 }
 
 template <size_t _N_words, size_t _Size>
-inline
-unsigned long
-__bitset<_N_words, _Size>::to_ulong(true_type) const
+inline unsigned long __bitset<_N_words, _Size>::to_ulong(true_type) const
 {
-    return __first_[0];
+  return __first_[0];
 }
 
 template <size_t _N_words, size_t _Size>
-unsigned long long
-__bitset<_N_words, _Size>::to_ullong(false_type) const
+unsigned long long __bitset<_N_words, _Size>::to_ullong(false_type) const
 {
-    const_iterator __e = __make_iter(_Size);
-    const_iterator __i = _CUDA_VSTD::find(__make_iter(sizeof(unsigned long long) * CHAR_BIT), __e, true);
-    if (__i != __e)
-        __throw_overflow_error("bitset to_ullong overflow error");
+  const_iterator __e = __make_iter(_Size);
+  const_iterator __i = _CUDA_VSTD::find(__make_iter(sizeof(unsigned long long) * CHAR_BIT), __e, true);
+  if (__i != __e)
+  {
+    __throw_overflow_error("bitset to_ullong overflow error");
+  }
 
-    return to_ullong(true_type());
+  return to_ullong(true_type());
 }
 
 template <size_t _N_words, size_t _Size>
-inline
-unsigned long long
-__bitset<_N_words, _Size>::to_ullong(true_type) const
+inline unsigned long long __bitset<_N_words, _Size>::to_ullong(true_type) const
 {
-    return to_ullong(true_type(), integral_constant<bool, sizeof(__storage_type) < sizeof(unsigned long long)>());
+  return to_ullong(true_type(), integral_constant<bool, sizeof(__storage_type) < sizeof(unsigned long long)>());
 }
 
 template <size_t _N_words, size_t _Size>
-inline
-unsigned long long
-__bitset<_N_words, _Size>::to_ullong(true_type, false_type) const
+inline unsigned long long __bitset<_N_words, _Size>::to_ullong(true_type, false_type) const
 {
-    return __first_[0];
+  return __first_[0];
 }
 
 template <size_t _N_words, size_t _Size>
-unsigned long long
-__bitset<_N_words, _Size>::to_ullong(true_type, true_type) const
-{
-    unsigned long long __r = __first_[0];
-    for (std::size_t __i = 1; __i < sizeof(unsigned long long) / sizeof(__storage_type); ++__i)
-        __r |= static_cast<unsigned long long>(__first_[__i]) << (sizeof(__storage_type) * CHAR_BIT);
-    return __r;
+unsigned long long __bitset<_N_words, _Size>::to_ullong(true_type, true_type) const
+{
+  unsigned long long __r = __first_[0];
+  for (std::size_t __i = 1; __i < sizeof(unsigned long long) / sizeof(__storage_type); ++__i)
+  {
+    __r |= static_cast<unsigned long long>(__first_[__i]) << (sizeof(__storage_type) * CHAR_BIT);
+  }
+  return __r;
 }
 
 template <size_t _N_words, size_t _Size>
-bool
-__bitset<_N_words, _Size>::all() const noexcept
-{
-    // do middle whole words
-    size_type __n = _Size;
-    __const_storage_pointer __p = __first_;
-    for (; __n >= __bits_per_word; ++__p, __n -= __bits_per_word)
-        if (~*__p)
-            return false;
-    // do last partial word
-    if (__n > 0)
+bool __bitset<_N_words, _Size>::all() const noexcept
+{
+  // do middle whole words
+  size_type __n               = _Size;
+  __const_storage_pointer __p = __first_;
+  for (; __n >= __bits_per_word; ++__p, __n -= __bits_per_word)
+  {
+    if (~*__p)
     {
-        __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-        if (~*__p & __m)
-            return false;
+      return false;
     }
-    return true;
+  }
+  // do last partial word
+  if (__n > 0)
+  {
+    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+    if (~*__p & __m)
+    {
+      return false;
+    }
+  }
+  return true;
 }
 
 template <size_t _N_words, size_t _Size>
-bool
-__bitset<_N_words, _Size>::any() const noexcept
-{
-    // do middle whole words
-    size_type __n = _Size;
-    __const_storage_pointer __p = __first_;
-    for (; __n >= __bits_per_word; ++__p, __n -= __bits_per_word)
-        if (*__p)
-            return true;
-    // do last partial word
-    if (__n > 0)
+bool __bitset<_N_words, _Size>::any() const noexcept
+{
+  // do middle whole words
+  size_type __n               = _Size;
+  __const_storage_pointer __p = __first_;
+  for (; __n >= __bits_per_word; ++__p, __n -= __bits_per_word)
+  {
+    if (*__p)
     {
-        __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-        if (*__p & __m)
-            return true;
+      return true;
     }
-    return false;
+  }
+  // do last partial word
+  if (__n > 0)
+  {
+    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+    if (*__p & __m)
+    {
+      return true;
+    }
+  }
+  return false;
 }
 
 template <size_t _N_words, size_t _Size>
-inline
-size_t
-__bitset<_N_words, _Size>::__hash_code() const noexcept
-{
-    size_t __h = 0;
-    for (size_type __i = 0; __i < _N_words; ++__i)
-        __h ^= __first_[__i];
-    return __h;
+inline size_t __bitset<_N_words, _Size>::__hash_code() const noexcept
+{
+  size_t __h = 0;
+  for (size_type __i = 0; __i < _N_words; ++__i)
+  {
+    __h ^= __first_[__i];
+  }
+  return __h;
 }
 
 template <size_t _Size>
 class __bitset<1, _Size>
 {
 public:
-    typedef ptrdiff_t              difference_type;
-    typedef size_t                 size_type;
-    typedef size_type              __storage_type;
-protected:
-    typedef __bitset __self;
-    typedef       __storage_type*  __storage_pointer;
-    typedef const __storage_type*  __const_storage_pointer;
-    static const unsigned __bits_per_word = static_cast<unsigned>(sizeof(__storage_type) * CHAR_BIT);
-
-    friend class __bit_reference<__bitset>;
-    friend class __bit_const_reference<__bitset>;
-    friend class __bit_iterator<__bitset, false>;
-    friend class __bit_iterator<__bitset, true>;
-    friend struct __bit_array<__bitset>;
-
-    __storage_type __first_;
-
-    typedef __bit_reference<__bitset>                  reference;
-    typedef __bit_const_reference<__bitset>            const_reference;
-    typedef __bit_iterator<__bitset, false>            iterator;
-    typedef __bit_iterator<__bitset, true>             const_iterator;
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    constexpr __bitset() noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    explicit constexpr __bitset(unsigned long long __v) noexcept;
-
-    _LIBCUDACXX_INLINE_VISIBILITY reference __make_ref(size_t __pos) noexcept
-        {return reference(&__first_, __storage_type(1) << __pos);}
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference __make_ref(size_t __pos) const noexcept
-        {return const_reference(&__first_, __storage_type(1) << __pos);}
-    _LIBCUDACXX_INLINE_VISIBILITY iterator __make_iter(size_t __pos) noexcept
-        {return iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word);}
-    _LIBCUDACXX_INLINE_VISIBILITY const_iterator __make_iter(size_t __pos) const noexcept
-        {return const_iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void operator&=(const __bitset& __v) noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void operator|=(const __bitset& __v) noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void operator^=(const __bitset& __v) noexcept;
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void flip() noexcept;
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    unsigned long to_ulong() const;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    unsigned long long to_ullong() const;
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool all() const noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool any() const noexcept;
+  typedef ptrdiff_t difference_type;
+  typedef size_t size_type;
+  typedef size_type __storage_type;
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    size_t __hash_code() const noexcept;
+protected:
+  typedef __bitset __self;
+  typedef __storage_type* __storage_pointer;
+  typedef const __storage_type* __const_storage_pointer;
+  static const unsigned __bits_per_word = static_cast<unsigned>(sizeof(__storage_type) * CHAR_BIT);
+
+  friend class __bit_reference<__bitset>;
+  friend class __bit_const_reference<__bitset>;
+  friend class __bit_iterator<__bitset, false>;
+  friend class __bit_iterator<__bitset, true>;
+  friend struct __bit_array<__bitset>;
+
+  __storage_type __first_;
+
+  typedef __bit_reference<__bitset> reference;
+  typedef __bit_const_reference<__bitset> const_reference;
+  typedef __bit_iterator<__bitset, false> iterator;
+  typedef __bit_iterator<__bitset, true> const_iterator;
+
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __bitset() noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __bitset(unsigned long long __v) noexcept;
+
+  _LIBCUDACXX_INLINE_VISIBILITY reference __make_ref(size_t __pos) noexcept
+  {
+    return reference(&__first_, __storage_type(1) << __pos);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference __make_ref(size_t __pos) const noexcept
+  {
+    return const_reference(&__first_, __storage_type(1) << __pos);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY iterator __make_iter(size_t __pos) noexcept
+  {
+    return iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY const_iterator __make_iter(size_t __pos) const noexcept
+  {
+    return const_iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY void operator&=(const __bitset& __v) noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY void operator|=(const __bitset& __v) noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY void operator^=(const __bitset& __v) noexcept;
+
+  _LIBCUDACXX_INLINE_VISIBILITY void flip() noexcept;
+
+  _LIBCUDACXX_INLINE_VISIBILITY unsigned long to_ulong() const;
+  _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong() const;
+
+  _LIBCUDACXX_INLINE_VISIBILITY bool all() const noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY bool any() const noexcept;
+
+  _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept;
 };
 
 template <size_t _Size>
-inline constexpr
-__bitset<1, _Size>::__bitset() noexcept
+inline constexpr __bitset<1, _Size>::__bitset() noexcept
     : __first_(0)
-{
-}
+{}
 
 template <size_t _Size>
-inline constexpr
-__bitset<1, _Size>::__bitset(unsigned long long __v) noexcept
-    : __first_(
-        _Size == __bits_per_word ? static_cast<__storage_type>(__v)
-                                 : static_cast<__storage_type>(__v) & ((__storage_type(1) << _Size) - 1)
-    )
-{
-}
+inline constexpr __bitset<1, _Size>::__bitset(unsigned long long __v) noexcept
+    : __first_(_Size == __bits_per_word ? static_cast<__storage_type>(__v)
+                                        : static_cast<__storage_type>(__v) & ((__storage_type(1) << _Size) - 1))
+{}
 
 template <size_t _Size>
-inline
-void
-__bitset<1, _Size>::operator&=(const __bitset& __v) noexcept
+inline void __bitset<1, _Size>::operator&=(const __bitset& __v) noexcept
 {
-    __first_ &= __v.__first_;
+  __first_ &= __v.__first_;
 }
 
 template <size_t _Size>
-inline
-void
-__bitset<1, _Size>::operator|=(const __bitset& __v) noexcept
+inline void __bitset<1, _Size>::operator|=(const __bitset& __v) noexcept
 {
-    __first_ |= __v.__first_;
+  __first_ |= __v.__first_;
 }
 
 template <size_t _Size>
-inline
-void
-__bitset<1, _Size>::operator^=(const __bitset& __v) noexcept
+inline void __bitset<1, _Size>::operator^=(const __bitset& __v) noexcept
 {
-    __first_ ^= __v.__first_;
+  __first_ ^= __v.__first_;
 }
 
 template <size_t _Size>
-inline
-void
-__bitset<1, _Size>::flip() noexcept
+inline void __bitset<1, _Size>::flip() noexcept
 {
-    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size);
-    __first_ = ~__first_;
-    __first_ &= __m;
+  __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size);
+  __first_           = ~__first_;
+  __first_ &= __m;
 }
 
 template <size_t _Size>
-inline
-unsigned long
-__bitset<1, _Size>::to_ulong() const
+inline unsigned long __bitset<1, _Size>::to_ulong() const
 {
-    return __first_;
+  return __first_;
 }
 
 template <size_t _Size>
-inline
-unsigned long long
-__bitset<1, _Size>::to_ullong() const
+inline unsigned long long __bitset<1, _Size>::to_ullong() const
 {
-    return __first_;
+  return __first_;
 }
 
 template <size_t _Size>
-inline
-bool
-__bitset<1, _Size>::all() const noexcept
+inline bool __bitset<1, _Size>::all() const noexcept
 {
-    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size);
-    return !(~__first_ & __m);
+  __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size);
+  return !(~__first_ & __m);
 }
 
 template <size_t _Size>
-inline
-bool
-__bitset<1, _Size>::any() const noexcept
+inline bool __bitset<1, _Size>::any() const noexcept
 {
-    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size);
-    return __first_ & __m;
+  __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size);
+  return __first_ & __m;
 }
 
 template <size_t _Size>
-inline
-size_t
-__bitset<1, _Size>::__hash_code() const noexcept
+inline size_t __bitset<1, _Size>::__hash_code() const noexcept
 {
-    return __first_;
+  return __first_;
 }
 
 template <>
 class __bitset<0, 0>
 {
 public:
-    typedef ptrdiff_t              difference_type;
-    typedef size_t                 size_type;
-    typedef size_type              __storage_type;
+  typedef ptrdiff_t difference_type;
+  typedef size_t size_type;
+  typedef size_type __storage_type;
+
 protected:
-    typedef __bitset __self;
-    typedef       __storage_type*  __storage_pointer;
-    typedef const __storage_type*  __const_storage_pointer;
-    static const unsigned __bits_per_word = static_cast<unsigned>(sizeof(__storage_type) * CHAR_BIT);
-
-    friend class __bit_reference<__bitset>;
-    friend class __bit_const_reference<__bitset>;
-    friend class __bit_iterator<__bitset, false>;
-    friend class __bit_iterator<__bitset, true>;
-    friend struct __bit_array<__bitset>;
-
-    typedef __bit_reference<__bitset>                  reference;
-    typedef __bit_const_reference<__bitset>            const_reference;
-    typedef __bit_iterator<__bitset, false>            iterator;
-    typedef __bit_iterator<__bitset, true>             const_iterator;
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    constexpr __bitset() noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    explicit constexpr __bitset(unsigned long long) noexcept;
-
-    _LIBCUDACXX_INLINE_VISIBILITY reference __make_ref(size_t) noexcept
-        {return reference(0, 1);}
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference __make_ref(size_t) const noexcept
-        {return const_reference(0, 1);}
-    _LIBCUDACXX_INLINE_VISIBILITY iterator __make_iter(size_t) noexcept
-        {return iterator(0, 0);}
-    _LIBCUDACXX_INLINE_VISIBILITY const_iterator __make_iter(size_t) const noexcept
-        {return const_iterator(0, 0);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY void operator&=(const __bitset&) noexcept {}
-    _LIBCUDACXX_INLINE_VISIBILITY void operator|=(const __bitset&) noexcept {}
-    _LIBCUDACXX_INLINE_VISIBILITY void operator^=(const __bitset&) noexcept {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY void flip() noexcept {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY unsigned long to_ulong() const {return 0;}
-    _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong() const {return 0;}
-
-    _LIBCUDACXX_INLINE_VISIBILITY bool all() const noexcept {return true;}
-    _LIBCUDACXX_INLINE_VISIBILITY bool any() const noexcept {return false;}
-
-    _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept {return 0;}
+  typedef __bitset __self;
+  typedef __storage_type* __storage_pointer;
+  typedef const __storage_type* __const_storage_pointer;
+  static const unsigned __bits_per_word = static_cast<unsigned>(sizeof(__storage_type) * CHAR_BIT);
+
+  friend class __bit_reference<__bitset>;
+  friend class __bit_const_reference<__bitset>;
+  friend class __bit_iterator<__bitset, false>;
+  friend class __bit_iterator<__bitset, true>;
+  friend struct __bit_array<__bitset>;
+
+  typedef __bit_reference<__bitset> reference;
+  typedef __bit_const_reference<__bitset> const_reference;
+  typedef __bit_iterator<__bitset, false> iterator;
+  typedef __bit_iterator<__bitset, true> const_iterator;
+
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __bitset() noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __bitset(unsigned long long) noexcept;
+
+  _LIBCUDACXX_INLINE_VISIBILITY reference __make_ref(size_t) noexcept
+  {
+    return reference(0, 1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference __make_ref(size_t) const noexcept
+  {
+    return const_reference(0, 1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY iterator __make_iter(size_t) noexcept
+  {
+    return iterator(0, 0);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY const_iterator __make_iter(size_t) const noexcept
+  {
+    return const_iterator(0, 0);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY void operator&=(const __bitset&) noexcept {}
+  _LIBCUDACXX_INLINE_VISIBILITY void operator|=(const __bitset&) noexcept {}
+  _LIBCUDACXX_INLINE_VISIBILITY void operator^=(const __bitset&) noexcept {}
+
+  _LIBCUDACXX_INLINE_VISIBILITY void flip() noexcept {}
+
+  _LIBCUDACXX_INLINE_VISIBILITY unsigned long to_ulong() const
+  {
+    return 0;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong() const
+  {
+    return 0;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY bool all() const noexcept
+  {
+    return true;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool any() const noexcept
+  {
+    return false;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept
+  {
+    return 0;
+  }
 };
 
-inline
-constexpr
-__bitset<0, 0>::__bitset() noexcept
-{
-}
+inline constexpr __bitset<0, 0>::__bitset() noexcept {}
 
-inline
-constexpr
-__bitset<0, 0>::__bitset(unsigned long long) noexcept
-{
-}
+inline constexpr __bitset<0, 0>::__bitset(unsigned long long) noexcept {}
 
-template <size_t _Size> class _LIBCUDACXX_TEMPLATE_VIS bitset;
-template <size_t _Size> struct hash<bitset<_Size> >;
+template <size_t _Size>
+class _LIBCUDACXX_TEMPLATE_VIS bitset;
+template <size_t _Size>
+struct hash<bitset<_Size>>;
 
 template <size_t _Size>
 class _LIBCUDACXX_TEMPLATE_VIS bitset
     : private __bitset<_Size == 0 ? 0 : (_Size - 1) / (sizeof(size_t) * CHAR_BIT) + 1, _Size>
 {
 public:
-    static const unsigned __n_words = _Size == 0 ? 0 : (_Size - 1) / (sizeof(size_t) * CHAR_BIT) + 1;
-    typedef __bitset<__n_words, _Size> base;
+  static const unsigned __n_words = _Size == 0 ? 0 : (_Size - 1) / (sizeof(size_t) * CHAR_BIT) + 1;
+  typedef __bitset<__n_words, _Size> base;
 
 public:
-    typedef typename base::reference       reference;
-    typedef typename base::const_reference const_reference;
-
-    // 23.3.5.1 constructors:
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr bitset() noexcept {}
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-        bitset(unsigned long long __v) noexcept : base(__v) {}
-    template<class _CharT, class = _EnableIf<_IsCharLikeType<_CharT>::value> >
-        explicit bitset(const _CharT* __str,
-                        typename basic_string<_CharT>::size_type __n = basic_string<_CharT>::npos,
-                        _CharT __zero = _CharT('0'), _CharT __one = _CharT('1'));
-    template<class _CharT, class _Traits, class _Allocator>
-        explicit bitset(const basic_string<_CharT,_Traits,_Allocator>& __str,
-                        typename basic_string<_CharT,_Traits,_Allocator>::size_type __pos = 0,
-                        typename basic_string<_CharT,_Traits,_Allocator>::size_type __n =
-                                (basic_string<_CharT,_Traits,_Allocator>::npos),
-                        _CharT __zero = _CharT('0'), _CharT __one = _CharT('1'));
-
-    // 23.3.5.2 bitset operations:
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bitset& operator&=(const bitset& __rhs) noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bitset& operator|=(const bitset& __rhs) noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bitset& operator^=(const bitset& __rhs) noexcept;
-    bitset& operator<<=(size_t __pos) noexcept;
-    bitset& operator>>=(size_t __pos) noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bitset& set() noexcept;
-    bitset& set(size_t __pos, bool __val = true);
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bitset& reset() noexcept;
-    bitset& reset(size_t __pos);
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bitset  operator~() const noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bitset& flip() noexcept;
-    bitset& flip(size_t __pos);
-
-    // element access:
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-                              const_reference operator[](size_t __p) const {return base::__make_ref(__p);}
-    _LIBCUDACXX_INLINE_VISIBILITY       reference operator[](size_t __p)       {return base::__make_ref(__p);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    unsigned long to_ulong() const;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    unsigned long long to_ullong() const;
-    template <class _CharT, class _Traits, class _Allocator>
-        basic_string<_CharT, _Traits, _Allocator> to_string(_CharT __zero = _CharT('0'),
-                                                            _CharT __one = _CharT('1')) const;
-    template <class _CharT, class _Traits>
-        _LIBCUDACXX_INLINE_VISIBILITY
-        basic_string<_CharT, _Traits, allocator<_CharT> > to_string(_CharT __zero = _CharT('0'),
-                                                                    _CharT __one = _CharT('1')) const;
-    template <class _CharT>
-        _LIBCUDACXX_INLINE_VISIBILITY
-        basic_string<_CharT, char_traits<_CharT>, allocator<_CharT> > to_string(_CharT __zero = _CharT('0'),
-                                                                                _CharT __one = _CharT('1')) const;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    basic_string<char, char_traits<char>, allocator<char> > to_string(char __zero = '0',
-                                                                      char __one = '1') const;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    size_t count() const noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr size_t size() const noexcept {return _Size;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool operator==(const bitset& __rhs) const noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool operator!=(const bitset& __rhs) const noexcept;
-    bool test(size_t __pos) const;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool all() const noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool any() const noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY bool none() const noexcept {return !any();}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bitset operator<<(size_t __pos) const noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bitset operator>>(size_t __pos) const noexcept;
+  typedef typename base::reference reference;
+  typedef typename base::const_reference const_reference;
+
+  // 23.3.5.1 constructors:
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr bitset() noexcept {}
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr bitset(unsigned long long __v) noexcept
+      : base(__v)
+  {}
+  template <class _CharT, class = _EnableIf<_IsCharLikeType<_CharT>::value>>
+  explicit bitset(const _CharT* __str,
+                  typename basic_string<_CharT>::size_type __n = basic_string<_CharT>::npos,
+                  _CharT __zero                                = _CharT('0'),
+                  _CharT __one                                 = _CharT('1'));
+  template <class _CharT, class _Traits, class _Allocator>
+  explicit bitset(const basic_string<_CharT, _Traits, _Allocator>& __str,
+                  typename basic_string<_CharT, _Traits, _Allocator>::size_type __pos = 0,
+                  typename basic_string<_CharT, _Traits, _Allocator>::size_type __n =
+                    (basic_string<_CharT, _Traits, _Allocator>::npos),
+                  _CharT __zero = _CharT('0'),
+                  _CharT __one  = _CharT('1'));
+
+  // 23.3.5.2 bitset operations:
+  _LIBCUDACXX_INLINE_VISIBILITY bitset& operator&=(const bitset& __rhs) noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY bitset& operator|=(const bitset& __rhs) noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY bitset& operator^=(const bitset& __rhs) noexcept;
+  bitset& operator<<=(size_t __pos) noexcept;
+  bitset& operator>>=(size_t __pos) noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY bitset& set() noexcept;
+  bitset& set(size_t __pos, bool __val = true);
+  _LIBCUDACXX_INLINE_VISIBILITY bitset& reset() noexcept;
+  bitset& reset(size_t __pos);
+  _LIBCUDACXX_INLINE_VISIBILITY bitset operator~() const noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY bitset& flip() noexcept;
+  bitset& flip(size_t __pos);
+
+  // element access:
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference operator[](size_t __p) const
+  {
+    return base::__make_ref(__p);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY reference operator[](size_t __p)
+  {
+    return base::__make_ref(__p);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY unsigned long to_ulong() const;
+  _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong() const;
+  template <class _CharT, class _Traits, class _Allocator>
+  basic_string<_CharT, _Traits, _Allocator> to_string(_CharT __zero = _CharT('0'), _CharT __one = _CharT('1')) const;
+  template <class _CharT, class _Traits>
+  _LIBCUDACXX_INLINE_VISIBILITY basic_string<_CharT, _Traits, allocator<_CharT>>
+  to_string(_CharT __zero = _CharT('0'), _CharT __one = _CharT('1')) const;
+  template <class _CharT>
+  _LIBCUDACXX_INLINE_VISIBILITY basic_string<_CharT, char_traits<_CharT>, allocator<_CharT>>
+  to_string(_CharT __zero = _CharT('0'), _CharT __one = _CharT('1')) const;
+  _LIBCUDACXX_INLINE_VISIBILITY basic_string<char, char_traits<char>, allocator<char>>
+  to_string(char __zero = '0', char __one = '1') const;
+  _LIBCUDACXX_INLINE_VISIBILITY size_t count() const noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr size_t size() const noexcept
+  {
+    return _Size;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool operator==(const bitset& __rhs) const noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY bool operator!=(const bitset& __rhs) const noexcept;
+  bool test(size_t __pos) const;
+  _LIBCUDACXX_INLINE_VISIBILITY bool all() const noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY bool any() const noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY bool none() const noexcept
+  {
+    return !any();
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bitset operator<<(size_t __pos) const noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY bitset operator>>(size_t __pos) const noexcept;
 
 private:
+  _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept
+  {
+    return base::__hash_code();
+  }
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    size_t __hash_code() const noexcept {return base::__hash_code();}
-
-    friend struct hash<bitset>;
+  friend struct hash<bitset>;
 };
 
 template <size_t _Size>
-template<class _CharT, class>
-bitset<_Size>::bitset(const _CharT* __str,
-                      typename basic_string<_CharT>::size_type __n,
-                      _CharT __zero, _CharT __one)
+template <class _CharT, class>
+bitset<_Size>::bitset(const _CharT* __str, typename basic_string<_CharT>::size_type __n, _CharT __zero, _CharT __one)
 {
-    size_t __rlen = _CUDA_VSTD::min(__n, char_traits<_CharT>::length(__str));
-    for (size_t __i = 0; __i < __rlen; ++__i)
-        if (__str[__i] != __zero && __str[__i] != __one)
-            __throw_invalid_argument("bitset string ctor has invalid argument");
-
-    size_t _Mp = _CUDA_VSTD::min(__rlen, _Size);
-    size_t __i = 0;
-    for (; __i < _Mp; ++__i)
+  size_t __rlen = _CUDA_VSTD::min(__n, char_traits<_CharT>::length(__str));
+  for (size_t __i = 0; __i < __rlen; ++__i)
+  {
+    if (__str[__i] != __zero && __str[__i] != __one)
     {
-        _CharT __c = __str[_Mp - 1 - __i];
-        if (__c == __zero)
-            (*this)[__i] = false;
-        else
-            (*this)[__i] = true;
+      __throw_invalid_argument("bitset string ctor has invalid argument");
     }
-    _CUDA_VSTD::fill(base::__make_iter(__i), base::__make_iter(_Size), false);
+  }
+
+  size_t _Mp = _CUDA_VSTD::min(__rlen, _Size);
+  size_t __i = 0;
+  for (; __i < _Mp; ++__i)
+  {
+    _CharT __c = __str[_Mp - 1 - __i];
+    if (__c == __zero)
+    {
+      (*this)[__i] = false;
+    }
+    else
+    {
+      (*this)[__i] = true;
+    }
+  }
+  _CUDA_VSTD::fill(base::__make_iter(__i), base::__make_iter(_Size), false);
 }
 
 template <size_t _Size>
-template<class _CharT, class _Traits, class _Allocator>
-bitset<_Size>::bitset(const basic_string<_CharT,_Traits,_Allocator>& __str,
-       typename basic_string<_CharT,_Traits,_Allocator>::size_type __pos,
-       typename basic_string<_CharT,_Traits,_Allocator>::size_type __n,
-       _CharT __zero, _CharT __one)
-{
-    if (__pos > __str.size())
-        __throw_out_of_range("bitset string pos out of range");
-
-    size_t __rlen = _CUDA_VSTD::min(__n, __str.size() - __pos);
-    for (size_t __i = __pos; __i < __pos + __rlen; ++__i)
-        if (!_Traits::eq(__str[__i], __zero) && !_Traits::eq(__str[__i], __one))
-            __throw_invalid_argument("bitset string ctor has invalid argument");
-
-    size_t _Mp = _CUDA_VSTD::min(__rlen, _Size);
-    size_t __i = 0;
-    for (; __i < _Mp; ++__i)
+template <class _CharT, class _Traits, class _Allocator>
+bitset<_Size>::bitset(
+  const basic_string<_CharT, _Traits, _Allocator>& __str,
+  typename basic_string<_CharT, _Traits, _Allocator>::size_type __pos,
+  typename basic_string<_CharT, _Traits, _Allocator>::size_type __n,
+  _CharT __zero,
+  _CharT __one)
+{
+  if (__pos > __str.size())
+  {
+    __throw_out_of_range("bitset string pos out of range");
+  }
+
+  size_t __rlen = _CUDA_VSTD::min(__n, __str.size() - __pos);
+  for (size_t __i = __pos; __i < __pos + __rlen; ++__i)
+  {
+    if (!_Traits::eq(__str[__i], __zero) && !_Traits::eq(__str[__i], __one))
+    {
+      __throw_invalid_argument("bitset string ctor has invalid argument");
+    }
+  }
+
+  size_t _Mp = _CUDA_VSTD::min(__rlen, _Size);
+  size_t __i = 0;
+  for (; __i < _Mp; ++__i)
+  {
+    _CharT __c = __str[__pos + _Mp - 1 - __i];
+    if (_Traits::eq(__c, __zero))
     {
-        _CharT __c = __str[__pos + _Mp - 1 - __i];
-        if (_Traits::eq(__c, __zero))
-            (*this)[__i] = false;
-        else
-            (*this)[__i] = true;
+      (*this)[__i] = false;
     }
-    _CUDA_VSTD::fill(base::__make_iter(__i), base::__make_iter(_Size), false);
+    else
+    {
+      (*this)[__i] = true;
+    }
+  }
+  _CUDA_VSTD::fill(base::__make_iter(__i), base::__make_iter(_Size), false);
 }
 
 template <size_t _Size>
-inline
-bitset<_Size>&
-bitset<_Size>::operator&=(const bitset& __rhs) noexcept
+inline bitset<_Size>& bitset<_Size>::operator&=(const bitset& __rhs) noexcept
 {
-    base::operator&=(__rhs);
-    return *this;
+  base::operator&=(__rhs);
+  return *this;
 }
 
 template <size_t _Size>
-inline
-bitset<_Size>&
-bitset<_Size>::operator|=(const bitset& __rhs) noexcept
+inline bitset<_Size>& bitset<_Size>::operator|=(const bitset& __rhs) noexcept
 {
-    base::operator|=(__rhs);
-    return *this;
+  base::operator|=(__rhs);
+  return *this;
 }
 
 template <size_t _Size>
-inline
-bitset<_Size>&
-bitset<_Size>::operator^=(const bitset& __rhs) noexcept
+inline bitset<_Size>& bitset<_Size>::operator^=(const bitset& __rhs) noexcept
 {
-    base::operator^=(__rhs);
-    return *this;
+  base::operator^=(__rhs);
+  return *this;
 }
 
 template <size_t _Size>
-bitset<_Size>&
-bitset<_Size>::operator<<=(size_t __pos) noexcept
+bitset<_Size>& bitset<_Size>::operator<<=(size_t __pos) noexcept
 {
-    __pos = _CUDA_VSTD::min(__pos, _Size);
-    _CUDA_VSTD::copy_backward(base::__make_iter(0), base::__make_iter(_Size - __pos), base::__make_iter(_Size));
-    _CUDA_VSTD::fill_n(base::__make_iter(0), __pos, false);
-    return *this;
+  __pos = _CUDA_VSTD::min(__pos, _Size);
+  _CUDA_VSTD::copy_backward(base::__make_iter(0), base::__make_iter(_Size - __pos), base::__make_iter(_Size));
+  _CUDA_VSTD::fill_n(base::__make_iter(0), __pos, false);
+  return *this;
 }
 
 template <size_t _Size>
-bitset<_Size>&
-bitset<_Size>::operator>>=(size_t __pos) noexcept
+bitset<_Size>& bitset<_Size>::operator>>=(size_t __pos) noexcept
 {
-    __pos = _CUDA_VSTD::min(__pos, _Size);
-    _CUDA_VSTD::copy(base::__make_iter(__pos), base::__make_iter(_Size), base::__make_iter(0));
-    _CUDA_VSTD::fill_n(base::__make_iter(_Size - __pos), __pos, false);
-    return *this;
+  __pos = _CUDA_VSTD::min(__pos, _Size);
+  _CUDA_VSTD::copy(base::__make_iter(__pos), base::__make_iter(_Size), base::__make_iter(0));
+  _CUDA_VSTD::fill_n(base::__make_iter(_Size - __pos), __pos, false);
+  return *this;
 }
 
 template <size_t _Size>
-inline
-bitset<_Size>&
-bitset<_Size>::set() noexcept
+inline bitset<_Size>& bitset<_Size>::set() noexcept
 {
-    _CUDA_VSTD::fill_n(base::__make_iter(0), _Size, true);
-    return *this;
+  _CUDA_VSTD::fill_n(base::__make_iter(0), _Size, true);
+  return *this;
 }
 
 template <size_t _Size>
-bitset<_Size>&
-bitset<_Size>::set(size_t __pos, bool __val)
+bitset<_Size>& bitset<_Size>::set(size_t __pos, bool __val)
 {
-    if (__pos >= _Size)
-        __throw_out_of_range("bitset set argument out of range");
+  if (__pos >= _Size)
+  {
+    __throw_out_of_range("bitset set argument out of range");
+  }
 
-    (*this)[__pos] = __val;
-    return *this;
+  (*this)[__pos] = __val;
+  return *this;
 }
 
 template <size_t _Size>
-inline
-bitset<_Size>&
-bitset<_Size>::reset() noexcept
+inline bitset<_Size>& bitset<_Size>::reset() noexcept
 {
-    _CUDA_VSTD::fill_n(base::__make_iter(0), _Size, false);
-    return *this;
+  _CUDA_VSTD::fill_n(base::__make_iter(0), _Size, false);
+  return *this;
 }
 
 template <size_t _Size>
-bitset<_Size>&
-bitset<_Size>::reset(size_t __pos)
+bitset<_Size>& bitset<_Size>::reset(size_t __pos)
 {
-    if (__pos >= _Size)
-        __throw_out_of_range("bitset reset argument out of range");
+  if (__pos >= _Size)
+  {
+    __throw_out_of_range("bitset reset argument out of range");
+  }
 
-    (*this)[__pos] = false;
-    return *this;
+  (*this)[__pos] = false;
+  return *this;
 }
 
 template <size_t _Size>
-inline
-bitset<_Size>
-bitset<_Size>::operator~() const noexcept
+inline bitset<_Size> bitset<_Size>::operator~() const noexcept
 {
-    bitset __x(*this);
-    __x.flip();
-    return __x;
+  bitset __x(*this);
+  __x.flip();
+  return __x;
 }
 
 template <size_t _Size>
-inline
-bitset<_Size>&
-bitset<_Size>::flip() noexcept
+inline bitset<_Size>& bitset<_Size>::flip() noexcept
 {
-    base::flip();
-    return *this;
+  base::flip();
+  return *this;
 }
 
 template <size_t _Size>
-bitset<_Size>&
-bitset<_Size>::flip(size_t __pos)
+bitset<_Size>& bitset<_Size>::flip(size_t __pos)
 {
-    if (__pos >= _Size)
-        __throw_out_of_range("bitset flip argument out of range");
+  if (__pos >= _Size)
+  {
+    __throw_out_of_range("bitset flip argument out of range");
+  }
 
-    reference r = base::__make_ref(__pos);
-    r = ~r;
-    return *this;
+  reference r = base::__make_ref(__pos);
+  r           = ~r;
+  return *this;
 }
 
 template <size_t _Size>
-inline
-unsigned long
-bitset<_Size>::to_ulong() const
+inline unsigned long bitset<_Size>::to_ulong() const
 {
-    return base::to_ulong();
+  return base::to_ulong();
 }
 
 template <size_t _Size>
-inline
-unsigned long long
-bitset<_Size>::to_ullong() const
+inline unsigned long long bitset<_Size>::to_ullong() const
 {
-    return base::to_ullong();
+  return base::to_ullong();
 }
 
 template <size_t _Size>
 template <class _CharT, class _Traits, class _Allocator>
-basic_string<_CharT, _Traits, _Allocator>
-bitset<_Size>::to_string(_CharT __zero, _CharT __one) const
+basic_string<_CharT, _Traits, _Allocator> bitset<_Size>::to_string(_CharT __zero, _CharT __one) const
 {
-    basic_string<_CharT, _Traits, _Allocator> __r(_Size, __zero);
-    for (size_t __i = 0; __i < _Size; ++__i)
+  basic_string<_CharT, _Traits, _Allocator> __r(_Size, __zero);
+  for (size_t __i = 0; __i < _Size; ++__i)
+  {
+    if ((*this)[__i])
     {
-        if ((*this)[__i])
-            __r[_Size - 1 - __i] = __one;
+      __r[_Size - 1 - __i] = __one;
     }
-    return __r;
+  }
+  return __r;
 }
 
 template <size_t _Size>
 template <class _CharT, class _Traits>
-inline
-basic_string<_CharT, _Traits, allocator<_CharT> >
-bitset<_Size>::to_string(_CharT __zero, _CharT __one) const
+inline basic_string<_CharT, _Traits, allocator<_CharT>> bitset<_Size>::to_string(_CharT __zero, _CharT __one) const
 {
-    return to_string<_CharT, _Traits, allocator<_CharT> >(__zero, __one);
+  return to_string<_CharT, _Traits, allocator<_CharT>>(__zero, __one);
 }
 
 template <size_t _Size>
 template <class _CharT>
-inline
-basic_string<_CharT, char_traits<_CharT>, allocator<_CharT> >
+inline basic_string<_CharT, char_traits<_CharT>, allocator<_CharT>>
 bitset<_Size>::to_string(_CharT __zero, _CharT __one) const
 {
-    return to_string<_CharT, char_traits<_CharT>, allocator<_CharT> >(__zero, __one);
+  return to_string<_CharT, char_traits<_CharT>, allocator<_CharT>>(__zero, __one);
 }
 
 template <size_t _Size>
-inline
-basic_string<char, char_traits<char>, allocator<char> >
-bitset<_Size>::to_string(char __zero, char __one) const
+inline basic_string<char, char_traits<char>, allocator<char>> bitset<_Size>::to_string(char __zero, char __one) const
 {
-    return to_string<char, char_traits<char>, allocator<char> >(__zero, __one);
+  return to_string<char, char_traits<char>, allocator<char>>(__zero, __one);
 }
 
 template <size_t _Size>
-inline
-size_t
-bitset<_Size>::count() const noexcept
+inline size_t bitset<_Size>::count() const noexcept
 {
-    return static_cast<size_t>(__count_bool_true(base::__make_iter(0), _Size));
+  return static_cast<size_t>(__count_bool_true(base::__make_iter(0), _Size));
 }
 
 template <size_t _Size>
-inline
-bool
-bitset<_Size>::operator==(const bitset& __rhs) const noexcept
+inline bool bitset<_Size>::operator==(const bitset& __rhs) const noexcept
 {
-    return _CUDA_VSTD::equal(base::__make_iter(0), base::__make_iter(_Size), __rhs.__make_iter(0));
+  return _CUDA_VSTD::equal(base::__make_iter(0), base::__make_iter(_Size), __rhs.__make_iter(0));
 }
 
 template <size_t _Size>
-inline
-bool
-bitset<_Size>::operator!=(const bitset& __rhs) const noexcept
+inline bool bitset<_Size>::operator!=(const bitset& __rhs) const noexcept
 {
-    return !(*this == __rhs);
+  return !(*this == __rhs);
 }
 
 template <size_t _Size>
-bool
-bitset<_Size>::test(size_t __pos) const
+bool bitset<_Size>::test(size_t __pos) const
 {
-    if (__pos >= _Size)
-        __throw_out_of_range("bitset test argument out of range");
+  if (__pos >= _Size)
+  {
+    __throw_out_of_range("bitset test argument out of range");
+  }
 
-    return (*this)[__pos];
+  return (*this)[__pos];
 }
 
 template <size_t _Size>
-inline
-bool
-bitset<_Size>::all() const noexcept
+inline bool bitset<_Size>::all() const noexcept
 {
-    return base::all();
+  return base::all();
 }
 
 template <size_t _Size>
-inline
-bool
-bitset<_Size>::any() const noexcept
+inline bool bitset<_Size>::any() const noexcept
 {
-    return base::any();
+  return base::any();
 }
 
 template <size_t _Size>
-inline
-bitset<_Size>
-bitset<_Size>::operator<<(size_t __pos) const noexcept
+inline bitset<_Size> bitset<_Size>::operator<<(size_t __pos) const noexcept
 {
-    bitset __r = *this;
-    __r <<= __pos;
-    return __r;
+  bitset __r = *this;
+  __r <<= __pos;
+  return __r;
 }
 
 template <size_t _Size>
-inline
-bitset<_Size>
-bitset<_Size>::operator>>(size_t __pos) const noexcept
+inline bitset<_Size> bitset<_Size>::operator>>(size_t __pos) const noexcept
 {
-    bitset __r = *this;
-    __r >>= __pos;
-    return __r;
+  bitset __r = *this;
+  __r >>= __pos;
+  return __r;
 }
 
 template <size_t _Size>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bitset<_Size>
-operator&(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY bitset<_Size> operator&(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept
 {
-    bitset<_Size> __r = __x;
-    __r &= __y;
-    return __r;
+  bitset<_Size> __r = __x;
+  __r &= __y;
+  return __r;
 }
 
 template <size_t _Size>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bitset<_Size>
-operator|(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY bitset<_Size> operator|(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept
 {
-    bitset<_Size> __r = __x;
-    __r |= __y;
-    return __r;
+  bitset<_Size> __r = __x;
+  __r |= __y;
+  return __r;
 }
 
 template <size_t _Size>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bitset<_Size>
-operator^(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY bitset<_Size> operator^(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept
 {
-    bitset<_Size> __r = __x;
-    __r ^= __y;
-    return __r;
+  bitset<_Size> __r = __x;
+  __r ^= __y;
+  return __r;
 }
 
 template <size_t _Size>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<bitset<_Size> >
-    : public __unary_function<bitset<_Size>, size_t>
+struct _LIBCUDACXX_TEMPLATE_VIS hash<bitset<_Size>> : public __unary_function<bitset<_Size>, size_t>
 {
-    _LIBCUDACXX_INLINE_VISIBILITY
-    size_t operator()(const bitset<_Size>& __bs) const noexcept
-        {return __bs.__hash_code();}
+  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(const bitset<_Size>& __bs) const noexcept
+  {
+    return __bs.__hash_code();
+  }
 };
 
 template <class _CharT, class _Traits, size_t _Size>
-basic_istream<_CharT, _Traits>&
-operator>>(basic_istream<_CharT, _Traits>& __is, bitset<_Size>& __x);
+basic_istream<_CharT, _Traits>& operator>>(basic_istream<_CharT, _Traits>& __is, bitset<_Size>& __x);
 
 template <class _CharT, class _Traits, size_t _Size>
-basic_ostream<_CharT, _Traits>&
-operator<<(basic_ostream<_CharT, _Traits>& __os, const bitset<_Size>& __x);
+basic_ostream<_CharT, _Traits>& operator<<(basic_ostream<_CharT, _Traits>& __os, const bitset<_Size>& __x);
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
 _LIBCUDACXX_POP_MACROS
 
-#endif  // _LIBCUDACXX_BITSET
+#endif // _LIBCUDACXX_BITSET
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/concepts b/libcudacxx/include/cuda/std/detail/libcxx/include/concepts
index 15f041190c0..24995197262 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/concepts
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/concepts
@@ -140,7 +140,6 @@ namespace std {
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
 #include <cuda/std/__concepts/__concept_macros.h>
 #include <cuda/std/__concepts/_One_of.h>
 #include <cuda/std/__concepts/all_of.h>
@@ -166,7 +165,7 @@ namespace std {
 #include <cuda/std/__concepts/semiregular.h>
 #include <cuda/std/__concepts/swappable.h>
 #include <cuda/std/__concepts/totally_ordered.h>
-
+#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
 #include <cuda/std/version>
 
 #endif // _LIBCUDACXX_CONCEPTS
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/version b/libcudacxx/include/cuda/std/detail/libcxx/include/version
index 9c81e18dcb2..08f33681920 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/version
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/version
@@ -10,7 +10,6 @@
 #ifndef _LIBCUDACXX_VERSIONH
 #define _LIBCUDACXX_VERSIONH
 
-
 /*
   version synopsis
 
@@ -212,131 +211,131 @@ __cpp_lib_void_t                                        201411L <type_traits>
 // We need to define our own macros to not conflict with the host stl.
 // At the same time we want bring in all feature test macros from host
 #if __has_include(<version>) // <version> should be the smallest include possible
-#include <version>
+#  include <version>
 #elif !defined(_CCCL_COMPILER_NVRTC)
-#include <ciso646> // otherwise go for the smallest possible header
+#  include <ciso646> // otherwise go for the smallest possible header
 #endif
 
 #if _CCCL_STD_VER > 2011
-# define __cccl_lib_chrono_udls                          201304L
-# define __cccl_lib_complex_udls                         201309L
-#ifdef _LIBCUDACXX_IS_CONSTANT_EVALUATED
-# define __cccl_lib_constexpr_complex                    201711L
-#endif
-# define __cccl_lib_concepts                             202002L
-# define __cccl_lib_exchange_function                    201304L
-# define __cccl_lib_expected                             202211L
+#  define __cccl_lib_chrono_udls  201304L
+#  define __cccl_lib_complex_udls 201309L
+#  ifdef _LIBCUDACXX_IS_CONSTANT_EVALUATED
+#    define __cccl_lib_constexpr_complex 201711L
+#  endif
+#  define __cccl_lib_concepts          202002L
+#  define __cccl_lib_exchange_function 201304L
+#  define __cccl_lib_expected          202211L
 // # define __cccl_lib_generic_associative_lookup           201304L
-# define __cccl_lib_integer_sequence                     201304L
-# define __cccl_lib_integral_constant_callable           201304L
-# define __cccl_lib_is_final                             201402L
-# define __cccl_lib_is_null_pointer                      201309L
-# define __cccl_lib_make_reverse_iterator                201402L
+#  define __cccl_lib_integer_sequence           201304L
+#  define __cccl_lib_integral_constant_callable 201304L
+#  define __cccl_lib_is_final                   201402L
+#  define __cccl_lib_is_null_pointer            201309L
+#  define __cccl_lib_make_reverse_iterator      201402L
 // # define __cccl_lib_make_unique                          201304L
-# define __cccl_lib_null_iterators                       201304L
-# define __cccl_lib_optional                             202110L
+#  define __cccl_lib_null_iterators 201304L
+#  define __cccl_lib_optional       202110L
 // # define __cccl_lib_quoted_string_io                     201304L
-# define __cccl_lib_result_of_sfinae                     201210L
-# define __cccl_lib_robust_nonmodifying_seq_ops          201304L
-# if !defined(_LIBCUDACXX_HAS_NO_THREADS)
+#  define __cccl_lib_result_of_sfinae            201210L
+#  define __cccl_lib_robust_nonmodifying_seq_ops 201304L
+#  if !defined(_LIBCUDACXX_HAS_NO_THREADS)
 // #   define __cccl_lib_shared_timed_mutex                 201402L
-# endif
-# define __cccl_lib_span                                 202002L
+#  endif
+#  define __cccl_lib_span 202002L
 // # define __cccl_lib_string_udls                          201304L
-# define __cccl_lib_transformation_trait_aliases         201304L
-# define __cccl_lib_transparent_operators                201210L
-# define __cccl_lib_tuple_element_t                      201402L
-# define __cccl_lib_tuples_by_type                       201304L
+#  define __cccl_lib_transformation_trait_aliases 201304L
+#  define __cccl_lib_transparent_operators        201210L
+#  define __cccl_lib_tuple_element_t              201402L
+#  define __cccl_lib_tuples_by_type               201304L
 #endif // _CCCL_STD_VER > 2011
 
 #if _CCCL_STD_VER > 2014
-# if defined(_LIBCUDACXX_ADDRESSOF)
-#   define __cccl_lib_addressof_constexpr                201603L
-# endif
+#  if defined(_LIBCUDACXX_ADDRESSOF)
+#    define __cccl_lib_addressof_constexpr 201603L
+#  endif
 // # define __cccl_lib_allocator_traits_is_always_equal     201411L
 // # define __cccl_lib_any                                  201606L
-# define __cccl_lib_apply                                201603L
-# define __cccl_lib_array_constexpr                      201603L
-# define __cccl_lib_as_const                             201510L
-# if !defined(_LIBCUDACXX_HAS_NO_THREADS)
-#   define __cccl_lib_atomic_is_always_lock_free         201603L
-# endif
-# define __cccl_lib_bind_front                           201907L
-# define __cccl_lib_bool_constant                        201505L
+#  define __cccl_lib_apply           201603L
+#  define __cccl_lib_array_constexpr 201603L
+#  define __cccl_lib_as_const        201510L
+#  if !defined(_LIBCUDACXX_HAS_NO_THREADS)
+#    define __cccl_lib_atomic_is_always_lock_free 201603L
+#  endif
+#  define __cccl_lib_bind_front    201907L
+#  define __cccl_lib_bool_constant 201505L
 // # define __cccl_lib_boyer_moore_searcher                 201603L
-# define __cccl_lib_byte                                 201603L
-# define __cccl_lib_chrono                               201611L
+#  define __cccl_lib_byte   201603L
+#  define __cccl_lib_chrono 201611L
 // # define __cccl_lib_clamp                                201603L
 // # define __cccl_lib_enable_shared_from_this              201603L
 // # define __cccl_lib_execution                            201603L
 // # define __cccl_lib_filesystem                           201703L
-# define __cccl_lib_gcd_lcm                              201606L
-# define __cccl_lib_hardware_interference_size           201703L
-# if defined(_LIBCUDACXX_HAS_UNIQUE_OBJECT_REPRESENTATIONS)
-#   define __cccl_lib_has_unique_object_representations  201606L
-# endif
-# define __cccl_lib_hypot                                201603L
+#  define __cccl_lib_gcd_lcm                    201606L
+#  define __cccl_lib_hardware_interference_size 201703L
+#  if defined(_LIBCUDACXX_HAS_UNIQUE_OBJECT_REPRESENTATIONS)
+#    define __cccl_lib_has_unique_object_representations 201606L
+#  endif
+#  define __cccl_lib_hypot 201603L
 // # define __cccl_lib_incomplete_container_elements        201505L
-# define __cccl_lib_invoke                               201411L
-# if !defined(_LIBCUDACXX_HAS_NO_IS_AGGREGATE)
-#   define __cccl_lib_is_aggregate                       201703L
-# endif
-# define __cccl_lib_is_invocable                         201703L
-# define __cccl_lib_is_swappable                         201603L
-# define __cccl_lib_launder                              201606L
-# define __cccl_lib_logical_traits                       201510L
-# define __cccl_lib_make_from_tuple                      201606L
+#  define __cccl_lib_invoke 201411L
+#  if !defined(_LIBCUDACXX_HAS_NO_IS_AGGREGATE)
+#    define __cccl_lib_is_aggregate 201703L
+#  endif
+#  define __cccl_lib_is_invocable    201703L
+#  define __cccl_lib_is_swappable    201603L
+#  define __cccl_lib_launder         201606L
+#  define __cccl_lib_logical_traits  201510L
+#  define __cccl_lib_make_from_tuple 201606L
 // # define __cccl_lib_map_try_emplace                      201411L
 // # define __cccl_lib_math_special_functions               201603L
 // # define __cccl_lib_memory_resource                      201603L
 // # define __cccl_lib_node_extract                         201606L
 // # define __cccl_lib_nonmember_container_access           201411L
-# define __cccl_lib_not_fn                               201603L
+#  define __cccl_lib_not_fn 201603L
 // # define __cccl_lib_parallel_algorithm                   201603L
 // # define __cccl_lib_raw_memory_algorithms                201606L
 // # define __cccl_lib_sample                               201603L
 // # define __cccl_lib_scoped_lock                          201703L
-# if !defined(_LIBCUDACXX_HAS_NO_THREADS)
+#  if !defined(_LIBCUDACXX_HAS_NO_THREADS)
 // #   define __cccl_lib_shared_mutex                       201505L
-# endif
+#  endif
 // # define __cccl_lib_shared_ptr_arrays                    201611L
 // # define __cccl_lib_shared_ptr_weak_type                 201606L
 // # define __cccl_lib_string_view                          201606L
 // # define __cccl_lib_to_chars                             201611L
-# define __cccl_lib_type_trait_variable_templates        201510L
-# define __cccl_lib_uncaught_exceptions                  201411L
-# define __cccl_lib_unordered_map_try_emplace            201411L
-# define __cccl_lib_variant                              201606L
-# define __cccl_lib_void_t                               201411L
+#  define __cccl_lib_type_trait_variable_templates 201510L
+#  define __cccl_lib_uncaught_exceptions           201411L
+#  define __cccl_lib_unordered_map_try_emplace     201411L
+#  define __cccl_lib_variant                       201606L
+#  define __cccl_lib_void_t                        201411L
 #endif // _CCCL_STD_VER > 2014
 
 #if _CCCL_STD_VER > 2017
-# undef  __cccl_lib_array_constexpr
-# define __cccl_lib_array_constexpr                      201811L
+#  undef __cccl_lib_array_constexpr
+#  define __cccl_lib_array_constexpr 201811L
 // # define __cccl_lib_assume_aligned                       201811L
-# define __cccl_lib_atomic_flag_test                     201907L
-# define __cccl_lib_atomic_float                         201711L
-# define __cccl_lib_atomic_lock_free_type_aliases        201907L
-# if !defined(_LIBCUDACXX_HAS_NO_THREADS)
-#   define __cccl_lib_atomic_ref                         201806L
-#endif
+#  define __cccl_lib_atomic_flag_test              201907L
+#  define __cccl_lib_atomic_float                  201711L
+#  define __cccl_lib_atomic_lock_free_type_aliases 201907L
+#  if !defined(_LIBCUDACXX_HAS_NO_THREADS)
+#    define __cccl_lib_atomic_ref 201806L
+#  endif
 // # define __cccl_lib_atomic_shared_ptr                    201711L
-# define __cccl_lib_atomic_value_initialization          201911L
-# if !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_atomic_wait)
-#   define __cccl_lib_atomic_wait                        201907L
-# endif
-# if !defined(_LIBCUDACXX_HAS_NO_THREADS) && !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_barrier)
-#   define __cccl_lib_barrier                            201907L
-# endif
-# define __cccl_lib_bit_cast                             201806L
-# define __cccl_lib_bitops                               201907L
-# define __cccl_lib_bounded_array_traits                 201902L
-# if !defined(_LIBCUDACXX_NO_HAS_CHAR8_T)
-#   define __cccl_lib_char8_t                            201811L
-# endif
+#  define __cccl_lib_atomic_value_initialization 201911L
+#  if !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_atomic_wait)
+#    define __cccl_lib_atomic_wait 201907L
+#  endif
+#  if !defined(_LIBCUDACXX_HAS_NO_THREADS) && !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_barrier)
+#    define __cccl_lib_barrier 201907L
+#  endif
+#  define __cccl_lib_bit_cast             201806L
+#  define __cccl_lib_bitops               201907L
+#  define __cccl_lib_bounded_array_traits 201902L
+#  if !defined(_LIBCUDACXX_NO_HAS_CHAR8_T)
+#    define __cccl_lib_char8_t 201811L
+#  endif
 // # define __cccl_lib_constexpr_algorithms                 201806L
 // # define __cccl_lib_constexpr_dynamic_alloc              201907L
-# define __cccl_lib_constexpr_functional                 201907L
+#  define __cccl_lib_constexpr_functional 201907L
 // # define __cccl_lib_constexpr_iterator                   201811L
 // # define __cccl_lib_constexpr_memory                     201811L
 // # define __cccl_lib_constexpr_misc                       201811L
@@ -348,40 +347,41 @@ __cpp_lib_void_t                                        201411L <type_traits>
 // # define __cccl_lib_constexpr_utility                    201811L
 // # define __cccl_lib_constexpr_vector                     201907L
 // # define __cccl_lib_coroutine                            201902L
-# if defined(__cpp_impl_destroying_delete) && __cpp_impl_destroying_delete >= 201806L && defined(__cpp_lib_destroying_delete)
-#   define __cccl_lib_destroying_delete                  201806L
-# endif
+#  if defined(__cpp_impl_destroying_delete) && __cpp_impl_destroying_delete >= 201806L \
+    && defined(__cpp_lib_destroying_delete)
+#    define __cccl_lib_destroying_delete 201806L
+#  endif
 // # define __cccl_lib_endian                               201907L
 // # define __cccl_lib_erase_if                             201811L
 // # undef  __cccl_lib_execution
 // # define __cccl_lib_execution                            201902L
-# if !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_format) && !defined(_LIBCUDACXX_HAS_NO_INCOMPLETE_FORMAT)
+#  if !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_format) && !defined(_LIBCUDACXX_HAS_NO_INCOMPLETE_FORMAT)
 // #   define __cccl_lib_format                             202106L
-# endif
+#  endif
 // # define __cccl_lib_generic_unordered_lookup             201811L
 // # define __cccl_lib_int_pow2                             202002L
 // # define __cccl_lib_integer_comparison_functions         202002L
 // # define __cccl_lib_interpolate                          201902L
-# if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
-#   define __cccl_lib_is_constant_evaluated              201811L
-# endif
+#  if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
+#    define __cccl_lib_is_constant_evaluated 201811L
+#  endif
 // # define __cccl_lib_is_layout_compatible                 201907L
-# define __cccl_lib_is_nothrow_convertible               201806L
+#  define __cccl_lib_is_nothrow_convertible 201806L
 // # define __cccl_lib_is_pointer_interconvertible          201907L
-# if !defined(_LIBCUDACXX_HAS_NO_THREADS)
+#  if !defined(_LIBCUDACXX_HAS_NO_THREADS)
 // #   define __cccl_lib_jthread                            201911L
-# endif
-# if !defined(_LIBCUDACXX_HAS_NO_THREADS) && !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_latch)
+#  endif
+#  if !defined(_LIBCUDACXX_HAS_NO_THREADS) && !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_latch)
 // #   define __cccl_lib_latch                              201907L
-# endif
+#  endif
 // # define __cccl_lib_list_remove_return_type              201806L
 // # define __cccl_lib_math_constants                       201907L
 // # define __cccl_lib_polymorphic_allocator                201902L
 // # define __cccl_lib_ranges                               201811L
 // # define __cccl_lib_remove_cvref                         201711L
-# if !defined(_LIBCUDACXX_HAS_NO_THREADS) && !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_semaphore)
+#  if !defined(_LIBCUDACXX_HAS_NO_THREADS) && !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_semaphore)
 // #   define __cccl_lib_semaphore                          201907L
-# endif
+#  endif
 // # undef  __cccl_lib_shared_ptr_arrays
 // # define __cccl_lib_shared_ptr_arrays                    201707L
 // # define __cccl_lib_shift                                201806L
@@ -396,7 +396,7 @@ __cpp_lib_void_t                                        201411L <type_traits>
 // # define __cccl_lib_to_address                           201711L
 // # define __cccl_lib_to_array                             201907L
 // # define __cccl_lib_type_identity                        201806L
-# define __cccl_lib_unwrap_ref                           201811L
+#  define __cccl_lib_unwrap_ref 201811L
 #endif // _CCCL_STD_VER > 2017
 
 #if _CCCL_STD_VER > 2020
@@ -411,9 +411,9 @@ __cpp_lib_void_t                                        201411L <type_traits>
 // # undef  __cccl_lib_constexpr_memory
 // # define __cccl_lib_constexpr_memory                     202202L
 // # define __cccl_lib_constexpr_typeinfo                   202106L
-# define __cccl_lib_forward_like                         202207L
+#  define __cccl_lib_forward_like 202207L
 // # define __cccl_lib_invoke_r                             202106L
-# define __cccl_lib_is_scoped_enum                       202011L
+#  define __cccl_lib_is_scoped_enum 202011L
 // # define __cccl_lib_move_only_function                   202110L
 // # define __cccl_lib_out_ptr                              202106L
 // # define __cccl_lib_ranges_chunk                         202202L
@@ -430,8 +430,8 @@ __cpp_lib_void_t                                        201411L <type_traits>
 // # define __cccl_lib_stdatomic_h                          202011L
 // # define __cccl_lib_string_contains                      202011L
 // # define __cccl_lib_string_resize_and_overwrite          202110L
-# define __cccl_lib_to_underlying                        202102L
-# define __cccl_lib_unreachable                          202202L
+#  define __cccl_lib_to_underlying 202102L
+#  define __cccl_lib_unreachable   202202L
 
 #endif // _CCCL_STD_VER > 2020
 
diff --git a/libcudacxx/include/cuda/std/expected b/libcudacxx/include/cuda/std/expected
index 9469e699f34..1448d03e6f5 100644
--- a/libcudacxx/include/cuda/std/expected
+++ b/libcudacxx/include/cuda/std/expected
@@ -12,10 +12,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/expected>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif //_CUDA_STD_EXPECTED
diff --git a/libcudacxx/include/cuda/std/functional b/libcudacxx/include/cuda/std/functional
index 042e4b4b072..aa04b2b17ca 100644
--- a/libcudacxx/include/cuda/std/functional
+++ b/libcudacxx/include/cuda/std/functional
@@ -13,10 +13,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/functional>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_FUNCTIONAL
diff --git a/libcudacxx/include/cuda/std/initializer_list b/libcudacxx/include/cuda/std/initializer_list
index 24296620e85..5bbda785a96 100644
--- a/libcudacxx/include/cuda/std/initializer_list
+++ b/libcudacxx/include/cuda/std/initializer_list
@@ -12,10 +12,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/initializer_list>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_INITIALIZER_LIST
diff --git a/libcudacxx/include/cuda/std/iterator b/libcudacxx/include/cuda/std/iterator
index 08568f18628..2bcceb7520e 100644
--- a/libcudacxx/include/cuda/std/iterator
+++ b/libcudacxx/include/cuda/std/iterator
@@ -12,10 +12,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/iterator>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_ITERATOR
diff --git a/libcudacxx/include/cuda/std/latch b/libcudacxx/include/cuda/std/latch
index fde9078ab4f..a01fd333116 100644
--- a/libcudacxx/include/cuda/std/latch
+++ b/libcudacxx/include/cuda/std/latch
@@ -8,19 +8,29 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef _CUDA_STD_LATCH
+#define _CUDA_STD_LATCH
+
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
 #  error "CUDA synchronization primitives are only supported for sm_70 and up."
 #endif
 
-#ifndef _CUDA_STD_LATCH
-#define _CUDA_STD_LATCH
-
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/latch>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_LATCH
diff --git a/libcudacxx/include/cuda/std/limits b/libcudacxx/include/cuda/std/limits
index 16531da3da0..2ba9062d357 100644
--- a/libcudacxx/include/cuda/std/limits
+++ b/libcudacxx/include/cuda/std/limits
@@ -13,10 +13,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/limits>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_LIMITS
diff --git a/libcudacxx/include/cuda/std/mdspan b/libcudacxx/include/cuda/std/mdspan
index e9522897ca0..1cc958bf22f 100644
--- a/libcudacxx/include/cuda/std/mdspan
+++ b/libcudacxx/include/cuda/std/mdspan
@@ -13,10 +13,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/mdspan>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_MDSPAN
diff --git a/libcudacxx/include/cuda/std/optional b/libcudacxx/include/cuda/std/optional
index 5ecee7594fb..ca79559f6e4 100644
--- a/libcudacxx/include/cuda/std/optional
+++ b/libcudacxx/include/cuda/std/optional
@@ -12,10 +12,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/optional>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_OPTIONAL
diff --git a/libcudacxx/include/cuda/std/ranges b/libcudacxx/include/cuda/std/ranges
index 56a06f65071..f8ea2dfed26 100644
--- a/libcudacxx/include/cuda/std/ranges
+++ b/libcudacxx/include/cuda/std/ranges
@@ -12,10 +12,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/ranges>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif //_CUDA_RANGES
diff --git a/libcudacxx/include/cuda/std/ratio b/libcudacxx/include/cuda/std/ratio
index 97425f38d1e..b737209b0dc 100644
--- a/libcudacxx/include/cuda/std/ratio
+++ b/libcudacxx/include/cuda/std/ratio
@@ -13,10 +13,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/ratio>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_RATIO
diff --git a/libcudacxx/include/cuda/std/semaphore b/libcudacxx/include/cuda/std/semaphore
index 5b7efef48a3..453aa36d81d 100644
--- a/libcudacxx/include/cuda/std/semaphore
+++ b/libcudacxx/include/cuda/std/semaphore
@@ -8,19 +8,29 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef _CUDA_STD_SEMAPHORE
+#define _CUDA_STD_SEMAPHORE
+
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
 #  error "CUDA synchronization primitives are only supported for sm_70 and up."
 #endif
 
-#ifndef _CUDA_STD_SEMAPHORE
-#define _CUDA_STD_SEMAPHORE
-
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/semaphore>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_SEMAPHORE
diff --git a/libcudacxx/include/cuda/std/span b/libcudacxx/include/cuda/std/span
index 0388da66871..c62e5a2e17c 100644
--- a/libcudacxx/include/cuda/std/span
+++ b/libcudacxx/include/cuda/std/span
@@ -13,10 +13,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/span>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_SPAN
diff --git a/libcudacxx/include/cuda/std/tuple b/libcudacxx/include/cuda/std/tuple
index ee870be346c..d0ebc45b85d 100644
--- a/libcudacxx/include/cuda/std/tuple
+++ b/libcudacxx/include/cuda/std/tuple
@@ -13,10 +13,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/tuple>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_TUPLE
diff --git a/libcudacxx/include/cuda/std/type_traits b/libcudacxx/include/cuda/std/type_traits
index 9eee9b7830a..fe7ddb3d80e 100644
--- a/libcudacxx/include/cuda/std/type_traits
+++ b/libcudacxx/include/cuda/std/type_traits
@@ -13,10 +13,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/type_traits>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_TYPE_TRAITS
diff --git a/libcudacxx/include/cuda/std/utility b/libcudacxx/include/cuda/std/utility
index de2b78ca814..d5e3715dc6f 100644
--- a/libcudacxx/include/cuda/std/utility
+++ b/libcudacxx/include/cuda/std/utility
@@ -13,10 +13,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/utility>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_UTILITY
diff --git a/libcudacxx/include/cuda/std/variant b/libcudacxx/include/cuda/std/variant
index 28d59fc012b..039e3ebf6d3 100644
--- a/libcudacxx/include/cuda/std/variant
+++ b/libcudacxx/include/cuda/std/variant
@@ -12,10 +12,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/variant>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif //_CUDA_STD_VARIANT
diff --git a/libcudacxx/include/cuda/std/version b/libcudacxx/include/cuda/std/version
index 2d0cbbe9aab..9a3627f9bbf 100644
--- a/libcudacxx/include/cuda/std/version
+++ b/libcudacxx/include/cuda/std/version
@@ -13,10 +13,20 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// clang-format off
 #include <cuda/std/detail/__pragma_push>
 
 #include <cuda/std/detail/libcxx/include/version>
 
 #include <cuda/std/detail/__pragma_pop>
+// clang-format on
 
 #endif // _CUDA_STD_VERSION
diff --git a/libcudacxx/include/cuda/stream_ref b/libcudacxx/include/cuda/stream_ref
index 5c2ef3c3d8b..9bb23d3e2ef 100644
--- a/libcudacxx/include/cuda/stream_ref
+++ b/libcudacxx/include/cuda/stream_ref
@@ -38,7 +38,8 @@ private:
 }  // cuda
 */
 
-#include <cuda_runtime_api.h> // cuda_runtime_api needs to come first
+#include <cuda_runtime_api.h>
+// cuda_runtime_api needs to come first
 
 #include <cuda/std/detail/__config>
 
@@ -50,9 +51,9 @@ private:
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/cstddef>
 #include <cuda/std/__cuda/api_wrapper.h>
 #include <cuda/std/__exception/cuda_error.h>
+#include <cuda/std/cstddef>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
@@ -127,7 +128,10 @@ public:
   }
 
   /// Returns the wrapped `cudaStream_t` handle.
-  _CCCL_NODISCARD constexpr value_type get() const noexcept { return __stream; }
+  _CCCL_NODISCARD constexpr value_type get() const noexcept
+  {
+    return __stream;
+  }
 
   /**
    * \brief Synchronizes the wrapped stream.
@@ -150,7 +154,8 @@ public:
   _CCCL_NODISCARD bool ready() const
   {
     const auto __result = ::cudaStreamQuery(get());
-    if (__result == ::cudaErrorNotReady) {
+    if (__result == ::cudaErrorNotReady)
+    {
       return false;
     }
     switch (__result)
diff --git a/libcudacxx/include/nv/detail/__preprocessor b/libcudacxx/include/nv/detail/__preprocessor
index af9382bd13a..15fe84eabd7 100644
--- a/libcudacxx/include/nv/detail/__preprocessor
+++ b/libcudacxx/include/nv/detail/__preprocessor
@@ -9,7 +9,7 @@
 //===----------------------------------------------------------------------===//
 
 #if defined(__GNUC__)
-#pragma GCC system_header
+#  pragma GCC system_header
 #endif
 
 // For all compilers and dialects this header defines:
@@ -24,95 +24,153 @@
 
 #if defined(_NV_TARGET_CPP11)
 #  define _NV_EVAL1(...) __VA_ARGS__
-#  define _NV_EVAL(...) _NV_EVAL1(__VA_ARGS__)
+#  define _NV_EVAL(...)  _NV_EVAL1(__VA_ARGS__)
 #else
 #  define _NV_EVAL1(x) x
-#  define _NV_EVAL(x) _NV_EVAL1(x)
+#  define _NV_EVAL(x)  _NV_EVAL1(x)
 #endif // C++11
 
-#define _NV_CONCAT_EVAL1(l, r) _NV_EVAL(l ## r)
-#define _NV_CONCAT_EVAL(l, r) _NV_CONCAT_EVAL1(l, r)
+#define _NV_CONCAT_EVAL1(l, r) _NV_EVAL(l##r)
+#define _NV_CONCAT_EVAL(l, r)  _NV_CONCAT_EVAL1(l, r)
 
 #define _NV_IF_0(t, f) f
 #define _NV_IF_1(t, f) t
 
-#define _NV_IF_BIT(b) _NV_EVAL(_NV_IF_##b)
-#define _NV_IF__EVAL(fn, t, f) _NV_EVAL(fn(t, f))
+#define _NV_IF_BIT(b)           _NV_EVAL(_NV_IF_##b)
+#define _NV_IF__EVAL(fn, t, f)  _NV_EVAL(fn(t, f))
 #define _NV_IF_EVAL(cond, t, f) _NV_IF__EVAL(_NV_IF_BIT(cond), t, f)
 
 #define _NV_IF1(cond, t, f) _NV_IF_EVAL(cond, t, f)
-#define _NV_IF(cond, t, f) _NV_IF1(_NV_EVAL(cond), _NV_EVAL(t), _NV_EVAL(f))
+#define _NV_IF(cond, t, f)  _NV_IF1(_NV_EVAL(cond), _NV_EVAL(t), _NV_EVAL(f))
 
 #if defined(_NV_TARGET_CPP11)
 
 // The below mechanisms were derived from: https://gustedt.wordpress.com/2010/06/08/detect-empty-macro-arguments/
 
-#define _NV_ARG32(...) _NV_EVAL(_NV_ARG32_0(__VA_ARGS__))
-#define _NV_ARG32_0(                                                                         \
-    _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15,                    \
-    _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, ...) _31
-
-#define _NV_HAS_COMMA(...) _NV_ARG32(__VA_ARGS__,   \
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)
-
-#define _NV_TRIGGER_PARENTHESIS_(...) ,
-
-#define _NV_ISEMPTY(...)                                                    \
-    _NV_ISEMPTY0(                                                           \
-          /* test if there is just one argument, eventually an empty        \
-             one */                                                         \
-          _NV_EVAL(_NV_HAS_COMMA(__VA_ARGS__)),                                       \
-          /* test if _TRIGGER_PARENTHESIS_ together with the argument       \
-             adds a comma */                                                \
-          _NV_EVAL(_NV_HAS_COMMA(_NV_TRIGGER_PARENTHESIS_ __VA_ARGS__)),              \
-          /* test if the argument together with a parenthesis               \
-             adds a comma */                                                \
-          _NV_EVAL(_NV_HAS_COMMA(__VA_ARGS__ (/*empty*/))),                           \
-          /* test if placing it between _TRIGGER_PARENTHESIS_ and the       \
-             parenthesis adds a comma */                                    \
-          _NV_EVAL(_NV_HAS_COMMA(_NV_TRIGGER_PARENTHESIS_ __VA_ARGS__ (/*empty*/)))   \
-          )
-
-#define _NV_PASTE5(_0, _1, _2, _3, _4) _0 ## _1 ## _2 ## _3 ## _4
-#define _NV_ISEMPTY0(_0, _1, _2, _3) _NV_HAS_COMMA(_NV_PASTE5(_NV_IS_EMPTY_CASE_, _0, _1, _2, _3))
-#define _NV_IS_EMPTY_CASE_0001 ,
-
-
-#define _NV_REMOVE_PAREN(...) _NV_REMOVE_PAREN1(__VA_ARGS__)
-#define _NV_REMOVE_PAREN1(...) _NV_STRIP_PAREN(_NV_IF(_NV_TEST_PAREN(__VA_ARGS__), (_NV_STRIP_PAREN(__VA_ARGS__)), (__VA_ARGS__)))
-
-#define _NV_STRIP_PAREN2(...) __VA_ARGS__
-#define _NV_STRIP_PAREN1(...) _NV_STRIP_PAREN2 __VA_ARGS__
-#define _NV_STRIP_PAREN(...) _NV_STRIP_PAREN1(__VA_ARGS__)
-
-#define _NV_TEST_PAREN(...) _NV_TEST_PAREN1(__VA_ARGS__)
-#define _NV_TEST_PAREN1(...) _NV_TEST_PAREN2(_NV_TEST_PAREN_DUMMY __VA_ARGS__)
-#define _NV_TEST_PAREN2(...) _NV_TEST_PAREN3(_NV_CONCAT_EVAL(_, __VA_ARGS__))
-#define _NV_TEST_PAREN3(...) _NV_EVAL(_NV_FIRST_ARG(__VA_ARGS__))
-
-#define __NV_PAREN_YES 1
-#define __NV_PAREN_NO 0
-
-#define _NV_TEST_PAREN_DUMMY(...) _NV_PAREN_YES
-#define __NV_TEST_PAREN_DUMMY     __NV_PAREN_NO,
-
-#define _NV_FIRST_ARG1(x, ...) x
-#define _NV_FIRST_ARG(x, ...) _NV_FIRST_ARG1(x)
-
-#define _NV_REMOVE_FIRST_ARGS1(...) __VA_ARGS__
-#define _NV_REMOVE_FIRST_ARGS(x, ...) _NV_REMOVE_FIRST_ARGS1(__VA_ARGS__)
-
-#define _NV_NUM_ARGS(...) _NV_NUM_ARGS0(__VA_ARGS__)
-#define _NV_NUM_ARGS0(...) _NV_EVAL(_NV_NUM_ARGS1(__VA_ARGS__))
-#define _NV_NUM_ARGS1(...) _NV_IF(_NV_ISEMPTY(__VA_ARGS__), 0, _NV_NUM_ARGS2(__VA_ARGS__))
-#define _NV_NUM_ARGS2(...) _NV_ARG32(__VA_ARGS__,   \
-    31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16, \
-    15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
-
-#define _NV_DISPATCH_N_IMPL1(name, ...) _NV_EVAL(name(__VA_ARGS__))
-#define _NV_DISPATCH_N_IMPL0(depth, name, ...) _NV_DISPATCH_N_IMPL1(_NV_CONCAT_EVAL(name, depth), __VA_ARGS__)
-#define _NV_DISPATCH_N_IMPL(name, ...) _NV_DISPATCH_N_IMPL0(_NV_NUM_ARGS(__VA_ARGS__), name, __VA_ARGS__)
-#define _NV_DISPATCH_N_ARY(name, ...) _NV_DISPATCH_N_IMPL(name, __VA_ARGS__)
+#  define _NV_ARG32(...) _NV_EVAL(_NV_ARG32_0(__VA_ARGS__))
+#  define _NV_ARG32_0( \
+    _0,                \
+    _1,                \
+    _2,                \
+    _3,                \
+    _4,                \
+    _5,                \
+    _6,                \
+    _7,                \
+    _8,                \
+    _9,                \
+    _10,               \
+    _11,               \
+    _12,               \
+    _13,               \
+    _14,               \
+    _15,               \
+    _16,               \
+    _17,               \
+    _18,               \
+    _19,               \
+    _20,               \
+    _21,               \
+    _22,               \
+    _23,               \
+    _24,               \
+    _25,               \
+    _26,               \
+    _27,               \
+    _28,               \
+    _29,               \
+    _30,               \
+    _31,               \
+    ...)               \
+    _31
+
+#  define _NV_HAS_COMMA(...) \
+    _NV_ARG32(__VA_ARGS__, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)
+
+#  define _NV_TRIGGER_PARENTHESIS_(...) ,
+
+#  define _NV_ISEMPTY(...)                                                                                            \
+    _NV_ISEMPTY0(/* test if there is just one argument, eventually an empty                                           \
+                    one */                                                                                            \
+                 _NV_EVAL(_NV_HAS_COMMA(__VA_ARGS__)), /* test if _TRIGGER_PARENTHESIS_ together with the argument    \
+                                                          adds a comma */                                             \
+                 _NV_EVAL(_NV_HAS_COMMA(_NV_TRIGGER_PARENTHESIS_ __VA_ARGS__)), /* test if the argument together with \
+                                                                                   a parenthesis adds a comma */      \
+                 _NV_EVAL(_NV_HAS_COMMA(__VA_ARGS__(/*empty*/))), /* test if placing it between _TRIGGER_PARENTHESIS_ \
+                                                                     and the parenthesis adds a comma */              \
+                 _NV_EVAL(_NV_HAS_COMMA(_NV_TRIGGER_PARENTHESIS_ __VA_ARGS__(/*empty*/))))
+
+#  define _NV_PASTE5(_0, _1, _2, _3, _4) _0##_1##_2##_3##_4
+#  define _NV_ISEMPTY0(_0, _1, _2, _3)   _NV_HAS_COMMA(_NV_PASTE5(_NV_IS_EMPTY_CASE_, _0, _1, _2, _3))
+#  define _NV_IS_EMPTY_CASE_0001         ,
+
+#  define _NV_REMOVE_PAREN(...) _NV_REMOVE_PAREN1(__VA_ARGS__)
+#  define _NV_REMOVE_PAREN1(...) \
+    _NV_STRIP_PAREN(_NV_IF(_NV_TEST_PAREN(__VA_ARGS__), (_NV_STRIP_PAREN(__VA_ARGS__)), (__VA_ARGS__)))
+
+#  define _NV_STRIP_PAREN2(...) __VA_ARGS__
+#  define _NV_STRIP_PAREN1(...) _NV_STRIP_PAREN2 __VA_ARGS__
+#  define _NV_STRIP_PAREN(...)  _NV_STRIP_PAREN1(__VA_ARGS__)
+
+#  define _NV_TEST_PAREN(...)  _NV_TEST_PAREN1(__VA_ARGS__)
+#  define _NV_TEST_PAREN1(...) _NV_TEST_PAREN2(_NV_TEST_PAREN_DUMMY __VA_ARGS__)
+#  define _NV_TEST_PAREN2(...) _NV_TEST_PAREN3(_NV_CONCAT_EVAL(_, __VA_ARGS__))
+#  define _NV_TEST_PAREN3(...) _NV_EVAL(_NV_FIRST_ARG(__VA_ARGS__))
+
+#  define __NV_PAREN_YES 1
+#  define __NV_PAREN_NO  0
+
+#  define _NV_TEST_PAREN_DUMMY(...) _NV_PAREN_YES
+#  define __NV_TEST_PAREN_DUMMY     __NV_PAREN_NO,
+
+#  define _NV_FIRST_ARG1(x, ...) x
+#  define _NV_FIRST_ARG(x, ...)  _NV_FIRST_ARG1(x)
+
+#  define _NV_REMOVE_FIRST_ARGS1(...)   __VA_ARGS__
+#  define _NV_REMOVE_FIRST_ARGS(x, ...) _NV_REMOVE_FIRST_ARGS1(__VA_ARGS__)
+
+#  define _NV_NUM_ARGS(...)  _NV_NUM_ARGS0(__VA_ARGS__)
+#  define _NV_NUM_ARGS0(...) _NV_EVAL(_NV_NUM_ARGS1(__VA_ARGS__))
+#  define _NV_NUM_ARGS1(...) _NV_IF(_NV_ISEMPTY(__VA_ARGS__), 0, _NV_NUM_ARGS2(__VA_ARGS__))
+#  define _NV_NUM_ARGS2(...) \
+    _NV_ARG32(               \
+      __VA_ARGS__,           \
+      31,                    \
+      30,                    \
+      29,                    \
+      28,                    \
+      27,                    \
+      26,                    \
+      25,                    \
+      24,                    \
+      23,                    \
+      22,                    \
+      21,                    \
+      20,                    \
+      19,                    \
+      18,                    \
+      17,                    \
+      16,                    \
+      15,                    \
+      14,                    \
+      13,                    \
+      12,                    \
+      11,                    \
+      10,                    \
+      9,                     \
+      8,                     \
+      7,                     \
+      6,                     \
+      5,                     \
+      4,                     \
+      3,                     \
+      2,                     \
+      1,                     \
+      0)
+
+#  define _NV_DISPATCH_N_IMPL1(name, ...)        _NV_EVAL(name(__VA_ARGS__))
+#  define _NV_DISPATCH_N_IMPL0(depth, name, ...) _NV_DISPATCH_N_IMPL1(_NV_CONCAT_EVAL(name, depth), __VA_ARGS__)
+#  define _NV_DISPATCH_N_IMPL(name, ...)         _NV_DISPATCH_N_IMPL0(_NV_NUM_ARGS(__VA_ARGS__), name, __VA_ARGS__)
+#  define _NV_DISPATCH_N_ARY(name, ...)          _NV_DISPATCH_N_IMPL(name, __VA_ARGS__)
 
 #endif // C++11
diff --git a/libcudacxx/include/nv/detail/__target_macros b/libcudacxx/include/nv/detail/__target_macros
index 6d108021b41..59df8dfd188 100644
--- a/libcudacxx/include/nv/detail/__target_macros
+++ b/libcudacxx/include/nv/detail/__target_macros
@@ -14,42 +14,42 @@
 #include <nv/detail/__preprocessor>
 
 #if defined(__GNUC__)
-#pragma GCC system_header
+#  pragma GCC system_header
 #endif
 
-#  define _NV_TARGET_ARCH_TO_SELECTOR_350 nv::target::sm_35
-#  define _NV_TARGET_ARCH_TO_SELECTOR_370 nv::target::sm_37
-#  define _NV_TARGET_ARCH_TO_SELECTOR_500 nv::target::sm_50
-#  define _NV_TARGET_ARCH_TO_SELECTOR_520 nv::target::sm_52
-#  define _NV_TARGET_ARCH_TO_SELECTOR_530 nv::target::sm_53
-#  define _NV_TARGET_ARCH_TO_SELECTOR_600 nv::target::sm_60
-#  define _NV_TARGET_ARCH_TO_SELECTOR_610 nv::target::sm_61
-#  define _NV_TARGET_ARCH_TO_SELECTOR_620 nv::target::sm_62
-#  define _NV_TARGET_ARCH_TO_SELECTOR_700 nv::target::sm_70
-#  define _NV_TARGET_ARCH_TO_SELECTOR_720 nv::target::sm_72
-#  define _NV_TARGET_ARCH_TO_SELECTOR_750 nv::target::sm_75
-#  define _NV_TARGET_ARCH_TO_SELECTOR_800 nv::target::sm_80
-#  define _NV_TARGET_ARCH_TO_SELECTOR_860 nv::target::sm_86
-#  define _NV_TARGET_ARCH_TO_SELECTOR_870 nv::target::sm_87
-#  define _NV_TARGET_ARCH_TO_SELECTOR_890 nv::target::sm_89
-#  define _NV_TARGET_ARCH_TO_SELECTOR_900 nv::target::sm_90
-
-#  define _NV_TARGET_ARCH_TO_SM_350 35
-#  define _NV_TARGET_ARCH_TO_SM_370 37
-#  define _NV_TARGET_ARCH_TO_SM_500 50
-#  define _NV_TARGET_ARCH_TO_SM_520 52
-#  define _NV_TARGET_ARCH_TO_SM_530 53
-#  define _NV_TARGET_ARCH_TO_SM_600 60
-#  define _NV_TARGET_ARCH_TO_SM_610 61
-#  define _NV_TARGET_ARCH_TO_SM_620 62
-#  define _NV_TARGET_ARCH_TO_SM_700 70
-#  define _NV_TARGET_ARCH_TO_SM_720 72
-#  define _NV_TARGET_ARCH_TO_SM_750 75
-#  define _NV_TARGET_ARCH_TO_SM_800 80
-#  define _NV_TARGET_ARCH_TO_SM_860 86
-#  define _NV_TARGET_ARCH_TO_SM_870 87
-#  define _NV_TARGET_ARCH_TO_SM_890 89
-#  define _NV_TARGET_ARCH_TO_SM_900 90
+#define _NV_TARGET_ARCH_TO_SELECTOR_350 nv::target::sm_35
+#define _NV_TARGET_ARCH_TO_SELECTOR_370 nv::target::sm_37
+#define _NV_TARGET_ARCH_TO_SELECTOR_500 nv::target::sm_50
+#define _NV_TARGET_ARCH_TO_SELECTOR_520 nv::target::sm_52
+#define _NV_TARGET_ARCH_TO_SELECTOR_530 nv::target::sm_53
+#define _NV_TARGET_ARCH_TO_SELECTOR_600 nv::target::sm_60
+#define _NV_TARGET_ARCH_TO_SELECTOR_610 nv::target::sm_61
+#define _NV_TARGET_ARCH_TO_SELECTOR_620 nv::target::sm_62
+#define _NV_TARGET_ARCH_TO_SELECTOR_700 nv::target::sm_70
+#define _NV_TARGET_ARCH_TO_SELECTOR_720 nv::target::sm_72
+#define _NV_TARGET_ARCH_TO_SELECTOR_750 nv::target::sm_75
+#define _NV_TARGET_ARCH_TO_SELECTOR_800 nv::target::sm_80
+#define _NV_TARGET_ARCH_TO_SELECTOR_860 nv::target::sm_86
+#define _NV_TARGET_ARCH_TO_SELECTOR_870 nv::target::sm_87
+#define _NV_TARGET_ARCH_TO_SELECTOR_890 nv::target::sm_89
+#define _NV_TARGET_ARCH_TO_SELECTOR_900 nv::target::sm_90
+
+#define _NV_TARGET_ARCH_TO_SM_350 35
+#define _NV_TARGET_ARCH_TO_SM_370 37
+#define _NV_TARGET_ARCH_TO_SM_500 50
+#define _NV_TARGET_ARCH_TO_SM_520 52
+#define _NV_TARGET_ARCH_TO_SM_530 53
+#define _NV_TARGET_ARCH_TO_SM_600 60
+#define _NV_TARGET_ARCH_TO_SM_610 61
+#define _NV_TARGET_ARCH_TO_SM_620 62
+#define _NV_TARGET_ARCH_TO_SM_700 70
+#define _NV_TARGET_ARCH_TO_SM_720 72
+#define _NV_TARGET_ARCH_TO_SM_750 75
+#define _NV_TARGET_ARCH_TO_SM_800 80
+#define _NV_TARGET_ARCH_TO_SM_860 86
+#define _NV_TARGET_ARCH_TO_SM_870 87
+#define _NV_TARGET_ARCH_TO_SM_890 89
+#define _NV_TARGET_ARCH_TO_SM_900 90
 
 // Only enable when compiling for CUDA/stdpar
 #if defined(_NV_COMPILER_NVCXX) && defined(_NVHPC_CUDA)
@@ -71,22 +71,22 @@
 #  define _NV_TARGET_VAL_SM_89 nv::target::sm_89
 #  define _NV_TARGET_VAL_SM_90 nv::target::sm_90
 
-#  define _NV_TARGET___NV_IS_HOST nv::target::is_host
+#  define _NV_TARGET___NV_IS_HOST   nv::target::is_host
 #  define _NV_TARGET___NV_IS_DEVICE nv::target::is_device
 
 #  define _NV_TARGET___NV_ANY_TARGET (nv::target::any_target)
-#  define _NV_TARGET___NV_NO_TARGET (nv::target::no_target)
+#  define _NV_TARGET___NV_NO_TARGET  (nv::target::no_target)
 
 #  if defined(NV_TARGET_SM_INTEGER_LIST)
 #    define NV_TARGET_MINIMUM_SM_SELECTOR _NV_FIRST_ARG(NV_TARGET_SM_SELECTOR_LIST)
-#    define NV_TARGET_MINIMUM_SM_INTEGER _NV_FIRST_ARG(NV_TARGET_SM_INTEGER_LIST)
-#    define __CUDA_MINIMUM_ARCH__ _NV_CONCAT_EVAL(_NV_FIRST_ARG(NV_TARGET_SM_INTEGER_LIST), 0)
+#    define NV_TARGET_MINIMUM_SM_INTEGER  _NV_FIRST_ARG(NV_TARGET_SM_INTEGER_LIST)
+#    define __CUDA_MINIMUM_ARCH__         _NV_CONCAT_EVAL(_NV_FIRST_ARG(NV_TARGET_SM_INTEGER_LIST), 0)
 #  endif
 
 #  define _NV_TARGET_PROVIDES(q)   nv::target::provides(q)
 #  define _NV_TARGET_IS_EXACTLY(q) nv::target::is_exactly(q)
 
-#elif defined(_NV_COMPILER_NVCC) || defined (_NV_COMPILER_CLANG_CUDA)
+#elif defined(_NV_COMPILER_NVCC) || defined(_NV_COMPILER_CLANG_CUDA)
 
 #  define _NV_TARGET_VAL_SM_35 350
 #  define _NV_TARGET_VAL_SM_37 370
@@ -106,10 +106,10 @@
 #  define _NV_TARGET_VAL_SM_90 900
 
 #  if defined(__CUDA_ARCH__)
-#    define _NV_TARGET_VAL __CUDA_ARCH__
+#    define _NV_TARGET_VAL                __CUDA_ARCH__
 #    define NV_TARGET_MINIMUM_SM_SELECTOR _NV_CONCAT_EVAL(_NV_TARGET_ARCH_TO_SELECTOR_, __CUDA_ARCH__)
-#    define NV_TARGET_MINIMUM_SM_INTEGER _NV_CONCAT_EVAL(_NV_TARGET_ARCH_TO_SM_, __CUDA_ARCH__)
-#    define __CUDA_MINIMUM_ARCH__ __CUDA_ARCH__
+#    define NV_TARGET_MINIMUM_SM_INTEGER  _NV_CONCAT_EVAL(_NV_TARGET_ARCH_TO_SM_, __CUDA_ARCH__)
+#    define __CUDA_MINIMUM_ARCH__         __CUDA_ARCH__
 #  endif
 
 #  if defined(__CUDA_ARCH__)
@@ -197,22 +197,22 @@
 #define _NV_TARGET___NV_IS_EXACTLY_SM_89 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_89))
 #define _NV_TARGET___NV_IS_EXACTLY_SM_90 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_90))
 
-#define NV_PROVIDES_SM_35   __NV_PROVIDES_SM_35
-#define NV_PROVIDES_SM_37   __NV_PROVIDES_SM_37
-#define NV_PROVIDES_SM_50   __NV_PROVIDES_SM_50
-#define NV_PROVIDES_SM_52   __NV_PROVIDES_SM_52
-#define NV_PROVIDES_SM_53   __NV_PROVIDES_SM_53
-#define NV_PROVIDES_SM_60   __NV_PROVIDES_SM_60
-#define NV_PROVIDES_SM_61   __NV_PROVIDES_SM_61
-#define NV_PROVIDES_SM_62   __NV_PROVIDES_SM_62
-#define NV_PROVIDES_SM_70   __NV_PROVIDES_SM_70
-#define NV_PROVIDES_SM_72   __NV_PROVIDES_SM_72
-#define NV_PROVIDES_SM_75   __NV_PROVIDES_SM_75
-#define NV_PROVIDES_SM_80   __NV_PROVIDES_SM_80
-#define NV_PROVIDES_SM_86   __NV_PROVIDES_SM_86
-#define NV_PROVIDES_SM_87   __NV_PROVIDES_SM_87
-#define NV_PROVIDES_SM_89   __NV_PROVIDES_SM_89
-#define NV_PROVIDES_SM_90   __NV_PROVIDES_SM_90
+#define NV_PROVIDES_SM_35 __NV_PROVIDES_SM_35
+#define NV_PROVIDES_SM_37 __NV_PROVIDES_SM_37
+#define NV_PROVIDES_SM_50 __NV_PROVIDES_SM_50
+#define NV_PROVIDES_SM_52 __NV_PROVIDES_SM_52
+#define NV_PROVIDES_SM_53 __NV_PROVIDES_SM_53
+#define NV_PROVIDES_SM_60 __NV_PROVIDES_SM_60
+#define NV_PROVIDES_SM_61 __NV_PROVIDES_SM_61
+#define NV_PROVIDES_SM_62 __NV_PROVIDES_SM_62
+#define NV_PROVIDES_SM_70 __NV_PROVIDES_SM_70
+#define NV_PROVIDES_SM_72 __NV_PROVIDES_SM_72
+#define NV_PROVIDES_SM_75 __NV_PROVIDES_SM_75
+#define NV_PROVIDES_SM_80 __NV_PROVIDES_SM_80
+#define NV_PROVIDES_SM_86 __NV_PROVIDES_SM_86
+#define NV_PROVIDES_SM_87 __NV_PROVIDES_SM_87
+#define NV_PROVIDES_SM_89 __NV_PROVIDES_SM_89
+#define NV_PROVIDES_SM_90 __NV_PROVIDES_SM_90
 
 #define NV_IS_EXACTLY_SM_35 __NV_IS_EXACTLY_SM_35
 #define NV_IS_EXACTLY_SM_37 __NV_IS_EXACTLY_SM_37
@@ -235,11 +235,11 @@
 // Will re-enable for nvcc below.
 #define NV_HAS_FEATURE_SM_90a NV_NO_TARGET
 
-#define NV_IS_HOST         __NV_IS_HOST
-#define NV_IS_DEVICE       __NV_IS_DEVICE
+#define NV_IS_HOST   __NV_IS_HOST
+#define NV_IS_DEVICE __NV_IS_DEVICE
 
-#define NV_ANY_TARGET      __NV_ANY_TARGET
-#define NV_NO_TARGET       __NV_NO_TARGET
+#define NV_ANY_TARGET __NV_ANY_TARGET
+#define NV_NO_TARGET  __NV_NO_TARGET
 
 // Platform invoke mechanisms
 #if defined(_NV_COMPILER_NVCXX) && defined(_NVHPC_CUDA)
@@ -249,11 +249,9 @@
 #  define _NV_BLOCK_EXPAND(...) _NV_REMOVE_PAREN(__VA_ARGS__)
 
 #  define _NV_TARGET_IF(cond, t, ...) \
-    (if target _NV_ARCH_COND(cond) {    \
-      _NV_BLOCK_EXPAND(t)        \
-    } else { _NV_BLOCK_EXPAND(__VA_ARGS__) })
+    (if target _NV_ARCH_COND(cond) { _NV_BLOCK_EXPAND(t) } else {_NV_BLOCK_EXPAND(__VA_ARGS__)})
 
-#elif defined(_NV_COMPILER_NVCC) || defined (_NV_COMPILER_CLANG_CUDA)
+#elif defined(_NV_COMPILER_NVCC) || defined(_NV_COMPILER_CLANG_CUDA)
 
 #  if (_NV_TARGET___NV_IS_EXACTLY_SM_35)
 #    define _NV_TARGET_BOOL___NV_IS_EXACTLY_SM_35 1
@@ -353,7 +351,7 @@
 
 // Re-enable sm_90a support in nvcc.
 #  undef NV_HAS_FEATURE_SM_90a
-#  define NV_HAS_FEATURE_SM_90a   __NV_HAS_FEATURE_SM_90a
+#  define NV_HAS_FEATURE_SM_90a __NV_HAS_FEATURE_SM_90a
 #  if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && defined(__CUDA_ARCH_FEAT_SM90_ALL))
 #    define _NV_TARGET_BOOL___NV_HAS_FEATURE_SM_90a 1
 #  else
@@ -369,7 +367,7 @@
 #  endif
 
 #  define _NV_TARGET_BOOL___NV_ANY_TARGET 1
-#  define _NV_TARGET_BOOL___NV_NO_TARGET 0
+#  define _NV_TARGET_BOOL___NV_NO_TARGET  0
 
 // NVCC Greater than stuff
 
@@ -470,18 +468,24 @@
 #  endif
 
 #  define _NV_ARCH_COND_CAT1(cond) _NV_TARGET_BOOL_##cond
-#  define _NV_ARCH_COND_CAT(cond) _NV_EVAL(_NV_ARCH_COND_CAT1(cond))
+#  define _NV_ARCH_COND_CAT(cond)  _NV_EVAL(_NV_ARCH_COND_CAT1(cond))
 
-#    define _NV_TARGET_EMPTY_PARAM ;
+#  define _NV_TARGET_EMPTY_PARAM ;
 
 #  if defined(_NV_TARGET_CPP11)
 
-#    define _NV_BLOCK_EXPAND(...) { _NV_REMOVE_PAREN(__VA_ARGS__) }
-#    define _NV_TARGET_IF(cond, t, ...) _NV_IF( _NV_ARCH_COND_CAT(cond), t, __VA_ARGS__)
+#    define _NV_BLOCK_EXPAND(...)     \
+      {                               \
+        _NV_REMOVE_PAREN(__VA_ARGS__) \
+      }
+#    define _NV_TARGET_IF(cond, t, ...) _NV_IF(_NV_ARCH_COND_CAT(cond), t, __VA_ARGS__)
 
 #  else // <C++11 fallback
 
-#    define _NV_BLOCK_EXPAND(x) { x }
+#    define _NV_BLOCK_EXPAND(x) \
+      {                         \
+        x                       \
+      }
 
 #    define _NV_TARGET_IF(cond, t)         _NV_IF(_NV_ARCH_COND_CAT(cond), t, _NV_TARGET_EMPTY_PARAM)
 #    define _NV_TARGET_IF_ELSE(cond, t, f) _NV_IF(_NV_ARCH_COND_CAT(cond), t, f)
@@ -493,28 +497,28 @@
 #if defined(_NV_TARGET_CPP11)
 
 #  define _NV_TARGET_DISPATCH_HANDLE0()
-#  define _NV_TARGET_DISPATCH_HANDLE2(q, fn)        _NV_TARGET_IF(q, fn)
-#  define _NV_TARGET_DISPATCH_HANDLE4(q, fn, ...)   _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE2(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE6(q, fn, ...)   _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE4(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE8(q, fn, ...)   _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE6(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE10(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE8(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE12(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE10(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE14(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE12(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE16(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE14(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE18(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE16(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE20(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE18(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE22(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE20(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE24(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE22(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE26(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE24(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE28(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE26(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE30(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE28(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE32(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE30(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE34(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE32(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE2(q, fn)       _NV_TARGET_IF(q, fn)
+#  define _NV_TARGET_DISPATCH_HANDLE4(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE2(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE6(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE4(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE8(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE6(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE10(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE8(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE12(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE10(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE14(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE12(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE16(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE14(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE18(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE16(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE20(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE18(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE22(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE20(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE24(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE22(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE26(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE24(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE28(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE26(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE30(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE28(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE32(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE30(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE34(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE32(__VA_ARGS__))
 
 #  define _NV_TARGET_DISPATCH(...) _NV_BLOCK_EXPAND(_NV_DISPATCH_N_ARY(_NV_TARGET_DISPATCH_HANDLE, __VA_ARGS__))
 
 // NV_IF_TARGET supports a false statement provided as a variadic macro
-#  define NV_IF_TARGET(cond, ...)    _NV_BLOCK_EXPAND(_NV_TARGET_IF(cond, __VA_ARGS__))
+#  define NV_IF_TARGET(cond, ...)       _NV_BLOCK_EXPAND(_NV_TARGET_IF(cond, __VA_ARGS__))
 #  define NV_IF_ELSE_TARGET(cond, t, f) _NV_BLOCK_EXPAND(_NV_TARGET_IF(cond, t, f))
 #  define NV_DISPATCH_TARGET(...)       _NV_TARGET_DISPATCH(__VA_ARGS__)
 
diff --git a/libcudacxx/include/nv/target b/libcudacxx/include/nv/target
index 0219d6caf8f..381f5341cdc 100644
--- a/libcudacxx/include/nv/target
+++ b/libcudacxx/include/nv/target
@@ -24,177 +24,192 @@
 #  define _NV_COMPILER_CLANG_CUDA
 #endif
 
-#if (!defined(__ibmxl__)) && \
-    ((defined(__cplusplus) && __cplusplus >= 201103L) || \
-     (defined(_MSC_VER) && _MSVC_LANG >= 201103L))
+#if (!defined(__ibmxl__)) \
+  && ((defined(__cplusplus) && __cplusplus >= 201103L) || (defined(_MSC_VER) && _MSVC_LANG >= 201103L))
 #  define _NV_TARGET_CPP11
 #endif
 
-
 // Hide `if target` support from NVRTC
 #if defined(_NV_TARGET_CPP11) && !defined(__CUDACC_RTC__)
 
-#if defined(_NV_COMPILER_NVCXX)
-#  define _NV_BITSET_ATTRIBUTE [[nv::__target_bitset]]
-#else
-#  define _NV_BITSET_ATTRIBUTE
-#endif
+#  if defined(_NV_COMPILER_NVCXX)
+#    define _NV_BITSET_ATTRIBUTE [[nv::__target_bitset]]
+#  else
+#    define _NV_BITSET_ATTRIBUTE
+#  endif
+
+namespace nv
+{
+namespace target
+{
+namespace detail
+{
+
+typedef unsigned long long base_int_t;
+
+// No host specialization
+constexpr base_int_t all_hosts = 1;
+
+// NVIDIA GPUs
+constexpr base_int_t sm_35_bit = 1 << 1;
+constexpr base_int_t sm_37_bit = 1 << 2;
+constexpr base_int_t sm_50_bit = 1 << 3;
+constexpr base_int_t sm_52_bit = 1 << 4;
+constexpr base_int_t sm_53_bit = 1 << 5;
+constexpr base_int_t sm_60_bit = 1 << 6;
+constexpr base_int_t sm_61_bit = 1 << 7;
+constexpr base_int_t sm_62_bit = 1 << 8;
+constexpr base_int_t sm_70_bit = 1 << 9;
+constexpr base_int_t sm_72_bit = 1 << 10;
+constexpr base_int_t sm_75_bit = 1 << 11;
+constexpr base_int_t sm_80_bit = 1 << 12;
+constexpr base_int_t sm_86_bit = 1 << 13;
+constexpr base_int_t sm_87_bit = 1 << 14;
+constexpr base_int_t sm_89_bit = 1 << 15;
+constexpr base_int_t sm_90_bit = 1 << 16;
+constexpr base_int_t all_devices =
+  sm_35_bit | sm_37_bit | sm_50_bit | sm_52_bit | sm_53_bit | sm_60_bit | sm_61_bit | sm_62_bit | sm_70_bit | sm_72_bit
+  | sm_75_bit | sm_80_bit | sm_86_bit | sm_87_bit | sm_89_bit | sm_90_bit;
+
+// Store a set of targets as a set of bits
+struct _NV_BITSET_ATTRIBUTE target_description
+{
+  base_int_t targets;
+
+  constexpr target_description(base_int_t a)
+      : targets(a)
+  {}
+};
+
+// The type of the user-visible names of the NVIDIA GPU targets
+enum class sm_selector : base_int_t
+{
+  sm_35 = 35,
+  sm_37 = 37,
+  sm_50 = 50,
+  sm_52 = 52,
+  sm_53 = 53,
+  sm_60 = 60,
+  sm_61 = 61,
+  sm_62 = 62,
+  sm_70 = 70,
+  sm_72 = 72,
+  sm_75 = 75,
+  sm_80 = 80,
+  sm_86 = 86,
+  sm_87 = 87,
+  sm_89 = 89,
+  sm_90 = 90,
+};
+
+constexpr base_int_t toint(sm_selector a)
+{
+  return static_cast<base_int_t>(a);
+}
+
+constexpr base_int_t bitexact(sm_selector a)
+{
+  return toint(a) == 35 ? sm_35_bit
+       : toint(a) == 37 ? sm_37_bit
+       : toint(a) == 50 ? sm_50_bit
+       : toint(a) == 52 ? sm_52_bit
+       : toint(a) == 53 ? sm_53_bit
+       : toint(a) == 60 ? sm_60_bit
+       : toint(a) == 61 ? sm_61_bit
+       : toint(a) == 62 ? sm_62_bit
+       : toint(a) == 70 ? sm_70_bit
+       : toint(a) == 72 ? sm_72_bit
+       : toint(a) == 75 ? sm_75_bit
+       : toint(a) == 80 ? sm_80_bit
+       : toint(a) == 86 ? sm_86_bit
+       : toint(a) == 87 ? sm_87_bit
+       : toint(a) == 89 ? sm_89_bit
+       : toint(a) == 90 ? sm_90_bit
+                        : 0;
+}
+
+constexpr base_int_t bitrounddown(sm_selector a)
+{
+  return toint(a) >= 90 ? sm_90_bit
+       : toint(a) >= 89 ? sm_89_bit
+       : toint(a) >= 87 ? sm_87_bit
+       : toint(a) >= 86 ? sm_86_bit
+       : toint(a) >= 80 ? sm_80_bit
+       : toint(a) >= 75 ? sm_75_bit
+       : toint(a) >= 72 ? sm_72_bit
+       : toint(a) >= 70 ? sm_70_bit
+       : toint(a) >= 62 ? sm_62_bit
+       : toint(a) >= 61 ? sm_61_bit
+       : toint(a) >= 60 ? sm_60_bit
+       : toint(a) >= 53 ? sm_53_bit
+       : toint(a) >= 52 ? sm_52_bit
+       : toint(a) >= 50 ? sm_50_bit
+       : toint(a) >= 37 ? sm_37_bit
+       : toint(a) >= 35 ? sm_35_bit
+                        : 0;
+}
+
+// Public API for NVIDIA GPUs
+
+constexpr target_description is_exactly(sm_selector a)
+{
+  return target_description(bitexact(a));
+}
+
+constexpr target_description provides(sm_selector a)
+{
+  return target_description(~(bitrounddown(a) - 1) & all_devices);
+}
+
+// Boolean operations on target sets
+
+constexpr target_description operator&&(target_description a, target_description b)
+{
+  return target_description(a.targets & b.targets);
+}
+
+constexpr target_description operator||(target_description a, target_description b)
+{
+  return target_description(a.targets | b.targets);
+}
 
-namespace nv {
-  namespace target {
-    namespace detail {
-
-      typedef unsigned long long base_int_t;
-
-      // No host specialization
-      constexpr base_int_t all_hosts = 1;
-
-      // NVIDIA GPUs
-      constexpr base_int_t sm_35_bit = 1 << 1;
-      constexpr base_int_t sm_37_bit = 1 << 2;
-      constexpr base_int_t sm_50_bit = 1 << 3;
-      constexpr base_int_t sm_52_bit = 1 << 4;
-      constexpr base_int_t sm_53_bit = 1 << 5;
-      constexpr base_int_t sm_60_bit = 1 << 6;
-      constexpr base_int_t sm_61_bit = 1 << 7;
-      constexpr base_int_t sm_62_bit = 1 << 8;
-      constexpr base_int_t sm_70_bit = 1 << 9;
-      constexpr base_int_t sm_72_bit = 1 << 10;
-      constexpr base_int_t sm_75_bit = 1 << 11;
-      constexpr base_int_t sm_80_bit = 1 << 12;
-      constexpr base_int_t sm_86_bit = 1 << 13;
-      constexpr base_int_t sm_87_bit = 1 << 14;
-      constexpr base_int_t sm_89_bit = 1 << 15;
-      constexpr base_int_t sm_90_bit = 1 << 16;
-      constexpr base_int_t all_devices =
-          sm_35_bit | sm_37_bit |
-          sm_50_bit | sm_52_bit | sm_53_bit |
-          sm_60_bit | sm_61_bit | sm_62_bit |
-          sm_70_bit | sm_72_bit | sm_75_bit |
-          sm_80_bit | sm_86_bit | sm_87_bit |
-          sm_89_bit | sm_90_bit;
-
-      // Store a set of targets as a set of bits
-      struct _NV_BITSET_ATTRIBUTE target_description {
-        base_int_t targets;
-
-        constexpr target_description(base_int_t a) : targets(a) { }
-      };
-
-      // The type of the user-visible names of the NVIDIA GPU targets
-      enum class sm_selector : base_int_t {
-        sm_35 = 35, sm_37 = 37,
-        sm_50 = 50, sm_52 = 52, sm_53 = 53,
-        sm_60 = 60, sm_61 = 61, sm_62 = 62,
-        sm_70 = 70, sm_72 = 72, sm_75 = 75,
-        sm_80 = 80, sm_86 = 86, sm_87 = 87,
-        sm_89 = 89, sm_90 = 90,
-      };
-
-      constexpr base_int_t toint(sm_selector a) {
-        return static_cast<base_int_t>(a);
-      }
-
-      constexpr base_int_t bitexact(sm_selector a) {
-        return toint(a) == 35 ? sm_35_bit :
-               toint(a) == 37 ? sm_37_bit :
-               toint(a) == 50 ? sm_50_bit :
-               toint(a) == 52 ? sm_52_bit :
-               toint(a) == 53 ? sm_53_bit :
-               toint(a) == 60 ? sm_60_bit :
-               toint(a) == 61 ? sm_61_bit :
-               toint(a) == 62 ? sm_62_bit :
-               toint(a) == 70 ? sm_70_bit :
-               toint(a) == 72 ? sm_72_bit :
-               toint(a) == 75 ? sm_75_bit :
-               toint(a) == 80 ? sm_80_bit :
-               toint(a) == 86 ? sm_86_bit :
-               toint(a) == 87 ? sm_87_bit :
-               toint(a) == 89 ? sm_89_bit :
-               toint(a) == 90 ? sm_90_bit : 0;
-      }
-
-      constexpr base_int_t bitrounddown(sm_selector a) {
-        return toint(a) >= 90 ? sm_90_bit :
-               toint(a) >= 89 ? sm_89_bit :
-               toint(a) >= 87 ? sm_87_bit :
-               toint(a) >= 86 ? sm_86_bit :
-               toint(a) >= 80 ? sm_80_bit :
-               toint(a) >= 75 ? sm_75_bit :
-               toint(a) >= 72 ? sm_72_bit :
-               toint(a) >= 70 ? sm_70_bit :
-               toint(a) >= 62 ? sm_62_bit :
-               toint(a) >= 61 ? sm_61_bit :
-               toint(a) >= 60 ? sm_60_bit :
-               toint(a) >= 53 ? sm_53_bit :
-               toint(a) >= 52 ? sm_52_bit :
-               toint(a) >= 50 ? sm_50_bit :
-               toint(a) >= 37 ? sm_37_bit :
-               toint(a) >= 35 ? sm_35_bit : 0;
-      }
-
-      // Public API for NVIDIA GPUs
-
-      constexpr target_description is_exactly(sm_selector a) {
-        return target_description(bitexact(a));
-      }
-
-      constexpr target_description provides(sm_selector a) {
-        return target_description(~(bitrounddown(a) - 1) & all_devices);
-      }
-
-      // Boolean operations on target sets
-
-      constexpr target_description operator&&(target_description a,
-                                              target_description b) {
-        return target_description(a.targets & b.targets);
-      }
-
-      constexpr target_description operator||(target_description a,
-                                              target_description b) {
-        return target_description(a.targets | b.targets);
-      }
-
-      constexpr target_description operator!(target_description a) {
-        return target_description(~a.targets & (all_devices | all_hosts));
-      }
-    }
-
-    using detail::target_description;
-    using detail::sm_selector;
-
-    // The predicates for basic host/device selection
-    constexpr target_description is_host =
-      target_description(detail::all_hosts);
-    constexpr target_description is_device =
-      target_description(detail::all_devices);
-    constexpr target_description any_target =
-      target_description(detail::all_hosts | detail::all_devices);
-    constexpr target_description no_target =
-      target_description(0);
-
-    // The public names for NVIDIA GPU architectures
-    constexpr sm_selector sm_35 = sm_selector::sm_35;
-    constexpr sm_selector sm_37 = sm_selector::sm_37;
-    constexpr sm_selector sm_50 = sm_selector::sm_50;
-    constexpr sm_selector sm_52 = sm_selector::sm_52;
-    constexpr sm_selector sm_53 = sm_selector::sm_53;
-    constexpr sm_selector sm_60 = sm_selector::sm_60;
-    constexpr sm_selector sm_61 = sm_selector::sm_61;
-    constexpr sm_selector sm_62 = sm_selector::sm_62;
-    constexpr sm_selector sm_70 = sm_selector::sm_70;
-    constexpr sm_selector sm_72 = sm_selector::sm_72;
-    constexpr sm_selector sm_75 = sm_selector::sm_75;
-    constexpr sm_selector sm_80 = sm_selector::sm_80;
-    constexpr sm_selector sm_86 = sm_selector::sm_86;
-    constexpr sm_selector sm_87 = sm_selector::sm_87;
-    constexpr sm_selector sm_89 = sm_selector::sm_89;
-    constexpr sm_selector sm_90 = sm_selector::sm_90;
-
-    using detail::is_exactly;
-    using detail::provides;
-  }
+constexpr target_description operator!(target_description a)
+{
+  return target_description(~a.targets & (all_devices | all_hosts));
 }
+} // namespace detail
+
+using detail::sm_selector;
+using detail::target_description;
+
+// The predicates for basic host/device selection
+constexpr target_description is_host    = target_description(detail::all_hosts);
+constexpr target_description is_device  = target_description(detail::all_devices);
+constexpr target_description any_target = target_description(detail::all_hosts | detail::all_devices);
+constexpr target_description no_target  = target_description(0);
+
+// The public names for NVIDIA GPU architectures
+constexpr sm_selector sm_35 = sm_selector::sm_35;
+constexpr sm_selector sm_37 = sm_selector::sm_37;
+constexpr sm_selector sm_50 = sm_selector::sm_50;
+constexpr sm_selector sm_52 = sm_selector::sm_52;
+constexpr sm_selector sm_53 = sm_selector::sm_53;
+constexpr sm_selector sm_60 = sm_selector::sm_60;
+constexpr sm_selector sm_61 = sm_selector::sm_61;
+constexpr sm_selector sm_62 = sm_selector::sm_62;
+constexpr sm_selector sm_70 = sm_selector::sm_70;
+constexpr sm_selector sm_72 = sm_selector::sm_72;
+constexpr sm_selector sm_75 = sm_selector::sm_75;
+constexpr sm_selector sm_80 = sm_selector::sm_80;
+constexpr sm_selector sm_86 = sm_selector::sm_86;
+constexpr sm_selector sm_87 = sm_selector::sm_87;
+constexpr sm_selector sm_89 = sm_selector::sm_89;
+constexpr sm_selector sm_90 = sm_selector::sm_90;
+
+using detail::is_exactly;
+using detail::provides;
+} // namespace target
+} // namespace nv
 
 #endif // C++11  && !defined(__CUDACC_RTC__)