NVIDIA · miscco · Feb 5, 2025 · Jan 30, 2025 · Jan 30, 2025 · Jan 30, 2025
@@ -1,4 +1,4 @@
 workflows:
  # If any jobs appear here, they will be executed instead of `pull_request' for PRs.
  # This is useful for limiting resource usage when a full matrix is not needed.
  # The branch protection checks will fail when using this override workflow.
@@ -14,10 +14,13 @@
     - {jobs: ['build'], std: 'all', ctk: '11.1', cxx: ['gcc6', 'gcc7', 'gcc8', 'gcc9', 'clang9', 'msvc2017']}
     - {jobs: ['build'], std: 'all', ctk: '11.8', cxx: ['gcc11'], sm: '60;70;80;90'}
     # Current CTK
-    - {jobs: ['build'], std: 'all', cxx: ['gcc7', 'gcc8', 'gcc9', 'gcc10', 'gcc11', 'gcc12']}
-    - {jobs: ['build'], std: 'all', cxx: ['clang9', 'clang10', 'clang11', 'clang12', 'clang13', 'clang14', 'clang15', 'clang16']}
-    - {jobs: ['build'], std: 'all', cxx: ['intel', 'msvc2019']}
-    - {jobs: ['test'],  std: 'all', cxx: ['gcc13', 'clang17', 'msvc2022']}
+    - {jobs: ['build'], std: [11, 17], cxx: ['gcc7', 'gcc8', 'gcc9']}
+    - {jobs: ['build'], std: [14, 20], cxx: ['gcc10', 'gcc11', 'gcc12']}
+    - {jobs: ['build'], std: [11, 14], cxx: ['clang10', 'clang11']}
+    - {jobs: ['build'], std: [17],  cxx: ['clang12', 'clang13', 'clang14', 'clang15']}
+    - {jobs: ['build'], std: 'all', cxx: ['clang9', 'clang16']}
+    - {jobs: ['build'], std: 17, cxx: ['intel', 'msvc2019']}
+    - {jobs: ['test'],  std: 'all', cxx: ['gcc', 'clang', 'msvc2022']}
     # Modded builds:
     - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'}
     - {jobs: ['build'], std: 'all', cxx: ['gcc'], sm: '90a'}
@@ -219,13 +222,13 @@
 
 # testing -> Runner with GPU is in a nv-gh-runners testing pool
 gpus:
-  v100:     { sm: 70 }                # 32 GB,  40 runners
-  t4:       { sm: 75, testing: true } # 16 GB,   8 runners
-  rtx2080:  { sm: 75, testing: true } #  8 GB,   8 runners
-  rtxa6000: { sm: 86, testing: true } # 48 GB,  12 runners
-  l4:       { sm: 89, testing: true } # 24 GB,  48 runners
-  rtx4090:  { sm: 89, testing: true } # 24 GB,  10 runners
-  h100:     { sm: 90 }                # 80 GB,  16 runners
+  v100:     { sm: 70 } # 32 GB,  40 runners
+  t4:       { sm: 75 } # 16 GB,  10 runners
+  rtx2080:  { sm: 75 } #  8 GB,  12 runners
+  rtxa6000: { sm: 86 } # 48 GB,  12 runners
+  l4:       { sm: 89 } # 24 GB,  48 runners
+  rtx4090:  { sm: 89 } # 24 GB,  10 runners
+  h100:     { sm: 90 } # 80 GB,  16 runners
 
 # Tags are used to define a `matrix job` in the workflow section.
 #

diff --git a/ci/test_pycuda.sh b/ci/test_pycuda.sh
@@ -8,25 +8,28 @@ print_environment_details
 
 fail_if_no_gpu
 
-readonly prefix="${BUILD_DIR}/python/"
-export PYTHONPATH="${prefix}:${PYTHONPATH:-}"
+begin_group "⚙️ Existing site-packages"
+pip freeze
+end_group "⚙️ Existing site-packages"
 
-pushd ../python/cuda_cooperative >/dev/null
+for module in cuda_parallel cuda_cooperative; do
 
-run_command "⚙️  Pip install cuda_cooperative" pip install --force-reinstall --upgrade --target "${prefix}" .[test]
-run_command "🚀  Pytest cuda_cooperative" python -m pytest -v ./tests
+  pushd "../python/${module}" >/dev/null
 
-popd >/dev/null
+  TEMP_VENV_DIR="/tmp/${module}_venv"
+  rm -rf "${TEMP_VENV_DIR}"
+  python -m venv "${TEMP_VENV_DIR}"
+  . "${TEMP_VENV_DIR}/bin/activate"
+  echo 'cuda-cccl @ file:///home/coder/cccl/python/cuda_cccl' > /tmp/cuda-cccl_constraints.txt
+  run_command "⚙️  Pip install ${module}" pip install -c /tmp/cuda-cccl_constraints.txt .[test]
+  begin_group "⚙️ ${module} site-packages"
+  pip freeze
+  end_group "⚙️ ${module} site-packages"
+  run_command "🚀  Pytest ${module}" python -m pytest -v ./tests
+  deactivate
 
-pushd ../python/cuda_parallel >/dev/null
+  popd >/dev/null
 
-# Temporarily install the package twice to populate include directory as part of the first installation
-# and to let manifest discover these includes during the second installation. Do not forget to remove the
-# second installation after https://github.com/NVIDIA/cccl/issues/2281 is addressed.
-run_command "⚙️  Pip install cuda_parallel once" pip install --force-reinstall --upgrade --target "${prefix}" .[test]
-run_command "⚙️  Pip install cuda_parallel twice" pip install --force-reinstall --upgrade --target "${prefix}" .[test]
-run_command "🚀  Pytest cuda_parallel" python -m pytest -v ./tests
-
-popd >/dev/null
+done
 
 print_time_summary
@@ -103,6 +103,7 @@ update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_MAJOR \([0-9]\+\))" "
 update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_MINOR \([0-9]\+\))" "set(cudax_VERSION_MINOR $minor)"
 update_file "$CUDAX_CMAKE_VERSION_FILE" "set(cudax_VERSION_PATCH \([0-9]\+\))" "set(cudax_VERSION_PATCH $patch)"
 
+update_file "$CUDA_CCCL_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$major.$minor.$patch\""
 update_file "$CUDA_COOPERATIVE_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$pymajor.$pyminor.$major.$minor.$patch\""
 update_file "$CUDA_PARALLEL_VERSION_FILE" "^__version__ = \"\([0-9.]\+\)\"" "__version__ = \"$pymajor.$pyminor.$major.$minor.$patch\""
 

@@ -25,7 +25,7 @@ endif()
 
 option(cudax_ENABLE_HEADER_TESTING "Test that CUDA Experimental's public headers compile." ON)
 option(cudax_ENABLE_TESTING "Build CUDA Experimental's tests." ON)
-option(cudax_ENABLE_SAMPLES "Build CUDA Experimental's samples." ON)
+option(cudax_ENABLE_SAMPLES "Build CUDA Experimental's samples." OFF)
 
 include(cmake/cudaxBuildCompilerTargets.cmake)
 include(cmake/cudaxBuildTargetList.cmake)

@@ -51,14 +51,6 @@ class bad_expected_access;
 template <>
 class bad_expected_access<void> : public ::std::exception
 {
-protected:
-  _CCCL_HIDE_FROM_ABI bad_expected_access() noexcept                             = default;
-  _CCCL_HIDE_FROM_ABI bad_expected_access(const bad_expected_access&)            = default;
-  _CCCL_HIDE_FROM_ABI bad_expected_access(bad_expected_access&&)                 = default;
-  _CCCL_HIDE_FROM_ABI bad_expected_access& operator=(const bad_expected_access&) = default;
-  _CCCL_HIDE_FROM_ABI bad_expected_access& operator=(bad_expected_access&&)      = default;
-  ~bad_expected_access() noexcept override                                       = default;
-
 public:
   // The way this has been designed (by using a class template below) means that we'll already
   // have a profusion of these vtables in TUs, and the dynamic linker will already have a bunch
@@ -74,10 +66,21 @@ template <class _Err>
 class bad_expected_access : public bad_expected_access<void>
 {
 public:
-  explicit bad_expected_access(_Err __e)
+#      if defined(_CCCL_CUDA_COMPILER_CLANG) // Clang needs this or it breaks with device only types
+  _CCCL_HOST_DEVICE
+#      endif // _CCCL_CUDA_COMPILER_CLANG
+  _CCCL_HIDE_FROM_ABI explicit bad_expected_access(_Err __e)
       : __unex_(_CUDA_VSTD::move(__e))
   {}
 
+#      if defined(_CCCL_CUDA_COMPILER_CLANG) // Clang needs this or it breaks with device only types
+  _CCCL_HOST_DEVICE
+#      endif // _CCCL_CUDA_COMPILER_CLANG
+  _CCCL_HIDE_FROM_ABI ~bad_expected_access() noexcept
+  {
+    __unex_.~_Err();
+  }
+
   _LIBCUDACXX_HIDE_FROM_ABI _Err& error() & noexcept
   {
     return __unex_;

@@ -1077,6 +1077,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
   }
 
   // [expected.object.eq], equality operators
+  _CCCL_EXEC_CHECK_DISABLE
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const expected& __y)
   {
     if (__x.__has_val_ != __y.has_value())
@@ -1097,12 +1098,14 @@ class expected : private __expected_move_assign<_Tp, _Err>
   }
 
 #  if _CCCL_STD_VER < 2020
+  _CCCL_EXEC_CHECK_DISABLE
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const expected& __x, const expected& __y)
   {
     return !(__x == __y);
   }
 #  endif // _CCCL_STD_VER < 2020
 
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _T2, class _E2)
   _LIBCUDACXX_REQUIRES((!_CCCL_TRAIT(is_void, _T2)))
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const expected<_T2, _E2>& __y)
@@ -1125,6 +1128,7 @@ class expected : private __expected_move_assign<_Tp, _Err>
   }
 
 #  if _CCCL_STD_VER < 2020
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _T2, class _E2)
   _LIBCUDACXX_REQUIRES((!_CCCL_TRAIT(is_void, _T2)))
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const expected& __x, const expected<_T2, _E2>& __y)
@@ -1133,25 +1137,29 @@ class expected : private __expected_move_assign<_Tp, _Err>
   }
 #  endif // _CCCL_STD_VER < 2020
 
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _T2)
   _LIBCUDACXX_REQUIRES((!__expected::__is_expected_nonvoid<_T2>) )
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const _T2& __v)
   {
     return __x.__has_val_ && static_cast<bool>(__x.__union_.__val_ == __v);
   }
 #  if _CCCL_STD_VER < 2020
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _T2)
   _LIBCUDACXX_REQUIRES((!__expected::__is_expected_nonvoid<_T2>) )
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const _T2& __v, const expected& __x)
   {
     return __x.__has_val_ && static_cast<bool>(__x.__union_.__val_ == __v);
   }
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _T2)
   _LIBCUDACXX_REQUIRES((!__expected::__is_expected_nonvoid<_T2>) )
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const expected& __x, const _T2& __v)
   {
     return !__x.__has_val_ || static_cast<bool>(__x.__union_.__val_ != __v);
   }
+  _CCCL_EXEC_CHECK_DISABLE
   _LIBCUDACXX_TEMPLATE(class _T2)
   _LIBCUDACXX_REQUIRES((!__expected::__is_expected_nonvoid<_T2>) )
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const _T2& __v, const expected& __x)
@@ -1160,22 +1168,26 @@ class expected : private __expected_move_assign<_Tp, _Err>
   }
 #  endif // _CCCL_STD_VER < 2020
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _E2>
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const unexpected<_E2>& __e)
   {
     return !__x.__has_val_ && static_cast<bool>(__x.__union_.__unex_ == __e.error());
   }
 #  if _CCCL_STD_VER < 2020
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _E2>
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const unexpected<_E2>& __e, const expected& __x)
   {
     return !__x.__has_val_ && static_cast<bool>(__x.__union_.__unex_ == __e.error());
   }
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _E2>
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const expected& __x, const unexpected<_E2>& __e)
   {
     return __x.__has_val_ || static_cast<bool>(__x.__union_.__unex_ != __e.error());
   }
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _E2>
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const unexpected<_E2>& __e, const expected& __x)
   {
@@ -1916,6 +1928,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   }
 
   // [expected.void.eq], equality operators
+  _CCCL_EXEC_CHECK_DISABLE
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const expected& __y) noexcept
   {
     if (__x.__has_val_ != __y.has_value())
@@ -1928,12 +1941,14 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
     }
   }
 #  if _CCCL_STD_VER < 2020
+  _CCCL_EXEC_CHECK_DISABLE
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator!=(const expected& __x, const expected& __y) noexcept
   {
     return !(__x == __y);
   }
 #  endif
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _E2>
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
   operator==(const expected& __x, const expected<void, _E2>& __y) noexcept
@@ -1948,6 +1963,7 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
     }
   }
 #  if _CCCL_STD_VER < 2020
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _E2>
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool
   operator!=(const expected& __x, const expected<void, _E2>& __y) noexcept
@@ -1956,22 +1972,26 @@ class expected<void, _Err> : private __expected_move_assign<void, _Err>
   }
 #  endif
 
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _E2>
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const expected& __x, const unexpected<_E2>& __y) noexcept
   {
     return !__x.__has_val_ && static_cast<bool>(__x.__union_.__unex_ == __y.error());
   }
 #  if _CCCL_STD_VER < 2020
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _E2>
   friend _LIBCUDACXX_HIDE_FROM_ABI constexpr bool operator==(const unexpected<_E2>& __y, const expected& __x) noexcept
   {
     return !__x.__has_val_ && static_cast<bool>(__x.__union_.__unex_ == __y.error());
   }
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _E2>
   _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool operator!=(const expected& __x, const unexpected<_E2>& __y) noexcept
   {
     return __x.__has_val_ || static_cast<bool>(__x.__union_.__unex_ != __y.error());
   }
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _E2>
   _LIBCUDACXX_HIDE_FROM_ABI friend constexpr bool operator!=(const unexpected<_E2>& __y, const expected& __x) noexcept
   {