NVIDIA · bernhardmgruber · Feb 4, 2025 · Jan 30, 2025
@@ -38,60 +38,46 @@
 THRUST_NAMESPACE_BEGIN
 namespace cuda_cub
 {
-
 template <typename DerivedPolicy, typename Pointer1, typename Pointer2>
-inline _CCCL_HOST_DEVICE void
-assign_value(thrust::cuda::execution_policy<DerivedPolicy>& exec, Pointer1 dst, Pointer2 src)
+_CCCL_HOST_DEVICE void assign_value(execution_policy<DerivedPolicy>& exec, Pointer1 dst, Pointer2 src)
 {
-  // XXX war nvbugs/881631
-  struct war_nvbugs_881631
+  // Because of https://docs.nvidia.com/cuda/cuda-c-programming-guide/#cuda-arch point 2., if a call from a __host__
+  // __device__ function leads to the template instantiation of a __global__ function, then this instantiation needs to
+  // happen regardless of whether __CUDA_ARCH__ is defined. Therefore, we make the host path visible outside the
+  // NV_IF_TARGET switch. See also NVBug 881631.
+  struct HostPath
   {
-    _CCCL_HOST inline static void
-    host_path(thrust::cuda::execution_policy<DerivedPolicy>& exec, Pointer1 dst, Pointer2 src)
+    _CCCL_HOST auto operator()(execution_policy<DerivedPolicy>& exec, Pointer1 dst, Pointer2 src)
     {
       cuda_cub::copy(exec, src, src + 1, dst);
     }
-
-    _CCCL_DEVICE inline static void
-    device_path(thrust::cuda::execution_policy<DerivedPolicy>&, Pointer1 dst, Pointer2 src)
-    {
-      *thrust::raw_pointer_cast(dst) = *thrust::raw_pointer_cast(src);
-    }
   };
-
   NV_IF_TARGET(
-    NV_IS_HOST, (war_nvbugs_881631::host_path(exec, dst, src);), (war_nvbugs_881631::device_path(exec, dst, src);));
-
-} // end assign_value()
+    NV_IS_HOST, (HostPath{}(exec, dst, src);), *thrust::raw_pointer_cast(dst) = *thrust::raw_pointer_cast(src););
+}
 
 template <typename System1, typename System2, typename Pointer1, typename Pointer2>
-inline _CCCL_HOST_DEVICE void assign_value(cross_system<System1, System2>& systems, Pointer1 dst, Pointer2 src)
+_CCCL_HOST_DEVICE void assign_value(cross_system<System1, System2>& systems, Pointer1 dst, Pointer2 src)
 {
-  // XXX war nvbugs/881631
-  struct war_nvbugs_881631
+  // Because of https://docs.nvidia.com/cuda/cuda-c-programming-guide/#cuda-arch point 2., if a call from a __host__
+  // __device__ function leads to the template instantiation of a __global__ function, then this instantiation needs to
+  // happen regardless of whether __CUDA_ARCH__ is defined. Therefore, we make the host path visible outside the
+  // NV_IF_TARGET switch. See also NVBug 881631.
+  struct HostPath
   {
-    _CCCL_HOST inline static void host_path(cross_system<System1, System2>& systems, Pointer1 dst, Pointer2 src)
+    _CCCL_HOST auto operator()(cross_system<System1, System2>& systems, Pointer1 dst, Pointer2 src)
     {
-      // rotate the systems so that they are ordered the same as (src, dst)
-      // for the call to thrust::copy
+      // rotate the systems so that they are ordered the same as (src, dst) for the call to thrust::copy
       cross_system<System2, System1> rotated_systems = systems.rotate();
       cuda_cub::copy(rotated_systems, src, src + 1, dst);
     }
-
-    _CCCL_DEVICE inline static void device_path(cross_system<System1, System2>&, Pointer1 dst, Pointer2 src)
-    {
-      // XXX forward the true cuda::execution_policy inside systems here
-      //     instead of materializing a tag
-      thrust::cuda::tag cuda_tag;
-      thrust::cuda_cub::assign_value(cuda_tag, dst, src);
-    }
   };
-
   NV_IF_TARGET(NV_IS_HOST,
-               (war_nvbugs_881631::host_path(systems, dst, src);),
-               (war_nvbugs_881631::device_path(systems, dst, src);));
-} // end assign_value()
-
+               (HostPath{}(systems, dst, src);),
+               (
+                 // XXX forward the true cuda::execution_policy inside systems here instead of materializing a tag
+                 cuda::tag cuda_tag; cuda_cub::assign_value(cuda_tag, dst, src);));
+}
 } // namespace cuda_cub
 THRUST_NAMESPACE_END
 #endif
@@ -38,51 +38,27 @@
 THRUST_NAMESPACE_BEGIN
 namespace cuda_cub
 {
-
-namespace
-{
-
 template <typename DerivedPolicy, typename Pointer>
-inline _CCCL_HOST_DEVICE typename thrust::iterator_value<Pointer>::type
-get_value_msvc2005_war(execution_policy<DerivedPolicy>& exec, Pointer ptr)
+_CCCL_HOST_DEVICE iterator_value_t<Pointer> get_value(execution_policy<DerivedPolicy>& exec, Pointer ptr)
 {
-  using result_type = typename thrust::iterator_value<Pointer>::type;
-
-  // XXX war nvbugs/881631
-  struct war_nvbugs_881631
+  // Because of https://docs.nvidia.com/cuda/cuda-c-programming-guide/#cuda-arch point 2., if a call from a __host__
+  // __device__ function leads to the template instantiation of a __global__ function, then this instantiation needs to
+  // happen regardless of whether __CUDA_ARCH__ is defined. Therefore, we make the host path visible outside the
+  // NV_IF_TARGET switch. See also NVBug 881631.
+  struct HostPath
   {
-    _CCCL_HOST inline static result_type host_path(execution_policy<DerivedPolicy>& exec, Pointer ptr)
+    _CCCL_HOST auto operator()(execution_policy<DerivedPolicy>& exec, Pointer ptr)
     {
-      // when called from host code, implement with assign_value
-      // note that this requires a type with default constructor
-      result_type result;
-
-      thrust::host_system_tag host_tag;
-      cross_system<thrust::host_system_tag, DerivedPolicy> systems(host_tag, exec);
+      // implemented with assign_value, which requires a type with a default constructor
+      iterator_value_t<Pointer> result;
+      host_system_tag host_tag;
+      cross_system<host_system_tag, DerivedPolicy> systems(host_tag, exec);
       assign_value(systems, &result, ptr);
-
       return result;
     }
-
-    _CCCL_DEVICE inline static result_type device_path(execution_policy<DerivedPolicy>&, Pointer ptr)
-    {
-      // when called from device code, just do simple deref
-      return *thrust::raw_pointer_cast(ptr);
-    }
   };
-
-  NV_IF_TARGET(
-    NV_IS_HOST, (return war_nvbugs_881631::host_path(exec, ptr);), (return war_nvbugs_881631::device_path(exec, ptr);))
-} // end get_value_msvc2005_war()
-} // namespace
-
-template <typename DerivedPolicy, typename Pointer>
-inline _CCCL_HOST_DEVICE typename thrust::iterator_value<Pointer>::type
-get_value(execution_policy<DerivedPolicy>& exec, Pointer ptr)
-{
-  return get_value_msvc2005_war(exec, ptr);
-} // end get_value()
-
+  NV_IF_TARGET(NV_IS_DEVICE, return *thrust::raw_pointer_cast(ptr);, (return HostPath{}(exec, ptr);))
+}
 } // namespace cuda_cub
 THRUST_NAMESPACE_END