diff --git a/thrust/thrust/system/cuda/detail/assign_value.h b/thrust/thrust/system/cuda/detail/assign_value.h index 395dce9341f..bc5f02b702c 100644 --- a/thrust/thrust/system/cuda/detail/assign_value.h +++ b/thrust/thrust/system/cuda/detail/assign_value.h @@ -38,60 +38,37 @@ THRUST_NAMESPACE_BEGIN namespace cuda_cub { - template -inline _CCCL_HOST_DEVICE void -assign_value(thrust::cuda::execution_policy& exec, Pointer1 dst, Pointer2 src) +_CCCL_HOST_DEVICE void assign_value(execution_policy& exec, Pointer1 dst, Pointer2 src) { - // XXX war nvbugs/881631 - struct war_nvbugs_881631 - { - _CCCL_HOST inline static void - host_path(thrust::cuda::execution_policy& exec, Pointer1 dst, Pointer2 src) - { - cuda_cub::copy(exec, src, src + 1, dst); - } - - _CCCL_DEVICE inline static void - device_path(thrust::cuda::execution_policy&, Pointer1 dst, Pointer2 src) - { - *thrust::raw_pointer_cast(dst) = *thrust::raw_pointer_cast(src); - } + // Because of https://docs.nvidia.com/cuda/cuda-c-programming-guide/#cuda-arch point 2., if a call from a __host__ + // __device__ function leads to the template instantiation of a __global__ function, then this instantiation needs to + // happen regardless of whether __CUDA_ARCH__ is defined. Therefore, we make the host path visible outside the + // NV_IF_TARGET switch. See also NVBug 881631. + auto host = [&]() { + cuda_cub::copy(exec, src, src + 1, dst); }; - - NV_IF_TARGET( - NV_IS_HOST, (war_nvbugs_881631::host_path(exec, dst, src);), (war_nvbugs_881631::device_path(exec, dst, src);)); - -} // end assign_value() + NV_IF_TARGET(NV_IS_HOST, host();, *thrust::raw_pointer_cast(dst) = *thrust::raw_pointer_cast(src);); +} template -inline _CCCL_HOST_DEVICE void assign_value(cross_system& systems, Pointer1 dst, Pointer2 src) +_CCCL_HOST_DEVICE void assign_value(cross_system& systems, Pointer1 dst, Pointer2 src) { - // XXX war nvbugs/881631 - struct war_nvbugs_881631 - { - _CCCL_HOST inline static void host_path(cross_system& systems, Pointer1 dst, Pointer2 src) - { - // rotate the systems so that they are ordered the same as (src, dst) - // for the call to thrust::copy - cross_system rotated_systems = systems.rotate(); - cuda_cub::copy(rotated_systems, src, src + 1, dst); - } - - _CCCL_DEVICE inline static void device_path(cross_system&, Pointer1 dst, Pointer2 src) - { - // XXX forward the true cuda::execution_policy inside systems here - // instead of materializing a tag - thrust::cuda::tag cuda_tag; - thrust::cuda_cub::assign_value(cuda_tag, dst, src); - } + // Because of https://docs.nvidia.com/cuda/cuda-c-programming-guide/#cuda-arch point 2., if a call from a __host__ + // __device__ function leads to the template instantiation of a __global__ function, then this instantiation needs to + // happen regardless of whether __CUDA_ARCH__ is defined. Therefore, we make the host path visible outside the + // NV_IF_TARGET switch. See also NVBug 881631. + auto host = [&]() { + // rotate the systems so that they are ordered the same as (src, dst) for the call to thrust::copy + cross_system rotated_systems = systems.rotate(); + cuda_cub::copy(rotated_systems, src, src + 1, dst); }; - - NV_IF_TARGET(NV_IS_HOST, - (war_nvbugs_881631::host_path(systems, dst, src);), - (war_nvbugs_881631::device_path(systems, dst, src);)); -} // end assign_value() - + NV_IF_TARGET(NV_IS_HOST, host(); + , + ( + // XXX forward the true cuda::execution_policy inside systems here instead of materializing a tag + cuda::tag cuda_tag; cuda_cub::assign_value(cuda_tag, dst, src);)); +} } // namespace cuda_cub THRUST_NAMESPACE_END #endif diff --git a/thrust/thrust/system/cuda/detail/get_value.h b/thrust/thrust/system/cuda/detail/get_value.h index 5d1bb7e4cb6..fdfa1eb2084 100644 --- a/thrust/thrust/system/cuda/detail/get_value.h +++ b/thrust/thrust/system/cuda/detail/get_value.h @@ -38,51 +38,23 @@ THRUST_NAMESPACE_BEGIN namespace cuda_cub { - -namespace -{ - template -inline _CCCL_HOST_DEVICE typename thrust::iterator_value::type -get_value_msvc2005_war(execution_policy& exec, Pointer ptr) +_CCCL_HOST_DEVICE iterator_value_t get_value(execution_policy& exec, Pointer ptr) { - using result_type = typename thrust::iterator_value::type; - - // XXX war nvbugs/881631 - struct war_nvbugs_881631 - { - _CCCL_HOST inline static result_type host_path(execution_policy& exec, Pointer ptr) - { - // when called from host code, implement with assign_value - // note that this requires a type with default constructor - result_type result; - - thrust::host_system_tag host_tag; - cross_system systems(host_tag, exec); - assign_value(systems, &result, ptr); - - return result; - } - - _CCCL_DEVICE inline static result_type device_path(execution_policy&, Pointer ptr) - { - // when called from device code, just do simple deref - return *thrust::raw_pointer_cast(ptr); - } + // Because of https://docs.nvidia.com/cuda/cuda-c-programming-guide/#cuda-arch point 2., if a call from a __host__ + // __device__ function leads to the template instantiation of a __global__ function, then this instantiation needs to + // happen regardless of whether __CUDA_ARCH__ is defined. Therefore, we make the host path visible outside the + // NV_IF_TARGET switch. See also NVBug 881631. + auto host = [&]() { + // implemented with assign_value, which requires a type with a default constructor + iterator_value_t result; + host_system_tag host_tag; + cross_system systems(host_tag, exec); + assign_value(systems, &result, ptr); + return result; }; - - NV_IF_TARGET( - NV_IS_HOST, (return war_nvbugs_881631::host_path(exec, ptr);), (return war_nvbugs_881631::device_path(exec, ptr);)) -} // end get_value_msvc2005_war() -} // namespace - -template -inline _CCCL_HOST_DEVICE typename thrust::iterator_value::type -get_value(execution_policy& exec, Pointer ptr) -{ - return get_value_msvc2005_war(exec, ptr); -} // end get_value() - + NV_IF_TARGET(NV_IS_DEVICE, return *thrust::raw_pointer_cast(ptr);, return host();) +} } // namespace cuda_cub THRUST_NAMESPACE_END