Add new patch to hide more CCCL APIs (rapidsai#580)

CCCL has more symbols that needed patching. This PR adapts NVIDIA/cccl#1395 to work on 2.2.0 (without the macros that have been defined for this in the current trunk of CCCL) by manually adding the visibility attribute. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Robert Maynard (https://github.com/robertmaynard) URL: rapidsai#580
Jacobfaib · Apr 11, 2024 · 9ee0f73 · 9ee0f73
1 parent 22388bf
commit 9ee0f73
Show file tree

Hide file tree

Showing 2 changed files with 191 additions and 0 deletions.
diff --git a/rapids-cmake/cpm/patches/cccl/kernel_pointer_hiding.diff b/rapids-cmake/cpm/patches/cccl/kernel_pointer_hiding.diff
@@ -0,0 +1,186 @@
+diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh
+index c059c21f9..f654b09c9 100644
+--- a/cub/cub/device/dispatch/dispatch_histogram.cuh
++++ b/cub/cub/device/dispatch/dispatch_histogram.cuh
+@@ -300,7 +300,7 @@ struct dispatch_histogram
+   template <typename ActivePolicyT,
+             typename DeviceHistogramInitKernelT,
+             typename DeviceHistogramSweepKernelT>
+-  CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
++  CUB_RUNTIME_FUNCTION __attribute__ ((__visibility__("hidden"))) __forceinline__ cudaError_t
+   Invoke(DeviceHistogramInitKernelT histogram_init_kernel,
+          DeviceHistogramSweepKernelT histogram_sweep_kernel)
+   {
+diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+index e1ea61e23..6a84292a9 100644
+--- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh
++++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+@@ -1268,7 +1268,7 @@ struct DispatchRadixSort : SelectedPolicy
+     template <
+         typename                ActivePolicyT,          ///< Umbrella policy active for the target device
+         typename                SingleTileKernelT>      ///< Function type of cub::DeviceRadixSortSingleTileKernel
+-    CUB_RUNTIME_FUNCTION __forceinline__
++    CUB_RUNTIME_FUNCTION __attribute__ ((__visibility__("hidden"))) __forceinline__
+     cudaError_t InvokeSingleTile(
+         SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel
+     {
+@@ -1480,7 +1480,7 @@ struct DispatchRadixSort : SelectedPolicy
+             typename UpsweepPolicyT,
+             typename ScanPolicyT,
+             typename DownsweepPolicyT>
+-        CUB_RUNTIME_FUNCTION __forceinline__
++        CUB_RUNTIME_FUNCTION __attribute__ ((__visibility__("hidden"))) __forceinline__
+         cudaError_t InitPassConfig(
+             UpsweepKernelT      upsweep_kernel,
+             ScanKernelT         scan_kernel,
+@@ -1732,7 +1732,7 @@ struct DispatchRadixSort : SelectedPolicy
+         typename            UpsweepKernelT,         ///< Function type of cub::DeviceRadixSortUpsweepKernel
+         typename            ScanKernelT,            ///< Function type of cub::SpineScanKernel
+         typename            DownsweepKernelT>       ///< Function type of cub::DeviceRadixSortDownsweepKernel
+-    CUB_RUNTIME_FUNCTION __forceinline__
++    CUB_RUNTIME_FUNCTION __attribute__ ((__visibility__("hidden"))) __forceinline__
+     cudaError_t InvokePasses(
+         UpsweepKernelT      upsweep_kernel,         ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
+         UpsweepKernelT      alt_upsweep_kernel,     ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
+@@ -2252,7 +2252,7 @@ struct DispatchSegmentedRadixSort : SelectedPolicy
+
+         /// Initialize pass configuration
+         template <typename SegmentedPolicyT>
+-        CUB_RUNTIME_FUNCTION __forceinline__
++        CUB_RUNTIME_FUNCTION __attribute__ ((__visibility__("hidden"))) __forceinline__
+         cudaError_t InitPassConfig(SegmentedKernelT segmented_kernel)
+         {
+             this->segmented_kernel  = segmented_kernel;
+@@ -2263,12 +2263,11 @@ struct DispatchSegmentedRadixSort : SelectedPolicy
+         }
+     };
+
+-
+     /// Invocation (run multiple digit passes)
+     template <
+         typename                ActivePolicyT,          ///< Umbrella policy active for the target device
+         typename                SegmentedKernelT>       ///< Function type of cub::DeviceSegmentedRadixSortKernel
+-    CUB_RUNTIME_FUNCTION __forceinline__
++    CUB_RUNTIME_FUNCTION __attribute__ ((__visibility__("hidden"))) __forceinline__
+     cudaError_t InvokePasses(
+         SegmentedKernelT     segmented_kernel,          ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
+         SegmentedKernelT     alt_segmented_kernel)      ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
+diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
+index d1fff4141..768c15012 100644
+--- a/cub/cub/device/dispatch/dispatch_reduce.cuh
++++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
+@@ -646,7 +646,7 @@ struct DispatchReduce : SelectedPolicy
+    *   cub::DeviceReduceSingleTileKernel
+    */
+   template <typename ActivePolicyT, typename SingleTileKernelT>
+-  CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
++  CUB_RUNTIME_FUNCTION __attribute__ ((__visibility__("hidden"))) __forceinline__ cudaError_t
+   InvokeSingleTile(SingleTileKernelT single_tile_kernel)
+   {
+     cudaError error = cudaSuccess;
+@@ -716,7 +716,7 @@ struct DispatchReduce : SelectedPolicy
+   template <typename ActivePolicyT,
+             typename ReduceKernelT,
+             typename SingleTileKernelT>
+-  CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
++  CUB_RUNTIME_FUNCTION __attribute__ ((__visibility__("hidden"))) __forceinline__ cudaError_t
+   InvokePasses(ReduceKernelT reduce_kernel,
+                SingleTileKernelT single_tile_kernel)
+   {
+@@ -1181,7 +1181,7 @@ struct DispatchSegmentedReduce : SelectedPolicy
+    *   cub::DeviceSegmentedReduceKernel
+    */
+   template <typename ActivePolicyT, typename DeviceSegmentedReduceKernelT>
+-  CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
++  CUB_RUNTIME_FUNCTION __attribute__ ((__visibility__("hidden"))) __forceinline__ cudaError_t
+   InvokePasses(DeviceSegmentedReduceKernelT segmented_reduce_kernel)
+   {
+     cudaError error = cudaSuccess;
+diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+index 146a54547..db675fef7 100644
+--- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
++++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+@@ -284,7 +284,7 @@ struct DispatchReduceByKey
+   //---------------------------------------------------------------------
+
+   template <typename ActivePolicyT, typename ScanInitKernelT, typename ReduceByKeyKernelT>
+-  CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke(ScanInitKernelT init_kernel,
++  CUB_RUNTIME_FUNCTION __attribute__ ((__visibility__("hidden"))) __forceinline__ cudaError_t Invoke(ScanInitKernelT init_kernel,
+                                                           ReduceByKeyKernelT reduce_by_key_kernel)
+   {
+     using AgentReduceByKeyPolicyT = typename ActivePolicyT::ReduceByKeyPolicyT;
+diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
+index b13f97b59..8694796ae 100644
+--- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
++++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
+@@ -547,7 +547,7 @@ template <typename LargeSegmentPolicyT,
+           typename ValueT,
+           typename BeginOffsetIteratorT,
+           typename EndOffsetIteratorT>
+-CUB_RUNTIME_FUNCTION cudaError_t
++CUB_RUNTIME_FUNCTION __attribute__ ((__visibility__("hidden")))  cudaError_t
+ DeviceSegmentedSortContinuation(
+     LargeKernelT large_kernel,
+     SmallKernelT small_kernel,
+@@ -1590,7 +1590,7 @@ private:
+             typename SmallAndMediumPolicyT,
+             typename LargeKernelT,
+             typename SmallKernelT>
+-  CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
++  CUB_RUNTIME_FUNCTION __attribute__ ((__visibility__("hidden"))) __forceinline__ cudaError_t
+   SortWithPartitioning(
+     LargeKernelT large_kernel,
+     SmallKernelT small_kernel,
+@@ -1725,7 +1725,7 @@ private:
+
+   template <typename LargeSegmentPolicyT,
+             typename FallbackKernelT>
+-  CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SortWithoutPartitioning(
++  CUB_RUNTIME_FUNCTION __attribute__ ((__visibility__("hidden"))) __forceinline__ cudaError_t SortWithoutPartitioning(
+     FallbackKernelT fallback_kernel,
+     cub::detail::device_double_buffer<KeyT> &d_keys_double_buffer,
+     cub::detail::device_double_buffer<ValueT> &d_values_double_buffer)
+diff --git a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
+index 227c2a42c..585e9d2a4 100644
+--- a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
++++ b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
+@@ -489,7 +489,7 @@ struct DispatchSpmv
+         typename                SpmvKernelT,                        ///< Function type of cub::AgentSpmvKernel
+         typename                SegmentFixupKernelT,                ///< Function type of cub::DeviceSegmentFixupKernelT
+         typename                SpmvEmptyMatrixKernelT>             ///< Function type of cub::DeviceSpmvEmptyMatrixKernel
+-    CUB_RUNTIME_FUNCTION __forceinline__
++    CUB_RUNTIME_FUNCTION __attribute__ ((__visibility__("hidden"))) __forceinline__
+     static cudaError_t Dispatch(
+         void*                   d_temp_storage,                     ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+         size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+@@ -780,7 +780,7 @@ struct DispatchSpmv
+               typename SegmentFixupKernelT,
+               typename SpmvEmptyMatrixKernelT>
+     CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+-    CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
++    CUB_RUNTIME_FUNCTION __attribute__ ((__visibility__("hidden"))) __forceinline__ static cudaError_t
+     Dispatch(void *d_temp_storage,
+              size_t &temp_storage_bytes,
+              SpmvParamsT &spmv_params,
+diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh
+index 4cd5fddb0..cd44a0ead 100644
+--- a/cub/cub/util_device.cuh
++++ b/cub/cub/util_device.cuh
+@@ -623,7 +623,7 @@ CUB_RUNTIME_FUNCTION inline cudaError_t HasUVA(bool& has_uva)
+  *
+  */
+ template <typename KernelPtr>
+-CUB_RUNTIME_FUNCTION inline
++CUB_RUNTIME_FUNCTION __attribute__ ((__visibility__("hidden"))) inline
+ cudaError_t MaxSmOccupancy(
+     int&                max_sm_occupancy,          ///< [out] maximum number of thread blocks that can reside on a single SM
+     KernelPtr           kernel_ptr,                 ///< [in] Kernel pointer for which to compute SM occupancy
+@@ -656,7 +656,7 @@ struct KernelConfig
+     KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {}
+
+     template <typename AgentPolicyT, typename KernelPtrT>
+-    CUB_RUNTIME_FUNCTION __forceinline__
++    CUB_RUNTIME_FUNCTION __attribute__ ((__visibility__("hidden")))  __forceinline__
+     cudaError_t Init(KernelPtrT kernel_ptr)
+     {
+         block_threads        = AgentPolicyT::BLOCK_THREADS;
diff --git a/rapids-cmake/cpm/versions.json b/rapids-cmake/cpm/versions.json
@@ -24,6 +24,11 @@
           "file": "cccl/revert_pr_211.diff",
           "issue": "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.",
           "fixed_in": ""
+        },
+        {
+          "file": "cccl/kernel_pointer_hiding.diff",
+          "issue": "Hide APIs that accept kernel pointers [https://github.com/NVIDIA/cccl/pull/1395]",
+          "fixed_in": "2.4"
         }
       ]
     },