Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hide API accepting kernel pointers #1395

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cub/cub/device/dispatch/dispatch_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ struct dispatch_histogram
template <typename ActivePolicyT,
typename DeviceHistogramInitKernelT,
typename DeviceHistogramSweepKernelT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE cudaError_t
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks good to me, maybe we should find a proper name for the combination of
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE

Invoke(DeviceHistogramInitKernelT histogram_init_kernel,
DeviceHistogramSweepKernelT histogram_sweep_kernel)
{
Expand Down
10 changes: 5 additions & 5 deletions cub/cub/device/dispatch/dispatch_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1495,7 +1495,7 @@ struct DispatchRadixSort : SelectedPolicy
* Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel
*/
template <typename ActivePolicyT, typename SingleTileKernelT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE cudaError_t
InvokeSingleTile(SingleTileKernelT single_tile_kernel)
{
cudaError error = cudaSuccess;
Expand Down Expand Up @@ -1710,7 +1710,7 @@ struct DispatchRadixSort : SelectedPolicy
typename UpsweepPolicyT,
typename ScanPolicyT,
typename DownsweepPolicyT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE
cudaError_t InitPassConfig(
UpsweepKernelT upsweep_kernel,
ScanKernelT scan_kernel,
Expand Down Expand Up @@ -2040,7 +2040,7 @@ struct DispatchRadixSort : SelectedPolicy
typename UpsweepKernelT,
typename ScanKernelT,
typename DownsweepKernelT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE cudaError_t
InvokePasses(UpsweepKernelT upsweep_kernel,
UpsweepKernelT alt_upsweep_kernel,
ScanKernelT scan_kernel,
Expand Down Expand Up @@ -2694,7 +2694,7 @@ struct DispatchSegmentedRadixSort : SelectedPolicy

/// Initialize pass configuration
template <typename SegmentedPolicyT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE
cudaError_t InitPassConfig(SegmentedKernelT segmented_kernel)
{
this->segmented_kernel = segmented_kernel;
Expand Down Expand Up @@ -2722,7 +2722,7 @@ struct DispatchSegmentedRadixSort : SelectedPolicy
* cub::DeviceSegmentedRadixSortKernel
*/
template <typename ActivePolicyT, typename SegmentedKernelT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE cudaError_t
InvokePasses(SegmentedKernelT segmented_kernel, SegmentedKernelT alt_segmented_kernel)
{
cudaError error = cudaSuccess;
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/device/dispatch/dispatch_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -652,7 +652,7 @@ struct DispatchReduce : SelectedPolicy
* cub::DeviceReduceSingleTileKernel
*/
template <typename ActivePolicyT, typename SingleTileKernelT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE cudaError_t
InvokeSingleTile(SingleTileKernelT single_tile_kernel)
{
cudaError error = cudaSuccess;
Expand Down Expand Up @@ -724,7 +724,7 @@ struct DispatchReduce : SelectedPolicy
* cub::DeviceReduceSingleTileKernel
*/
template <typename ActivePolicyT, typename ReduceKernelT, typename SingleTileKernelT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE cudaError_t
InvokePasses(ReduceKernelT reduce_kernel, SingleTileKernelT single_tile_kernel)
{
cudaError error = cudaSuccess;
Expand Down Expand Up @@ -1216,7 +1216,7 @@ struct DispatchSegmentedReduce : SelectedPolicy
* cub::DeviceSegmentedReduceKernel
*/
template <typename ActivePolicyT, typename DeviceSegmentedReduceKernelT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE cudaError_t
InvokePasses(DeviceSegmentedReduceKernelT segmented_reduce_kernel)
{
cudaError error = cudaSuccess;
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -293,8 +293,8 @@ struct DispatchReduceByKey
//---------------------------------------------------------------------

template <typename ActivePolicyT, typename ScanInitKernelT, typename ReduceByKeyKernelT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke(ScanInitKernelT init_kernel,
ReduceByKeyKernelT reduce_by_key_kernel)
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE cudaError_t
Invoke(ScanInitKernelT init_kernel, ReduceByKeyKernelT reduce_by_key_kernel)
{
using AgentReduceByKeyPolicyT = typename ActivePolicyT::ReduceByKeyPolicyT;
constexpr int block_threads = AgentReduceByKeyPolicyT::BLOCK_THREADS;
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/device/dispatch/dispatch_segmented_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -557,7 +557,7 @@ template <typename LargeSegmentPolicyT,
typename ValueT,
typename BeginOffsetIteratorT,
typename EndOffsetIteratorT>
CUB_RUNTIME_FUNCTION cudaError_t
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN cudaError_t
elstehle marked this conversation as resolved.
Show resolved Hide resolved
DeviceSegmentedSortContinuation(
LargeKernelT large_kernel,
SmallKernelT small_kernel,
Expand Down Expand Up @@ -1603,7 +1603,7 @@ private:
typename SmallAndMediumPolicyT,
typename LargeKernelT,
typename SmallKernelT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE cudaError_t
SortWithPartitioning(
LargeKernelT large_kernel,
SmallKernelT small_kernel,
Expand Down Expand Up @@ -1740,7 +1740,7 @@ private:

template <typename LargeSegmentPolicyT,
typename FallbackKernelT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t SortWithoutPartitioning(
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE cudaError_t SortWithoutPartitioning(
FallbackKernelT fallback_kernel,
cub::detail::device_double_buffer<KeyT> &d_keys_double_buffer,
cub::detail::device_double_buffer<ValueT> &d_values_double_buffer)
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_spmv_orig.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -649,7 +649,7 @@ struct DispatchSpmv
typename SpmvKernelT,
typename SegmentFixupKernelT,
typename SpmvEmptyMatrixKernelT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE static cudaError_t
Dispatch(void *d_temp_storage,
size_t &temp_storage_bytes,
SpmvParamsT &spmv_params,
Expand Down Expand Up @@ -939,7 +939,7 @@ struct DispatchSpmv
typename SegmentFixupKernelT,
typename SpmvEmptyMatrixKernelT>
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE static cudaError_t
Dispatch(void *d_temp_storage,
size_t &temp_storage_bytes,
SpmvParamsT &spmv_params,
Expand Down
8 changes: 3 additions & 5 deletions cub/cub/util_device.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -816,10 +816,8 @@ CUB_RUNTIME_FUNCTION inline cudaError_t HasUVA(bool& has_uva)
* Dynamically allocated shared memory in bytes. Default is 0.
*/
template <typename KernelPtr>
CUB_RUNTIME_FUNCTION inline cudaError_t MaxSmOccupancy(int &max_sm_occupancy,
KernelPtr kernel_ptr,
int block_threads,
int dynamic_smem_bytes = 0)
_CCCL_ATTRIBUTE_HIDDEN CUB_RUNTIME_FUNCTION inline cudaError_t
MaxSmOccupancy(int& max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes = 0)
{
return CubDebug(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_sm_occupancy,
Expand Down Expand Up @@ -847,7 +845,7 @@ struct KernelConfig
KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {}

template <typename AgentPolicyT, typename KernelPtrT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE
cudaError_t Init(KernelPtrT kernel_ptr)
{
block_threads = AgentPolicyT::BLOCK_THREADS;
Expand Down
6 changes: 3 additions & 3 deletions cub/docs/developer_overview.rst
Original file line number Diff line number Diff line change
Expand Up @@ -725,9 +725,9 @@ This solution has poor discoverability,
since issues present themselves in forms of segmentation faults, hangs, wrong results, etc.
To eliminate the symbol visibility issues on our end, we follow the following rules:

#. Hiding kernel launchers:
it's important that kernel launchers like Thrust ``triple_chevron`` always reside in the same
library as the API using these kernel launchers.
#. Hiding symbols accpeting kernel pointers:
it's important that API accepting kernel pointers (e.g. ``triple_chevron``) always reside in the same
library as the code taking this pointers.

#. Hiding all kernels:
it's important that kernels always reside in the same library as the API using these kernels.
Expand Down
Loading