Skip to content

Commit

Permalink
Move visibility attributes to cccl (#1595)
Browse files Browse the repository at this point in the history
* Move the `[[nodiscard]]` attribute detection to a centralized place

We can also drop the clang extension, as all supported clang based compilers also support nodiscard

* Move `CUDACC` detection logic to centralized header

* Use cccl execution space macros in libcu++

* Move `_LIBCUDACXX_NODISCARD_FRIEND` macro to cccl

* Move `[[no_unique_address]]` detection to cccl

* Move `[[fallthrough]]` detection to cccl

* Move `[[noreturn]]` detection to cccl

* Rename `_CCCL_ATTRIBUTE_HIDDEN` to `_CCCL_VISIBILITY_HIDDEN`

* Move visibility attributes to cccl

* Do not use ALIGNAS as that might be already used by the implementation
  • Loading branch information
miscco authored Apr 15, 2024
1 parent fe66c08 commit f62616b
Show file tree
Hide file tree
Showing 168 changed files with 2,688 additions and 2,788 deletions.
23 changes: 12 additions & 11 deletions .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -23,23 +23,29 @@ AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterReturnType: None
AlwaysBreakTemplateDeclarations: Yes
AttributeMacros: [
'_CCCL_ALIGNAS_TYPE',
'_CCCL_ALIGNAS',
'_CCCL_CONSTEXPR_CXX14',
'_CCCL_CONSTEXPR_CXX17',
'_CCCL_CONSTEXPR_CXX20',
'_CCCL_CONSTEXPR_CXX23',
'_CCCL_DEVICE',
'_CCCL_FALLTHROUGH',
'_CCCL_FORCEINLINE',
'_CCCL_HOST_DEVICE',
'_CCCL_HOST',
'_CCCL_NO_UNIQUE_ADDRESS',
'_CCCL_NODISCARD_FRIEND',
'_CCCL_NODISCARD',
'_CCCL_NORETURN',
'_CCCL_VISIBILITY_HIDDEN',
'CUB_RUNTIME_FUNCTION',
'CUB_DETAIL_KERNEL_ATTRIBUTES',
'THRUST_RUNTIME_FUNCTION',
'THRUST_DETAIL_KERNEL_ATTRIBUTES',
'_ALIGNAS_TYPE',
'_ALIGNAS',
'_LIBCUDACXX_ALIGNOF',
'_LIBCUDACXX_ALWAYS_INLINE',
'_LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS',
'_CCCL_CONSTEXPR_CXX14',
'_CCCL_CONSTEXPR_CXX17',
'_CCCL_CONSTEXPR_CXX20',
'_CCCL_CONSTEXPR_CXX23',
'_LIBCUDACXX_CONSTINIT',
'_LIBCUDACXX_DEPRECATED_IN_CXX11',
'_LIBCUDACXX_DEPRECATED_IN_CXX14',
Expand All @@ -50,7 +56,6 @@ AttributeMacros: [
'_LIBCUDACXX_EXCLUDE_FROM_EXPLICIT_INSTANTIATION',
'_LIBCUDACXX_EXPORTED_FROM_ABI',
'_LIBCUDACXX_EXTERN_TEMPLATE_TYPE_VIS',
'_LIBCUDACXX_FALLTHROUGH',
'_LIBCUDACXX_HIDDEN',
'_LIBCUDACXX_HIDE_FROM_ABI_AFTER_V1',
'_LIBCUDACXX_HIDE_FROM_ABI',
Expand All @@ -59,11 +64,7 @@ AttributeMacros: [
'_LIBCUDACXX_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS',
'_LIBCUDACXX_NO_DESTROY',
'_LIBCUDACXX_NO_SANITIZE',
'_LIBCUDACXX_NO_UNIQUE_ADDRESS',
'_LIBCUDACXX_NOALIAS',
'_LIBCUDACXX_NODISCARD_EXT',
'_LIBCUDACXX_NODISCARD',
'_LIBCUDACXX_NORETURN',
'_LIBCUDACXX_OVERRIDABLE_FUNC_VIS',
'_LIBCUDACXX_STANDALONE_DEBUG',
'_LIBCUDACXX_TEMPLATE_DATA_VIS',
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/device/dispatch/dispatch_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ struct dispatch_histogram
{}

template <typename ActivePolicyT, typename DeviceHistogramInitKernelT, typename DeviceHistogramSweepKernelT>
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE cudaError_t
CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t
Invoke(DeviceHistogramInitKernelT histogram_init_kernel, DeviceHistogramSweepKernelT histogram_sweep_kernel)
{
cudaError error = cudaSuccess;
Expand Down
10 changes: 5 additions & 5 deletions cub/cub/device/dispatch/dispatch_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1849,7 +1849,7 @@ struct DispatchRadixSort : SelectedPolicy
* Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel
*/
template <typename ActivePolicyT, typename SingleTileKernelT>
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE cudaError_t
CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t
InvokeSingleTile(SingleTileKernelT single_tile_kernel)
{
cudaError error = cudaSuccess;
Expand Down Expand Up @@ -2065,7 +2065,7 @@ struct DispatchRadixSort : SelectedPolicy

/// Initialize pass configuration
template <typename UpsweepPolicyT, typename ScanPolicyT, typename DownsweepPolicyT>
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE cudaError_t InitPassConfig(
CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t InitPassConfig(
UpsweepKernelT upsweep_kernel,
ScanKernelT scan_kernel,
DownsweepKernelT downsweep_kernel,
Expand Down Expand Up @@ -2386,7 +2386,7 @@ struct DispatchRadixSort : SelectedPolicy
* cub::DeviceRadixSortDownsweepKernel
*/
template <typename ActivePolicyT, typename UpsweepKernelT, typename ScanKernelT, typename DownsweepKernelT>
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE cudaError_t InvokePasses(
CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t InvokePasses(
UpsweepKernelT upsweep_kernel,
UpsweepKernelT alt_upsweep_kernel,
ScanKernelT scan_kernel,
Expand Down Expand Up @@ -3013,7 +3013,7 @@ struct DispatchSegmentedRadixSort : SelectedPolicy

/// Initialize pass configuration
template <typename SegmentedPolicyT>
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE cudaError_t
CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t
InitPassConfig(SegmentedKernelT segmented_kernel)
{
this->segmented_kernel = segmented_kernel;
Expand Down Expand Up @@ -3041,7 +3041,7 @@ struct DispatchSegmentedRadixSort : SelectedPolicy
* cub::DeviceSegmentedRadixSortKernel
*/
template <typename ActivePolicyT, typename SegmentedKernelT>
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE cudaError_t
CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t
InvokePasses(SegmentedKernelT segmented_kernel, SegmentedKernelT alt_segmented_kernel)
{
cudaError error = cudaSuccess;
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/device/dispatch/dispatch_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -652,7 +652,7 @@ struct DispatchReduce : SelectedPolicy
* cub::DeviceReduceSingleTileKernel
*/
template <typename ActivePolicyT, typename SingleTileKernelT>
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE cudaError_t
CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t
InvokeSingleTile(SingleTileKernelT single_tile_kernel)
{
cudaError error = cudaSuccess;
Expand Down Expand Up @@ -721,7 +721,7 @@ struct DispatchReduce : SelectedPolicy
* cub::DeviceReduceSingleTileKernel
*/
template <typename ActivePolicyT, typename ReduceKernelT, typename SingleTileKernelT>
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE cudaError_t
CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t
InvokePasses(ReduceKernelT reduce_kernel, SingleTileKernelT single_tile_kernel)
{
cudaError error = cudaSuccess;
Expand Down Expand Up @@ -1198,7 +1198,7 @@ struct DispatchSegmentedReduce : SelectedPolicy
* cub::DeviceSegmentedReduceKernel
*/
template <typename ActivePolicyT, typename DeviceSegmentedReduceKernelT>
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE cudaError_t
CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t
InvokePasses(DeviceSegmentedReduceKernelT segmented_reduce_kernel)
{
cudaError error = cudaSuccess;
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ struct DispatchReduceByKey
//---------------------------------------------------------------------

template <typename ActivePolicyT, typename ScanInitKernelT, typename ReduceByKeyKernelT>
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE cudaError_t
CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t
Invoke(ScanInitKernelT init_kernel, ReduceByKeyKernelT reduce_by_key_kernel)
{
using AgentReduceByKeyPolicyT = typename ActivePolicyT::ReduceByKeyPolicyT;
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/device/dispatch/dispatch_segmented_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -514,7 +514,7 @@ template <typename LargeSegmentPolicyT,
typename ValueT,
typename BeginOffsetIteratorT,
typename EndOffsetIteratorT>
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN cudaError_t DeviceSegmentedSortContinuation(
CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN cudaError_t DeviceSegmentedSortContinuation(
LargeKernelT large_kernel,
SmallKernelT small_kernel,
int num_segments,
Expand Down Expand Up @@ -1493,7 +1493,7 @@ private:
}

template <typename LargeSegmentPolicyT, typename SmallAndMediumPolicyT, typename LargeKernelT, typename SmallKernelT>
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE cudaError_t SortWithPartitioning(
CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t SortWithPartitioning(
LargeKernelT large_kernel,
SmallKernelT small_kernel,
std::size_t three_way_partition_temp_storage_bytes,
Expand Down Expand Up @@ -1630,7 +1630,7 @@ private:
}

template <typename LargeSegmentPolicyT, typename FallbackKernelT>
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE cudaError_t SortWithoutPartitioning(
CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t SortWithoutPartitioning(
FallbackKernelT fallback_kernel,
cub::detail::device_double_buffer<KeyT>& d_keys_double_buffer,
cub::detail::device_double_buffer<ValueT>& d_values_double_buffer)
Expand Down
29 changes: 14 additions & 15 deletions cub/cub/device/dispatch/dispatch_spmv_orig.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,7 @@ struct DispatchSpmv
typename SpmvKernelT,
typename SegmentFixupKernelT,
typename SpmvEmptyMatrixKernelT>
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE static cudaError_t Dispatch(
CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE static cudaError_t Dispatch(
void* d_temp_storage,
size_t& temp_storage_bytes,
SpmvParamsT& spmv_params,
Expand Down Expand Up @@ -898,20 +898,19 @@ struct DispatchSpmv
typename SpmvKernelT,
typename SegmentFixupKernelT,
typename SpmvEmptyMatrixKernelT>
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION
_CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE static cudaError_t
Dispatch(void* d_temp_storage,
size_t& temp_storage_bytes,
SpmvParamsT& spmv_params,
cudaStream_t stream,
bool debug_synchronous,
Spmv1ColKernelT spmv_1col_kernel,
SpmvSearchKernelT spmv_search_kernel,
SpmvKernelT spmv_kernel,
SegmentFixupKernelT segment_fixup_kernel,
SpmvEmptyMatrixKernelT spmv_empty_matrix_kernel,
KernelConfig spmv_config,
KernelConfig segment_fixup_config)
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE static cudaError_t
Dispatch(void* d_temp_storage,
size_t& temp_storage_bytes,
SpmvParamsT& spmv_params,
cudaStream_t stream,
bool debug_synchronous,
Spmv1ColKernelT spmv_1col_kernel,
SpmvSearchKernelT spmv_search_kernel,
SpmvKernelT spmv_kernel,
SegmentFixupKernelT segment_fixup_kernel,
SpmvEmptyMatrixKernelT spmv_empty_matrix_kernel,
KernelConfig spmv_config,
KernelConfig segment_fixup_config)
{
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

Expand Down
4 changes: 2 additions & 2 deletions cub/cub/util_device.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -825,7 +825,7 @@ CUB_RUNTIME_FUNCTION inline cudaError_t HasUVA(bool& has_uva)
* Dynamically allocated shared memory in bytes. Default is 0.
*/
template <typename KernelPtr>
_CCCL_ATTRIBUTE_HIDDEN CUB_RUNTIME_FUNCTION inline cudaError_t
_CCCL_VISIBILITY_HIDDEN CUB_RUNTIME_FUNCTION inline cudaError_t
MaxSmOccupancy(int& max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes = 0)
{
return CubDebug(
Expand Down Expand Up @@ -854,7 +854,7 @@ struct KernelConfig
{}

template <typename AgentPolicyT, typename KernelPtrT>
CUB_RUNTIME_FUNCTION _CCCL_ATTRIBUTE_HIDDEN _CCCL_FORCEINLINE cudaError_t Init(KernelPtrT kernel_ptr)
CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t Init(KernelPtrT kernel_ptr)
{
block_threads = AgentPolicyT::BLOCK_THREADS;
items_per_thread = AgentPolicyT::ITEMS_PER_THREAD;
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/util_type.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
# pragma system_header
#endif // no system header

#include <cuda/__cccl_config> // _LIBCUDACXX_CUDACC_VER
#include <cuda/__cccl_config> // _CCCL_CUDACC_VER
#include <cuda/std/limits>

#if !_NVHPC_CUDA
Expand All @@ -53,7 +53,7 @@
#if !_NVHPC_CUDA && !defined(CUB_DISABLE_BF16_SUPPORT)
# include <cuda_bf16.h>
// cuda_fp8.h transitively includes cuda_fp16.h, so we have to include the header under !CUB_DISABLE_BF16_SUPPORT
# if _LIBCUDACXX_CUDACC_VER >= 1108000
# if _CCCL_CUDACC_VER >= 1108000
// cuda_fp8.h resets default for C4127, so we have to guard the inclusion
# if defined(_CCCL_COMPILER_MSVC)
# pragma warning(push)
Expand Down Expand Up @@ -85,7 +85,7 @@ CUB_NAMESPACE_BEGIN
# define CUB_IS_INT128_ENABLED 1
# endif // !defined(__CUDACC_RTC_INT128__)
# else // !defined(__CUDACC_RTC__)
# if _LIBCUDACXX_CUDACC_VER >= 1105000
# if _CCCL_CUDACC_VER >= 1105000
# if defined(_CCCL_COMPILER_GCC) || defined(_CCCL_COMPILER_CLANG) || defined(_CCCL_COMPILER_ICC) \
|| defined(_CCCL_COMPILER_NVHPC)
# define CUB_IS_INT128_ENABLED 1
Expand Down
Loading

0 comments on commit f62616b

Please sign in to comment.