Skip to content

Commit

Permalink
Fix tuning benchmark for cub::DeviceTransform (#2970)
Browse files Browse the repository at this point in the history
* Replace CUB_DETAIL_COUNT by _CCCL_PP_COUNT. It was removed at some point, but not replaced everywhere.
* Add missing pragma once to header
* Fix use of _CUB_HAS_TRANSFORM_UBLKCP before it is defined
  • Loading branch information
bernhardmgruber authored Nov 28, 2024
1 parent af0a8bb commit 9beeb26
Show file tree
Hide file tree
Showing 7 changed files with 15 additions and 102 deletions.
17 changes: 0 additions & 17 deletions cub/benchmarks/bench/transform/babelstream1.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,8 @@
// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// keep checks at the top so compilation of discarded variants fails really fast
#if !TUNE_BASE
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif

# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
# endif
#endif

#include "common.h"

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
# error "This benchmark does not support being compiled for multiple architectures"
# endif
#endif

template <typename T, typename OffsetT>
static void mul(nvbench::state& state, nvbench::type_list<T, OffsetT>)
{
Expand Down
17 changes: 0 additions & 17 deletions cub/benchmarks/bench/transform/babelstream2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,8 @@
// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// keep checks at the top so compilation of discarded variants fails really fast
#if !TUNE_BASE
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif

# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
# endif
#endif

#include "common.h"

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
# error "This benchmark does not support being compiled for multiple architectures"
# endif
#endif

template <typename T, typename OffsetT>
static void add(nvbench::state& state, nvbench::type_list<T, OffsetT>)
{
Expand Down
17 changes: 0 additions & 17 deletions cub/benchmarks/bench/transform/babelstream3.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,8 @@
// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// keep checks at the top so compilation of discarded variants fails really fast
#if !TUNE_BASE
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif

# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
# endif
#endif

#include "common.h"

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
# error "This benchmark does not support being compiled for multiple architectures"
# endif
#endif

template <typename T, typename OffsetT>
static void nstream(nvbench::state& state, nvbench::type_list<T, OffsetT>)
{
Expand Down
15 changes: 15 additions & 0 deletions cub/benchmarks/bench/transform/common.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,22 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

#pragma once

// keep checks at the top so compilation of discarded variants fails really fast
#include <cub/device/dispatch/dispatch_transform.cuh>
#if !TUNE_BASE && TUNE_ALGORITHM == 1
# if _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1
# error "When tuning, this benchmark does not support being compiled for multiple architectures"
# endif
# if (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif
# ifndef _CUB_HAS_TRANSFORM_UBLKCP
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
# endif
#endif

#include <cub/util_namespace.cuh>

#include <cuda/std/type_traits>
Expand Down
17 changes: 0 additions & 17 deletions cub/benchmarks/bench/transform/complex_cmp.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,8 @@
// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// keep checks at the top so compilation of discarded variants fails really fast
#if !TUNE_BASE
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif

# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
# endif
#endif

#include "common.h"

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
# error "This benchmark does not support being compiled for multiple architectures"
# endif
#endif

// This benchmark tests overlapping memory regions for reading and is compute intensive

template <typename OffsetT>
Expand Down
17 changes: 0 additions & 17 deletions cub/benchmarks/bench/transform/fib.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,8 @@
// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// keep checks at the top so compilation of discarded variants fails really fast
#if !TUNE_BASE
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif

# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
# endif
#endif

#include "common.h"

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
# error "This benchmark does not support being compiled for multiple architectures"
# endif
#endif

// This benchmark is compute intensive with diverging threads

template <class IndexT, class OutputT>
Expand Down
17 changes: 0 additions & 17 deletions cub/benchmarks/bench/transform/heavy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,8 @@
// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// keep checks at the top so compilation of discarded variants fails really fast
#if !TUNE_BASE
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif

# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
# endif
#endif

#include "common.h"

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
# error "This benchmark does not support being compiled for multiple architectures"
# endif
#endif

// This benchmark uses a LOT of registers and is compute intensive.

template <int N>
Expand Down

0 comments on commit 9beeb26

Please sign in to comment.