Skip to content

Commit

Permalink
Dubious perf modifications
Browse files Browse the repository at this point in the history
  • Loading branch information
gonidelis committed Sep 16, 2024
1 parent e308e2d commit ce3b44c
Showing 1 changed file with 22 additions and 10 deletions.
32 changes: 22 additions & 10 deletions cub/cub/device/device_find_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
#include <cub/thread/thread_operators.cuh>
#include <cub/util_deprecated.cuh>

#define elements_per_thread 32
#define elements_per_thread 16

CUB_NAMESPACE_BEGIN

Expand All @@ -60,36 +60,48 @@ __global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result, st
// int elements_per_thread = 32;
auto tile_size = blockDim.x * elements_per_thread;
__shared__ int sresult;
__shared__ int block_result;

for (int tile_offset = blockIdx.x * tile_size; tile_offset < num_items; tile_offset += tile_size * gridDim.x)
{
// Only one thread reads atomically and propagates it to the
// the rest threads of the block through shared memory
if (threadIdx.x == 0)
{
sresult = atomicAdd(result, 0);
sresult = atomicAdd(result, 0);
block_result = atomicAdd(result, 0); // @giannis: I don't like two the two shared memory variables
// coalesce them?
}
__syncthreads();

// early exit
if (sresult < tile_offset)
{
return;
}

for (int i = 0; i < elements_per_thread; ++i)
{
auto index = tile_offset + threadIdx.x + i * blockDim.x;

if (index < num_items)
{
// early exit
if (sresult < index)
{
return;
}

if (pred(*(begin + index)))
{
atomicMin(result, index);
return;
atomicMin(&block_result, index);
}
}
}
__syncthreads();

if (block_result < *result)
{
if (threadIdx.x == 0)
{
atomicMin(result, block_result);
}
return; // if block found at least one local minimum, whole block exits
}
}
}

Expand Down

0 comments on commit ce3b44c

Please sign in to comment.