From fd6e83632270596af6385f2323676c43dfe4a436 Mon Sep 17 00:00:00 2001
From: gonidelis <ggonidelis@nvidia.com>
Date: Tue, 27 Aug 2024 18:09:58 -0700
Subject: [PATCH 01/13] cub::DeviceFind::FindIf dummy implementation and test

---
 cub/cub/device/device_find_if.cuh      | 165 +++++++++++++++++++++++++
 cub/test/catch2_test_device_find_if.cu | 123 ++++++++++++++++++
 2 files changed, 288 insertions(+)
 create mode 100644 cub/cub/device/device_find_if.cuh
 create mode 100644 cub/test/catch2_test_device_find_if.cu
diff --git a/cub/cub/device/device_find_if.cuh b/cub/cub/device/device_find_if.cuh
new file mode 100644
index 00000000000..d8f12d36a99
--- /dev/null
+++ b/cub/cub/device/device_find_if.cuh
@@ -0,0 +1,165 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//! @file
+//! cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data
+//! items residing within device-accessible memory.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/choose_offset.cuh>
+#include <cub/detail/nvtx.cuh>
+#include <cub/device/dispatch/dispatch_scan.cuh>
+#include <cub/device/dispatch/dispatch_scan_by_key.cuh>
+#include <cub/thread/thread_operators.cuh>
+#include <cub/util_deprecated.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+template <typename IterBegin, typename IterEnd, typename Pred>
+__global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result)
+{
+  extern __shared__ int sresult[];
+  sresult[0] = INT_MAX;
+  __syncthreads();
+
+  auto global_index = threadIdx.x + blockIdx.x * blockDim.x;
+  int total_threads = gridDim.x * blockDim.x;
+
+  // traverse the sequence
+  for (auto index = global_index; begin + index < end; index += total_threads)
+  {
+    // Only one thread reads atomically and propagates it to the
+    // the rest threads of the block through shared memory
+    if (threadIdx.x == 0)
+    {
+      sresult[0] = atomicAdd(result, 0);
+    }
+    __syncthreads();
+
+    if (sresult[0] < index)
+    { // @georgii early exit!!!
+      // printf("early exit!!!");
+      return; // this returns the whole block
+    }
+
+    if (pred(*(begin + index)))
+    {
+      atomicMin(result,
+                index); // @georgii atomic min per your request makes sense
+      // printf("%d\n", *result);
+      return;
+    }
+  }
+}
+
+template <typename ValueType, typename OutputIteratorT>
+__global__ void write_final_result_in_output_iterator_already(ValueType* d_temp_storage, OutputIteratorT d_out)
+{
+  *d_out = *d_temp_storage;
+}
+
+template <typename ValueType, typename NumItemsT>
+__global__ void cuda_mem_set_async_dtemp_storage(ValueType* d_temp_storage, NumItemsT num_items)
+{
+  *d_temp_storage = num_items;
+}
+
+struct DeviceFind
+{
+  template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static void FindIf(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    ScanOpT op,
+    NumItemsT num_items,
+    cudaStream_t stream = 0)
+  {
+    int block_threads = 128;
+    // first cub API call
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = sizeof(int);
+      return;
+    }
+    int* int_temp_storage = static_cast<int*>(d_temp_storage);
+
+    // Get device ordinal
+    int device_ordinal;
+    cudaError error = CubDebug(cudaGetDevice(&device_ordinal));
+    if (cudaSuccess != error)
+    {
+      return;
+    }
+
+    // Get SM count
+    int sm_count;
+    error = CubDebug(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal));
+    if (cudaSuccess != error)
+    {
+      return;
+    }
+
+    int find_if_sm_occupancy;
+    error = CubDebug(
+      cub::MaxSmOccupancy(find_if_sm_occupancy, find_if<InputIteratorT, InputIteratorT, ScanOpT>, block_threads));
+    if (cudaSuccess != error)
+    {
+      return;
+    }
+
+    int findif_device_occupancy = find_if_sm_occupancy * sm_count;
+
+    // Even-share work distribution
+    int max_blocks = findif_device_occupancy;
+
+    // use d_temp_storage as the intermediate device result
+    // to read and write from. Then store the final result in the output iterator.
+    cuda_mem_set_async_dtemp_storage<<<1, 1>>>(int_temp_storage, num_items);
+
+    find_if<<<max_blocks, block_threads, 0, stream>>>(d_in, d_in + num_items, op, int_temp_storage);
+
+    write_final_result_in_output_iterator_already<int><<<1, 1>>>(int_temp_storage, d_out);
+
+    return;
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/cub/test/catch2_test_device_find_if.cu b/cub/test/catch2_test_device_find_if.cu
new file mode 100644
index 00000000000..2cb16b9d297
--- /dev/null
+++ b/cub/test/catch2_test_device_find_if.cu
@@ -0,0 +1,123 @@
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_find_if.cuh>
+#include <cub/thread/thread_operators.cuh>
+#include <cub/util_namespace.cuh>
+#include <cub/util_type.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/iterator/constant_iterator.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <iostream>
+
+#include "c2h/custom_type.cuh"
+#include "catch2_test_device_reduce.cuh"
+#include "catch2_test_helper.h"
+#include "catch2_test_launch_helper.h"
+#include <nv/target>
+
+// %PARAM% TEST_LAUNCH lid 0:1
+
+// DECLARE_LAUNCH_WRAPPER(cub::DeviceFind::FindIf, device_findif);
+
+// List of types to test
+using custom_t =
+  c2h::custom_type_t<c2h::accumulateable_t,
+                     c2h::equal_comparable_t,
+                     c2h::lexicographical_less_comparable_t,
+                     c2h::lexicographical_greater_comparable_t>;
+
+using full_type_list = c2h::type_list<type_pair<std::uint8_t, std::int32_t>, type_pair<std::int8_t>>;
+// clang-format on
+
+enum class gen_data_t : int
+{
+  /// Uniform random data generation
+  GEN_TYPE_RANDOM,
+  /// Constant value as input data
+  GEN_TYPE_CONST
+};
+
+template <typename InputIt, typename OutputIt, typename BinaryOp>
+void compute_find_if_reference(InputIt first, InputIt last, OutputIt& result, BinaryOp op)
+{
+  auto pos = thrust::find_if(first, last, op);
+  result   = pos - first;
+}
+
+template <typename T>
+struct equals_2
+{
+  __device__ __host__ bool operator()(T i)
+  {
+    return i == 2;
+  }
+};
+
+CUB_TEST("Device find if works", "[device]", full_type_list)
+{
+  using params   = params_t<TestType>;
+  using input_t  = typename params::item_t;
+  using output_t = typename params::output_t;
+  using offset_t = int32_t;
+
+  constexpr offset_t min_items = 1;
+  constexpr offset_t max_items = 1000000;
+
+  // Generate the input sizes to test for
+  const offset_t num_items = GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+
+  // Input data generation to test
+  const gen_data_t data_gen_mode = GENERATE_COPY(gen_data_t::GEN_TYPE_RANDOM, gen_data_t::GEN_TYPE_CONST);
+
+  // Generate input data
+  c2h::device_vector<input_t> in_items(num_items);
+  if (data_gen_mode == gen_data_t::GEN_TYPE_RANDOM)
+  {
+    c2h::gen(CUB_SEED(2), in_items);
+  }
+  else
+  {
+    input_t default_constant{};
+    init_default_constant(default_constant);
+    thrust::fill(c2h::device_policy, in_items.begin(), in_items.end(), default_constant);
+  }
+  auto d_in_it = thrust::raw_pointer_cast(in_items.data());
+
+  SECTION("find if")
+  {
+    using op_t = equals_2<std::int32_t>;
+
+    // Prepare verification data
+    c2h::host_vector<input_t> host_items(in_items);
+    c2h::host_vector<output_t> expected_result(1);
+    compute_find_if_reference(host_items.begin(), host_items.end(), expected_result[0], op_t{});
+
+    void* d_temp_storage = nullptr;
+    size_t temp_storage_bytes{};
+
+    // Run test
+    c2h::device_vector<output_t> out_result(1);
+    auto d_out_it = thrust::raw_pointer_cast(out_result.data());
+
+    cub::DeviceFind::FindIf(
+      d_temp_storage, temp_storage_bytes, unwrap_it(d_in_it), unwrap_it(d_out_it), op_t{}, num_items);
+
+    thrust::device_vector<uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    cub::DeviceFind::FindIf(
+      d_temp_storage, temp_storage_bytes, unwrap_it(d_in_it), unwrap_it(d_out_it), op_t{}, num_items);
+
+    // Verify result
+    REQUIRE(expected_result == out_result);
+  }
+}

From afd42cbef6ff7c12686d056eacaaa96ec76e66e6 Mon Sep 17 00:00:00 2001
From: gonidelis <ggonidelis@nvidia.com>
Date: Wed, 4 Sep 2024 21:55:45 -0700
Subject: [PATCH 02/13] Calculate d_temp_storage bytes properly aligned and use
 grid_size according to tile size

---
 cub/cub/device/device_find_if.cuh | 39 +++++++++++++++++++++++--------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/cub/cub/device/device_find_if.cuh b/cub/cub/device/device_find_if.cuh
index d8f12d36a99..f40d048286b 100644
--- a/cub/cub/device/device_find_if.cuh
+++ b/cub/cub/device/device_find_if.cuh
@@ -34,6 +34,8 @@
 
 #include <cub/config.cuh>
 
+#include "cub/util_type.cuh"
+
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
@@ -112,14 +114,10 @@ struct DeviceFind
     NumItemsT num_items,
     cudaStream_t stream = 0)
   {
-    int block_threads = 128;
-    // first cub API call
-    if (d_temp_storage == nullptr)
-    {
-      temp_storage_bytes = sizeof(int);
-      return;
-    }
-    int* int_temp_storage = static_cast<int*>(d_temp_storage);
+    int block_threads    = 128;
+    int items_per_thread = 2;
+    int tile_size        = block_threads * items_per_thread;
+    int num_tiles        = static_cast<int>(cub::DivideAndRoundUp(num_items, tile_size));
 
     // Get device ordinal
     int device_ordinal;
@@ -148,13 +146,34 @@ struct DeviceFind
     int findif_device_occupancy = find_if_sm_occupancy * sm_count;
 
     // Even-share work distribution
-    int max_blocks = findif_device_occupancy;
+    int max_blocks = findif_device_occupancy; // no * CUB_SUBSCRIPTION_FACTOR(0) because max_blocks gets too big
+
+    int findif_grid_size = CUB_MIN(num_tiles, max_blocks);
+
+    // Temporary storage allocation requirements
+    void* allocations[1]       = {};
+    size_t allocation_sizes[1] = {sizeof(int)};
+
+    // Alias the temporary allocations from the single storage blob (or
+    // compute the necessary size of the blob)
+    error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+    if (cudaSuccess != error)
+    {
+      return;
+    }
+
+    int* int_temp_storage = static_cast<int*>(allocations[0]); // this shouldn't be just int
+
+    if (d_temp_storage == nullptr)
+    {
+      return;
+    }
 
     // use d_temp_storage as the intermediate device result
     // to read and write from. Then store the final result in the output iterator.
     cuda_mem_set_async_dtemp_storage<<<1, 1>>>(int_temp_storage, num_items);
 
-    find_if<<<max_blocks, block_threads, 0, stream>>>(d_in, d_in + num_items, op, int_temp_storage);
+    find_if<<<findif_grid_size, block_threads, 0, stream>>>(d_in, d_in + num_items, op, int_temp_storage);
 
     write_final_result_in_output_iterator_already<int><<<1, 1>>>(int_temp_storage, d_out);
 

From 7610da68ed3f4ef8ac1754dc8226eaafa834fe28 Mon Sep 17 00:00:00 2001
From: gonidelis <ggonidelis@nvidia.com>
Date: Wed, 4 Sep 2024 21:56:24 -0700
Subject: [PATCH 03/13] Add cub::Device::FindIf nvbench benchmark

---
 cub/benchmarks/bench/find_if/base.cu | 127 +++++++++++++++++++++++++++
 cub/cub/device/device_find_if.cuh    |  13 ++-
 2 files changed, 132 insertions(+), 8 deletions(-)
 create mode 100644 cub/benchmarks/bench/find_if/base.cu

diff --git a/cub/benchmarks/bench/find_if/base.cu b/cub/benchmarks/bench/find_if/base.cu
new file mode 100644
index 00000000000..25312b158aa
--- /dev/null
+++ b/cub/benchmarks/bench/find_if/base.cu
@@ -0,0 +1,127 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_find_if.cuh>
+
+#include <thrust/count.h>
+#include <thrust/find.h>
+
+#include <nvbench_helper.cuh>
+
+template <typename T>
+struct equals_100
+{
+  __device__ bool operator()(T i)
+  {
+    return i == 1;
+  } // @amd you 'll never find out the secret sauce
+};
+
+template <typename T>
+void find_if(nvbench::state& state, nvbench::type_list<T>)
+{
+  // set up input
+  const auto elements      = state.get_int64("Elements");
+  const auto common_prefix = state.get_float64("CommonPrefixRatio");
+  const auto same_elements = elements * common_prefix;
+
+  thrust::device_vector<T> dinput(elements, 0);
+  thrust::fill(dinput.begin() + same_elements, dinput.end(), 1);
+  thrust::device_vector<T> d_result(1);
+  ///
+
+  void* d_temp_storage = nullptr;
+  size_t temp_storage_bytes{};
+
+  cub::DeviceFind::FindIf(
+    d_temp_storage, temp_storage_bytes, dinput.begin(), d_result.begin(), equals_100<int>{}, dinput.size(), 0);
+
+  thrust::device_vector<uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    cub::DeviceFind::FindIf(
+      d_temp_storage,
+      temp_storage_bytes,
+      dinput.begin(),
+      d_result.begin(),
+      equals_100<int>{},
+      dinput.size(),
+      launch.get_stream());
+  });
+}
+NVBENCH_BENCH_TYPES(find_if, NVBENCH_TYPE_AXES(nvbench::type_list<int32_t> /*integral_types*/))
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
+  .add_float64_axis("CommonPrefixRatio", std::vector{1.0, 0.5, 0.0});
+
+//////////////////////////////////////////////////////
+template <typename T>
+void thrust_find_if(nvbench::state& state, nvbench::type_list<T>)
+{
+  // set up input
+  const auto elements      = state.get_int64("Elements");
+  const auto common_prefix = state.get_float64("CommonPrefixRatio");
+  const auto same_elements = elements * common_prefix;
+
+  thrust::device_vector<T> dinput(elements, 0);
+  thrust::fill(dinput.begin() + same_elements, dinput.end(), 1);
+  ///
+
+  caching_allocator_t alloc;
+  thrust::find_if(policy(alloc), dinput.begin(), dinput.end(), equals_100<int>{});
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    thrust::find_if(policy(alloc, launch), dinput.begin(), dinput.end(), equals_100<int>{});
+  });
+}
+NVBENCH_BENCH_TYPES(thrust_find_if, NVBENCH_TYPE_AXES(nvbench::type_list<int32_t> /*integral_types*/))
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
+  .add_float64_axis("CommonPrefixRatio", std::vector{1.0, 0.5, 0.0});
+
+//////////////////////////////////////////////////////
+template <typename T>
+void thrust_count_if(nvbench::state& state, nvbench::type_list<T>)
+{
+  // set up input
+  const auto elements      = state.get_int64("Elements");
+  const auto common_prefix = state.get_float64("CommonPrefixRatio");
+  const auto same_elements = elements * common_prefix;
+
+  thrust::device_vector<T> dinput(elements, 0);
+  thrust::fill(dinput.begin() + same_elements, dinput.end(), 1);
+  ///
+
+  caching_allocator_t alloc;
+  thrust::count_if(policy(alloc), dinput.begin(), dinput.end(), equals_100<int>{});
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    thrust::count_if(policy(alloc, launch), dinput.begin(), dinput.end(), equals_100<int>{});
+  });
+}
+NVBENCH_BENCH_TYPES(thrust_count_if, NVBENCH_TYPE_AXES(nvbench::type_list<int32_t> /*integral_types*/))
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
+  .add_float64_axis("CommonPrefixRatio", std::vector{1.0, 0.5, 0.0});
diff --git a/cub/cub/device/device_find_if.cuh b/cub/cub/device/device_find_if.cuh
index f40d048286b..f0663b92ab9 100644
--- a/cub/cub/device/device_find_if.cuh
+++ b/cub/cub/device/device_find_if.cuh
@@ -56,9 +56,7 @@ CUB_NAMESPACE_BEGIN
 template <typename IterBegin, typename IterEnd, typename Pred>
 __global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result)
 {
-  extern __shared__ int sresult[];
-  sresult[0] = INT_MAX;
-  __syncthreads();
+  __shared__ int sresult;
 
   auto global_index = threadIdx.x + blockIdx.x * blockDim.x;
   int total_threads = gridDim.x * blockDim.x;
@@ -70,14 +68,13 @@ __global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result)
     // the rest threads of the block through shared memory
     if (threadIdx.x == 0)
     {
-      sresult[0] = atomicAdd(result, 0);
+      sresult = atomicAdd(result, 0);
     }
     __syncthreads();
 
-    if (sresult[0] < index)
-    { // @georgii early exit!!!
-      // printf("early exit!!!");
-      return; // this returns the whole block
+    if (sresult < index)
+    {
+      return; // early exit
     }
 
     if (pred(*(begin + index)))

From e683a3dcbbc47e1398b83f6be21a700902009409 Mon Sep 17 00:00:00 2001
From: gonidelis <ggonidelis@nvidia.com>
Date: Mon, 9 Sep 2024 15:40:47 -0700
Subject: [PATCH 04/13] Introduced tile based loop

---
 cub/cub/device/device_find_if.cuh | 41 +++++++++++++++++--------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/cub/cub/device/device_find_if.cuh b/cub/cub/device/device_find_if.cuh
index f0663b92ab9..08d3d549bbe 100644
--- a/cub/cub/device/device_find_if.cuh
+++ b/cub/cub/device/device_find_if.cuh
@@ -34,7 +34,7 @@
 
 #include <cub/config.cuh>
 
-#include "cub/util_type.cuh"
+#include "device_launch_parameters.h"
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
@@ -54,15 +54,13 @@
 CUB_NAMESPACE_BEGIN
 
 template <typename IterBegin, typename IterEnd, typename Pred>
-__global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result)
+__global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result, std::size_t num_items)
 {
+  int elements_per_thread = 2;
+  auto tile_size          = blockDim.x * elements_per_thread;
   __shared__ int sresult;
 
-  auto global_index = threadIdx.x + blockIdx.x * blockDim.x;
-  int total_threads = gridDim.x * blockDim.x;
-
-  // traverse the sequence
-  for (auto index = global_index; begin + index < end; index += total_threads)
+  for (int tile_offset = blockIdx.x * tile_size; tile_offset < num_items; tile_offset += tile_size * gridDim.x)
   {
     // Only one thread reads atomically and propagates it to the
     // the rest threads of the block through shared memory
@@ -72,17 +70,24 @@ __global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result)
     }
     __syncthreads();
 
-    if (sresult < index)
+    for (int i = 0; i < elements_per_thread; ++i)
     {
-      return; // early exit
-    }
-
-    if (pred(*(begin + index)))
-    {
-      atomicMin(result,
-                index); // @georgii atomic min per your request makes sense
-      // printf("%d\n", *result);
-      return;
+      auto index = tile_offset + threadIdx.x + i * blockDim.x;
+
+      if (index < num_items)
+      {
+        // early exit
+        if (sresult < index)
+        {
+          return;
+        }
+
+        if (pred(*(begin + index)))
+        {
+          atomicMin(result, index);
+          return;
+        }
+      }
     }
   }
 }
@@ -170,7 +175,7 @@ struct DeviceFind
     // to read and write from. Then store the final result in the output iterator.
     cuda_mem_set_async_dtemp_storage<<<1, 1>>>(int_temp_storage, num_items);
 
-    find_if<<<findif_grid_size, block_threads, 0, stream>>>(d_in, d_in + num_items, op, int_temp_storage);
+    find_if<<<findif_grid_size, block_threads, 0, stream>>>(d_in, d_in + num_items, op, int_temp_storage, num_items);
 
     write_final_result_in_output_iterator_already<int><<<1, 1>>>(int_temp_storage, d_out);
 

From b7a6db50bcd4a276d9d080419f26daf4fbae2232 Mon Sep 17 00:00:00 2001
From: gonidelis <ggonidelis@nvidia.com>
Date: Wed, 11 Sep 2024 08:56:48 -0700
Subject: [PATCH 05/13] Use multiple elements per thread

---
 cub/cub/device/device_find_if.cuh      | 17 +++++++-------
 cub/test/catch2_test_device_find_if.cu | 31 ++++++++++++++++++++++----
 2 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/cub/cub/device/device_find_if.cuh b/cub/cub/device/device_find_if.cuh
index 08d3d549bbe..f6e1ce95cd3 100644
--- a/cub/cub/device/device_find_if.cuh
+++ b/cub/cub/device/device_find_if.cuh
@@ -1,6 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -51,13 +50,15 @@
 #include <cub/thread/thread_operators.cuh>
 #include <cub/util_deprecated.cuh>
 
+#define elements_per_thread 16
+
 CUB_NAMESPACE_BEGIN
 
 template <typename IterBegin, typename IterEnd, typename Pred>
 __global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result, std::size_t num_items)
 {
-  int elements_per_thread = 2;
-  auto tile_size          = blockDim.x * elements_per_thread;
+  // int elements_per_thread = 32;
+  auto tile_size = blockDim.x * elements_per_thread;
   __shared__ int sresult;
 
   for (int tile_offset = blockIdx.x * tile_size; tile_offset < num_items; tile_offset += tile_size * gridDim.x)
@@ -116,10 +117,10 @@ struct DeviceFind
     NumItemsT num_items,
     cudaStream_t stream = 0)
   {
-    int block_threads    = 128;
-    int items_per_thread = 2;
-    int tile_size        = block_threads * items_per_thread;
-    int num_tiles        = static_cast<int>(cub::DivideAndRoundUp(num_items, tile_size));
+    int block_threads = 128;
+    // int items_per_thread = 2;
+    int tile_size = block_threads * elements_per_thread;
+    int num_tiles = static_cast<int>(cub::DivideAndRoundUp(num_items, tile_size));
 
     // Get device ordinal
     int device_ordinal;
diff --git a/cub/test/catch2_test_device_find_if.cu b/cub/test/catch2_test_device_find_if.cu
index 2cb16b9d297..b92030c9eaa 100644
--- a/cub/test/catch2_test_device_find_if.cu
+++ b/cub/test/catch2_test_device_find_if.cu
@@ -1,3 +1,30 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
 #include "insert_nested_NVTX_range_guard.h"
 // above header needs to be included first
 
@@ -9,10 +36,6 @@
 #include <thrust/device_vector.h>
 #include <thrust/iterator/constant_iterator.h>
 
-#include <algorithm>
-#include <cstdint>
-#include <iostream>
-
 #include "c2h/custom_type.cuh"
 #include "catch2_test_device_reduce.cuh"
 #include "catch2_test_helper.h"

From f69e68846947ceead2bb0ebf793cbca92c15af21 Mon Sep 17 00:00:00 2001
From: gonidelis <ggonidelis@nvidia.com>
Date: Tue, 17 Sep 2024 10:41:40 -0700
Subject: [PATCH 06/13] Optimized solution

---
 cub/cub/device/device_find_if.cuh | 32 ++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/cub/cub/device/device_find_if.cuh b/cub/cub/device/device_find_if.cuh
index f6e1ce95cd3..a645680014d 100644
--- a/cub/cub/device/device_find_if.cuh
+++ b/cub/cub/device/device_find_if.cuh
@@ -60,6 +60,12 @@ __global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result, st
   // int elements_per_thread = 32;
   auto tile_size = blockDim.x * elements_per_thread;
   __shared__ int sresult;
+  __shared__ int block_result;
+
+  if (threadIdx.x == 0)
+  {
+    block_result = num_items;
+  }
 
   for (int tile_offset = blockIdx.x * tile_size; tile_offset < num_items; tile_offset += tile_size * gridDim.x)
   {
@@ -71,22 +77,34 @@ __global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result, st
     }
     __syncthreads();
 
+    // early exit
+    if (sresult < tile_offset)
+    {
+      return;
+    }
+
+    bool found = false;
     for (int i = 0; i < elements_per_thread; ++i)
     {
       auto index = tile_offset + threadIdx.x + i * blockDim.x;
 
       if (index < num_items)
       {
-        // early exit
-        if (sresult < index)
+        if (pred(*(begin + index)))
         {
-          return;
+          found = true;
+          atomicMin(&block_result, index);
+          break;
         }
-
-        if (pred(*(begin + index)))
+      }
+    }
+    if (syncthreads_or(found))
+    {
+      if (threadIdx.x == 0)
+      {
+        if (block_result < num_items)
         {
-          atomicMin(result, index);
-          return;
+          atomicMin(result, block_result);
         }
       }
     }

From 81c676eadf8c821d2432f1198947321803958125 Mon Sep 17 00:00:00 2001
From: gonidelis <ggonidelis@nvidia.com>
Date: Thu, 19 Sep 2024 19:27:13 -0700
Subject: [PATCH 07/13] Add vectorized loads

---
 cub/cub/device/device_find_if.cuh | 38 +++++++++++++++++++++++++++----
 1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/cub/cub/device/device_find_if.cuh b/cub/cub/device/device_find_if.cuh
index a645680014d..365e03cd593 100644
--- a/cub/cub/device/device_find_if.cuh
+++ b/cub/cub/device/device_find_if.cuh
@@ -33,6 +33,7 @@
 
 #include <cub/config.cuh>
 
+#include "cub/util_type.cuh"
 #include "device_launch_parameters.h"
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
@@ -50,13 +51,22 @@
 #include <cub/thread/thread_operators.cuh>
 #include <cub/util_deprecated.cuh>
 
-#define elements_per_thread 16
+static constexpr int elements_per_thread = 16;
+static constexpr int _VECTOR_LOAD_LENGTH = 4;
+static constexpr int block_threads       = 128;
 
 CUB_NAMESPACE_BEGIN
 
 template <typename IterBegin, typename IterEnd, typename Pred>
 __global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result, std::size_t num_items)
 {
+  using InputT  = cub::detail::value_t<IterBegin>;
+  using VectorT = typename CubVector<InputT, _VECTOR_LOAD_LENGTH>::Type;
+  enum
+  {
+    WORDS = elements_per_thread / _VECTOR_LOAD_LENGTH
+  };
+
   // int elements_per_thread = 32;
   auto tile_size = blockDim.x * elements_per_thread;
   __shared__ int sresult;
@@ -83,14 +93,35 @@ __global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result, st
       return;
     }
 
+    //// vectorized loads begin
+    InputT* d_in_unqualified = const_cast<InputT*>(begin) + tile_offset + (threadIdx.x * _VECTOR_LOAD_LENGTH);
+
+    cub::CacheModifiedInputIterator<cub::CacheLoadModifier::LOAD_DEFAULT, VectorT> d_vec_in(
+      reinterpret_cast<VectorT*>(d_in_unqualified));
+
+    InputT input_items[elements_per_thread];
+    VectorT* vec_items = reinterpret_cast<VectorT*>(input_items);
+
+#pragma unroll
+    for (int i = 0; i < WORDS; ++i)
+    {
+      vec_items[i] = d_vec_in[block_threads * i];
+    }
+    //// vectorized loads end
+    // what about when input is not devisible by _VECTOR_LOAD_LENGTH? --> case for not full tile to be added
+
     bool found = false;
     for (int i = 0; i < elements_per_thread; ++i)
     {
-      auto index = tile_offset + threadIdx.x + i * blockDim.x;
+      int index = i % WORDS + (i / WORDS) * block_threads * WORDS + threadIdx.x * WORDS + tile_offset;
+      // i % WORDS                            = + 0 1 2 3, 0 1 2 3, 0 1 2 3, ... (static)
+      // (i / WORDS) * block_threads * WORDS  = + 0      , 64     , 128,     ... (static)
+      // threadIdx.x * WORDS                  = + 0, 4, 8, ... offset of the thread within working tile
+      // tile_offset                          = + just start at the beginning of the block
 
       if (index < num_items)
       {
-        if (pred(*(begin + index)))
+        if (pred(input_items[i]))
         {
           found = true;
           atomicMin(&block_result, index);
@@ -135,7 +166,6 @@ struct DeviceFind
     NumItemsT num_items,
     cudaStream_t stream = 0)
   {
-    int block_threads = 128;
     // int items_per_thread = 2;
     int tile_size = block_threads * elements_per_thread;
     int num_tiles = static_cast<int>(cub::DivideAndRoundUp(num_items, tile_size));

From b8b86dd08c8ec66f69ae6d56d1538bc3c891a234 Mon Sep 17 00:00:00 2001
From: gonidelis <ggonidelis@nvidia.com>
Date: Fri, 20 Sep 2024 17:13:25 -0700
Subject: [PATCH 08/13] Separate benchmarks into different files

---
 cub/benchmarks/bench/find_if/base.cu      | 87 +++++++----------------
 cub/cub/device/device_find_if.cuh         |  5 ++
 thrust/benchmarks/bench/count_if/basic.cu | 71 ++++++++++++++++++
 thrust/benchmarks/bench/find_if/basic.cu  | 71 ++++++++++++++++++
 4 files changed, 173 insertions(+), 61 deletions(-)
 create mode 100644 thrust/benchmarks/bench/count_if/basic.cu
 create mode 100644 thrust/benchmarks/bench/find_if/basic.cu

diff --git a/cub/benchmarks/bench/find_if/base.cu b/cub/benchmarks/bench/find_if/base.cu
index 25312b158aa..8183a04a145 100644
--- a/cub/benchmarks/bench/find_if/base.cu
+++ b/cub/benchmarks/bench/find_if/base.cu
@@ -33,24 +33,30 @@
 #include <nvbench_helper.cuh>
 
 template <typename T>
-struct equals_100
+struct equals
 {
-  __device__ bool operator()(T i)
+  T val;
+  equals(T _val)
+      : val(_val)
+  {}
+
+  __device__ __host__ bool operator()(T i)
   {
-    return i == 1;
-  } // @amd you 'll never find out the secret sauce
+    return i == val;
+  }
 };
 
 template <typename T>
 void find_if(nvbench::state& state, nvbench::type_list<T>)
 {
+  T val = 1;
   // set up input
-  const auto elements      = state.get_int64("Elements");
-  const auto common_prefix = state.get_float64("CommonPrefixRatio");
-  const auto same_elements = elements * common_prefix;
+  const auto elements       = state.get_int64("Elements");
+  const auto common_prefix  = state.get_float64("MismatchAt");
+  const auto mismatch_point = elements * common_prefix;
 
   thrust::device_vector<T> dinput(elements, 0);
-  thrust::fill(dinput.begin() + same_elements, dinput.end(), 1);
+  thrust::fill(dinput.begin() + mismatch_point, dinput.end(), val);
   thrust::device_vector<T> d_result(1);
   ///
 
@@ -58,7 +64,13 @@ void find_if(nvbench::state& state, nvbench::type_list<T>)
   size_t temp_storage_bytes{};
 
   cub::DeviceFind::FindIf(
-    d_temp_storage, temp_storage_bytes, dinput.begin(), d_result.begin(), equals_100<int>{}, dinput.size(), 0);
+    d_temp_storage,
+    temp_storage_bytes,
+    thrust::raw_pointer_cast(dinput.data()),
+    thrust::raw_pointer_cast(d_result.data()),
+    equals{val},
+    dinput.size(),
+    0);
 
   thrust::device_vector<uint8_t> temp_storage(temp_storage_bytes);
   d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
@@ -67,61 +79,14 @@ void find_if(nvbench::state& state, nvbench::type_list<T>)
     cub::DeviceFind::FindIf(
       d_temp_storage,
       temp_storage_bytes,
-      dinput.begin(),
-      d_result.begin(),
-      equals_100<int>{},
+      thrust::raw_pointer_cast(dinput.data()),
+      thrust::raw_pointer_cast(d_result.data()),
+      equals{val},
       dinput.size(),
       launch.get_stream());
   });
 }
-NVBENCH_BENCH_TYPES(find_if, NVBENCH_TYPE_AXES(nvbench::type_list<int32_t> /*integral_types*/))
-  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
-  .add_float64_axis("CommonPrefixRatio", std::vector{1.0, 0.5, 0.0});
-
-//////////////////////////////////////////////////////
-template <typename T>
-void thrust_find_if(nvbench::state& state, nvbench::type_list<T>)
-{
-  // set up input
-  const auto elements      = state.get_int64("Elements");
-  const auto common_prefix = state.get_float64("CommonPrefixRatio");
-  const auto same_elements = elements * common_prefix;
-
-  thrust::device_vector<T> dinput(elements, 0);
-  thrust::fill(dinput.begin() + same_elements, dinput.end(), 1);
-  ///
-
-  caching_allocator_t alloc;
-  thrust::find_if(policy(alloc), dinput.begin(), dinput.end(), equals_100<int>{});
-
-  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    thrust::find_if(policy(alloc, launch), dinput.begin(), dinput.end(), equals_100<int>{});
-  });
-}
-NVBENCH_BENCH_TYPES(thrust_find_if, NVBENCH_TYPE_AXES(nvbench::type_list<int32_t> /*integral_types*/))
-  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
-  .add_float64_axis("CommonPrefixRatio", std::vector{1.0, 0.5, 0.0});
-
-//////////////////////////////////////////////////////
-template <typename T>
-void thrust_count_if(nvbench::state& state, nvbench::type_list<T>)
-{
-  // set up input
-  const auto elements      = state.get_int64("Elements");
-  const auto common_prefix = state.get_float64("CommonPrefixRatio");
-  const auto same_elements = elements * common_prefix;
 
-  thrust::device_vector<T> dinput(elements, 0);
-  thrust::fill(dinput.begin() + same_elements, dinput.end(), 1);
-  ///
-
-  caching_allocator_t alloc;
-  thrust::count_if(policy(alloc), dinput.begin(), dinput.end(), equals_100<int>{});
-
-  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    thrust::count_if(policy(alloc, launch), dinput.begin(), dinput.end(), equals_100<int>{});
-  });
-}
-NVBENCH_BENCH_TYPES(thrust_count_if, NVBENCH_TYPE_AXES(nvbench::type_list<int32_t> /*integral_types*/))
+NVBENCH_BENCH_TYPES(find_if, NVBENCH_TYPE_AXES(fundamental_types))
   .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
-  .add_float64_axis("CommonPrefixRatio", std::vector{1.0, 0.5, 0.0});
+  .add_float64_axis("MismatchAt", std::vector{1.0, 0.5, 0.0});
diff --git a/cub/cub/device/device_find_if.cuh b/cub/cub/device/device_find_if.cuh
index 365e03cd593..0adf8aeedca 100644
--- a/cub/cub/device/device_find_if.cuh
+++ b/cub/cub/device/device_find_if.cuh
@@ -51,6 +51,8 @@
 #include <cub/thread/thread_operators.cuh>
 #include <cub/util_deprecated.cuh>
 
+#include <cassert>
+
 static constexpr int elements_per_thread = 16;
 static constexpr int _VECTOR_LOAD_LENGTH = 4;
 static constexpr int block_threads       = 128;
@@ -129,6 +131,7 @@ __global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result, st
         }
       }
     }
+
     if (syncthreads_or(found))
     {
       if (threadIdx.x == 0)
@@ -166,6 +169,8 @@ struct DeviceFind
     NumItemsT num_items,
     cudaStream_t stream = 0)
   {
+    static_assert(elements_per_thread % _VECTOR_LOAD_LENGTH == 0, "No full tile at the end");
+
     // int items_per_thread = 2;
     int tile_size = block_threads * elements_per_thread;
     int num_tiles = static_cast<int>(cub::DivideAndRoundUp(num_items, tile_size));
diff --git a/thrust/benchmarks/bench/count_if/basic.cu b/thrust/benchmarks/bench/count_if/basic.cu
new file mode 100644
index 00000000000..b672d789868
--- /dev/null
+++ b/thrust/benchmarks/bench/count_if/basic.cu
@@ -0,0 +1,71 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <thrust/count.h>
+#include <thrust/fill.h>
+
+#include "nvbench_helper.cuh"
+
+template <typename T>
+struct equals
+{
+  T val;
+  equals(T _val)
+      : val(_val)
+  {}
+
+  __device__ __host__ bool operator()(T i)
+  {
+    return i == val;
+  }
+};
+
+template <typename T>
+void count_if(nvbench::state& state, nvbench::type_list<T>)
+{
+  T val = 1;
+  // set up input
+  const auto elements       = static_cast<std::size_t>(state.get_int64("Elements"));
+  const auto common_prefix  = state.get_float64("MismatchAt");
+  const auto mismatch_point = elements * common_prefix;
+
+  thrust::device_vector<T> dinput(elements, 0);
+  thrust::fill(dinput.begin() + mismatch_point, dinput.end(), val);
+  ///
+
+  caching_allocator_t alloc;
+  thrust::count_if(policy(alloc), dinput.begin(), dinput.end(), equals<T>{val});
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    thrust::count_if(policy(alloc, launch), dinput.begin(), dinput.end(), equals<T>{val});
+  });
+}
+
+NVBENCH_BENCH_TYPES(count_if, NVBENCH_TYPE_AXES(fundamental_types))
+  .set_name("thrust::count_if")
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
+  .add_float64_axis("MismatchAt", std::vector{1.0, 0.5, 0.0});
diff --git a/thrust/benchmarks/bench/find_if/basic.cu b/thrust/benchmarks/bench/find_if/basic.cu
new file mode 100644
index 00000000000..8a2618e26d5
--- /dev/null
+++ b/thrust/benchmarks/bench/find_if/basic.cu
@@ -0,0 +1,71 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <thrust/count.h>
+#include <thrust/fill.h>
+
+#include "nvbench_helper.cuh"
+
+template <typename T>
+struct equals
+{
+  T val;
+  equals(T _val)
+      : val(_val)
+  {}
+
+  __device__ __host__ bool operator()(T i)
+  {
+    return i == val;
+  }
+};
+
+template <typename T>
+void find_if(nvbench::state& state, nvbench::type_list<T>)
+{
+  T val = 1;
+  // set up input
+  const auto elements       = static_cast<std::size_t>(state.get_int64("Elements"));
+  const auto common_prefix  = state.get_float64("MismatchAt");
+  const auto mismatch_point = elements * common_prefix;
+
+  thrust::device_vector<T> dinput(elements, 0);
+  thrust::fill(dinput.begin() + mismatch_point, dinput.end(), val);
+  ///
+
+  caching_allocator_t alloc;
+  thrust::find_if(policy(alloc), dinput.begin(), dinput.end(), equals<T>{val});
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    thrust::find_if(policy(alloc, launch), dinput.begin(), dinput.end(), equals<T>{val});
+  });
+}
+
+NVBENCH_BENCH_TYPES(find_if, NVBENCH_TYPE_AXES(fundamental_types))
+  .set_name("thrust::find_if")
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
+  .add_float64_axis("MismatchAt", std::vector{1.0, 0.5, 0.0});

From f93ff9b21a1a5aec28c4c889537261a1847cae0d Mon Sep 17 00:00:00 2001
From: gonidelis <ggonidelis@nvidia.com>
Date: Tue, 1 Oct 2024 13:09:24 -0700
Subject: [PATCH 09/13] Add split implementation that works with both
 vectorized and non vectorized loads

---
 cub/cub/device/device_find_if.cuh | 199 +++++++++++++++++++++---------
 1 file changed, 144 insertions(+), 55 deletions(-)

diff --git a/cub/cub/device/device_find_if.cuh b/cub/cub/device/device_find_if.cuh
index 0adf8aeedca..706201a274a 100644
--- a/cub/cub/device/device_find_if.cuh
+++ b/cub/cub/device/device_find_if.cuh
@@ -34,6 +34,9 @@
 #include <cub/config.cuh>
 
 #include "cub/util_type.cuh"
+#include "cuda/std/__cccl/dialect.h"
+#include "cuda/std/__memory/pointer_traits.h"
+#include "cuda/std/__utility/declval.h"
 #include "device_launch_parameters.h"
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
@@ -59,89 +62,178 @@ static constexpr int block_threads       = 128;
 
 CUB_NAMESPACE_BEGIN
 
-template <typename IterBegin, typename IterEnd, typename Pred>
-__global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result, std::size_t num_items)
+template <typename Iterator>
+static _CCCL_DEVICE _CCCL_FORCEINLINE bool IsAlignedAndFullTile(
+  Iterator d_in, int tile_offset, int tile_size, std::size_t num_items, Int2Type<true> /*CAN_VECTORIZE*/)
+{
+  /// Create an AgentFindIf and extract these two as type member in the encapsulating struct
+  using InputT  = cub::detail::value_t<Iterator>;
+  using VectorT = typename CubVector<InputT, _VECTOR_LOAD_LENGTH>::Type;
+  ///
+  bool full_tile  = (tile_offset + tile_size) <= num_items;
+  bool is_aligned = (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
+  return full_tile && is_aligned;
+}
+
+template <typename Iterator>
+static _CCCL_DEVICE _CCCL_FORCEINLINE bool IsAlignedAndFullTile(
+  Iterator /*d_in*/,
+  int /*tile_offset*/,
+  int /*tile_size*/,
+  std::size_t /*num_items*/,
+  Int2Type<false> /*CAN_VECTORIZE*/)
 {
+  return false;
+}
+
+template <typename IterBegin, typename Pred>
+__device__ void ConsumeRange(
+  IterBegin begin, int tile_offset, Pred pred, int* result, std::size_t num_items, Int2Type<true> /*CAN_VECTORIZE*/)
+{
+  __shared__ int block_result;
+
+  if (threadIdx.x == 0)
+  {
+    block_result = num_items;
+  }
+
   using InputT  = cub::detail::value_t<IterBegin>;
   using VectorT = typename CubVector<InputT, _VECTOR_LOAD_LENGTH>::Type;
+
   enum
   {
     WORDS = elements_per_thread / _VECTOR_LOAD_LENGTH
   };
+  //// vectorized loads begin
+  InputT* d_in_unqualified = const_cast<InputT*>(begin) + tile_offset + (threadIdx.x * _VECTOR_LOAD_LENGTH);
 
-  // int elements_per_thread = 32;
-  auto tile_size = blockDim.x * elements_per_thread;
-  __shared__ int sresult;
-  __shared__ int block_result;
+  cub::CacheModifiedInputIterator<cub::CacheLoadModifier::LOAD_LDG, VectorT> d_vec_in(
+    reinterpret_cast<VectorT*>(d_in_unqualified));
 
-  if (threadIdx.x == 0)
+  InputT input_items[elements_per_thread];
+  VectorT* vec_items = reinterpret_cast<VectorT*>(input_items);
+
+#pragma unroll
+  for (int i = 0; i < WORDS; ++i)
   {
-    block_result = num_items;
+    vec_items[i] = d_vec_in[block_threads * i];
   }
+  //// vectorized loads end
 
-  for (int tile_offset = blockIdx.x * tile_size; tile_offset < num_items; tile_offset += tile_size * gridDim.x)
+  bool found = false;
+  for (int i = 0; i < elements_per_thread; ++i)
   {
-    // Only one thread reads atomically and propagates it to the
-    // the rest threads of the block through shared memory
-    if (threadIdx.x == 0)
+    int index = i % WORDS + (i / WORDS) * block_threads * WORDS + threadIdx.x * WORDS + tile_offset;
+    // i % WORDS                            = + 0 1 2 3, 0 1 2 3, 0 1 2 3, ... (static)
+    // (i / WORDS) * block_threads * WORDS  = + 0      , 64     , 128,     ... (static)
+    // threadIdx.x * WORDS                  = + 0, 4, 8, ... offset of the thread within working tile
+    // tile_offset                          = + just start at the beginning of the block
+
+    if (index < num_items)
     {
-      sresult = atomicAdd(result, 0);
+      if (pred(input_items[i]))
+      {
+        found = true;
+        atomicMin(&block_result, index);
+        break; // every thread goes over multiple elements per thread
+               // for every tile. If a thread finds a local minimum it doesn't
+               // need to proceed further (inner early exit).
+      }
     }
-    __syncthreads();
+  }
 
-    // early exit
-    if (sresult < tile_offset)
+  if (syncthreads_or(found))
+  {
+    if (threadIdx.x == 0)
     {
-      return;
+      if (block_result < num_items)
+      {
+        atomicMin(result, block_result);
+      }
     }
+  }
+}
 
-    //// vectorized loads begin
-    InputT* d_in_unqualified = const_cast<InputT*>(begin) + tile_offset + (threadIdx.x * _VECTOR_LOAD_LENGTH);
+template <typename IterBegin, typename Pred>
+__device__ void ConsumeRange(
+  IterBegin begin, int tile_offset, Pred pred, int* result, std::size_t num_items, Int2Type<false> /*CAN_VECTORIZE*/)
+{
+  __shared__ int block_result;
 
-    cub::CacheModifiedInputIterator<cub::CacheLoadModifier::LOAD_DEFAULT, VectorT> d_vec_in(
-      reinterpret_cast<VectorT*>(d_in_unqualified));
+  if (threadIdx.x == 0)
+  {
+    block_result = num_items;
+  }
 
-    InputT input_items[elements_per_thread];
-    VectorT* vec_items = reinterpret_cast<VectorT*>(input_items);
+  bool found = false;
+  for (int i = 0; i < elements_per_thread; ++i)
+  {
+    auto index = tile_offset + threadIdx.x + i * blockDim.x;
 
-#pragma unroll
-    for (int i = 0; i < WORDS; ++i)
+    if (index < num_items)
     {
-      vec_items[i] = d_vec_in[block_threads * i];
+      if (pred(*(begin + index)))
+      {
+        found = true;
+        atomicMin(&block_result, index);
+        break;
+      }
     }
-    //// vectorized loads end
-    // what about when input is not devisible by _VECTOR_LOAD_LENGTH? --> case for not full tile to be added
-
-    bool found = false;
-    for (int i = 0; i < elements_per_thread; ++i)
+  }
+  if (syncthreads_or(found))
+  {
+    if (threadIdx.x == 0)
     {
-      int index = i % WORDS + (i / WORDS) * block_threads * WORDS + threadIdx.x * WORDS + tile_offset;
-      // i % WORDS                            = + 0 1 2 3, 0 1 2 3, 0 1 2 3, ... (static)
-      // (i / WORDS) * block_threads * WORDS  = + 0      , 64     , 128,     ... (static)
-      // threadIdx.x * WORDS                  = + 0, 4, 8, ... offset of the thread within working tile
-      // tile_offset                          = + just start at the beginning of the block
-
-      if (index < num_items)
+      if (block_result < num_items)
       {
-        if (pred(input_items[i]))
-        {
-          found = true;
-          atomicMin(&block_result, index);
-          break;
-        }
+        atomicMin(result, block_result);
       }
     }
+  }
+}
+
+template <typename IterBegin, typename IterEnd, typename Pred>
+__global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result, std::size_t num_items)
+{
+  using InputT = cub::detail::value_t<IterBegin>;
+
+  // 1. _VECTOR_LOAD_LENGTH > 1: number of items per vectorized load should have been determined to be more than 1.
+  //    Now it's hardcoded but it will be determined at compile time according to the GPU architecture (Policy) later
+  //    on.
+  // 2. elements_per_thread % _VECTOR_LOAD_LENGTH == 0: elements_per_thread is also defined at compile time after tuning
+  //    for specific architectures. There is not point in doing vectorization if a thread cannot vectorize load all the
+  //    elmenets it is going to be working on, on a single tile.
+  // 3. ::cuda::std::is_pointer<IterBegin>::value: is contiguous iterator. If memory is not contiguous loading
+  //    vectorized memory makes no sense. Defined at compile time.
+  // 4. Traits<cub::detail::value_t<IterBegin>>::PRIMITIVE: InputT cannot be an arbitrary type, that could be layed out
+  //    in memory in any way. Needs to be known. Compile time.
+
+  static constexpr bool ATTEMPT_VECTORIZATION =
+    (_VECTOR_LOAD_LENGTH > 1) && (elements_per_thread % _VECTOR_LOAD_LENGTH == 0)
+    && (::cuda::std::is_pointer<IterBegin>::value) && Traits<InputT>::PRIMITIVE;
+
+  auto tile_size = blockDim.x * elements_per_thread;
+  __shared__ int sresult;
+
+  for (int tile_offset = blockIdx.x * tile_size; tile_offset < num_items; tile_offset += tile_size * gridDim.x)
+  {
+    // Only one thread reads atomically and propagates it to the
+    // the rest threads of the block through shared memory
+    if (threadIdx.x == 0)
+    {
+      sresult = atomicAdd(result, 0);
+    }
+    __syncthreads();
 
-    if (syncthreads_or(found))
+    // early exit
+    if (sresult < tile_offset)
     {
-      if (threadIdx.x == 0)
-      {
-        if (block_result < num_items)
-        {
-          atomicMin(result, block_result);
-        }
-      }
+      return;
     }
+
+    IsAlignedAndFullTile(begin, tile_offset, tile_size, num_items, Int2Type<ATTEMPT_VECTORIZATION>())
+      ? ConsumeRange(begin, tile_offset, pred, result, num_items, Int2Type < true && ATTEMPT_VECTORIZATION > ())
+      : ConsumeRange(begin, tile_offset, pred, result, num_items, Int2Type < false && ATTEMPT_VECTORIZATION > ());
   }
 }
 
@@ -169,9 +261,6 @@ struct DeviceFind
     NumItemsT num_items,
     cudaStream_t stream = 0)
   {
-    static_assert(elements_per_thread % _VECTOR_LOAD_LENGTH == 0, "No full tile at the end");
-
-    // int items_per_thread = 2;
     int tile_size = block_threads * elements_per_thread;
     int num_tiles = static_cast<int>(cub::DivideAndRoundUp(num_items, tile_size));
 

From 50dc8f500eded88b1db0cd71cbc2e7908043b492 Mon Sep 17 00:00:00 2001
From: gonidelis <ggonidelis@nvidia.com>
Date: Tue, 1 Oct 2024 13:09:41 -0700
Subject: [PATCH 10/13] Add elaborate unit testing

---
 cub/test/catch2_test_device_find_if.cu | 129 +++++++++++++++++++++++--
 1 file changed, 121 insertions(+), 8 deletions(-)

diff --git a/cub/test/catch2_test_device_find_if.cu b/cub/test/catch2_test_device_find_if.cu
index b92030c9eaa..2f7a3175049 100644
--- a/cub/test/catch2_test_device_find_if.cu
+++ b/cub/test/catch2_test_device_find_if.cu
@@ -36,10 +36,14 @@
 #include <thrust/device_vector.h>
 #include <thrust/iterator/constant_iterator.h>
 
+#include <iostream>
+
 #include "c2h/custom_type.cuh"
 #include "catch2_test_device_reduce.cuh"
 #include "catch2_test_helper.h"
 #include "catch2_test_launch_helper.h"
+#include "thrust/detail/raw_pointer_cast.h"
+#include <catch2/catch.hpp>
 #include <nv/target>
 
 // %PARAM% TEST_LAUNCH lid 0:1
@@ -72,15 +76,20 @@ void compute_find_if_reference(InputIt first, InputIt last, OutputIt& result, Bi
 }
 
 template <typename T>
-struct equals_2
+struct equals
 {
+  T val;
+  equals(T _val)
+      : val(_val)
+  {}
+
   __device__ __host__ bool operator()(T i)
   {
-    return i == 2;
+    return i == val;
   }
 };
 
-CUB_TEST("Device find if works", "[device]", full_type_list)
+CUB_TEST("Device find_if works", "[device]", full_type_list)
 {
   using params   = params_t<TestType>;
   using input_t  = typename params::item_t;
@@ -115,14 +124,15 @@ CUB_TEST("Device find if works", "[device]", full_type_list)
   }
   auto d_in_it = thrust::raw_pointer_cast(in_items.data());
 
-  SECTION("find if")
+  SECTION("Generic find if case")
   {
-    using op_t = equals_2<std::int32_t>;
+    using op_t = equals<input_t>;
+    input_t val_to_find{2};
 
     // Prepare verification data
     c2h::host_vector<input_t> host_items(in_items);
     c2h::host_vector<output_t> expected_result(1);
-    compute_find_if_reference(host_items.begin(), host_items.end(), expected_result[0], op_t{});
+    compute_find_if_reference(host_items.begin(), host_items.end(), expected_result[0], op_t{val_to_find});
 
     void* d_temp_storage = nullptr;
     size_t temp_storage_bytes{};
@@ -132,13 +142,116 @@ CUB_TEST("Device find if works", "[device]", full_type_list)
     auto d_out_it = thrust::raw_pointer_cast(out_result.data());
 
     cub::DeviceFind::FindIf(
-      d_temp_storage, temp_storage_bytes, unwrap_it(d_in_it), unwrap_it(d_out_it), op_t{}, num_items);
+      d_temp_storage, temp_storage_bytes, unwrap_it(d_in_it), unwrap_it(d_out_it), op_t{val_to_find}, num_items);
+
+    thrust::device_vector<uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    cub::DeviceFind::FindIf(
+      d_temp_storage, temp_storage_bytes, unwrap_it(d_in_it), unwrap_it(d_out_it), op_t{val_to_find}, num_items);
+
+    // Verify result
+    REQUIRE(expected_result == out_result);
+  }
+
+  SECTION("find_if works with non raw pointers - .begin() iterator")
+  {
+    using op_t = equals<input_t>;
+    input_t val_to_find{2};
+
+    // Prepare verification data
+    c2h::host_vector<input_t> host_items(in_items);
+    c2h::host_vector<output_t> expected_result(1);
+    compute_find_if_reference(host_items.begin(), host_items.end(), expected_result[0], op_t{val_to_find});
+
+    void* d_temp_storage = nullptr;
+    size_t temp_storage_bytes{};
+
+    // Run test
+    c2h::device_vector<output_t> out_result(1);
+
+    cub::DeviceFind::FindIf(
+      d_temp_storage, temp_storage_bytes, in_items.begin(), out_result.begin(), op_t{val_to_find}, num_items);
 
     thrust::device_vector<uint8_t> temp_storage(temp_storage_bytes);
     d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
 
     cub::DeviceFind::FindIf(
-      d_temp_storage, temp_storage_bytes, unwrap_it(d_in_it), unwrap_it(d_out_it), op_t{}, num_items);
+      d_temp_storage, temp_storage_bytes, in_items.begin(), out_result.begin(), op_t{val_to_find}, num_items);
+
+    // Verify result
+    REQUIRE(expected_result == out_result);
+  }
+
+  SECTION("find_if works for unaligned input")
+  {
+    for (int offset = 1; offset < 4; ++offset)
+    {
+      if (num_items - offset > 0)
+      {
+        using op_t = equals<input_t>;
+        input_t val_to_find{2};
+
+        // Prepare verification data
+        c2h::host_vector<input_t> host_items(in_items);
+        c2h::host_vector<output_t> expected_result(1);
+        compute_find_if_reference(host_items.begin() + offset, host_items.end(), expected_result[0], op_t{val_to_find});
+
+        void* d_temp_storage = nullptr;
+        size_t temp_storage_bytes{};
+
+        // Run test
+        c2h::device_vector<output_t> out_result(1);
+        auto d_out_it = thrust::raw_pointer_cast(out_result.data());
+
+        cub::DeviceFind::FindIf(
+          d_temp_storage,
+          temp_storage_bytes,
+          unwrap_it(d_in_it + offset),
+          unwrap_it(d_out_it),
+          op_t{val_to_find},
+          num_items - offset);
+
+        thrust::device_vector<uint8_t> temp_storage(temp_storage_bytes);
+        d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+        cub::DeviceFind::FindIf(
+          d_temp_storage,
+          temp_storage_bytes,
+          unwrap_it(d_in_it + offset),
+          unwrap_it(d_out_it),
+          op_t{val_to_find},
+          num_items - offset);
+
+        // Verify result
+        REQUIRE(expected_result == out_result);
+      }
+    }
+  }
+
+  SECTION("find_if works with non primitive iterator")
+  {
+    using op_t = equals<input_t>;
+    input_t val_to_find{2};
+
+    // Prepare verification data
+    auto it = thrust::make_counting_iterator(0); // non-primitive iterator
+    c2h::host_vector<output_t> expected_result(1);
+    compute_find_if_reference(it, it + num_items, expected_result[0], op_t{val_to_find});
+
+    void* d_temp_storage = nullptr;
+    size_t temp_storage_bytes{};
+
+    // Run test
+    c2h::device_vector<output_t> out_result(1);
+    auto d_out_it = thrust::raw_pointer_cast(out_result.data());
+
+    cub::DeviceFind::FindIf(d_temp_storage, temp_storage_bytes, it, unwrap_it(d_out_it), op_t{val_to_find}, num_items);
+
+    thrust::device_vector<uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    cub::DeviceFind::FindIf(d_temp_storage, temp_storage_bytes, it, unwrap_it(d_out_it), op_t{val_to_find}, num_items);
 
     // Verify result
     REQUIRE(expected_result == out_result);

From c508b994008c60e5827154fc98326f2403c2b986 Mon Sep 17 00:00:00 2001
From: gonidelis <ggonidelis@nvidia.com>
Date: Tue, 29 Oct 2024 13:43:03 -0700
Subject: [PATCH 11/13] Add Dispatch and Agent layers

---
 cub/benchmarks/bench/find_if/base.cu      |   3 -
 cub/cub/agent/agent_find.cuh              | 309 ++++++++++++++++++++
 cub/cub/device/device_find_if.cuh         | 261 +----------------
 cub/cub/device/dispatch/dispatch_find.cuh | 331 ++++++++++++++++++++++
 cub/test/catch2_test_device_find_if.cu    |  38 +--
 5 files changed, 661 insertions(+), 281 deletions(-)
 create mode 100644 cub/cub/agent/agent_find.cuh
 create mode 100644 cub/cub/device/dispatch/dispatch_find.cuh

diff --git a/cub/benchmarks/bench/find_if/base.cu b/cub/benchmarks/bench/find_if/base.cu
index 8183a04a145..6c51a754b56 100644
--- a/cub/benchmarks/bench/find_if/base.cu
+++ b/cub/benchmarks/bench/find_if/base.cu
@@ -36,9 +36,6 @@ template <typename T>
 struct equals
 {
   T val;
-  equals(T _val)
-      : val(_val)
-  {}
 
   __device__ __host__ bool operator()(T i)
   {
diff --git a/cub/cub/agent/agent_find.cuh b/cub/cub/agent/agent_find.cuh
new file mode 100644
index 00000000000..80b35ea93a0
--- /dev/null
+++ b/cub/cub/agent/agent_find.cuh
@@ -0,0 +1,309 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file cub::AgentFind implements a stateful abstraction of CUDA thread
+ *       blocks for participating in device-wide search.
+ */
+
+#pragma once
+#include <cub/config.cuh>
+
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/thread/thread_load.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentFind
+ * @tparam NOMINAL_BLOCK_THREADS_4B Threads per thread block
+ * @tparam NOMINAL_ITEMS_PER_THREAD_4B Items per thread (per tile of input)
+ * @tparam _VECTOR_LOAD_LENGTH Number of items per vectorized load
+ * @tparam _LOAD_MODIFIER Cache load modifier for reading input elements
+ */
+template <int NOMINAL_BLOCK_THREADS_4B,
+          int NOMINAL_ITEMS_PER_THREAD_4B,
+          typename ComputeT,
+          int _VECTOR_LOAD_LENGTH,
+          CacheLoadModifier _LOAD_MODIFIER,
+          typename ScalingType = MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>>
+struct AgentFindPolicy : ScalingType
+{
+  /// Number of items per vectorized load
+  static constexpr int VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH;
+
+  /// Cache load modifier for reading input elements
+  static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
+};
+
+template <typename AgentFindPolicy,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename OffsetT,
+          typename ScanOpT> // @giannis OutputiteratorT not needed
+struct AgentFind
+{
+  //---------------------------------------------------------------------
+  // Types and constants
+  //---------------------------------------------------------------------
+
+  /// The input value type
+  using InputT = cub::detail::value_t<InputIteratorT>;
+
+  /// Vector type of InputT for data movement
+  using VectorT = typename CubVector<InputT, AgentFindPolicy::VECTOR_LOAD_LENGTH>::Type;
+
+  /// Input iterator wrapper type (for applying cache modifier)
+  // Wrap the native input pointer with CacheModifiedInputIterator
+  // or directly use the supplied input iterator type
+  using WrappedInputIteratorT =
+    ::cuda::std::_If<::cuda::std::is_pointer<InputIteratorT>::value,
+                     CacheModifiedInputIterator<AgentFindPolicy::LOAD_MODIFIER, InputT, OffsetT>,
+                     InputIteratorT>;
+
+  /// Constants
+  static constexpr int BLOCK_THREADS      = AgentFindPolicy::BLOCK_THREADS;
+  static constexpr int ITEMS_PER_THREAD   = AgentFindPolicy::ITEMS_PER_THREAD;
+  static constexpr int TILE_ITEMS         = BLOCK_THREADS * ITEMS_PER_THREAD;
+  static constexpr int VECTOR_LOAD_LENGTH = CUB_MIN(ITEMS_PER_THREAD, AgentFindPolicy::VECTOR_LOAD_LENGTH);
+
+  // Can vectorize according to the policy if the input iterator is a native
+  // pointer to a primitive type
+  static constexpr bool ATTEMPT_VECTORIZATION =
+    (VECTOR_LOAD_LENGTH > 1) && (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0)
+    && (::cuda::std::is_pointer<InputIteratorT>::value) && Traits<InputT>::PRIMITIVE;
+
+  static constexpr CacheLoadModifier LOAD_MODIFIER = AgentFindPolicy::LOAD_MODIFIER;
+
+  /// Shared memory type required by this thread block
+  using _TempStorage = OffsetT;
+
+  /// Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //---------------------------------------------------------------------
+  // Per-thread fields
+  //---------------------------------------------------------------------
+
+  _TempStorage& sresult; ///< Reference to temp_storage
+  InputIteratorT d_in; ///< Input data to find
+  // OutputIteratorT d_out;
+  // OffsetT num_items;
+  // OffsetT* value_temp_storage;
+  // WrappedInputIteratorT d_wrapped_in; ///< Wrapped input data to find
+  ScanOpT scan_op; ///< Binary reduction operator
+
+  //---------------------------------------------------------------------
+  // Utility
+  //---------------------------------------------------------------------
+
+  template <typename T>
+  static _CCCL_DEVICE _CCCL_FORCEINLINE bool
+  IsAlignedAndFullTile(T* d_in, int tile_offset, int tile_size, OffsetT num_items, Int2Type<true> /*CAN_VECTORIZE*/)
+  {
+    /// Create an AgentFindIf and extract these two as type member in the encapsulating struct
+    using InputT  = T;
+    using VectorT = typename CubVector<InputT, VECTOR_LOAD_LENGTH>::Type;
+    ///
+    const bool full_tile  = (tile_offset + tile_size) <= num_items;
+    const bool is_aligned = reinterpret_cast<::cuda::std::uintptr_t>(d_in) % uintptr_t{sizeof(VectorT)} == 0;
+    return full_tile && is_aligned;
+  }
+
+  template <typename Iterator>
+  static _CCCL_DEVICE _CCCL_FORCEINLINE bool IsAlignedAndFullTile(
+    Iterator /*d_in*/,
+    int /*tile_offset*/,
+    int /*tile_size*/,
+    std::size_t /*num_items*/,
+    Int2Type<false> /*CAN_VECTORIZE*/)
+  {
+    return false;
+  }
+
+  //---------------------------------------------------------------------
+  // Constructor
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Constructor
+   * @param sresult Reference to temp_storage
+   * @param d_in Input data to search
+   * @param scan_op Binary scan operator
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE AgentFind(TempStorage& sresult, InputIteratorT d_in, ScanOpT scan_op)
+      : sresult(sresult.Alias())
+      , d_in(d_in)
+      , scan_op(scan_op)
+  {}
+
+  //---------------------------------------------------------------------
+  // Tile consumption
+  //---------------------------------------------------------------------
+
+  template <typename Pred>
+  __device__ void
+  ConsumeTile(int tile_offset, Pred pred, OffsetT* result, OffsetT num_items, Int2Type<true> /*CAN_VECTORIZE*/)
+  {
+    using InputT  = cub::detail::value_t<InputIteratorT>;
+    using VectorT = typename CubVector<InputT, VECTOR_LOAD_LENGTH>::Type;
+
+    __shared__ OffsetT block_result;
+
+    if (threadIdx.x == 0)
+    {
+      block_result = num_items;
+    }
+
+    __syncthreads();
+
+    enum
+    {
+      NUMBER_OF_VECTORS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH
+    };
+    //// vectorized loads begin
+    InputT* d_in_unqualified = const_cast<InputT*>(d_in) + tile_offset + (threadIdx.x * VECTOR_LOAD_LENGTH);
+
+    cub::CacheModifiedInputIterator<AgentFindPolicy::LOAD_MODIFIER, VectorT> d_vec_in(
+      reinterpret_cast<VectorT*>(d_in_unqualified));
+
+    InputT input_items[ITEMS_PER_THREAD];
+    VectorT* vec_items = reinterpret_cast<VectorT*>(input_items);
+
+#pragma unroll
+    for (int i = 0; i < NUMBER_OF_VECTORS; ++i)
+    {
+      vec_items[i] = d_vec_in[BLOCK_THREADS * i];
+    }
+    //// vectorized loads end
+
+    bool found = false;
+    for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+    {
+      OffsetT nth_vector_of_thread = i / VECTOR_LOAD_LENGTH;
+      OffsetT element_in_word      = i % VECTOR_LOAD_LENGTH;
+      OffsetT vector_of_tile       = nth_vector_of_thread * BLOCK_THREADS + threadIdx.x;
+
+      OffsetT index = tile_offset + vector_of_tile * VECTOR_LOAD_LENGTH + element_in_word;
+
+      if (index < num_items)
+      {
+        if (pred(input_items[i]))
+        {
+          found = true;
+          atomicMin(&block_result, index);
+          break; // every thread goes over multiple elements per thread
+                 // for every tile. If a thread finds a local minimum it doesn't
+                 // need to proceed further (inner early exit).
+        }
+      }
+    }
+
+    if (syncthreads_or(found))
+    {
+      if (threadIdx.x == 0)
+      {
+        if (block_result < num_items)
+        {
+          atomicMin(result, block_result);
+        }
+      }
+    }
+  }
+
+  template <typename Pred>
+  __device__ void
+  ConsumeTile(int tile_offset, Pred pred, OffsetT* result, OffsetT num_items, Int2Type<false> /*CAN_VECTORIZE*/)
+  {
+    __shared__ int block_result;
+
+    if (threadIdx.x == 0)
+    {
+      block_result = num_items;
+    }
+
+    __syncthreads();
+
+    bool found = false;
+    for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+    {
+      auto index = tile_offset + threadIdx.x + i * blockDim.x;
+
+      if (index < num_items)
+      {
+        if (pred(*(d_in + index)))
+        {
+          found = true;
+          atomicMin(&block_result, index);
+          break;
+        }
+      }
+    }
+    if (syncthreads_or(found))
+    {
+      if (threadIdx.x == 0)
+      {
+        if (block_result < num_items)
+        {
+          atomicMin(result, block_result);
+        }
+      }
+    }
+  }
+
+  __device__ void Process(OffsetT* value_temp_storage, OffsetT num_items)
+  {
+    for (int tile_offset = blockIdx.x * TILE_ITEMS; tile_offset < num_items; tile_offset += TILE_ITEMS * gridDim.x)
+    {
+      // Only one thread reads atomically and propagates it to the
+      // the rest threads of the block through shared memory
+      if (threadIdx.x == 0)
+      {
+        sresult = atomicAdd(value_temp_storage, 0);
+      }
+      __syncthreads();
+
+      // early exit
+      if (sresult < tile_offset)
+      {
+        return;
+      }
+
+      IsAlignedAndFullTile(d_in, tile_offset, TILE_ITEMS, num_items, Int2Type<ATTEMPT_VECTORIZATION>())
+        ? ConsumeTile(tile_offset, scan_op, value_temp_storage, num_items, Int2Type<ATTEMPT_VECTORIZATION>())
+        : ConsumeTile(tile_offset, scan_op, value_temp_storage, num_items, Int2Type<false>());
+    }
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/cub/cub/device/device_find_if.cuh b/cub/cub/device/device_find_if.cuh
index 706201a274a..ee30f5c4392 100644
--- a/cub/cub/device/device_find_if.cuh
+++ b/cub/cub/device/device_find_if.cuh
@@ -49,6 +49,7 @@
 
 #include <cub/detail/choose_offset.cuh>
 #include <cub/detail/nvtx.cuh>
+#include <cub/device/dispatch/dispatch_find.cuh>
 #include <cub/device/dispatch/dispatch_scan.cuh>
 #include <cub/device/dispatch/dispatch_scan_by_key.cuh>
 #include <cub/thread/thread_operators.cuh>
@@ -56,273 +57,27 @@
 
 #include <cassert>
 
-static constexpr int elements_per_thread = 16;
-static constexpr int _VECTOR_LOAD_LENGTH = 4;
-static constexpr int block_threads       = 128;
-
 CUB_NAMESPACE_BEGIN
 
-template <typename Iterator>
-static _CCCL_DEVICE _CCCL_FORCEINLINE bool IsAlignedAndFullTile(
-  Iterator d_in, int tile_offset, int tile_size, std::size_t num_items, Int2Type<true> /*CAN_VECTORIZE*/)
-{
-  /// Create an AgentFindIf and extract these two as type member in the encapsulating struct
-  using InputT  = cub::detail::value_t<Iterator>;
-  using VectorT = typename CubVector<InputT, _VECTOR_LOAD_LENGTH>::Type;
-  ///
-  bool full_tile  = (tile_offset + tile_size) <= num_items;
-  bool is_aligned = (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
-  return full_tile && is_aligned;
-}
-
-template <typename Iterator>
-static _CCCL_DEVICE _CCCL_FORCEINLINE bool IsAlignedAndFullTile(
-  Iterator /*d_in*/,
-  int /*tile_offset*/,
-  int /*tile_size*/,
-  std::size_t /*num_items*/,
-  Int2Type<false> /*CAN_VECTORIZE*/)
-{
-  return false;
-}
-
-template <typename IterBegin, typename Pred>
-__device__ void ConsumeRange(
-  IterBegin begin, int tile_offset, Pred pred, int* result, std::size_t num_items, Int2Type<true> /*CAN_VECTORIZE*/)
-{
-  __shared__ int block_result;
-
-  if (threadIdx.x == 0)
-  {
-    block_result = num_items;
-  }
-
-  using InputT  = cub::detail::value_t<IterBegin>;
-  using VectorT = typename CubVector<InputT, _VECTOR_LOAD_LENGTH>::Type;
-
-  enum
-  {
-    WORDS = elements_per_thread / _VECTOR_LOAD_LENGTH
-  };
-  //// vectorized loads begin
-  InputT* d_in_unqualified = const_cast<InputT*>(begin) + tile_offset + (threadIdx.x * _VECTOR_LOAD_LENGTH);
-
-  cub::CacheModifiedInputIterator<cub::CacheLoadModifier::LOAD_LDG, VectorT> d_vec_in(
-    reinterpret_cast<VectorT*>(d_in_unqualified));
-
-  InputT input_items[elements_per_thread];
-  VectorT* vec_items = reinterpret_cast<VectorT*>(input_items);
-
-#pragma unroll
-  for (int i = 0; i < WORDS; ++i)
-  {
-    vec_items[i] = d_vec_in[block_threads * i];
-  }
-  //// vectorized loads end
-
-  bool found = false;
-  for (int i = 0; i < elements_per_thread; ++i)
-  {
-    int index = i % WORDS + (i / WORDS) * block_threads * WORDS + threadIdx.x * WORDS + tile_offset;
-    // i % WORDS                            = + 0 1 2 3, 0 1 2 3, 0 1 2 3, ... (static)
-    // (i / WORDS) * block_threads * WORDS  = + 0      , 64     , 128,     ... (static)
-    // threadIdx.x * WORDS                  = + 0, 4, 8, ... offset of the thread within working tile
-    // tile_offset                          = + just start at the beginning of the block
-
-    if (index < num_items)
-    {
-      if (pred(input_items[i]))
-      {
-        found = true;
-        atomicMin(&block_result, index);
-        break; // every thread goes over multiple elements per thread
-               // for every tile. If a thread finds a local minimum it doesn't
-               // need to proceed further (inner early exit).
-      }
-    }
-  }
-
-  if (syncthreads_or(found))
-  {
-    if (threadIdx.x == 0)
-    {
-      if (block_result < num_items)
-      {
-        atomicMin(result, block_result);
-      }
-    }
-  }
-}
-
-template <typename IterBegin, typename Pred>
-__device__ void ConsumeRange(
-  IterBegin begin, int tile_offset, Pred pred, int* result, std::size_t num_items, Int2Type<false> /*CAN_VECTORIZE*/)
-{
-  __shared__ int block_result;
-
-  if (threadIdx.x == 0)
-  {
-    block_result = num_items;
-  }
-
-  bool found = false;
-  for (int i = 0; i < elements_per_thread; ++i)
-  {
-    auto index = tile_offset + threadIdx.x + i * blockDim.x;
-
-    if (index < num_items)
-    {
-      if (pred(*(begin + index)))
-      {
-        found = true;
-        atomicMin(&block_result, index);
-        break;
-      }
-    }
-  }
-  if (syncthreads_or(found))
-  {
-    if (threadIdx.x == 0)
-    {
-      if (block_result < num_items)
-      {
-        atomicMin(result, block_result);
-      }
-    }
-  }
-}
-
-template <typename IterBegin, typename IterEnd, typename Pred>
-__global__ void find_if(IterBegin begin, IterEnd end, Pred pred, int* result, std::size_t num_items)
-{
-  using InputT = cub::detail::value_t<IterBegin>;
-
-  // 1. _VECTOR_LOAD_LENGTH > 1: number of items per vectorized load should have been determined to be more than 1.
-  //    Now it's hardcoded but it will be determined at compile time according to the GPU architecture (Policy) later
-  //    on.
-  // 2. elements_per_thread % _VECTOR_LOAD_LENGTH == 0: elements_per_thread is also defined at compile time after tuning
-  //    for specific architectures. There is not point in doing vectorization if a thread cannot vectorize load all the
-  //    elmenets it is going to be working on, on a single tile.
-  // 3. ::cuda::std::is_pointer<IterBegin>::value: is contiguous iterator. If memory is not contiguous loading
-  //    vectorized memory makes no sense. Defined at compile time.
-  // 4. Traits<cub::detail::value_t<IterBegin>>::PRIMITIVE: InputT cannot be an arbitrary type, that could be layed out
-  //    in memory in any way. Needs to be known. Compile time.
-
-  static constexpr bool ATTEMPT_VECTORIZATION =
-    (_VECTOR_LOAD_LENGTH > 1) && (elements_per_thread % _VECTOR_LOAD_LENGTH == 0)
-    && (::cuda::std::is_pointer<IterBegin>::value) && Traits<InputT>::PRIMITIVE;
-
-  auto tile_size = blockDim.x * elements_per_thread;
-  __shared__ int sresult;
-
-  for (int tile_offset = blockIdx.x * tile_size; tile_offset < num_items; tile_offset += tile_size * gridDim.x)
-  {
-    // Only one thread reads atomically and propagates it to the
-    // the rest threads of the block through shared memory
-    if (threadIdx.x == 0)
-    {
-      sresult = atomicAdd(result, 0);
-    }
-    __syncthreads();
-
-    // early exit
-    if (sresult < tile_offset)
-    {
-      return;
-    }
-
-    IsAlignedAndFullTile(begin, tile_offset, tile_size, num_items, Int2Type<ATTEMPT_VECTORIZATION>())
-      ? ConsumeRange(begin, tile_offset, pred, result, num_items, Int2Type < true && ATTEMPT_VECTORIZATION > ())
-      : ConsumeRange(begin, tile_offset, pred, result, num_items, Int2Type < false && ATTEMPT_VECTORIZATION > ());
-  }
-}
-
-template <typename ValueType, typename OutputIteratorT>
-__global__ void write_final_result_in_output_iterator_already(ValueType* d_temp_storage, OutputIteratorT d_out)
-{
-  *d_out = *d_temp_storage;
-}
-
-template <typename ValueType, typename NumItemsT>
-__global__ void cuda_mem_set_async_dtemp_storage(ValueType* d_temp_storage, NumItemsT num_items)
-{
-  *d_temp_storage = num_items;
-}
-
 struct DeviceFind
 {
   template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename NumItemsT>
-  CUB_RUNTIME_FUNCTION static void FindIf(
+  CUB_RUNTIME_FUNCTION static cudaError_t FindIf(
     void* d_temp_storage,
     size_t& temp_storage_bytes,
     InputIteratorT d_in,
     OutputIteratorT d_out,
-    ScanOpT op,
+    ScanOpT scan_op,
     NumItemsT num_items,
     cudaStream_t stream = 0)
   {
-    int tile_size = block_threads * elements_per_thread;
-    int num_tiles = static_cast<int>(cub::DivideAndRoundUp(num_items, tile_size));
-
-    // Get device ordinal
-    int device_ordinal;
-    cudaError error = CubDebug(cudaGetDevice(&device_ordinal));
-    if (cudaSuccess != error)
-    {
-      return;
-    }
-
-    // Get SM count
-    int sm_count;
-    error = CubDebug(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal));
-    if (cudaSuccess != error)
-    {
-      return;
-    }
-
-    int find_if_sm_occupancy;
-    error = CubDebug(
-      cub::MaxSmOccupancy(find_if_sm_occupancy, find_if<InputIteratorT, InputIteratorT, ScanOpT>, block_threads));
-    if (cudaSuccess != error)
-    {
-      return;
-    }
-
-    int findif_device_occupancy = find_if_sm_occupancy * sm_count;
-
-    // Even-share work distribution
-    int max_blocks = findif_device_occupancy; // no * CUB_SUBSCRIPTION_FACTOR(0) because max_blocks gets too big
-
-    int findif_grid_size = CUB_MIN(num_tiles, max_blocks);
-
-    // Temporary storage allocation requirements
-    void* allocations[1]       = {};
-    size_t allocation_sizes[1] = {sizeof(int)};
-
-    // Alias the temporary allocations from the single storage blob (or
-    // compute the necessary size of the blob)
-    error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
-    if (cudaSuccess != error)
-    {
-      return;
-    }
-
-    int* int_temp_storage = static_cast<int*>(allocations[0]); // this shouldn't be just int
-
-    if (d_temp_storage == nullptr)
-    {
-      return;
-    }
-
-    // use d_temp_storage as the intermediate device result
-    // to read and write from. Then store the final result in the output iterator.
-    cuda_mem_set_async_dtemp_storage<<<1, 1>>>(int_temp_storage, num_items);
-
-    find_if<<<findif_grid_size, block_threads, 0, stream>>>(d_in, d_in + num_items, op, int_temp_storage, num_items);
+    //    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceFind::FindIf");
 
-    write_final_result_in_output_iterator_already<int><<<1, 1>>>(int_temp_storage, d_out);
+    // Signed integer type for global offsets
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
 
-    return;
+    return DispatchFind<InputIteratorT, OutputIteratorT, OffsetT, ScanOpT>::Dispatch(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast<OffsetT>(num_items), scan_op, stream);
   }
 };
 
diff --git a/cub/cub/device/dispatch/dispatch_find.cuh b/cub/cub/device/dispatch/dispatch_find.cuh
new file mode 100644
index 00000000000..52bf5f6efe8
--- /dev/null
+++ b/cub/cub/device/dispatch/dispatch_find.cuh
@@ -0,0 +1,331 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file cub::DeviceFind provides device-wide, parallel operations for
+ *       computing search across a sequence of data items residing within
+ *       device-accessible memory.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+#include <thrust/detail/config.h>
+
+#include <cub/thread/thread_load.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include "cub/util_type.cuh"
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_find.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+template <typename ValueType, typename OutputIteratorT>
+__global__ void write_final_result_in_output_iterator_already(ValueType* d_temp_storage, OutputIteratorT d_out)
+{
+  *d_out = *d_temp_storage;
+}
+
+template <typename ValueType, typename NumItemsT>
+__global__ void cuda_mem_set_async_dtemp_storage(ValueType* d_temp_storage, NumItemsT num_items)
+{
+  *d_temp_storage = num_items;
+}
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/** ENTER DOCUMENTATION */
+template <typename ChainedPolicyT, typename InputIteratorT, typename OutputIteratorT, typename OffsetT, typename ScanOpT>
+__launch_bounds__(int(ChainedPolicyT::ActivePolicy::FindPolicy::BLOCK_THREADS))
+  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceFindKernel(
+    InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, OffsetT* value_temp_storage, ScanOpT scan_op)
+{
+  using AgentFindT =
+    AgentFind<typename ChainedPolicyT::ActivePolicy::FindPolicy, InputIteratorT, OutputIteratorT, OffsetT, ScanOpT>;
+
+  __shared__ typename AgentFindT::TempStorage sresult;
+  // __shared__ int temp_storage;
+  // Process tiles
+  AgentFindT agent(sresult, d_in, scan_op); // Seems like sresult can be defined and initialized in agent_find.cuh
+                                            // directly without having to pass it here as an argument.
+
+  agent.Process(value_temp_storage, num_items);
+}
+
+template <typename InputIt>
+struct DeviceFindPolicy
+{
+  //---------------------------------------------------------------------------
+  // Architecture-specific tuning policies
+  //---------------------------------------------------------------------------
+
+  /// SM30
+  struct Policy300 : ChainedPolicy<300, Policy300, Policy300>
+  {
+    static constexpr int threads_per_block  = 128;
+    static constexpr int items_per_thread   = 16;
+    static constexpr int items_per_vec_load = 4;
+
+    // FindPolicy (GTX670: 154.0 @ 48M 4B items)
+    using FindPolicy =
+      AgentFindPolicy<threads_per_block, items_per_thread, cub::detail::value_t<InputIt>, items_per_vec_load, LOAD_LDG>;
+
+    // // SingleTilePolicy
+    // using SingleTilePolicy = FindPolicy;
+  };
+
+  using MaxPolicy = Policy300;
+};
+
+template <typename InputIteratorT,
+          typename OutputIteratorT,
+          typename OffsetT,
+          typename ScanOpT,
+          typename SelectedPolicy = DeviceFindPolicy<InputIteratorT>>
+struct DispatchFind : SelectedPolicy
+{
+  //---------------------------------------------------------------------------
+  // Problem state
+  //---------------------------------------------------------------------------
+
+  /// Device-accessible allocation of temporary storage. When `nullptr`, the
+  /// required allocation size is written to `temp_storage_bytes` and no work
+  /// is done.
+  void* d_temp_storage;
+
+  /// Reference to size in bytes of `d_temp_storage` allocation
+  size_t& temp_storage_bytes;
+
+  /// Pointer to the input sequence of data items
+  InputIteratorT d_in;
+
+  /// Pointer to the output aggregate
+  OutputIteratorT d_out;
+
+  /// Total number of input items (i.e., length of `d_in`)
+  OffsetT num_items;
+
+  /// Unary search functor
+  ScanOpT scan_op;
+
+  /// CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+  cudaStream_t stream;
+
+  int ptx_version;
+
+  //---------------------------------------------------------------------------
+  // Constructor
+  //---------------------------------------------------------------------------
+
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchFind(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    OffsetT num_items,
+    ScanOpT scan_op,
+    cudaStream_t stream,
+    int ptx_version)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_in(d_in)
+      , d_out(d_out)
+      , num_items(num_items)
+      , scan_op(scan_op)
+      , stream(stream)
+      , ptx_version(ptx_version)
+  {}
+
+  //---------------------------------------------------------------------------
+  // Normal problem size invocation
+  //---------------------------------------------------------------------------
+
+  //---------------------------------------------------------------------------
+  // Chained policy invocation
+  //---------------------------------------------------------------------------
+
+  /// Invocation
+  template <typename ActivePolicyT, typename FindKernel>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke(FindKernel find_kernel)
+  {
+    using Policy = typename ActivePolicyT::FindPolicy;
+
+    cudaError error = cudaSuccess;
+    do
+    {
+      // Number of input tiles
+      int tile_size = Policy::BLOCK_THREADS * Policy::ITEMS_PER_THREAD;
+      int num_tiles = static_cast<int>(::cuda::ceil_div(num_items, tile_size));
+
+      // Get device ordinal
+      int device_ordinal;
+      error = CubDebug(cudaGetDevice(&device_ordinal));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Get SM count
+      int sm_count;
+      error = CubDebug(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      int find_if_sm_occupancy;
+      error = CubDebug(cub::MaxSmOccupancy(find_if_sm_occupancy, find_kernel, Policy::BLOCK_THREADS));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      int findif_device_occupancy = find_if_sm_occupancy * sm_count;
+      int max_blocks       = findif_device_occupancy; // no * CUB_SUBSCRIPTION_FACTOR(0) because max_blocks gets too big
+      int findif_grid_size = CUB_MIN(num_tiles, max_blocks);
+
+      // Temporary storage allocation requirements
+      void* allocations[1]       = {};
+      size_t allocation_sizes[1] = {sizeof(int)};
+      // Alias the temporary allocations from the single storage blob (or
+      // compute the necessary size of the blob)
+      error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      OffsetT* value_temp_storage = static_cast<OffsetT*>(allocations[0]);
+
+      if (d_temp_storage == nullptr)
+      {
+        // Return if the caller is simply requesting the size of the storage
+        // allocation
+        return cudaSuccess;
+      }
+
+      // use d_temp_storage as the intermediate device result
+      // to read and write from. Then store the final result in the output iterator.
+
+      cuda_mem_set_async_dtemp_storage<<<1, 1>>>(value_temp_storage, num_items);
+
+      // Invoke FindIfKernel
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+        findif_grid_size, ActivePolicyT::FindPolicy::BLOCK_THREADS, 0, stream)
+        .doit(find_kernel, d_in, d_out, num_items, value_temp_storage, scan_op);
+
+      write_final_result_in_output_iterator_already<OffsetT><<<1, 1>>>(value_temp_storage, d_out);
+
+      // Check for failure to launch
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+    } while (0);
+    return error;
+  }
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
+  {
+    using MaxPolicyT = typename SelectedPolicy::MaxPolicy;
+    return Invoke<ActivePolicyT>(
+      DeviceFindKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetT, ScanOpT>); // include the surrounding two
+                                                                                        // init and write back kernels
+                                                                                        // here.
+  }
+
+  //---------------------------------------------------------------------------
+  // Dispatch entrypoints
+  //---------------------------------------------------------------------------
+
+  /**
+   * @brief @giannis ENTER NO DOCUMENTATION. DISPATCH LAYER IN NEW ALGOS NOT EXPOSED
+   *
+  // private: ????? */
+
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    OffsetT num_items,
+    ScanOpT scan_op,
+    cudaStream_t stream)
+  {
+    using MaxPolicyT = typename DispatchFind::MaxPolicy;
+
+    cudaError error = cudaSuccess;
+    do
+    {
+      // Get PTX version
+      int ptx_version = 0;
+      error           = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+      // Create dispatch functor
+      DispatchFind dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, scan_op, stream, ptx_version);
+
+      // Dispatch to chained policy
+      error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); // @giannis how is Invoke() been called since it
+                                                                   // takes no arguments
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/cub/test/catch2_test_device_find_if.cu b/cub/test/catch2_test_device_find_if.cu
index 2f7a3175049..67918021e6a 100644
--- a/cub/test/catch2_test_device_find_if.cu
+++ b/cub/test/catch2_test_device_find_if.cu
@@ -36,11 +36,8 @@
 #include <thrust/device_vector.h>
 #include <thrust/iterator/constant_iterator.h>
 
-#include <iostream>
-
 #include "c2h/custom_type.cuh"
 #include "catch2_test_device_reduce.cuh"
-#include "catch2_test_helper.h"
 #include "catch2_test_launch_helper.h"
 #include "thrust/detail/raw_pointer_cast.h"
 #include <catch2/catch.hpp>
@@ -57,7 +54,7 @@ using custom_t =
                      c2h::lexicographical_less_comparable_t,
                      c2h::lexicographical_greater_comparable_t>;
 
-using full_type_list = c2h::type_list<type_pair<std::uint8_t, std::int32_t>, type_pair<std::int8_t>>;
+using full_type_list = c2h::type_list<type_pair<std::uint8_t, std::int32_t>, type_pair<std::int16_t>>;
 // clang-format on
 
 enum class gen_data_t : int
@@ -89,15 +86,15 @@ struct equals
   }
 };
 
-CUB_TEST("Device find_if works", "[device]", full_type_list)
+C2H_TEST("Device find_if works", "[device]", full_type_list)
 {
   using params   = params_t<TestType>;
   using input_t  = typename params::item_t;
   using output_t = typename params::output_t;
-  using offset_t = int32_t;
+  using offset_t = output_t;
 
   constexpr offset_t min_items = 1;
-  constexpr offset_t max_items = 1000000;
+  constexpr offset_t max_items = std::numeric_limits<offset_t>::max() / 5; // make test run faster
 
   // Generate the input sizes to test for
   const offset_t num_items = GENERATE_COPY(
@@ -114,7 +111,7 @@ CUB_TEST("Device find_if works", "[device]", full_type_list)
   c2h::device_vector<input_t> in_items(num_items);
   if (data_gen_mode == gen_data_t::GEN_TYPE_RANDOM)
   {
-    c2h::gen(CUB_SEED(2), in_items);
+    c2h::gen(C2H_SEED(2), in_items);
   }
   else
   {
@@ -124,11 +121,11 @@ CUB_TEST("Device find_if works", "[device]", full_type_list)
   }
   auto d_in_it = thrust::raw_pointer_cast(in_items.data());
 
+  using op_t          = equals<input_t>;
+  input_t val_to_find = GENERATE_COPY(take(1, random(min_items, max_items)));
+
   SECTION("Generic find if case")
   {
-    using op_t = equals<input_t>;
-    input_t val_to_find{2};
-
     // Prepare verification data
     c2h::host_vector<input_t> host_items(in_items);
     c2h::host_vector<output_t> expected_result(1);
@@ -139,12 +136,12 @@ CUB_TEST("Device find_if works", "[device]", full_type_list)
 
     // Run test
     c2h::device_vector<output_t> out_result(1);
-    auto d_out_it = thrust::raw_pointer_cast(out_result.data());
+    output_t* d_out_it = thrust::raw_pointer_cast(out_result.data());
 
     cub::DeviceFind::FindIf(
       d_temp_storage, temp_storage_bytes, unwrap_it(d_in_it), unwrap_it(d_out_it), op_t{val_to_find}, num_items);
 
-    thrust::device_vector<uint8_t> temp_storage(temp_storage_bytes);
+    thrust::device_vector<output_t> temp_storage(temp_storage_bytes);
     d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
 
     cub::DeviceFind::FindIf(
@@ -156,9 +153,6 @@ CUB_TEST("Device find_if works", "[device]", full_type_list)
 
   SECTION("find_if works with non raw pointers - .begin() iterator")
   {
-    using op_t = equals<input_t>;
-    input_t val_to_find{2};
-
     // Prepare verification data
     c2h::host_vector<input_t> host_items(in_items);
     c2h::host_vector<output_t> expected_result(1);
@@ -173,7 +167,7 @@ CUB_TEST("Device find_if works", "[device]", full_type_list)
     cub::DeviceFind::FindIf(
       d_temp_storage, temp_storage_bytes, in_items.begin(), out_result.begin(), op_t{val_to_find}, num_items);
 
-    thrust::device_vector<uint8_t> temp_storage(temp_storage_bytes);
+    thrust::device_vector<output_t> temp_storage(temp_storage_bytes);
     d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
 
     cub::DeviceFind::FindIf(
@@ -189,9 +183,6 @@ CUB_TEST("Device find_if works", "[device]", full_type_list)
     {
       if (num_items - offset > 0)
       {
-        using op_t = equals<input_t>;
-        input_t val_to_find{2};
-
         // Prepare verification data
         c2h::host_vector<input_t> host_items(in_items);
         c2h::host_vector<output_t> expected_result(1);
@@ -212,7 +203,7 @@ CUB_TEST("Device find_if works", "[device]", full_type_list)
           op_t{val_to_find},
           num_items - offset);
 
-        thrust::device_vector<uint8_t> temp_storage(temp_storage_bytes);
+        thrust::device_vector<output_t> temp_storage(temp_storage_bytes);
         d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
 
         cub::DeviceFind::FindIf(
@@ -231,9 +222,6 @@ CUB_TEST("Device find_if works", "[device]", full_type_list)
 
   SECTION("find_if works with non primitive iterator")
   {
-    using op_t = equals<input_t>;
-    input_t val_to_find{2};
-
     // Prepare verification data
     auto it = thrust::make_counting_iterator(0); // non-primitive iterator
     c2h::host_vector<output_t> expected_result(1);
@@ -248,7 +236,7 @@ CUB_TEST("Device find_if works", "[device]", full_type_list)
 
     cub::DeviceFind::FindIf(d_temp_storage, temp_storage_bytes, it, unwrap_it(d_out_it), op_t{val_to_find}, num_items);
 
-    thrust::device_vector<uint8_t> temp_storage(temp_storage_bytes);
+    thrust::device_vector<output_t> temp_storage(temp_storage_bytes);
     d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
 
     cub::DeviceFind::FindIf(d_temp_storage, temp_storage_bytes, it, unwrap_it(d_out_it), op_t{val_to_find}, num_items);

From eaa5b754e1eaef028819656f3123ca89cc38aa87 Mon Sep 17 00:00:00 2001
From: gonidelis <ggonidelis@nvidia.com>
Date: Fri, 1 Nov 2024 17:28:45 -0700
Subject: [PATCH 12/13] Reviews, cleanup

---
 cub/cub/agent/agent_find.cuh              | 11 ++++-------
 cub/test/catch2_test_device_find_if.cu    |  3 ---
 thrust/benchmarks/bench/count_if/basic.cu |  3 ---
 thrust/benchmarks/bench/find_if/basic.cu  |  3 ---
 4 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/cub/cub/agent/agent_find.cuh b/cub/cub/agent/agent_find.cuh
index 80b35ea93a0..b3f377fcbdc 100644
--- a/cub/cub/agent/agent_find.cuh
+++ b/cub/cub/agent/agent_find.cuh
@@ -187,10 +187,7 @@ struct AgentFind
 
     __syncthreads();
 
-    enum
-    {
-      NUMBER_OF_VECTORS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH
-    };
+    constexpr int NUMBER_OF_VECTORS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH;
     //// vectorized loads begin
     InputT* d_in_unqualified = const_cast<InputT*>(d_in) + tile_offset + (threadIdx.x * VECTOR_LOAD_LENGTH);
 
@@ -211,10 +208,10 @@ struct AgentFind
     for (int i = 0; i < ITEMS_PER_THREAD; ++i)
     {
       OffsetT nth_vector_of_thread = i / VECTOR_LOAD_LENGTH;
-      OffsetT element_in_word      = i % VECTOR_LOAD_LENGTH;
+      OffsetT element_in_vector    = i % VECTOR_LOAD_LENGTH;
       OffsetT vector_of_tile       = nth_vector_of_thread * BLOCK_THREADS + threadIdx.x;
 
-      OffsetT index = tile_offset + vector_of_tile * VECTOR_LOAD_LENGTH + element_in_word;
+      OffsetT index = tile_offset + vector_of_tile * VECTOR_LOAD_LENGTH + element_in_vector;
 
       if (index < num_items)
       {
@@ -286,7 +283,7 @@ struct AgentFind
     for (int tile_offset = blockIdx.x * TILE_ITEMS; tile_offset < num_items; tile_offset += TILE_ITEMS * gridDim.x)
     {
       // Only one thread reads atomically and propagates it to the
-      // the rest threads of the block through shared memory
+      // the other threads of the block through shared memory
       if (threadIdx.x == 0)
       {
         sresult = atomicAdd(value_temp_storage, 0);
diff --git a/cub/test/catch2_test_device_find_if.cu b/cub/test/catch2_test_device_find_if.cu
index 67918021e6a..524f7c04c41 100644
--- a/cub/test/catch2_test_device_find_if.cu
+++ b/cub/test/catch2_test_device_find_if.cu
@@ -76,9 +76,6 @@ template <typename T>
 struct equals
 {
   T val;
-  equals(T _val)
-      : val(_val)
-  {}
 
   __device__ __host__ bool operator()(T i)
   {
diff --git a/thrust/benchmarks/bench/count_if/basic.cu b/thrust/benchmarks/bench/count_if/basic.cu
index b672d789868..981e7c7610e 100644
--- a/thrust/benchmarks/bench/count_if/basic.cu
+++ b/thrust/benchmarks/bench/count_if/basic.cu
@@ -34,9 +34,6 @@ template <typename T>
 struct equals
 {
   T val;
-  equals(T _val)
-      : val(_val)
-  {}
 
   __device__ __host__ bool operator()(T i)
   {
diff --git a/thrust/benchmarks/bench/find_if/basic.cu b/thrust/benchmarks/bench/find_if/basic.cu
index 8a2618e26d5..362619f29e0 100644
--- a/thrust/benchmarks/bench/find_if/basic.cu
+++ b/thrust/benchmarks/bench/find_if/basic.cu
@@ -34,9 +34,6 @@ template <typename T>
 struct equals
 {
   T val;
-  equals(T _val)
-      : val(_val)
-  {}
 
   __device__ __host__ bool operator()(T i)
   {

From 7c2f2a87281d79a02c2a886e5c2c20f9870cb583 Mon Sep 17 00:00:00 2001
From: Giannis Gonidelis <gonidelis@hotmail.com>
Date: Mon, 11 Nov 2024 00:27:46 -0800
Subject: [PATCH 13/13] CacheModifiedInputIterator accepts const

Co-authored-by: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
---
 cub/cub/agent/agent_find.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cub/cub/agent/agent_find.cuh b/cub/cub/agent/agent_find.cuh
index b3f377fcbdc..3298ad94cdb 100644
--- a/cub/cub/agent/agent_find.cuh
+++ b/cub/cub/agent/agent_find.cuh
@@ -189,10 +189,10 @@ struct AgentFind
 
     constexpr int NUMBER_OF_VECTORS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH;
     //// vectorized loads begin
-    InputT* d_in_unqualified = const_cast<InputT*>(d_in) + tile_offset + (threadIdx.x * VECTOR_LOAD_LENGTH);
+    const InputT* d_in_unqualified = d_in + tile_offset + (threadIdx.x * VECTOR_LOAD_LENGTH);
 
     cub::CacheModifiedInputIterator<AgentFindPolicy::LOAD_MODIFIER, VectorT> d_vec_in(
-      reinterpret_cast<VectorT*>(d_in_unqualified));
+      reinterpret_cast<const VectorT*>(d_in_unqualified));
 
     InputT input_items[ITEMS_PER_THREAD];
     VectorT* vec_items = reinterpret_cast<VectorT*>(input_items);