Skip to content

Commit

Permalink
Fix identifier naming convention
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber committed Jun 10, 2024
1 parent 734a3f6 commit 253b8fb
Show file tree
Hide file tree
Showing 3 changed files with 178 additions and 197 deletions.
191 changes: 90 additions & 101 deletions cub/cub/agent/agent_merge.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,57 +27,47 @@
CUB_NAMESPACE_BEGIN
namespace detail
{
template <int _BLOCK_THREADS,
int _ITEMS_PER_THREAD,
BlockLoadAlgorithm _LOAD_ALGORITHM,
CacheLoadModifier _LOAD_MODIFIER,
BlockStoreAlgorithm _STORE_ALGORITHM>
struct AgentMergePolicy
{
static constexpr int BLOCK_THREADS = _BLOCK_THREADS;
static constexpr int ITEMS_PER_THREAD = _ITEMS_PER_THREAD;
static constexpr int ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD;

static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
static constexpr BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
};

/**
* \brief This agent is responsible for partitioning a merge path into equal segments
*
* There are two sorted arrays to be merged into one array. If the first array
* is partitioned between parallel workers by slicing it into ranges of equal
* size, there could be a significant workload imbalance. The imbalance is
* caused by the fact that the distribution of elements from the second array
* is unknown beforehand. Instead, the MergePath is partitioned between workers.
* This approach guarantees an equal amount of work being assigned to each worker.
*
* This approach is outlined in the paper:
* Odeh et al, "Merge Path - Parallel Merging Made Simple"
* doi:10.1109/IPDPSW.2012.202
*/
// TODO(bgruber): can we unify this one with AgentParition in agent_merge_sort.cuh?
template <typename KeysIt1, typename KeysIt2, typename OffsetT, typename CompareOp>
// This agent computes the merge path intersections at equally wide intervals. The approach is outlined in the paper:
// Odeh et al, "Merge Path - Parallel Merging Made Simple" * doi : 10.1109 / IPDPSW .2012.202
// The algorithm is the same as AgentPartition for merge sort, but that agent handles a lot more.
template <typename KeysIt1, typename KeysIt2, typename Offset, typename CompareOp, int items_per_tile>
struct AgentPartitionMergePath
{
KeysIt1 keys1;
OffsetT keys1_count;
Offset keys1_count;
KeysIt2 keys2;
OffsetT keys2_count;
OffsetT* merge_partitions;
Offset keys2_count;
Offset* merge_partitions;
CompareOp compare_op;
int items_per_tile;
OffsetT partition_idx;
Offset partition_idx;

_CCCL_DEVICE _CCCL_FORCEINLINE void Process()
// TODO(bgruber): I honestly would just inline this agent into the kernel, because it short and non-public
_CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
{
const OffsetT partition_at = (cub::min)(partition_idx * items_per_tile, keys1_count + keys2_count);
const OffsetT partition_diag = cub::MergePath(keys1, keys2, keys1_count, keys2_count, partition_at, compare_op);
merge_partitions[partition_idx] = partition_diag;
const Offset partition_at = (cub::min)(partition_idx * items_per_tile, keys1_count + keys2_count);
merge_partitions[partition_idx] = cub::MergePath(keys1, keys2, keys1_count, keys2_count, partition_at, compare_op);
}
};

template <int ThreadsPerBlock,
int ItemsPerThread,
BlockLoadAlgorithm LoadAlgorithm,
CacheLoadModifier LoadCacheModifier,
BlockStoreAlgorithm StoreAlgorithm>
struct agent_merge_no_sort_policy
{
static constexpr int threads_per_block = ThreadsPerBlock;
static constexpr int items_per_thread = ItemsPerThread;
static constexpr int items_per_tile = threads_per_block * items_per_thread;

// for BlockLoad
static constexpr int BLOCK_THREADS = threads_per_block;
static constexpr int ITEMS_PER_THREAD = items_per_thread;
static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = LoadAlgorithm;
static constexpr CacheLoadModifier LOAD_MODIFIER = LoadCacheModifier;
static constexpr BlockStoreAlgorithm STORE_ALGORITHM = StoreAlgorithm;
};

// TODO(bgruber): can we unify this one with AgentMerge in agent_merge_sort.cuh?
template <typename Policy,
typename KeysIt1,
Expand All @@ -88,67 +78,65 @@ template <typename Policy,
typename ItemsOutputIt,
typename Offset,
typename CompareOp>
struct AgentMergeNoSort
struct agent_merge_no_sort
{
using key_type = typename ::cuda::std::iterator_traits<KeysIt1>::value_type;
using item_type = typename ::cuda::std::iterator_traits<ItemsIt1>::value_type;

static constexpr bool HaveItems = !std::is_same<item_type, NullType>::value;

using KeysLoadIt1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeysIt1>::type;
using KeysLoadIt2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeysIt2>::type;
using ItemsLoadIt1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ItemsIt1>::type;
using ItemsLoadIt2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ItemsIt2>::type;
using keys_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeysIt1>::type;
using keys_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeysIt2>::type;
using items_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ItemsIt1>::type;
using items_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ItemsIt2>::type;

using BlockLoadKeys1 = typename BlockLoadType<Policy, KeysLoadIt1>::type;
using BlockLoadKeys2 = typename BlockLoadType<Policy, KeysLoadIt2>::type;
using BlockLoadItems1 = typename BlockLoadType<Policy, ItemsLoadIt1>::type;
using BlockLoadItems2 = typename BlockLoadType<Policy, ItemsLoadIt2>::type;
using block_load_keys1 = typename BlockLoadType<Policy, keys_load_it1>::type;
using block_load_keys2 = typename BlockLoadType<Policy, keys_load_it2>::type;
using block_load_items1 = typename BlockLoadType<Policy, items_load_it1>::type;
using block_load_items2 = typename BlockLoadType<Policy, items_load_it2>::type;

using BlockStoreKeys = typename BlockStoreType<Policy, KeysOutputIt, key_type>::type;
using BlockStoreItems = typename BlockStoreType<Policy, ItemsOutputIt, item_type>::type;
using block_store_keys = typename BlockStoreType<Policy, KeysOutputIt, key_type>::type;
using block_store_items = typename BlockStoreType<Policy, ItemsOutputIt, item_type>::type;

union _TempStorage
union temp_storages
{
typename BlockLoadKeys1::TempStorage load_keys1;
typename BlockLoadKeys2::TempStorage load_keys2;
typename BlockLoadItems1::TempStorage load_items1;
typename BlockLoadItems2::TempStorage load_items2;
typename BlockStoreKeys::TempStorage store_keys;
typename BlockStoreItems::TempStorage store_items;

item_type items_shared[Policy::ITEMS_PER_TILE + 1];
key_type keys_shared[Policy::ITEMS_PER_TILE + 1];
typename block_load_keys1::TempStorage load_keys1;
typename block_load_keys2::TempStorage load_keys2;
typename block_load_items1::TempStorage load_items1;
typename block_load_items2::TempStorage load_items2;
typename block_store_keys::TempStorage store_keys;
typename block_store_items::TempStorage store_items;

item_type items_shared[Policy::items_per_tile + 1];
key_type keys_shared[Policy::items_per_tile + 1];
};

struct TempStorage : Uninitialized<_TempStorage>
struct TempStorage : Uninitialized<temp_storages>
{};

static constexpr int ITEMS_PER_THREAD = Policy::ITEMS_PER_THREAD;
static constexpr int BLOCK_THREADS = Policy::BLOCK_THREADS;
static constexpr Offset ITEMS_PER_TILE = Policy::ITEMS_PER_TILE;
static constexpr int items_per_thread = Policy::items_per_thread;
static constexpr int threads_per_block = Policy::threads_per_block;
static constexpr Offset items_per_tile = Policy::items_per_tile;

// Per thread data
_TempStorage& storage;
KeysLoadIt1 keys1_in;
ItemsLoadIt1 items1_in;
temp_storages& storage;
keys_load_it1 keys1_in;
items_load_it1 items1_in;
Offset keys1_count;
KeysLoadIt2 keys2_in;
ItemsLoadIt2 items2_in;
keys_load_it2 keys2_in;
items_load_it2 items2_in;
Offset keys2_count;
KeysOutputIt keys_out;
ItemsOutputIt items_out;
CompareOp compare_op;
Offset* merge_partitions;

template <bool IS_FULL_TILE>
template <bool IsFullTile>
_CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile(Offset tile_idx, Offset tile_base, int num_remaining)
{
const Offset partition_beg = merge_partitions[tile_idx + 0];
const Offset partition_end = merge_partitions[tile_idx + 1];

const Offset diag0 = ITEMS_PER_TILE * tile_idx;
const Offset diag1 = (cub::min)(keys1_count + keys2_count, diag0 + ITEMS_PER_TILE);
const Offset diag0 = items_per_tile * tile_idx;
const Offset diag1 = (cub::min)(keys1_count + keys2_count, diag0 + items_per_tile);

// compute bounding box for keys1 & keys2
const Offset keys1_beg = partition_beg;
Expand All @@ -160,14 +148,15 @@ struct AgentMergeNoSort
const int num_keys1 = static_cast<int>(keys1_end - keys1_beg);
const int num_keys2 = static_cast<int>(keys2_end - keys2_beg);

key_type keys_loc[ITEMS_PER_THREAD];
gmem_to_reg<BLOCK_THREADS, IS_FULL_TILE>(keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg, num_keys1, num_keys2);
reg_to_shared<BLOCK_THREADS>(&storage.keys_shared[0], keys_loc);
key_type keys_loc[items_per_thread];
gmem_to_reg<threads_per_block, IsFullTile>(
keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg, num_keys1, num_keys2);
reg_to_shared<threads_per_block>(&storage.keys_shared[0], keys_loc);
CTA_SYNC();

// use binary search in shared memory to find merge path for each of thread.
// we can use int type here, because the number of items in shared memory is limited
const int diag0_loc = min<int>(num_keys1 + num_keys2, ITEMS_PER_THREAD * threadIdx.x);
const int diag0_loc = min<int>(num_keys1 + num_keys2, items_per_thread * threadIdx.x);

const int keys1_beg_loc =
MergePath(&storage.keys_shared[0], &storage.keys_shared[num_keys1], num_keys1, num_keys2, diag0_loc, compare_op);
Expand All @@ -179,7 +168,7 @@ struct AgentMergeNoSort
const int num_keys2_loc = keys2_end_loc - keys2_beg_loc;

// perform serial merge
int indices[ITEMS_PER_THREAD];
int indices[items_per_thread];
cub::SerialMerge(
&storage.keys_shared[0],
keys1_beg_loc,
Expand All @@ -192,62 +181,62 @@ struct AgentMergeNoSort
CTA_SYNC();

// write keys
if (IS_FULL_TILE)
if (IsFullTile)
{
BlockStoreKeys(storage.store_keys).Store(keys_out + tile_base, keys_loc);
block_store_keys{storage.store_keys}.Store(keys_out + tile_base, keys_loc);
}
else
{
BlockStoreKeys(storage.store_keys).Store(keys_out + tile_base, keys_loc, num_remaining);
block_store_keys{storage.store_keys}.Store(keys_out + tile_base, keys_loc, num_remaining);
}

// if items are provided, merge them
_CCCL_IF_CONSTEXPR (HaveItems)
static constexpr bool have_items = !std::is_same<item_type, NullType>::value;
_CCCL_IF_CONSTEXPR (have_items)
{
item_type items_loc[ITEMS_PER_THREAD];
gmem_to_reg<BLOCK_THREADS, IS_FULL_TILE>(
item_type items_loc[items_per_thread];
gmem_to_reg<threads_per_block, IsFullTile>(
items_loc, items1_in + keys1_beg, items2_in + keys2_beg, num_keys1, num_keys2);
CTA_SYNC(); // TODO(bgruber): there is no sync when loading keys. bug or unnecessary?
reg_to_shared<BLOCK_THREADS>(&storage.items_shared[0], items_loc);
reg_to_shared<threads_per_block>(&storage.items_shared[0], items_loc);
CTA_SYNC();

// gather items from shared mem
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
for (int i = 0; i < items_per_thread; ++i)
{
items_loc[ITEM] = storage.items_shared[indices[ITEM]];
items_loc[i] = storage.items_shared[indices[i]];
}
CTA_SYNC();

// write from reg to gmem
if (IS_FULL_TILE)
if (IsFullTile)
{
BlockStoreItems(storage.store_items).Store(items_out + tile_base, items_loc);
block_store_items{storage.store_items}.Store(items_out + tile_base, items_loc);
}
else
{
BlockStoreItems(storage.store_items).Store(items_out + tile_base, items_loc, num_remaining);
block_store_items{storage.store_items}.Store(items_out + tile_base, items_loc, num_remaining);
}
}
}

_CCCL_DEVICE _CCCL_FORCEINLINE void Process()
_CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
{
// XXX with 8.5 chaging type to Offset (or long long) results in error!
// TODO(bgruber): is the above still true?
const int tile_idx = static_cast<int>(blockIdx.x);
const Offset tile_base = tile_idx * ITEMS_PER_TILE;
const Offset tile_base = tile_idx * items_per_tile;
// TODO(bgruber): random mixing of int and Offset
const int items_in_tile =
static_cast<int>(cub::min(static_cast<Offset>(ITEMS_PER_TILE), keys1_count + keys2_count - tile_base));
if (items_in_tile == ITEMS_PER_TILE)
static_cast<int>(cub::min(static_cast<Offset>(items_per_tile), keys1_count + keys2_count - tile_base));
if (items_in_tile == items_per_tile)
{
// full tile
consume_tile<true>(tile_idx, tile_base, ITEMS_PER_TILE);
consume_tile<true>(tile_idx, tile_base, items_per_tile); // full tile
}
else
{
// partial tile
consume_tile<false>(tile_idx, tile_base, items_in_tile);
consume_tile<false>(tile_idx, tile_base, items_in_tile); // partial tile
}
}
};
Expand Down
Loading

0 comments on commit 253b8fb

Please sign in to comment.