Skip to content

Commit

Permalink
Lookback state fixes (#491)
Browse files Browse the repository at this point in the history
* Do not call fence in the wait loop

* Use __hip_atomic_load/store instead of atomicExch/atomicAdd

atomicExch is compiled to global_atomic_swap even when the results is not
used.

* Use faster fences in lookback algorithms on gfx94*

This version is specific for devices with slow __threadfence ("agent" fence which does
L2 cache flushing and invalidation).
Fences with "workgroup" scope are used instead to ensure ordering only but not coherence,
they do not flush and invalidate cache.
Global coherence of prefixes_*_values is ensured by atomic_load/atomic_store that bypass
cache.

* Rename ROCPRIM_DETAIL_LOOKBACK_SCAN_STATE_WITHOUT_SLOW_FENCES

from ROCPRIM_LOOKBACK_WITHOUT_SLOW_FENCES.
This is more verbose to communicates that it is implementation detail

It uses 0 and 1 instead of the presence of the macro now, and won't
be overriden if set by a developer on the command line.

* Add WITHOUT_SLOW_FENCES version to lookback_scan_state::get_complete_value

* refactor: lookback_scan_state WITHOUT_SLOW_FENCES misc changes

- use sizeof(variable)
- use auto* and const auto* instead of just auto
- use void* instead of char* to avoid yet another cast
- make the atomic order fence a separate function and add docs &
  warning

* fix: Restore removed interfaces of lookback_scan_state

Even though these are in the detail namespace and as such explicitly
not meant for usage by users, some projects did start depending on them.

The interfaces for these are slightly broken and rocPRIM developers
discourage any users from using them (or the newer interfaces for that
matter) because they are implementation details. No further guarantees
are provided for these APIs.

In the future a public interface is planned for lookback_scan_state
as we have recognized that this is a useful primitive, and it's
unreasonable to expect users to implement for themselves.

* refactor: rename __builtin_amdgcn_fence as atomic_fence_acquire_order_only

---------

Co-authored-by: Anton Gorenko <[email protected]>
  • Loading branch information
Mészáros Gergely and ex-rzr authored Dec 5, 2023
1 parent daab012 commit 57c30d8
Show file tree
Hide file tree
Showing 6 changed files with 222 additions and 50 deletions.
4 changes: 2 additions & 2 deletions rocprim/include/rocprim/device/detail/device_radix_sort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -881,13 +881,13 @@ struct onesweep_lookback_state

ROCPRIM_DEVICE ROCPRIM_INLINE static onesweep_lookback_state load(onesweep_lookback_state* ptr)
{
underlying_type state = ::rocprim::detail::atomic_add(&ptr->state, 0);
underlying_type state = ::rocprim::detail::atomic_load(&ptr->state);
return onesweep_lookback_state(state);
}

ROCPRIM_DEVICE ROCPRIM_INLINE void store(onesweep_lookback_state* ptr) const
{
::rocprim::detail::atomic_exch(&ptr->state, this->state);
::rocprim::detail::atomic_store(&ptr->state, this->state);
}
};

Expand Down
199 changes: 159 additions & 40 deletions rocprim/include/rocprim/device/detail/lookback_scan_state.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,21 @@
#include "../config_types.hpp"
#include "rocprim/config.hpp"

// This version is specific for devices with slow __threadfence ("agent" fence which does
// L2 cache flushing and invalidation).
// Fences with "workgroup" scope are used instead to ensure ordering only but not coherence,
// they do not flush and invalidate cache.
// Global coherence of prefixes_*_values is ensured by atomic_load/atomic_store that bypass
// cache.
#ifndef ROCPRIM_DETAIL_LOOKBACK_SCAN_STATE_WITHOUT_SLOW_FENCES
#if defined(__HIP_DEVICE_COMPILE__) \
&& (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
#define ROCPRIM_DETAIL_LOOKBACK_SCAN_STATE_WITHOUT_SLOW_FENCES 1
#else
#define ROCPRIM_DETAIL_LOOKBACK_SCAN_STATE_WITHOUT_SLOW_FENCES 0
#endif
#endif // ROCPRIM_DETAIL_LOOKBACK_SCAN_STATE_WITHOUT_SLOW_FENCES

extern "C"
{
void __builtin_amdgcn_s_sleep(int);
Expand Down Expand Up @@ -111,6 +126,16 @@ struct lookback_scan_state<T, UseSleep, true>
return hipSuccess;
}

[[deprecated(
"Please use the overload returns an error code, this function assumes the default"
" stream and silently ignores errors.")]] ROCPRIM_HOST static inline lookback_scan_state
create(void* temp_storage, const unsigned int number_of_blocks)
{
lookback_scan_state result;
(void)create(result, temp_storage, number_of_blocks, /*default stream*/ 0);
return result;
}

ROCPRIM_HOST static inline hipError_t get_storage_size(const unsigned int number_of_blocks,
const hipStream_t stream,
size_t& storage_size)
Expand All @@ -123,6 +148,15 @@ struct lookback_scan_state<T, UseSleep, true>
return error;
}

[[deprecated("Please use the overload returns an error code, this function assumes the default"
" stream and silently ignores errors.")]] ROCPRIM_HOST static inline size_t
get_storage_size(const unsigned int number_of_blocks)
{
size_t result;
(void)get_storage_size(number_of_blocks, /*default stream*/ 0, result);
return result;
}

ROCPRIM_HOST static inline hipError_t
get_temp_storage_layout(const unsigned int number_of_blocks,
const hipStream_t stream,
Expand All @@ -134,6 +168,16 @@ struct lookback_scan_state<T, UseSleep, true>
return error;
}

[[deprecated("Please use the overload returns an error code, this function assumes the default"
" stream and silently ignores errors.")]] ROCPRIM_HOST static inline detail::
temp_storage::layout
get_temp_storage_layout(const unsigned int number_of_blocks)
{
detail::temp_storage::layout result;
(void)get_temp_storage_layout(number_of_blocks, /*default stream*/ 0, result);
return result;
}

ROCPRIM_DEVICE ROCPRIM_INLINE
void initialize_prefix(const unsigned int block_id,
const unsigned int number_of_blocks)
Expand Down Expand Up @@ -189,7 +233,7 @@ struct lookback_scan_state<T, UseSleep, true>
const unsigned int SLEEP_MAX = 32;
unsigned int times_through = 1;

prefix_underlying_type p = ::rocprim::detail::atomic_add(&prefixes[padding + block_id], 0);
prefix_underlying_type p = ::rocprim::detail::atomic_load(&prefixes[padding + block_id]);
#ifndef __HIP_CPU_RT__
__builtin_memcpy(&prefix, &p, sizeof(prefix_type));
#else
Expand All @@ -208,8 +252,8 @@ struct lookback_scan_state<T, UseSleep, true>
if (times_through < SLEEP_MAX)
times_through++;
}
// atomic_add(..., 0) is used to load values atomically
prefix_underlying_type p = ::rocprim::detail::atomic_add(&prefixes[padding + block_id], 0);
prefix_underlying_type p
= ::rocprim::detail::atomic_load(&prefixes[padding + block_id]);
#ifndef __HIP_CPU_RT__
__builtin_memcpy(&prefix, &p, sizeof(prefix_type));
#else
Expand Down Expand Up @@ -252,7 +296,7 @@ struct lookback_scan_state<T, UseSleep, true>
#else
std::memcpy(&p, &prefix, sizeof(prefix_type));
#endif
::rocprim::detail::atomic_exch(&prefixes[padding + block_id], p);
::rocprim::detail::atomic_store(&prefixes[padding + block_id], p);
}

prefix_underlying_type * prefixes;
Expand Down Expand Up @@ -284,13 +328,23 @@ struct lookback_scan_state<T, UseSleep, false>
state.prefixes_flags = reinterpret_cast<flag_type*>(ptr);
ptr += ::rocprim::detail::align_size(n * sizeof(flag_type));

state.prefixes_partial_values = reinterpret_cast<T*>(ptr);
ptr += ::rocprim::detail::align_size(n * sizeof(T));
state.prefixes_partial_values = ptr;
ptr += ::rocprim::detail::align_size(n * sizeof(value_underlying_type));

state.prefixes_complete_values = reinterpret_cast<T*>(ptr);
state.prefixes_complete_values = ptr;
return error;
}

[[deprecated(
"Please use the overload returns an error code, this function assumes the default"
" stream and silently ignores errors.")]] ROCPRIM_HOST static inline lookback_scan_state
create(void* temp_storage, const unsigned int number_of_blocks)
{
lookback_scan_state result;
(void)create(result, temp_storage, number_of_blocks, /*default stream*/ 0);
return result;
}

ROCPRIM_HOST static inline hipError_t get_storage_size(const unsigned int number_of_blocks,
const hipStream_t stream,
size_t& storage_size)
Expand All @@ -299,25 +353,45 @@ struct lookback_scan_state<T, UseSleep, false>
hipError_t error = ::rocprim::host_warp_size(stream, warp_size);
const auto n = warp_size + number_of_blocks;
storage_size = ::rocprim::detail::align_size(n * sizeof(flag_type));
storage_size += 2 * ::rocprim::detail::align_size(n * sizeof(T));
// Always use sizeof(value_underlying_type) instead of sizeof(T) because storage is
// allocated by host so it can hold both types no matter what device is used.
storage_size += 2 * ::rocprim::detail::align_size(n * sizeof(value_underlying_type));
return error;
}

[[deprecated("Please use the overload returns an error code, this function assumes the default"
" stream and silently ignores errors.")]] ROCPRIM_HOST static inline size_t
get_storage_size(const unsigned int number_of_blocks)
{
size_t result;
(void)get_storage_size(number_of_blocks, /*default stream*/ 0, result);
return result;
}

ROCPRIM_HOST static inline hipError_t
get_temp_storage_layout(const unsigned int number_of_blocks,
const hipStream_t stream,
detail::temp_storage::layout& layout)
{
size_t storage_size = 0;
size_t alignment = std::max(alignof(flag_type), alignof(T));
size_t alignment = std::max({alignof(flag_type), alignof(T), alignof(value_underlying_type)});
hipError_t error = get_storage_size(number_of_blocks, stream, storage_size);
layout = detail::temp_storage::layout{storage_size, alignment};
return error;
}

ROCPRIM_DEVICE ROCPRIM_INLINE
void initialize_prefix(const unsigned int block_id,
const unsigned int number_of_blocks)
[[deprecated("Please use the overload returns an error code, this function assumes the default"
" stream and silently ignores errors.")]] ROCPRIM_HOST static inline detail::
temp_storage::layout
get_temp_storage_layout(const unsigned int number_of_blocks)
{
detail::temp_storage::layout result;
(void)get_temp_storage_layout(number_of_blocks, /*default stream*/ 0, result);
return result;
}

ROCPRIM_DEVICE ROCPRIM_INLINE void initialize_prefix(const unsigned int block_id,
const unsigned int number_of_blocks)
{
constexpr unsigned int padding = ::rocprim::device_warp_size();
if(block_id < number_of_blocks)
Expand All @@ -330,38 +404,25 @@ struct lookback_scan_state<T, UseSleep, false>
}
}

ROCPRIM_DEVICE ROCPRIM_INLINE
void set_partial(const unsigned int block_id, const T value)
ROCPRIM_DEVICE ROCPRIM_INLINE void set_partial(const unsigned int block_id, const T value)
{
constexpr unsigned int padding = ::rocprim::device_warp_size();

prefixes_partial_values[padding + block_id] = value;
::rocprim::detail::memory_fence_device();
::rocprim::detail::atomic_exch(&prefixes_flags[padding + block_id], PREFIX_PARTIAL);
this->set(block_id, PREFIX_PARTIAL, value);
}

ROCPRIM_DEVICE ROCPRIM_INLINE
void set_complete(const unsigned int block_id, const T value)
ROCPRIM_DEVICE ROCPRIM_INLINE void set_complete(const unsigned int block_id, const T value)
{
constexpr unsigned int padding = ::rocprim::device_warp_size();

prefixes_complete_values[padding + block_id] = value;
::rocprim::detail::memory_fence_device();
::rocprim::detail::atomic_exch(&prefixes_flags[padding + block_id], PREFIX_COMPLETE);
this->set(block_id, PREFIX_COMPLETE, value);
}

// block_id must be > 0
ROCPRIM_DEVICE ROCPRIM_INLINE
void get(const unsigned int block_id, flag_type& flag, T& value)
ROCPRIM_DEVICE ROCPRIM_INLINE void get(const unsigned int block_id, flag_type& flag, T& value)
{
constexpr unsigned int padding = ::rocprim::device_warp_size();

const unsigned int SLEEP_MAX = 32;
unsigned int times_through = 1;

// atomic_add(..., 0) is used to load values atomically
flag = ::rocprim::detail::atomic_add(&prefixes_flags[padding + block_id], 0);
::rocprim::detail::memory_fence_device();
flag = ::rocprim::detail::atomic_load(&prefixes_flags[padding + block_id]);
while(flag == PREFIX_EMPTY)
{
if (UseSleep)
Expand All @@ -376,14 +437,26 @@ struct lookback_scan_state<T, UseSleep, false>
times_through++;
}

flag = ::rocprim::detail::atomic_add(&prefixes_flags[padding + block_id], 0);
::rocprim::detail::memory_fence_device();
flag = ::rocprim::detail::atomic_load(&prefixes_flags[padding + block_id]);
}
#if ROCPRIM_DETAIL_LOOKBACK_SCAN_STATE_WITHOUT_SLOW_FENCES
rocprim::detail::atomic_fence_acquire_order_only();

const auto* values = static_cast<const value_underlying_type*>(
flag == PREFIX_PARTIAL ? prefixes_partial_values : prefixes_complete_values);
value_underlying_type v;
for(unsigned int i = 0; i < value_underlying_type::words_no; ++i)
{
v.words[i] = ::rocprim::detail::atomic_load(&values[padding + block_id].words[i]);
}
__builtin_memcpy(&value, &v, sizeof(value));
#else
::rocprim::detail::memory_fence_device();

if(flag == PREFIX_PARTIAL)
value = prefixes_partial_values[padding + block_id];
else
value = prefixes_complete_values[padding + block_id];
const auto* values = static_cast<const T*>(
flag == PREFIX_PARTIAL ? prefixes_partial_values : prefixes_complete_values);
value = values[padding + block_id];
#endif
}

/// \brief Gets the prefix value for a block. Should only be called after all
Expand All @@ -392,17 +465,63 @@ struct lookback_scan_state<T, UseSleep, false>
{
constexpr unsigned int padding = ::rocprim::device_warp_size();

#if ROCPRIM_DETAIL_LOOKBACK_SCAN_STATE_WITHOUT_SLOW_FENCES
T value;
const auto* values = static_cast<const value_underlying_type*>(prefixes_complete_values);
value_underlying_type v;
for(unsigned int i = 0; i < value_underlying_type::words_no; ++i)
{
v.words[i] = ::rocprim::detail::atomic_load(&values[padding + block_id].words[i]);
}
__builtin_memcpy(&value, &v, sizeof(value));
return value;
#else
assert(prefixes_flags[padding + block_id] == PREFIX_COMPLETE);
return prefixes_complete_values[padding + block_id];
const auto* values = static_cast<const T*>(prefixes_complete_values);
return values[padding + block_id];
#endif
}

private:
ROCPRIM_DEVICE ROCPRIM_INLINE void
set(const unsigned int block_id, const flag_type flag, const T value)
{
constexpr unsigned int padding = ::rocprim::device_warp_size();

#if ROCPRIM_DETAIL_LOOKBACK_SCAN_STATE_WITHOUT_SLOW_FENCES
auto* values = static_cast<value_underlying_type*>(
flag == PREFIX_PARTIAL ? prefixes_partial_values : prefixes_complete_values);
value_underlying_type v;
__builtin_memcpy(&v, &value, sizeof(value));
for(unsigned int i = 0; i < value_underlying_type::words_no; ++i)
{
::rocprim::detail::atomic_store(&values[padding + block_id].words[i], v.words[i]);
}
// Wait for all atomic stores of prefixes_*_values before signaling complete / partial state
rocprim::detail::atomic_fence_release_vmem_order_only();
#else
auto* values = static_cast<T*>(flag == PREFIX_PARTIAL ? prefixes_partial_values
: prefixes_complete_values);
values[padding + block_id] = value;
::rocprim::detail::memory_fence_device();
#endif

::rocprim::detail::atomic_store(&prefixes_flags[padding + block_id], flag);
}

struct value_underlying_type
{
static constexpr int words_no = ceiling_div(sizeof(T), sizeof(unsigned int));

unsigned int words[words_no];
};

flag_type * prefixes_flags;
// We need to separate arrays for partial and final prefixes, because
// value can be overwritten before flag is changed (flag and value are
// not stored in single instruction).
T * prefixes_partial_values;
T * prefixes_complete_values;
void* prefixes_partial_values;
void* prefixes_complete_values;
};

template<class T, class BinaryFunction, class LookbackScanState>
Expand Down
Loading

0 comments on commit 57c30d8

Please sign in to comment.