Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

xe: reduction: fixup exceptions on zero dims #2469

Merged
merged 1 commit into from
Jan 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion src/common/reduction_pd.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2020-2024 Intel Corporation
* Copyright 2020-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -131,6 +131,10 @@ struct reduction_pd_t : public primitive_desc_t {
}
}

bool has_zero_dim_memory() const {
return memory_desc_wrapper(src_md()).has_zero_dim();
}

protected:
reduction_desc_t desc_;

Expand Down
4 changes: 4 additions & 0 deletions src/gpu/intel/ocl/reduction/atomic_reduction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,10 @@ atomic_reduction_conf_t::atomic_reduction_conf_t(
conf.src_type = src_type;
conf.dst_type = dst_type;
conf.subgroup_size = device_info.max_subgroup_size();
// Short-circuit if zero-dim is present
gpu_assert(reduction_block.block != 0) << "Reducing over 0 elements";
if (outer_block.block == 0 || inner_block.block == 0) return;

auto arch = device_info.gpu_arch();
const int base_threads_per_eu
= compute::device_info_t::threads_per_eu(arch);
Expand Down
4 changes: 4 additions & 0 deletions src/gpu/intel/ocl/reduction/combined_reduction.cl
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,11 @@ void write_padded_zeros(__global DST_DATA_T *dst) {
}

#if INNER_DIM_SIZE < SUBGROUP_SIZE
#if INNER_DIM_SIZE == 0
#define SLM_PER_SG 1
#else
#define SLM_PER_SG INNER_DIM_SIZE
#endif // INNER_DIM_SIZE == 0
#else
#define SLM_PER_SG SUBGROUP_SIZE
#endif
Expand Down
23 changes: 18 additions & 5 deletions src/gpu/intel/ocl/reduction/combined_reduction.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2021-2024 Intel Corporation
* Copyright 2021-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -80,8 +80,14 @@ reduction_phase_conf_t::reduction_phase_conf_t(
: reduction_subproblem_t(subprb)
, src_type(src_type)
, dst_type(dst_type)
, subgroup_size(compute_engine->device_info()->max_subgroup_size())
, with_block_reads(can_use_block_reads()) {
, subgroup_size(compute_engine->device_info()->max_subgroup_size()) {
// Short-circuit if zero-dim is present
gpu_assert(reduction_block.block != 0) << "Reducing over 0 elements";
if (outer_block.block == 0 || inner_block.block == 0) {
nd_range = compute::nd_range_t({0}, {into<size_t>(subgroup_size)});
return;
}
with_block_reads = can_use_block_reads();

const int num_EU = compute_engine->device_info()->eu_count();
const int max_wg_size = static_cast<int>(
Expand Down Expand Up @@ -182,12 +188,17 @@ status_t split_into_phases(const reduction_subproblem_t &subprb,
const compute::compute_engine_t *compute_engine,
std::vector<reduction_phase_conf_t> &phases, bool large_grf_mode) {
const dim_t reduction_elems = subprb.reduction_block.block;
reduction_phase_conf_t try_phase(subprb, accum_data_type, accum_data_type,
compute_engine, large_grf_mode);
// Zero-dim short circuit
if (try_phase.outer_block.block == 0 || try_phase.inner_block.block == 0) {
phases.emplace_back(try_phase);
return status::success;
}

//Heuristic:
// subsplitting has a high cost due to launching multiple sequential threads,
// so only split when parallelism is low and reductions per thread is large
reduction_phase_conf_t try_phase(subprb, accum_data_type, accum_data_type,
compute_engine, large_grf_mode);
const bool low_parallelism = [&compute_engine, &large_grf_mode,
&try_phase]() {
compute::gpu_arch_t arch = compute_engine->device_info()->gpu_arch();
Expand Down Expand Up @@ -453,6 +464,8 @@ status_t combined_reduction_t::pd_t::init_kernel_ctx(
}

status_t combined_reduction_t::execute_combined(const exec_ctx_t &ctx) const {
if (pd()->has_zero_dim_memory()) return status::success;

auto &src = CTX_IN_STORAGE(DNNL_ARG_SRC);
auto &dst = CTX_OUT_STORAGE(DNNL_ARG_DST);
std::unique_ptr<memory_storage_t> sp_reduce[2]
Expand Down
Loading