From cdd2f7033ae56c81e1bbf8c5ee2e9675ea1bebec Mon Sep 17 00:00:00 2001 From: Simon Ewing <simon.ewing@intel.com> Date: Tue, 21 Jan 2025 13:58:05 -0800 Subject: [PATCH] xe: reduction: fixup exceptions on zero dims --- src/common/reduction_pd.hpp | 6 ++++- .../intel/ocl/reduction/atomic_reduction.cpp | 4 ++++ .../intel/ocl/reduction/combined_reduction.cl | 4 ++++ .../ocl/reduction/combined_reduction.cpp | 23 +++++++++++++++---- 4 files changed, 31 insertions(+), 6 deletions(-) diff --git a/src/common/reduction_pd.hpp b/src/common/reduction_pd.hpp index 0f1aef10011..b0dd46aabb0 100644 --- a/src/common/reduction_pd.hpp +++ b/src/common/reduction_pd.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2024 Intel Corporation +* Copyright 2020-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -131,6 +131,10 @@ struct reduction_pd_t : public primitive_desc_t { } } + bool has_zero_dim_memory() const { + return memory_desc_wrapper(src_md()).has_zero_dim(); + } + protected: reduction_desc_t desc_; diff --git a/src/gpu/intel/ocl/reduction/atomic_reduction.cpp b/src/gpu/intel/ocl/reduction/atomic_reduction.cpp index cc5330c59d4..e59fb768ee7 100644 --- a/src/gpu/intel/ocl/reduction/atomic_reduction.cpp +++ b/src/gpu/intel/ocl/reduction/atomic_reduction.cpp @@ -101,6 +101,10 @@ atomic_reduction_conf_t::atomic_reduction_conf_t( conf.src_type = src_type; conf.dst_type = dst_type; conf.subgroup_size = device_info.max_subgroup_size(); + // Short-circuit if zero-dim is present + gpu_assert(reduction_block.block != 0) << "Reducing over 0 elements"; + if (outer_block.block == 0 || inner_block.block == 0) return; + auto arch = device_info.gpu_arch(); const int base_threads_per_eu = compute::device_info_t::threads_per_eu(arch); diff --git a/src/gpu/intel/ocl/reduction/combined_reduction.cl b/src/gpu/intel/ocl/reduction/combined_reduction.cl index 5497adb1c97..7be5297b1b8 100644 --- a/src/gpu/intel/ocl/reduction/combined_reduction.cl +++ b/src/gpu/intel/ocl/reduction/combined_reduction.cl @@ -174,7 +174,11 @@ void write_padded_zeros(__global DST_DATA_T *dst) { } #if INNER_DIM_SIZE < SUBGROUP_SIZE +#if INNER_DIM_SIZE == 0 +#define SLM_PER_SG 1 +#else #define SLM_PER_SG INNER_DIM_SIZE +#endif // INNER_DIM_SIZE == 0 #else #define SLM_PER_SG SUBGROUP_SIZE #endif diff --git a/src/gpu/intel/ocl/reduction/combined_reduction.cpp b/src/gpu/intel/ocl/reduction/combined_reduction.cpp index 02a786cac80..996a684f406 100644 --- a/src/gpu/intel/ocl/reduction/combined_reduction.cpp +++ b/src/gpu/intel/ocl/reduction/combined_reduction.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2021-2024 Intel Corporation +* Copyright 2021-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -80,8 +80,14 @@ reduction_phase_conf_t::reduction_phase_conf_t( : reduction_subproblem_t(subprb) , src_type(src_type) , dst_type(dst_type) - , subgroup_size(compute_engine->device_info()->max_subgroup_size()) - , with_block_reads(can_use_block_reads()) { + , subgroup_size(compute_engine->device_info()->max_subgroup_size()) { + // Short-circuit if zero-dim is present + gpu_assert(reduction_block.block != 0) << "Reducing over 0 elements"; + if (outer_block.block == 0 || inner_block.block == 0) { + nd_range = compute::nd_range_t({0}, {into<size_t>(subgroup_size)}); + return; + } + with_block_reads = can_use_block_reads(); const int num_EU = compute_engine->device_info()->eu_count(); const int max_wg_size = static_cast<int>( @@ -182,12 +188,17 @@ status_t split_into_phases(const reduction_subproblem_t &subprb, const compute::compute_engine_t *compute_engine, std::vector<reduction_phase_conf_t> &phases, bool large_grf_mode) { const dim_t reduction_elems = subprb.reduction_block.block; + reduction_phase_conf_t try_phase(subprb, accum_data_type, accum_data_type, + compute_engine, large_grf_mode); + // Zero-dim short circuit + if (try_phase.outer_block.block == 0 || try_phase.inner_block.block == 0) { + phases.emplace_back(try_phase); + return status::success; + } //Heuristic: // subsplitting has a high cost due to launching multiple sequential threads, // so only split when parallelism is low and reductions per thread is large - reduction_phase_conf_t try_phase(subprb, accum_data_type, accum_data_type, - compute_engine, large_grf_mode); const bool low_parallelism = [&compute_engine, &large_grf_mode, &try_phase]() { compute::gpu_arch_t arch = compute_engine->device_info()->gpu_arch(); @@ -453,6 +464,8 @@ status_t combined_reduction_t::pd_t::init_kernel_ctx( } status_t combined_reduction_t::execute_combined(const exec_ctx_t &ctx) const { + if (pd()->has_zero_dim_memory()) return status::success; + auto &src = CTX_IN_STORAGE(DNNL_ARG_SRC); auto &dst = CTX_OUT_STORAGE(DNNL_ARG_DST); std::unique_ptr<memory_storage_t> sp_reduce[2]