oneapi-src · Simonsays095 · Jan 22, 2025 · Jan 21, 2025
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -131,6 +131,10 @@ struct reduction_pd_t : public primitive_desc_t {
         }
     }
 
+    bool has_zero_dim_memory() const {
+        return memory_desc_wrapper(src_md()).has_zero_dim();
+    }
+
 protected:
     reduction_desc_t desc_;
 

@@ -101,6 +101,10 @@ atomic_reduction_conf_t::atomic_reduction_conf_t(
     conf.src_type = src_type;
     conf.dst_type = dst_type;
     conf.subgroup_size = device_info.max_subgroup_size();
+    // Short-circuit if zero-dim is present
+    gpu_assert(reduction_block.block != 0) << "Reducing over 0 elements";
+    if (outer_block.block == 0 || inner_block.block == 0) return;
+
     auto arch = device_info.gpu_arch();
     const int base_threads_per_eu
             = compute::device_info_t::threads_per_eu(arch);

@@ -174,7 +174,11 @@ void write_padded_zeros(__global DST_DATA_T *dst) {
 }
 
 #if INNER_DIM_SIZE < SUBGROUP_SIZE
+#if INNER_DIM_SIZE == 0
+#define SLM_PER_SG 1
+#else
 #define SLM_PER_SG INNER_DIM_SIZE
+#endif // INNER_DIM_SIZE == 0
 #else
 #define SLM_PER_SG SUBGROUP_SIZE
 #endif

@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -80,8 +80,14 @@ reduction_phase_conf_t::reduction_phase_conf_t(
     : reduction_subproblem_t(subprb)
     , src_type(src_type)
     , dst_type(dst_type)
-    , subgroup_size(compute_engine->device_info()->max_subgroup_size())
-    , with_block_reads(can_use_block_reads()) {
+    , subgroup_size(compute_engine->device_info()->max_subgroup_size()) {
+    // Short-circuit if zero-dim is present
+    gpu_assert(reduction_block.block != 0) << "Reducing over 0 elements";
+    if (outer_block.block == 0 || inner_block.block == 0) {
+        nd_range = compute::nd_range_t({0}, {into<size_t>(subgroup_size)});
+        return;
+    }
+    with_block_reads = can_use_block_reads();
 
     const int num_EU = compute_engine->device_info()->eu_count();
     const int max_wg_size = static_cast<int>(
@@ -182,12 +188,17 @@ status_t split_into_phases(const reduction_subproblem_t &subprb,
         const compute::compute_engine_t *compute_engine,
         std::vector<reduction_phase_conf_t> &phases, bool large_grf_mode) {
     const dim_t reduction_elems = subprb.reduction_block.block;
+    reduction_phase_conf_t try_phase(subprb, accum_data_type, accum_data_type,
+            compute_engine, large_grf_mode);
+    // Zero-dim short circuit
+    if (try_phase.outer_block.block == 0 || try_phase.inner_block.block == 0) {
+        phases.emplace_back(try_phase);
+        return status::success;
+    }
 
     //Heuristic:
     // subsplitting has a high cost due to launching multiple sequential threads,
     // so only split when parallelism is low and reductions per thread is large
-    reduction_phase_conf_t try_phase(subprb, accum_data_type, accum_data_type,
-            compute_engine, large_grf_mode);
     const bool low_parallelism = [&compute_engine, &large_grf_mode,
                                          &try_phase]() {
         compute::gpu_arch_t arch = compute_engine->device_info()->gpu_arch();
@@ -453,6 +464,8 @@ status_t combined_reduction_t::pd_t::init_kernel_ctx(
 }
 
 status_t combined_reduction_t::execute_combined(const exec_ctx_t &ctx) const {
+    if (pd()->has_zero_dim_memory()) return status::success;
+
     auto &src = CTX_IN_STORAGE(DNNL_ARG_SRC);
     auto &dst = CTX_OUT_STORAGE(DNNL_ARG_DST);
     std::unique_ptr<memory_storage_t> sp_reduce[2]