From 7b3f824daa0e2c5485230eb18b1edab3b97d94b8 Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Tue, 26 Nov 2024 19:32:47 +0200 Subject: [PATCH] Math: IIR DF1: Optimize IIR core for Xtensa HiFi5 This patch adds iir_df1_hifi5.c that is a modified version of iir_df1_hifi4.c. The coefficients and data load is 128 bits when possible. The data load is fastest non-aligned, so the iir->delay address needs to be 128 bits / 16 bytes aligned. The updated version saves in sof-testbench4 run 2.1 MCPS, from 10.4 to 8.3 MCPS for used 10th order filter. The used test run command for HiFi5 build of sof-testench4 was "scripts/sof-testbench-helper.sh -x -m eqiir". Signed-off-by: Seppo Ingalsuo --- src/math/CMakeLists.txt | 2 +- src/math/iir_df1_hifi4.c | 2 +- src/math/iir_df1_hifi5.c | 113 ++++++++++++++++++++ test/cmocka/src/audio/eq_iir/CMakeLists.txt | 1 + zephyr/CMakeLists.txt | 1 + 5 files changed, 117 insertions(+), 2 deletions(-) create mode 100644 src/math/iir_df1_hifi5.c diff --git a/src/math/CMakeLists.txt b/src/math/CMakeLists.txt index b2f266827b60..9b506eb0b85e 100644 --- a/src/math/CMakeLists.txt +++ b/src/math/CMakeLists.txt @@ -39,7 +39,7 @@ add_local_sources_ifdef(CONFIG_MATH_IIR_DF2T sof iir_df2t_generic.c iir_df2t_hifi3.c iir_df2t.c) add_local_sources_ifdef(CONFIG_MATH_IIR_DF1 sof - iir_df1_generic.c iir_df1_hifi3.c iir_df1_hifi4.c iir_df1.c) + iir_df1_generic.c iir_df1_hifi3.c iir_df1_hifi4.c iir_df1_hifi5.c iir_df1.c) if(CONFIG_MATH_WINDOW) add_local_sources(sof window.c) diff --git a/src/math/iir_df1_hifi4.c b/src/math/iir_df1_hifi4.c index ee756fd93e26..5d2fa5e5825a 100644 --- a/src/math/iir_df1_hifi4.c +++ b/src/math/iir_df1_hifi4.c @@ -14,7 +14,7 @@ #include -#if SOF_USE_MIN_HIFI(4, FILTER) +#if SOF_USE_HIFI(4, FILTER) /* * Direct form I second order filter block (biquad) diff --git a/src/math/iir_df1_hifi5.c b/src/math/iir_df1_hifi5.c new file mode 100644 index 000000000000..86dc91ab2fd0 --- /dev/null +++ b/src/math/iir_df1_hifi5.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: BSD-3-Clause +// +// Copyright(c) 2022-2024 Intel Corporation. +// +// Author: Seppo Ingalsuo + +#include +#include +#include +#include +#include +#include +#include + +#include + +#if SOF_USE_MIN_HIFI(5, FILTER) + +/* + * Direct form I second order filter block (biquad) + * + * +----+ +---+ +-------+ + * X(z) ---o--->| b0 |---> + --+-------------o--->| g |--->| shift |---> Y(z) + * | +----+ ^ ^ | +---+ +-------+ + * | | | | + * +------+ | | +------+ + * | z^-1 | | | | z^-1 | + * +------+ | | +------+ + * | +----+ | | +----+ | + * o--->| b1 |---> + + <---| a1 |---o + * | +----+ ^ ^ +----+ | + * | | | | + * +------+ | | +------+ + * | z^-1 | | | | z^-1 | + * +------+ | | +------+ + * | ^ ^ | + * | +----+ | | +----+ | + * o--->| b2 |---> + +<--- | a2 |---o + * +----+ +----+ + * + * y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] + * the a1 a2 has been negated during calculation + */ + +/* Series DF1 IIR */ + +/* 32 bit data, 32 bit coefficients and 32 bit state variables */ + +int32_t iir_df1(struct iir_state_df1 *iir, int32_t x) +{ + ae_valignx2 coef_align; + ae_f64 acc; + ae_int32x2 coef_a2a1; + ae_int32x2 coef_b2b1; + ae_int32x2 coef_b0; + ae_int32x2 gain; + ae_int32x2 delay_y2y1; + ae_int32x2 delay_x2x1; + ae_int32x2 shift; + ae_int32 in; + ae_int32 out = 0; + ae_int32x4 *coefp = (ae_int32x4 *)iir->coef; + ae_int32x4 *delay = (ae_int32x4 *)iir->delay; + int i; + int j; + int nseries = iir->biquads_in_series; + + /* Bypass is set with number of biquads set to zero. */ + if (!iir->biquads) + return x; + + /* Coefficients order in coef[] is {a2, a1, b2, b1, b0, shift, gain} */ + /* Delay order in state[] is {y(n - 2), y(n - 1), x(n - 2), x(n - 1)} */ + for (j = 0; j < iir->biquads; j += nseries) { + in = x; + for (i = 0; i < nseries; i++) { + /* Load data */ + AE_L32X2X2_IP(delay_y2y1, delay_x2x1, delay, 0); + + /* Load coefficients */ + coef_align = AE_LA128_PP(coefp); + AE_LA32X2X2_IP(coef_a2a1, coef_b2b1, coef_align, coefp); + AE_L32_IP(coef_b0, (ae_int32 *)coefp, 4); + AE_L32_IP(shift, (ae_int32 *)coefp, 4); + AE_L32_IP(gain, (ae_int32 *)coefp, 4); + + acc = AE_MULF32RA_HH(coef_b0, in); /* acc = b0 * in */ + AE_MULAAFD32RA_HH_LL(acc, coef_a2a1, delay_y2y1); /* + a2 * y2 + a1 * y1 */ + AE_MULAAFD32RA_HH_LL(acc, coef_b2b1, delay_x2x1); /* + b2 * x2 + b1 * x1 */ + AE_PKSR32(delay_y2y1, acc, 1); /* y2 = y1, y1 = acc(q1.31) */ + delay_x2x1 = AE_SEL32_LL(delay_x2x1, in); /* x2 = x1, x1 = in */ + + /* Store data */ + AE_S32X2X2_IP(delay_y2y1, delay_x2x1, delay, sizeof(ae_int32x4)); + + /* Apply gain */ + acc = AE_MULF32R_LL(gain, delay_y2y1); /* acc = gain * y1 */ + acc = AE_SLAI64S(acc, 17); /* Convert to Q17.47 */ + + /* Apply biquad output shift right parameter and then + * round and saturate to 32 bits Q1.31. + */ + acc = AE_SRAA64(acc, shift); + in = AE_ROUND32F48SSYM(acc); + } + /* Output of previous section is in variable in */ + out = AE_F32_ADDS_F32(out, in); + } + return out; +} +EXPORT_SYMBOL(iir_df1); + +#endif diff --git a/test/cmocka/src/audio/eq_iir/CMakeLists.txt b/test/cmocka/src/audio/eq_iir/CMakeLists.txt index 47f4507c78bd..a328590b7106 100644 --- a/test/cmocka/src/audio/eq_iir/CMakeLists.txt +++ b/test/cmocka/src/audio/eq_iir/CMakeLists.txt @@ -19,6 +19,7 @@ add_library(audio_for_eq_iir STATIC ${PROJECT_SOURCE_DIR}/src/math/iir_df1_generic.c ${PROJECT_SOURCE_DIR}/src/math/iir_df1_hifi3.c ${PROJECT_SOURCE_DIR}/src/math/iir_df1_hifi4.c + ${PROJECT_SOURCE_DIR}/src/math/iir_df1_hifi5.c ${PROJECT_SOURCE_DIR}/src/math/iir_df2t.c ${PROJECT_SOURCE_DIR}/src/math/iir_df2t_generic.c ${PROJECT_SOURCE_DIR}/src/math/iir_df2t_hifi3.c diff --git a/zephyr/CMakeLists.txt b/zephyr/CMakeLists.txt index 142c3414bbef..2e96668e119b 100644 --- a/zephyr/CMakeLists.txt +++ b/zephyr/CMakeLists.txt @@ -669,6 +669,7 @@ zephyr_library_sources_ifdef(CONFIG_MATH_IIR_DF1 ${SOF_MATH_PATH}/iir_df1_generic.c ${SOF_MATH_PATH}/iir_df1_hifi3.c ${SOF_MATH_PATH}/iir_df1_hifi4.c + ${SOF_MATH_PATH}/iir_df1_hifi5.c ${SOF_MATH_PATH}/iir_df1.c )