From 170db41db0305663ca805ed26e59075852c05820 Mon Sep 17 00:00:00 2001 From: sekrit-twc Date: Sat, 13 Jan 2018 20:34:20 -0800 Subject: [PATCH] Fix AVX2 3DLUT interpolation. --- timecube/lut_avx2.cpp | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/timecube/lut_avx2.cpp b/timecube/lut_avx2.cpp index 82bd0de..7194e47 100644 --- a/timecube/lut_avx2.cpp +++ b/timecube/lut_avx2.cpp @@ -211,6 +211,11 @@ static inline FORCE_INLINE __m256 lut3d_trilinear_interp(const void *lut, ptrdif __m256 r, __m256 g, __m256 b) { #define LUT_OFFSET(x) reinterpret_cast(static_cast(lut) + (x)) + __m256 g_lo = _mm256_permute2f128_ps(g, g, 0x00); + __m256 b_lo = _mm256_permute2f128_ps(b, b, 0x00); + __m256 g_hi = _mm256_permute2f128_ps(g, g, 0x11); + __m256 b_hi = _mm256_permute2f128_ps(b, b, 0x11); + __m256 g0b0_a, g0b1_a, g1b0_a, g1b1_a; __m256 g0b0_b, g0b1_b, g1b0_b, g1b1_b; @@ -219,20 +224,20 @@ static inline FORCE_INLINE __m256 lut3d_trilinear_interp(const void *lut, ptrdif g0b1_a = _mm256_loadu_ps(LUT_OFFSET(idx_lo + stride_b)); g1b1_a = _mm256_loadu_ps(LUT_OFFSET(idx_lo + stride_b + stride_g)); - g0b0_a = mm256_interp_ps(g0b0_a, g1b0_a, g); - g0b1_a = mm256_interp_ps(g0b1_a, g1b1_a, g); + g0b0_a = mm256_interp_ps(g0b0_a, g1b0_a, g_lo); + g0b1_a = mm256_interp_ps(g0b1_a, g1b1_a, g_lo); - g0b0_a = mm256_interp_ps(g0b0_a, g0b1_a, b); + g0b0_a = mm256_interp_ps(g0b0_a, g0b1_a, b_lo); g0b0_b = _mm256_loadu_ps(LUT_OFFSET(idx_hi)); g1b0_b = _mm256_loadu_ps(LUT_OFFSET(idx_hi + stride_g)); g0b1_b = _mm256_loadu_ps(LUT_OFFSET(idx_hi + stride_b)); g1b1_b = _mm256_loadu_ps(LUT_OFFSET(idx_hi + stride_b + stride_g)); - g0b0_b = mm256_interp_ps(g0b0_b, g1b0_b, g); - g0b1_b = mm256_interp_ps(g0b1_b, g1b1_b, g); + g0b0_b = mm256_interp_ps(g0b0_b, g1b0_b, g_hi); + g0b1_b = mm256_interp_ps(g0b1_b, g1b1_b, g_hi); - g0b0_b = mm256_interp_ps(g0b0_b, g0b1_b, b); + g0b0_b = mm256_interp_ps(g0b0_b, g0b1_b, b_hi); mm256_transpose2_ps128(g0b0_a, g0b0_b); g0b0_a = mm256_interp_ps(g0b0_a, g0b0_b, r);