Skip to content

Commit 3b7b6f9

Browse files
authored
Merge pull request #19 from wx257osn2/improve-encoder-performance
Improve encoder performance
2 parents 7c23a9d + f381f6d commit 3b7b6f9

File tree

1 file changed

+111
-71
lines changed

1 file changed

+111
-71
lines changed

include/qoixx.hpp

+111-71
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,19 @@ class qoi{
248248
};
249249
struct rgb_t{
250250
std::uint8_t r, g, b;
251+
inline std::uint32_t v()const{
252+
static_assert(sizeof(rgb_t) == 3u);
253+
if constexpr(std::endian::native == std::endian::little){
254+
std::uint32_t x = 255u << 24u;
255+
efficient_memcpy<3>(&x, this);
256+
return x;
257+
}
258+
else
259+
return std::uint32_t{r} |
260+
std::uint32_t{g} << 8 |
261+
std::uint32_t{b} << 16 |
262+
255u << 24;
263+
}
251264
inline std::uint_fast32_t hash()const{
252265
static constexpr std::uint64_t constant =
253266
static_cast<std::uint64_t>(3u) << 56 |
@@ -302,12 +315,29 @@ class qoi{
302315
}
303316
}
304317
private:
318+
template<bool Alpha>
319+
using local_rgba_pixel_t = std::conditional_t<Alpha, rgba_t, rgb_t>;
320+
template<bool Alpha>
321+
static constexpr local_rgba_pixel_t<Alpha> default_pixel()noexcept{
322+
if constexpr(Alpha)
323+
return {0, 0, 0, 255};
324+
else
325+
return {};
326+
}
327+
template<bool Alpha>
328+
struct local_pixel{
329+
std::uint8_t rgb = static_cast<std::uint8_t>(chunk_tag::rgb);
330+
local_rgba_pixel_t<Alpha> v;
331+
};
332+
static_assert(std::has_unique_object_representations_v<local_pixel<true>> and std::has_unique_object_representations_v<local_pixel<false>>);
305333
template<std::uint_fast8_t Channels, typename Pusher, typename Puller>
306-
static inline void encode_body(Pusher& p, Puller& pixels, rgba_t (&index)[index_size], std::size_t px_len, rgba_t px_prev = {0, 0, 0, 255}, std::uint8_t prev_hash = static_cast<std::uint8_t>(index_size), std::size_t run = 0){
307-
const auto f = [&run, &index, &p, &prev_hash](rgba_t px, rgba_t px_prev){
308-
if(px == px_prev){
334+
static inline void encode_body(Pusher& p, Puller& pixels, rgba_t (&index)[index_size], std::size_t px_len, local_rgba_pixel_t<Channels == 4u> px_prev = default_pixel<Channels == 4u>(), std::uint8_t prev_hash = static_cast<std::uint8_t>(index_size), std::size_t run = 0){
335+
local_pixel<Channels == 4u> px;
336+
while(px_len--)[[likely]]{
337+
pull<Channels>(&px.v, pixels);
338+
if(px.v.v() == px_prev.v()){
309339
++run;
310-
return;
340+
continue;
311341
}
312342
if(run > 0){
313343
while(run >= 62)[[unlikely]]{
@@ -328,57 +358,55 @@ class qoi{
328358
}
329359
}
330360

331-
const auto index_pos = px.hash() % index_size;
361+
const auto index_pos = px.v.hash() % index_size;
332362
prev_hash = index_pos;
333363

334-
if(index[index_pos] == px){
335-
p.push(chunk_tag::index | index_pos);
336-
return;
337-
}
338-
index[index_pos] = px;
339-
340-
if constexpr(Channels == 4)
341-
if(px.a != px_prev.a){
342-
p.push(chunk_tag::rgba);
343-
push<4>(p, &px);
344-
return;
364+
do{
365+
if(index[index_pos].v() == px.v.v()){
366+
p.push(chunk_tag::index | index_pos);
367+
break;
345368
}
346-
const auto vr = static_cast<int>(px.r) - static_cast<int>(px_prev.r) + 2;
347-
const auto vg = static_cast<int>(px.g) - static_cast<int>(px_prev.g) + 2;
348-
const auto vb = static_cast<int>(px.b) - static_cast<int>(px_prev.b) + 2;
369+
efficient_memcpy<Channels>(index + index_pos, &px.v);
370+
if constexpr(Channels == 3)
371+
index[index_pos].a = 255u;
349372

350-
if(const std::uint8_t v = vr|vg|vb; v < 4){
351-
p.push(chunk_tag::diff | vr << 4 | vg << 2 | vb);
352-
return;
353-
}
354-
const auto vg_r = vr - vg + 8;
355-
const auto vg_b = vb - vg + 8;
356-
if(const int v = vg_r|vg_b, g = vg+30; ((v&0xf0)|(g&0xc0)) == 0){
357-
p.push(chunk_tag::luma | g);
358-
p.push(vg_r << 4 | vg_b);
359-
}
360-
else{
361-
p.push(chunk_tag::rgb);
362-
push<3>(p, &px);
363-
}
364-
};
365-
auto px = px_prev;
366-
while(px_len--)[[likely]]{
367-
px_prev = px;
368-
pull<Channels>(&px, pixels);
369-
f(px, px_prev);
373+
if constexpr(Channels == 4)
374+
if(px.v.a != px_prev.a){
375+
p.push(chunk_tag::rgba);
376+
push<4>(p, &px.v);
377+
break;
378+
}
379+
const auto vg_2 = static_cast<int>(px.v.g) - static_cast<int>(px_prev.g);
380+
if(const std::uint8_t g = vg_2+32; g < 64){
381+
const auto vr = static_cast<int>(px.v.r) - static_cast<int>(px_prev.r) + 2;
382+
const auto vg = vg_2 + 2;
383+
const auto vb = static_cast<int>(px.v.b) - static_cast<int>(px_prev.b) + 2;
384+
385+
if(static_cast<std::uint8_t>(vr|vg|vb) < 4){
386+
p.push(chunk_tag::diff | vr << 4 | vg << 2 | vb);
387+
break;
388+
}
389+
const auto vg_r = vr - vg + 8;
390+
const auto vg_b = vb - vg + 8;
391+
if(static_cast<std::uint8_t>(vg_r|vg_b) < 16){
392+
p.push(chunk_tag::luma | g);
393+
p.push(vg_r << 4 | vg_b);
394+
}
395+
else
396+
push<4>(p, &px);
397+
}
398+
else
399+
push<4>(p, &px);
400+
}while(false);
401+
efficient_memcpy<Channels>(&px_prev, &px.v);
370402
}
371-
if(px == px_prev){
372-
while(run >= 62)[[unlikely]]{
373-
static constexpr std::uint8_t x = chunk_tag::run | 61;
374-
p.push(x);
375-
run -= 62;
376-
}
377-
if(run > 0){
378-
p.push(chunk_tag::run | (run-1));
379-
run = 0;
380-
}
403+
while(run >= 62)[[unlikely]]{
404+
static constexpr std::uint8_t x = chunk_tag::run | 61;
405+
p.push(x);
406+
run -= 62;
381407
}
408+
if(run > 0)
409+
p.push(chunk_tag::run | (run-1));
382410
}
383411
#ifndef QOIXX_NO_SIMD
384412
#if defined(__ARM_FEATURE_SVE)
@@ -488,12 +516,12 @@ class qoi{
488516
hash = svand_n_u8_x(mask, svadd_u8_x(mask, svadd_u8_x(mask, svmul_n_u8_x(mask, get<0>(pxs), 3), svmul_n_u8_x(mask, get<1>(pxs), 5)), svadd_u8_x(mask, svmul_n_u8_x(mask, get<2>(pxs), 7), svmul_n_u8_x(mask, get<3>(pxs), 11))), 63);
489517
else
490518
hash = svand_n_u8_x(mask, svadd_u8_x(mask, svadd_u8_x(mask, svmul_n_u8_x(mask, get<0>(pxs), 3), svmul_n_u8_x(mask, get<1>(pxs), 5)), svadd_n_u8_x(mask, svmul_n_u8_x(mask, get<2>(pxs), 7), static_cast<std::uint8_t>(255*11))), 63);
491-
std::uint8_t runs[SVERegisterSize/8], diffs[SVERegisterSize/8], lus[SVERegisterSize/8], mas[SVERegisterSize/8], hashs[SVERegisterSize/8];
519+
std::uint8_t runs[SVERegisterSize/8], diffs[SVERegisterSize/8], lumas[SVERegisterSize/8*2], hashs[SVERegisterSize/8];
492520
[[maybe_unused]] std::uint8_t alphas[SVERegisterSize/8];
493521
svst1_u8(mask, runs, svadd_n_u8_m(runv, zero, 1));
494522
svst1_u8(mask, diffs, diffv);
495-
svst1_u8(mask, lus, lu);
496-
svst1_u8(mask, mas, ma);
523+
const auto luma = svcreate2_u8(lu, ma);
524+
svst2_u8(mask, lumas, luma);
497525
svst1_u8(mask, hashs, hash);
498526
if constexpr(Alpha)
499527
if(!alpha)
@@ -534,9 +562,9 @@ class qoi{
534562
}
535563
if(diffs[i])
536564
*p++ = diffs[i];
537-
else if(lus[i]){
538-
*p++ = lus[i];
539-
*p++ = mas[i];
565+
else if(lumas[i*2]){
566+
std::memcpy(p, lumas + i*2, 2);
567+
p += 2;
540568
}
541569
else{
542570
*p++ = chunk_tag::rgb;
@@ -655,12 +683,11 @@ class qoi{
655683
hash = vandq_u8(vaddq_u8(vaddq_u8(vmulq_u8(pxs.val[0], vdupq_n_u8(3)), vmulq_u8(pxs.val[1], vdupq_n_u8(5))), vaddq_u8(vmulq_u8(pxs.val[2], vdupq_n_u8(7)), vmulq_u8(pxs.val[3], vdupq_n_u8(11)))), vdupq_n_u8(63));
656684
else
657685
hash = vandq_u8(vaddq_u8(vaddq_u8(vmulq_u8(pxs.val[0], vdupq_n_u8(3)), vmulq_u8(pxs.val[1], vdupq_n_u8(5))), vaddq_u8(vmulq_u8(pxs.val[2], vdupq_n_u8(7)), vdupq_n_u8(static_cast<std::uint8_t>(255*11)))), vdupq_n_u8(63));
658-
std::uint8_t runs[simd_lanes], diffs[simd_lanes], lus[simd_lanes], mas[simd_lanes], hashs[simd_lanes];
686+
std::uint8_t runs[simd_lanes], diffs[simd_lanes], lumas[simd_lanes*2], hashs[simd_lanes];
659687
[[maybe_unused]] std::uint8_t alphas[simd_lanes];
660688
vst1q_u8(runs, runv);
661689
vst1q_u8(diffs, diffv);
662-
vst1q_u8(lus, lu);
663-
vst1q_u8(mas, ma);
690+
vst2q_u8(lumas, (uint8x16x2_t{lu, ma}));
664691
vst1q_u8(hashs, hash);
665692
if constexpr(Alpha)
666693
if(!alpha)
@@ -701,9 +728,9 @@ class qoi{
701728
}
702729
if(diffs[i])
703730
*p++ = diffs[i];
704-
else if(lus[i]){
705-
*p++ = lus[i];
706-
*p++ = mas[i];
731+
else if(lumas[i*2]){
732+
std::memcpy(p, lumas + i*2, 2);
733+
p += 2;
707734
}
708735
else{
709736
*p++ = chunk_tag::rgb;
@@ -715,7 +742,13 @@ class qoi{
715742
}
716743
p_.advance(p-p_.raw_pointer());
717744

718-
encode_body<Channels>(p_, pixels_, index, px_len, px, prev_hash, run);
745+
if constexpr(Alpha)
746+
encode_body<Channels>(p_, pixels_, index, px_len, px, prev_hash, run);
747+
else{
748+
rgb_t px_prev;
749+
efficient_memcpy<3>(&px_prev, &px);
750+
encode_body<Channels>(p_, pixels_, index, px_len, px_prev, prev_hash, run);
751+
}
719752

720753
push<sizeof(padding)>(p_, padding);
721754
}
@@ -920,19 +953,20 @@ class qoi{
920953
diff.val[0] = _mm256_add_epi8(_mm256_sub_epi8(diff.val[0], diff.val[1]), eight);
921954
diff.val[2] = _mm256_add_epi8(_mm256_sub_epi8(diff.val[2], diff.val[1]), eight);
922955
diff.val[1] = _mm256_add_epi8(diff.val[1], _mm256_set1_epi8(30));
923-
const auto lu = _mm256_and_si256(_mm256_or_si256(_mm256_set1_epi8(static_cast<char>(chunk_tag::luma)), diff.val[1]), _mm256_cmpeq_epi8(_mm256_or_si256(_mm256_and_si256(_mm256_or_si256(diff.val[0], diff.val[2]), _mm256_set1_epi8(static_cast<char>(0xf0))), _mm256_and_si256(diff.val[1], _mm256_set1_epi8(static_cast<char>(0xc0)))), zero));
924-
const auto ma = _mm256_or_si256(slli_epi8<4>(diff.val[0]), diff.val[2]);
956+
const auto luma_mask = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
957+
const auto lu = _mm256_permutevar8x32_epi32(_mm256_and_si256(_mm256_or_si256(_mm256_set1_epi8(static_cast<char>(chunk_tag::luma)), diff.val[1]), _mm256_cmpeq_epi8(_mm256_or_si256(_mm256_and_si256(_mm256_or_si256(diff.val[0], diff.val[2]), _mm256_set1_epi8(static_cast<char>(0xf0))), _mm256_and_si256(diff.val[1], _mm256_set1_epi8(static_cast<char>(0xc0)))), zero)), luma_mask);
958+
const auto ma = _mm256_permutevar8x32_epi32(_mm256_or_si256(slli_epi8<4>(diff.val[0]), diff.val[2]), luma_mask);
925959
__m256i hash;
926960
if constexpr(Alpha)
927961
hash = _mm256_and_si256(_mm256_add_epi8(_mm256_add_epi8(mul_epi8<3>(pxs.val[0]), mul_epi8<5>(pxs.val[1])), _mm256_add_epi8(mul_epi8<7>(pxs.val[2]), mul_epi8<11>(pxs.val[3]))), _mm256_set1_epi8(63));
928962
else
929963
hash = _mm256_and_si256(_mm256_add_epi8(_mm256_add_epi8(mul_epi8<3>(pxs.val[0]), mul_epi8<5>(pxs.val[1])), _mm256_add_epi8(mul_epi8<7>(pxs.val[2]), _mm256_set1_epi8(static_cast<std::uint8_t>(255*11)))), _mm256_set1_epi8(63));
930-
alignas(alignof(__m256i)) std::uint8_t runs[simd_lanes], diffs[simd_lanes], lus[simd_lanes], mas[simd_lanes], hashs[simd_lanes];
964+
alignas(alignof(__m256i)) std::uint8_t runs[simd_lanes], diffs[simd_lanes], lumas[simd_lanes*2], hashs[simd_lanes];
931965
[[maybe_unused]] alignas(alignof(__m256i)) std::uint8_t alphas[simd_lanes];
932966
_mm256_store_si256(reinterpret_cast<__m256i*>(runs), runv);
933967
_mm256_store_si256(reinterpret_cast<__m256i*>(diffs), diffv);
934-
_mm256_store_si256(reinterpret_cast<__m256i*>(lus), lu);
935-
_mm256_store_si256(reinterpret_cast<__m256i*>(mas), ma);
968+
_mm256_store_si256(reinterpret_cast<__m256i*>(lumas), _mm256_unpacklo_epi8(lu, ma));
969+
_mm256_store_si256(reinterpret_cast<__m256i*>(lumas)+1, _mm256_unpackhi_epi8(lu, ma));
936970
_mm256_store_si256(reinterpret_cast<__m256i*>(hashs), hash);
937971
if constexpr(Alpha)
938972
if(!alpha)
@@ -973,9 +1007,9 @@ class qoi{
9731007
}
9741008
if(diffs[i])
9751009
*p++ = diffs[i];
976-
else if(lus[i]){
977-
*p++ = lus[i];
978-
*p++ = mas[i];
1010+
else if(lumas[i*2]){
1011+
std::memcpy(p, lumas + i*2, 2);
1012+
p += 2;
9791013
}
9801014
else{
9811015
*p++ = chunk_tag::rgb;
@@ -987,7 +1021,13 @@ class qoi{
9871021
}
9881022
p_.advance(p-p_.raw_pointer());
9891023

990-
encode_body<Channels>(p_, pixels_, index, px_len, px, prev_hash, run);
1024+
if constexpr(Alpha)
1025+
encode_body<Channels>(p_, pixels_, index, px_len, px, prev_hash, run);
1026+
else{
1027+
rgb_t px_prev;
1028+
efficient_memcpy<3>(&px_prev, &px);
1029+
encode_body<Channels>(p_, pixels_, index, px_len, px_prev, prev_hash, run);
1030+
}
9911031

9921032
push<sizeof(padding)>(p_, padding);
9931033
}

0 commit comments

Comments
 (0)