@@ -248,6 +248,19 @@ class qoi{
248
248
};
249
249
struct rgb_t {
250
250
std::uint8_t r, g, b;
251
+ inline std::uint32_t v ()const {
252
+ static_assert (sizeof (rgb_t ) == 3u );
253
+ if constexpr (std::endian::native == std::endian::little){
254
+ std::uint32_t x = 255u << 24u ;
255
+ efficient_memcpy<3 >(&x, this );
256
+ return x;
257
+ }
258
+ else
259
+ return std::uint32_t {r} |
260
+ std::uint32_t {g} << 8 |
261
+ std::uint32_t {b} << 16 |
262
+ 255u << 24 ;
263
+ }
251
264
inline std::uint_fast32_t hash ()const {
252
265
static constexpr std::uint64_t constant =
253
266
static_cast <std::uint64_t >(3u ) << 56 |
@@ -302,12 +315,29 @@ class qoi{
302
315
}
303
316
}
304
317
private:
318
+ template <bool Alpha>
319
+ using local_rgba_pixel_t = std::conditional_t <Alpha, rgba_t , rgb_t >;
320
+ template <bool Alpha>
321
+ static constexpr local_rgba_pixel_t <Alpha> default_pixel ()noexcept {
322
+ if constexpr (Alpha)
323
+ return {0 , 0 , 0 , 255 };
324
+ else
325
+ return {};
326
+ }
327
+ template <bool Alpha>
328
+ struct local_pixel {
329
+ std::uint8_t rgb = static_cast <std::uint8_t >(chunk_tag::rgb);
330
+ local_rgba_pixel_t <Alpha> v;
331
+ };
332
+ static_assert (std::has_unique_object_representations_v<local_pixel<true >> and std::has_unique_object_representations_v<local_pixel<false >>);
305
333
template <std::uint_fast8_t Channels, typename Pusher, typename Puller>
306
- static inline void encode_body (Pusher& p, Puller& pixels, rgba_t (&index)[index_size], std::size_t px_len, rgba_t px_prev = {0 , 0 , 0 , 255 }, std::uint8_t prev_hash = static_cast <std::uint8_t >(index_size), std::size_t run = 0 ){
307
- const auto f = [&run, &index , &p, &prev_hash](rgba_t px, rgba_t px_prev){
308
- if (px == px_prev){
334
+ static inline void encode_body (Pusher& p, Puller& pixels, rgba_t (&index)[index_size], std::size_t px_len, local_rgba_pixel_t<Channels == 4u> px_prev = default_pixel<Channels == 4u>(), std::uint8_t prev_hash = static_cast<std::uint8_t>(index_size), std::size_t run = 0){
335
+ local_pixel<Channels == 4u > px;
336
+ while (px_len--)[[likely]]{
337
+ pull<Channels>(&px.v , pixels);
338
+ if (px.v .v () == px_prev.v ()){
309
339
++run;
310
- return ;
340
+ continue ;
311
341
}
312
342
if (run > 0 ){
313
343
while (run >= 62 )[[unlikely]]{
@@ -328,57 +358,55 @@ class qoi{
328
358
}
329
359
}
330
360
331
- const auto index_pos = px.hash () % index_size;
361
+ const auto index_pos = px.v . hash () % index_size;
332
362
prev_hash = index_pos;
333
363
334
- if (index [index_pos] == px){
335
- p.push (chunk_tag::index | index_pos);
336
- return ;
337
- }
338
- index [index_pos] = px;
339
-
340
- if constexpr (Channels == 4 )
341
- if (px.a != px_prev.a ){
342
- p.push (chunk_tag::rgba);
343
- push<4 >(p, &px);
344
- return ;
364
+ do {
365
+ if (index [index_pos].v () == px.v .v ()){
366
+ p.push (chunk_tag::index | index_pos);
367
+ break ;
345
368
}
346
- const auto vr = static_cast < int >(px. r ) - static_cast < int >(px_prev. r ) + 2 ;
347
- const auto vg = static_cast < int >(px. g ) - static_cast < int >(px_prev. g ) + 2 ;
348
- const auto vb = static_cast < int >(px. b ) - static_cast < int >(px_prev. b ) + 2 ;
369
+ efficient_memcpy<Channels>( index + index_pos, &px. v ) ;
370
+ if constexpr (Channels == 3 )
371
+ index [index_pos]. a = 255u ;
349
372
350
- if (const std::uint8_t v = vr|vg|vb; v < 4 ){
351
- p.push (chunk_tag::diff | vr << 4 | vg << 2 | vb);
352
- return ;
353
- }
354
- const auto vg_r = vr - vg + 8 ;
355
- const auto vg_b = vb - vg + 8 ;
356
- if (const int v = vg_r|vg_b, g = vg+30 ; ((v&0xf0 )|(g&0xc0 )) == 0 ){
357
- p.push (chunk_tag::luma | g);
358
- p.push (vg_r << 4 | vg_b);
359
- }
360
- else {
361
- p.push (chunk_tag::rgb);
362
- push<3 >(p, &px);
363
- }
364
- };
365
- auto px = px_prev;
366
- while (px_len--)[[likely]]{
367
- px_prev = px;
368
- pull<Channels>(&px, pixels);
369
- f (px, px_prev);
373
+ if constexpr (Channels == 4 )
374
+ if (px.v .a != px_prev.a ){
375
+ p.push (chunk_tag::rgba);
376
+ push<4 >(p, &px.v );
377
+ break ;
378
+ }
379
+ const auto vg_2 = static_cast <int >(px.v .g ) - static_cast <int >(px_prev.g );
380
+ if (const std::uint8_t g = vg_2+32 ; g < 64 ){
381
+ const auto vr = static_cast <int >(px.v .r ) - static_cast <int >(px_prev.r ) + 2 ;
382
+ const auto vg = vg_2 + 2 ;
383
+ const auto vb = static_cast <int >(px.v .b ) - static_cast <int >(px_prev.b ) + 2 ;
384
+
385
+ if (static_cast <std::uint8_t >(vr|vg|vb) < 4 ){
386
+ p.push (chunk_tag::diff | vr << 4 | vg << 2 | vb);
387
+ break ;
388
+ }
389
+ const auto vg_r = vr - vg + 8 ;
390
+ const auto vg_b = vb - vg + 8 ;
391
+ if (static_cast <std::uint8_t >(vg_r|vg_b) < 16 ){
392
+ p.push (chunk_tag::luma | g);
393
+ p.push (vg_r << 4 | vg_b);
394
+ }
395
+ else
396
+ push<4 >(p, &px);
397
+ }
398
+ else
399
+ push<4 >(p, &px);
400
+ }while (false );
401
+ efficient_memcpy<Channels>(&px_prev, &px.v );
370
402
}
371
- if (px == px_prev){
372
- while (run >= 62 )[[unlikely]]{
373
- static constexpr std::uint8_t x = chunk_tag::run | 61 ;
374
- p.push (x);
375
- run -= 62 ;
376
- }
377
- if (run > 0 ){
378
- p.push (chunk_tag::run | (run-1 ));
379
- run = 0 ;
380
- }
403
+ while (run >= 62 )[[unlikely]]{
404
+ static constexpr std::uint8_t x = chunk_tag::run | 61 ;
405
+ p.push (x);
406
+ run -= 62 ;
381
407
}
408
+ if (run > 0 )
409
+ p.push (chunk_tag::run | (run-1 ));
382
410
}
383
411
#ifndef QOIXX_NO_SIMD
384
412
#if defined(__ARM_FEATURE_SVE)
@@ -488,12 +516,12 @@ class qoi{
488
516
hash = svand_n_u8_x (mask, svadd_u8_x (mask, svadd_u8_x (mask, svmul_n_u8_x (mask, get<0 >(pxs), 3 ), svmul_n_u8_x (mask, get<1 >(pxs), 5 )), svadd_u8_x (mask, svmul_n_u8_x (mask, get<2 >(pxs), 7 ), svmul_n_u8_x (mask, get<3 >(pxs), 11 ))), 63 );
489
517
else
490
518
hash = svand_n_u8_x (mask, svadd_u8_x (mask, svadd_u8_x (mask, svmul_n_u8_x (mask, get<0 >(pxs), 3 ), svmul_n_u8_x (mask, get<1 >(pxs), 5 )), svadd_n_u8_x (mask, svmul_n_u8_x (mask, get<2 >(pxs), 7 ), static_cast <std::uint8_t >(255 *11 ))), 63 );
491
- std::uint8_t runs[SVERegisterSize/8 ], diffs[SVERegisterSize/8 ], lus [SVERegisterSize/8 ], mas[SVERegisterSize/ 8 ], hashs[SVERegisterSize/8 ];
519
+ std::uint8_t runs[SVERegisterSize/8 ], diffs[SVERegisterSize/8 ], lumas [SVERegisterSize/8 * 2 ], hashs[SVERegisterSize/8 ];
492
520
[[maybe_unused]] std::uint8_t alphas[SVERegisterSize/8 ];
493
521
svst1_u8 (mask, runs, svadd_n_u8_m (runv, zero, 1 ));
494
522
svst1_u8 (mask, diffs, diffv);
495
- svst1_u8 (mask, lus, lu );
496
- svst1_u8 (mask, mas, ma );
523
+ const auto luma = svcreate2_u8 (lu, ma );
524
+ svst2_u8 (mask, lumas, luma );
497
525
svst1_u8 (mask, hashs, hash);
498
526
if constexpr (Alpha)
499
527
if (!alpha)
@@ -534,9 +562,9 @@ class qoi{
534
562
}
535
563
if (diffs[i])
536
564
*p++ = diffs[i];
537
- else if (lus[i ]){
538
- *p++ = lus[i] ;
539
- *p++ = mas[i] ;
565
+ else if (lumas[i* 2 ]){
566
+ std::memcpy (p, lumas + i* 2 , 2 ) ;
567
+ p += 2 ;
540
568
}
541
569
else {
542
570
*p++ = chunk_tag::rgb;
@@ -655,12 +683,11 @@ class qoi{
655
683
hash = vandq_u8 (vaddq_u8 (vaddq_u8 (vmulq_u8 (pxs.val [0 ], vdupq_n_u8 (3 )), vmulq_u8 (pxs.val [1 ], vdupq_n_u8 (5 ))), vaddq_u8 (vmulq_u8 (pxs.val [2 ], vdupq_n_u8 (7 )), vmulq_u8 (pxs.val [3 ], vdupq_n_u8 (11 )))), vdupq_n_u8 (63 ));
656
684
else
657
685
hash = vandq_u8 (vaddq_u8 (vaddq_u8 (vmulq_u8 (pxs.val [0 ], vdupq_n_u8 (3 )), vmulq_u8 (pxs.val [1 ], vdupq_n_u8 (5 ))), vaddq_u8 (vmulq_u8 (pxs.val [2 ], vdupq_n_u8 (7 )), vdupq_n_u8 (static_cast <std::uint8_t >(255 *11 )))), vdupq_n_u8 (63 ));
658
- std::uint8_t runs[simd_lanes], diffs[simd_lanes], lus [simd_lanes], mas[simd_lanes ], hashs[simd_lanes];
686
+ std::uint8_t runs[simd_lanes], diffs[simd_lanes], lumas [simd_lanes* 2 ], hashs[simd_lanes];
659
687
[[maybe_unused]] std::uint8_t alphas[simd_lanes];
660
688
vst1q_u8 (runs, runv);
661
689
vst1q_u8 (diffs, diffv);
662
- vst1q_u8 (lus, lu);
663
- vst1q_u8 (mas, ma);
690
+ vst2q_u8 (lumas, (uint8x16x2_t {lu, ma}));
664
691
vst1q_u8 (hashs, hash);
665
692
if constexpr (Alpha)
666
693
if (!alpha)
@@ -701,9 +728,9 @@ class qoi{
701
728
}
702
729
if (diffs[i])
703
730
*p++ = diffs[i];
704
- else if (lus[i ]){
705
- *p++ = lus[i] ;
706
- *p++ = mas[i] ;
731
+ else if (lumas[i* 2 ]){
732
+ std::memcpy (p, lumas + i* 2 , 2 ) ;
733
+ p += 2 ;
707
734
}
708
735
else {
709
736
*p++ = chunk_tag::rgb;
@@ -715,7 +742,13 @@ class qoi{
715
742
}
716
743
p_.advance (p-p_.raw_pointer ());
717
744
718
- encode_body<Channels>(p_, pixels_, index , px_len, px, prev_hash, run);
745
+ if constexpr (Alpha)
746
+ encode_body<Channels>(p_, pixels_, index , px_len, px, prev_hash, run);
747
+ else {
748
+ rgb_t px_prev;
749
+ efficient_memcpy<3 >(&px_prev, &px);
750
+ encode_body<Channels>(p_, pixels_, index , px_len, px_prev, prev_hash, run);
751
+ }
719
752
720
753
push<sizeof (padding)>(p_, padding);
721
754
}
@@ -920,19 +953,20 @@ class qoi{
920
953
diff.val [0 ] = _mm256_add_epi8 (_mm256_sub_epi8 (diff.val [0 ], diff.val [1 ]), eight);
921
954
diff.val [2 ] = _mm256_add_epi8 (_mm256_sub_epi8 (diff.val [2 ], diff.val [1 ]), eight);
922
955
diff.val [1 ] = _mm256_add_epi8 (diff.val [1 ], _mm256_set1_epi8 (30 ));
923
- const auto lu = _mm256_and_si256 (_mm256_or_si256 (_mm256_set1_epi8 (static_cast <char >(chunk_tag::luma)), diff.val [1 ]), _mm256_cmpeq_epi8 (_mm256_or_si256 (_mm256_and_si256 (_mm256_or_si256 (diff.val [0 ], diff.val [2 ]), _mm256_set1_epi8 (static_cast <char >(0xf0 ))), _mm256_and_si256 (diff.val [1 ], _mm256_set1_epi8 (static_cast <char >(0xc0 )))), zero));
924
- const auto ma = _mm256_or_si256 (slli_epi8<4 >(diff.val [0 ]), diff.val [2 ]);
956
+ const auto luma_mask = _mm256_setr_epi32 (0 , 1 , 4 , 5 , 2 , 3 , 6 , 7 );
957
+ const auto lu = _mm256_permutevar8x32_epi32 (_mm256_and_si256 (_mm256_or_si256 (_mm256_set1_epi8 (static_cast <char >(chunk_tag::luma)), diff.val [1 ]), _mm256_cmpeq_epi8 (_mm256_or_si256 (_mm256_and_si256 (_mm256_or_si256 (diff.val [0 ], diff.val [2 ]), _mm256_set1_epi8 (static_cast <char >(0xf0 ))), _mm256_and_si256 (diff.val [1 ], _mm256_set1_epi8 (static_cast <char >(0xc0 )))), zero)), luma_mask);
958
+ const auto ma = _mm256_permutevar8x32_epi32 (_mm256_or_si256 (slli_epi8<4 >(diff.val [0 ]), diff.val [2 ]), luma_mask);
925
959
__m256i hash;
926
960
if constexpr (Alpha)
927
961
hash = _mm256_and_si256 (_mm256_add_epi8 (_mm256_add_epi8 (mul_epi8<3 >(pxs.val [0 ]), mul_epi8<5 >(pxs.val [1 ])), _mm256_add_epi8 (mul_epi8<7 >(pxs.val [2 ]), mul_epi8<11 >(pxs.val [3 ]))), _mm256_set1_epi8 (63 ));
928
962
else
929
963
hash = _mm256_and_si256 (_mm256_add_epi8 (_mm256_add_epi8 (mul_epi8<3 >(pxs.val [0 ]), mul_epi8<5 >(pxs.val [1 ])), _mm256_add_epi8 (mul_epi8<7 >(pxs.val [2 ]), _mm256_set1_epi8 (static_cast <std::uint8_t >(255 *11 )))), _mm256_set1_epi8 (63 ));
930
- alignas (alignof (__m256i)) std::uint8_t runs[simd_lanes], diffs[simd_lanes], lus [simd_lanes], mas[simd_lanes ], hashs[simd_lanes];
964
+ alignas (alignof (__m256i)) std::uint8_t runs[simd_lanes], diffs[simd_lanes], lumas [simd_lanes* 2 ], hashs[simd_lanes];
931
965
[[maybe_unused]] alignas (alignof (__m256i)) std::uint8_t alphas[simd_lanes];
932
966
_mm256_store_si256 (reinterpret_cast <__m256i*>(runs), runv);
933
967
_mm256_store_si256 (reinterpret_cast <__m256i*>(diffs), diffv);
934
- _mm256_store_si256 (reinterpret_cast <__m256i*>(lus ), lu );
935
- _mm256_store_si256 (reinterpret_cast <__m256i*>(mas), ma );
968
+ _mm256_store_si256 (reinterpret_cast <__m256i*>(lumas ), _mm256_unpacklo_epi8 (lu, ma) );
969
+ _mm256_store_si256 (reinterpret_cast <__m256i*>(lumas)+ 1 , _mm256_unpackhi_epi8 (lu, ma) );
936
970
_mm256_store_si256 (reinterpret_cast <__m256i*>(hashs), hash);
937
971
if constexpr (Alpha)
938
972
if (!alpha)
@@ -973,9 +1007,9 @@ class qoi{
973
1007
}
974
1008
if (diffs[i])
975
1009
*p++ = diffs[i];
976
- else if (lus[i ]){
977
- *p++ = lus[i] ;
978
- *p++ = mas[i] ;
1010
+ else if (lumas[i* 2 ]){
1011
+ std::memcpy (p, lumas + i* 2 , 2 ) ;
1012
+ p += 2 ;
979
1013
}
980
1014
else {
981
1015
*p++ = chunk_tag::rgb;
@@ -987,7 +1021,13 @@ class qoi{
987
1021
}
988
1022
p_.advance (p-p_.raw_pointer ());
989
1023
990
- encode_body<Channels>(p_, pixels_, index , px_len, px, prev_hash, run);
1024
+ if constexpr (Alpha)
1025
+ encode_body<Channels>(p_, pixels_, index , px_len, px, prev_hash, run);
1026
+ else {
1027
+ rgb_t px_prev;
1028
+ efficient_memcpy<3 >(&px_prev, &px);
1029
+ encode_body<Channels>(p_, pixels_, index , px_len, px_prev, prev_hash, run);
1030
+ }
991
1031
992
1032
push<sizeof (padding)>(p_, padding);
993
1033
}
0 commit comments