Skip to content

optimize has_tabs_or_newline for NEON #639

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
May 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions src/helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ ada_really_inline size_t find_next_host_delimiter_special(
uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
uint8x16_t classify = vandq_u8(lowpart, highpart);
if (vmaxvq_u8(classify) != 0) {
if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
uint8x16_t is_zero = vceqq_u8(classify, zero);
uint16_t is_non_zero = ~to_bitmask(is_zero);
return i + trailing_zeroes(is_non_zero);
Expand All @@ -256,7 +256,7 @@ ada_really_inline size_t find_next_host_delimiter_special(
uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
uint8x16_t classify = vandq_u8(lowpart, highpart);
if (vmaxvq_u8(classify) != 0) {
if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
uint8x16_t is_zero = vceqq_u8(classify, zero);
uint16_t is_non_zero = ~to_bitmask(is_zero);
return view.length() - 16 + trailing_zeroes(is_non_zero);
Expand Down Expand Up @@ -381,7 +381,7 @@ ada_really_inline size_t find_next_host_delimiter(std::string_view view,
uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
uint8x16_t classify = vandq_u8(lowpart, highpart);
if (vmaxvq_u8(classify) != 0) {
if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
uint8x16_t is_zero = vceqq_u8(classify, zero);
uint16_t is_non_zero = ~to_bitmask(is_zero);
return i + trailing_zeroes(is_non_zero);
Expand All @@ -394,7 +394,7 @@ ada_really_inline size_t find_next_host_delimiter(std::string_view view,
uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
uint8x16_t classify = vandq_u8(lowpart, highpart);
if (vmaxvq_u8(classify) != 0) {
if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
uint8x16_t is_zero = vceqq_u8(classify, zero);
uint16_t is_non_zero = ~to_bitmask(is_zero);
return view.length() - 16 + trailing_zeroes(is_non_zero);
Expand Down
31 changes: 21 additions & 10 deletions src/unicode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,24 +60,34 @@ ada_really_inline bool has_tabs_or_newline(
}
// fast path for long strings (expected to be common)
size_t i = 0;
const uint8x16_t mask1 = vmovq_n_u8('\r');
const uint8x16_t mask2 = vmovq_n_u8('\n');
const uint8x16_t mask3 = vmovq_n_u8('\t');
/**
* The fastest way to check for `\t` (==9), '\n'(== 10) and `\r` (==13) relies
* on table lookup instruction. We notice that these are all unique numbers
* between 0..15. Let's prepare a special register, where we put '\t' in the
* 9th position, '\n' - 10th and '\r' - 13th. Then we shuffle this register by
* input register. If the input had `\t` in position X then this shuffled
* register will also have '\t' in that position. Comparing input with this
* shuffled register will mark us all interesting characters in the input.
*
* credit for algorithmic idea: @aqrit, credit for description:
* @DenisYaroshevskiy
*/
static uint8_t rnt_array[16] = {1, 0, 0, 0, 0, 0, 0, 0,
0, 9, 10, 0, 0, 13, 0, 0};
const uint8x16_t rnt = vld1q_u8(rnt_array);
// m['0xd', '0xa', '0x9']
uint8x16_t running{0};
for (; i + 15 < user_input.size(); i += 16) {
uint8x16_t word = vld1q_u8((const uint8_t*)user_input.data() + i);
running = vorrq_u8(vorrq_u8(running, vorrq_u8(vceqq_u8(word, mask1),
vceqq_u8(word, mask2))),
vceqq_u8(word, mask3));

running = vorrq_u8(running, vceqq_u8(vqtbl1q_u8(rnt, word), word));
}
if (i < user_input.size()) {
uint8x16_t word =
vld1q_u8((const uint8_t*)user_input.data() + user_input.length() - 16);
running = vorrq_u8(vorrq_u8(running, vorrq_u8(vceqq_u8(word, mask1),
vceqq_u8(word, mask2))),
vceqq_u8(word, mask3));
running = vorrq_u8(running, vceqq_u8(vqtbl1q_u8(rnt, word), word));
}
return vmaxvq_u8(running) != 0;
return vmaxvq_u32(vreinterpretq_u32_u8(running)) != 0;
}
#elif ADA_SSE2
ada_really_inline bool has_tabs_or_newline(
Expand All @@ -97,6 +107,7 @@ ada_really_inline bool has_tabs_or_newline(
const __m128i mask1 = _mm_set1_epi8('\r');
const __m128i mask2 = _mm_set1_epi8('\n');
const __m128i mask3 = _mm_set1_epi8('\t');
// If we supported SSSE3, we could use the algorithm that we use for NEON.
__m128i running{0};
for (; i + 15 < user_input.size(); i += 16) {
__m128i word = _mm_loadu_si128((const __m128i*)(user_input.data() + i));
Expand Down