ada-url · anonrig · May 1, 2024 · Apr 30, 2024 · Apr 30, 2024 · Apr 30, 2024
diff --git a/src/helpers.cpp b/src/helpers.cpp
@@ -243,7 +243,7 @@ ada_really_inline size_t find_next_host_delimiter_special(
     uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
     uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
     uint8x16_t classify = vandq_u8(lowpart, highpart);
-    if (vmaxvq_u8(classify) != 0) {
+    if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
       uint8x16_t is_zero = vceqq_u8(classify, zero);
       uint16_t is_non_zero = ~to_bitmask(is_zero);
       return i + trailing_zeroes(is_non_zero);
@@ -256,7 +256,7 @@ ada_really_inline size_t find_next_host_delimiter_special(
     uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
     uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
     uint8x16_t classify = vandq_u8(lowpart, highpart);
-    if (vmaxvq_u8(classify) != 0) {
+    if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
       uint8x16_t is_zero = vceqq_u8(classify, zero);
       uint16_t is_non_zero = ~to_bitmask(is_zero);
       return view.length() - 16 + trailing_zeroes(is_non_zero);
@@ -381,7 +381,7 @@ ada_really_inline size_t find_next_host_delimiter(std::string_view view,
     uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
     uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
     uint8x16_t classify = vandq_u8(lowpart, highpart);
-    if (vmaxvq_u8(classify) != 0) {
+    if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
       uint8x16_t is_zero = vceqq_u8(classify, zero);
       uint16_t is_non_zero = ~to_bitmask(is_zero);
       return i + trailing_zeroes(is_non_zero);
@@ -394,7 +394,7 @@ ada_really_inline size_t find_next_host_delimiter(std::string_view view,
     uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
     uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
     uint8x16_t classify = vandq_u8(lowpart, highpart);
-    if (vmaxvq_u8(classify) != 0) {
+    if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
       uint8x16_t is_zero = vceqq_u8(classify, zero);
       uint16_t is_non_zero = ~to_bitmask(is_zero);
       return view.length() - 16 + trailing_zeroes(is_non_zero);

diff --git a/src/unicode.cpp b/src/unicode.cpp
@@ -60,24 +60,34 @@ ada_really_inline bool has_tabs_or_newline(
   }
   // fast path for long strings (expected to be common)
   size_t i = 0;
-  const uint8x16_t mask1 = vmovq_n_u8('\r');
-  const uint8x16_t mask2 = vmovq_n_u8('\n');
-  const uint8x16_t mask3 = vmovq_n_u8('\t');
+  /**
+   * The fastest way to check for `\t` (==9), '\n'(== 10) and `\r` (==13) relies
+   * on table lookup instruction. We notice that these are all unique numbers
+   * between 0..15. Let's prepare a special register, where we put '\t' in the
+   * 9th position, '\n' - 10th and '\r' - 13th. Then we shuffle this register by
+   * input register. If the input had `\t` in position X then this shuffled
+   * register will also have '\t' in that position. Comparing input with this
+   * shuffled register will mark us all interesting characters in the input.
+   *
+   * credit for algorithmic idea: @aqrit, credit for description:
+   * @DenisYaroshevskiy
+   */
+  static uint8_t rnt_array[16] = {1, 0, 0,  0, 0, 0,  0, 0,
+                                  0, 9, 10, 0, 0, 13, 0, 0};
+  const uint8x16_t rnt = vld1q_u8(rnt_array);
+  // m['0xd', '0xa', '0x9']
   uint8x16_t running{0};
   for (; i + 15 < user_input.size(); i += 16) {
     uint8x16_t word = vld1q_u8((const uint8_t*)user_input.data() + i);
-    running = vorrq_u8(vorrq_u8(running, vorrq_u8(vceqq_u8(word, mask1),
-                                                  vceqq_u8(word, mask2))),
-                       vceqq_u8(word, mask3));
+
+    running = vorrq_u8(running, vceqq_u8(vqtbl1q_u8(rnt, word), word));
   }
   if (i < user_input.size()) {
     uint8x16_t word =
         vld1q_u8((const uint8_t*)user_input.data() + user_input.length() - 16);
-    running = vorrq_u8(vorrq_u8(running, vorrq_u8(vceqq_u8(word, mask1),
-                                                  vceqq_u8(word, mask2))),
-                       vceqq_u8(word, mask3));
+    running = vorrq_u8(running, vceqq_u8(vqtbl1q_u8(rnt, word), word));
   }
-  return vmaxvq_u8(running) != 0;
+  return vmaxvq_u32(vreinterpretq_u32_u8(running)) != 0;
 }
 #elif ADA_SSE2
 ada_really_inline bool has_tabs_or_newline(
@@ -97,6 +107,7 @@ ada_really_inline bool has_tabs_or_newline(
   const __m128i mask1 = _mm_set1_epi8('\r');
   const __m128i mask2 = _mm_set1_epi8('\n');
   const __m128i mask3 = _mm_set1_epi8('\t');
+  // If we supported SSSE3, we could use the algorithm that we use for NEON.
   __m128i running{0};
   for (; i + 15 < user_input.size(); i += 16) {
     __m128i word = _mm_loadu_si128((const __m128i*)(user_input.data() + i));