diff --git a/src/arch/x86/AVX2.cpp b/src/arch/x86/AVX2.cpp index ab7f03e..cd10e21 100644 --- a/src/arch/x86/AVX2.cpp +++ b/src/arch/x86/AVX2.cpp @@ -31,7 +31,7 @@ namespace hat::detail { const auto e = vec + n; for (; vec != e; vec++) { - const auto cmp = _mm256_cmpeq_epi8(firstByte, *vec); + const auto cmp = _mm256_cmpeq_epi8(firstByte, _mm256_loadu_si256(vec)); auto mask = static_cast(_mm256_movemask_epi8(cmp)); while (mask) { const auto offset = _tzcnt_u32(mask); diff --git a/src/arch/x86/AVX512.cpp b/src/arch/x86/AVX512.cpp index 47e559e..ad1e2f6 100644 --- a/src/arch/x86/AVX512.cpp +++ b/src/arch/x86/AVX512.cpp @@ -31,7 +31,7 @@ namespace hat::detail { const auto e = vec + n; for (; vec != e; vec++) { - auto mask = _mm512_cmpeq_epi8_mask(firstByte, *vec); + auto mask = _mm512_cmpeq_epi8_mask(firstByte, _mm512_loadu_si512(vec)); while (mask) { const auto offset = LIBHAT_TZCNT64(mask); const auto i = reinterpret_cast(vec) + offset; diff --git a/src/arch/x86/SSE.cpp b/src/arch/x86/SSE.cpp index 4c9717c..0500b29 100644 --- a/src/arch/x86/SSE.cpp +++ b/src/arch/x86/SSE.cpp @@ -31,7 +31,7 @@ namespace hat::detail { const auto e = vec + n; for (; vec != e; vec++) { - const auto cmp = _mm_cmpeq_epi8(firstByte, *vec); + const auto cmp = _mm_cmpeq_epi8(firstByte, _mm_loadu_si128(vec)); auto mask = static_cast(_mm_movemask_epi8(cmp)); while (mask) { const auto offset = LIBHAT_BSF32(mask);