diff --git a/src/arch/x86/AVX2.cpp b/src/arch/x86/AVX2.cpp index 3f295ad..5579f1a 100644 --- a/src/arch/x86/AVX2.cpp +++ b/src/arch/x86/AVX2.cpp @@ -53,7 +53,10 @@ namespace hat::detail { const auto cmp = _mm256_cmpeq_epi8(firstByte, _mm256_loadu_si256(vec)); auto mask = static_cast(_mm256_movemask_epi8(cmp)); - if constexpr (cmpeq2) { + if constexpr (alignment != scan_alignment::X1) { + mask &= create_alignment_mask(); + if (!mask) continue; + } else if constexpr (cmpeq2) { const auto cmp2 = _mm256_cmpeq_epi8(secondByte, _mm256_loadu_si256(vec)); auto mask2 = static_cast(_mm256_movemask_epi8(cmp2)); // avoid loading unaligned memory by letting a match of the first signature byte in the last @@ -61,7 +64,6 @@ namespace hat::detail { mask &= (mask2 >> 1) | (0b1u << 31); } - mask &= create_alignment_mask(); while (mask) { const auto offset = _tzcnt_u32(mask); const auto i = reinterpret_cast(vec) + offset; @@ -91,7 +93,7 @@ namespace hat::detail { template scan_result find_pattern_avx2(const std::byte* begin, const std::byte* end, signature_view signature) { - const bool cmpeq2 = signature.size() > 1 && signature[1].has_value(); + const bool cmpeq2 = alignment == scan_alignment::X1 && signature.size() > 1 && signature[1].has_value(); const bool veccmp = signature.size() <= 33; if (cmpeq2 && veccmp) { diff --git a/src/arch/x86/AVX512.cpp b/src/arch/x86/AVX512.cpp index 85f5750..046e43f 100644 --- a/src/arch/x86/AVX512.cpp +++ b/src/arch/x86/AVX512.cpp @@ -53,12 +53,14 @@ namespace hat::detail { for (; vec != e; vec++) { auto mask = _mm512_cmpeq_epi8_mask(firstByte, _mm512_loadu_si512(vec)); - if constexpr (cmpeq2) { + if constexpr (alignment != scan_alignment::X1) { + mask &= create_alignment_mask(); + if (!mask) continue; + } else if constexpr (cmpeq2) { const auto mask2 = _mm512_cmpeq_epi8_mask(secondByte, _mm512_loadu_si512(vec)); mask &= (mask2 >> 1) | (0b1ull << 63); } - mask &= create_alignment_mask(); while (mask) { const auto offset = LIBHAT_TZCNT64(mask); const auto i = reinterpret_cast(vec) + offset; @@ -87,7 +89,7 @@ namespace hat::detail { template scan_result find_pattern_avx512(const std::byte* begin, const std::byte* end, signature_view signature) { - const bool cmpeq2 = signature.size() > 1 && signature[1].has_value(); + const bool cmpeq2 = alignment == scan_alignment::X1 && signature.size() > 1 && signature[1].has_value(); const bool veccmp = signature.size() <= 65; if (cmpeq2 && veccmp) { diff --git a/src/arch/x86/SSE.cpp b/src/arch/x86/SSE.cpp index a3b26a9..b064b08 100644 --- a/src/arch/x86/SSE.cpp +++ b/src/arch/x86/SSE.cpp @@ -53,13 +53,15 @@ namespace hat::detail { const auto cmp = _mm_cmpeq_epi8(firstByte, _mm_loadu_si128(vec)); auto mask = static_cast(_mm_movemask_epi8(cmp)); - if constexpr (cmpeq2) { + if constexpr (alignment != scan_alignment::X1) { + mask &= create_alignment_mask(); + if (!mask) continue; + } else if constexpr (cmpeq2) { const auto cmp2 = _mm_cmpeq_epi8(secondByte, _mm_loadu_si128(vec)); auto mask2 = static_cast(_mm_movemask_epi8(cmp2)); mask &= (mask2 >> 1) | (0b1u << 15); } - mask &= create_alignment_mask(); while (mask) { const auto offset = LIBHAT_BSF32(mask); const auto i = reinterpret_cast(vec) + offset; @@ -89,7 +91,7 @@ namespace hat::detail { template scan_result find_pattern_sse(const std::byte* begin, const std::byte* end, signature_view signature) { - const bool cmpeq2 = signature.size() > 1 && signature[1].has_value(); + const bool cmpeq2 = alignment == scan_alignment::X1 && signature.size() > 1 && signature[1].has_value(); const bool veccmp = signature.size() <= 17; if (cmpeq2 && veccmp) {