Pre-calculate last vector check ptr in compare256 for sse2 and lsx

author Nathan Moinvaziri <nathan@nathanm.com>

Thu, 8 Jan 2026 07:51:08 +0000 (23:51 -0800)

committer Hans Kristian Rosbach <hk-github@circlestorm.org>

Wed, 21 Jan 2026 23:40:02 +0000 (00:40 +0100)
author Nathan Moinvaziri <nathan@nathanm.com>
Thu, 8 Jan 2026 07:51:08 +0000 (23:51 -0800)
committer Hans Kristian Rosbach <hk-github@circlestorm.org>
Wed, 21 Jan 2026 23:40:02 +0000 (00:40 +0100)
diff --git a/arch/loongarch/compare256_lsx.c b/arch/loongarch/compare256_lsx.c

index 72b40cdd47d43c171d53743848945e7a4bd278c2..4d23dee3c874ba50e9c4a5d6fdbececbd27fd04d 100644 (file)
--- a/arch/loongarch/compare256_lsx.c
+++ b/arch/loongarch/compare256_lsx.c
@@ -15,10 +15,6 @@
  #include "lsxintrin_ext.h"
  
  static inline uint32_t compare256_lsx_static(const uint8_t *src0, const uint8_t *src1) {
-    uint32_t len = 0;
-    int align_offset = ((uintptr_t)src0) & 15;
-    const uint8_t *end0 = src0 + 256;
-    const uint8_t *end1 = src1 + 256;
      __m128i xmm_src0, xmm_src1, xmm_cmp;
  
      /* Do the first load unaligned, than all subsequent ones we have at least
@@ -33,18 +29,20 @@ static inline uint32_t compare256_lsx_static(const uint8_t *src0, const uint8_t
       * since a lot of those uops are shared and fused */
      if (mask != 0xFFFF) {
          uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
-        return len + match_byte;
+        return match_byte;
      }
  
+    const uint8_t *last0 = src0 + 240;
+    const uint8_t *last1 = src1 + 240;
+
+    int align_offset = ((uintptr_t)src0) & 15;
      int align_adv = 16 - align_offset;
-    len += align_adv;
+    uint32_t len = align_adv;
+
      src0 += align_adv;
      src1 += align_adv;
  
-    /* Do a flooring division (should just be a shift right) */
-    int num_iter = (256 - len) / 16;
-
-    for (int i = 0; i < num_iter; ++i) {
+    for (int i = 0; i < 15; i++) {
          xmm_src0 = __lsx_vld(src0, 0);
          xmm_src1 = __lsx_vld(src1, 0);
          xmm_cmp = __lsx_vseq_b(xmm_src0, xmm_src1);
@@ -62,19 +60,15 @@ static inline uint32_t compare256_lsx_static(const uint8_t *src0, const uint8_t
      }
  
      if (align_offset) {
-        src0 = end0 - 16;
-        src1 = end1 - 16;
-        len = 256 - 16;
-
-        xmm_src0 = __lsx_vld(src0, 0);
-        xmm_src1 = __lsx_vld(src1, 0);
+        xmm_src0 = __lsx_vld(last0, 0);
+        xmm_src1 = __lsx_vld(last1, 0);
          xmm_cmp = __lsx_vseq_b(xmm_src0, xmm_src1);
  
          mask = (unsigned)lsx_movemask_b(xmm_cmp);
  
          if (mask != 0xFFFF) {
              uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
-            return len + match_byte;
+            return 240 + match_byte;
          }
      }
  
diff --git a/arch/x86/compare256_sse2.c b/arch/x86/compare256_sse2.c

index 25b65316a8b8b8cfa2f330631e34af5efbc7fb8f..1d539cb0d5e519667a4c472cbc2b7490e59eedf2 100644 (file)
--- a/arch/x86/compare256_sse2.c
+++ b/arch/x86/compare256_sse2.c
@@ -13,10 +13,6 @@
  #include <emmintrin.h>
  
  static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t *src1) {
-    uint32_t len = 0;
-    int align_offset = ((uintptr_t)src0) & 15;
-    const uint8_t *end0 = src0 + 256;
-    const uint8_t *end1 = src1 + 256;
      __m128i xmm_src0, xmm_src1, xmm_cmp;
  
      /* Do the first load unaligned, than all subsequent ones we have at least
@@ -31,18 +27,20 @@ static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t
       * since a lot of those uops are shared and fused */
      if (mask != 0xFFFF) {
          uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
-        return len + match_byte;
+        return match_byte;
      }
  
+    const uint8_t *last0 = src0 + 240;
+    const uint8_t *last1 = src1 + 240;
+
+    int align_offset = ((uintptr_t)src0) & 15;
      int align_adv = 16 - align_offset;
-    len += align_adv;
+    uint32_t len = align_adv;
+
      src0 += align_adv;
      src1 += align_adv;
  
-    /* Do a flooring division (should just be a shift right) */
-    int num_iter = (256 - len) / 16;
-
-    for (int i = 0; i < num_iter; ++i) {
+    for (int i = 0; i < 15; ++i) {
          xmm_src0 = _mm_load_si128((__m128i*)src0);
          xmm_src1 = _mm_loadu_si128((__m128i*)src1);
          xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
@@ -60,19 +58,15 @@ static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t
      }
  
      if (align_offset) {
-        src0 = end0 - 16;
-        src1 = end1 - 16;
-        len = 256 - 16;
-
-        xmm_src0 = _mm_loadu_si128((__m128i*)src0);
-        xmm_src1 = _mm_loadu_si128((__m128i*)src1);
+        xmm_src0 = _mm_loadu_si128((__m128i*)last0);
+        xmm_src1 = _mm_loadu_si128((__m128i*)last1);
          xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
  
          mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
  
          if (mask != 0xFFFF) {
              uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
-            return len + match_byte;
+            return 240 + match_byte;
          }
      }
author	Nathan Moinvaziri <nathan@nathanm.com>
	Thu, 8 Jan 2026 07:51:08 +0000 (23:51 -0800)
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>
	Wed, 21 Jan 2026 23:40:02 +0000 (00:40 +0100)
arch/loongarch/compare256_lsx.c		patch \| blob \| blame \| history
arch/x86/compare256_sse2.c		patch \| blob \| blame \| history