]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Add LoongArch64 compare256, longest_match, longest_match_slow implementation
authorVladislav Shchapov <vladislav@shchapov.ru>
Thu, 12 Jun 2025 10:25:23 +0000 (15:25 +0500)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Fri, 11 Jul 2025 14:12:18 +0000 (16:12 +0200)
Signed-off-by: Vladislav Shchapov <vladislav@shchapov.ru>
CMakeLists.txt
arch/loongarch/Makefile.in
arch/loongarch/compare256_lasx.c [new file with mode: 0644]
arch/loongarch/compare256_lsx.c [new file with mode: 0644]
arch/loongarch/lasxintrin_ext.h [new file with mode: 0644]
arch/loongarch/loongarch_functions.h
arch/loongarch/lsxintrin_ext.h [new file with mode: 0644]
configure
functable.c
test/benchmarks/benchmark_compare256.cc
test/test_compare256.cc

index ca11c6085b7519abe8470addbc653952b8666fc6..44ad4adfc6d63e20f6cef487eaba187868162cbc 100644 (file)
@@ -1037,7 +1037,7 @@ if(WITH_OPTIM)
             check_lsx_intrinsics()
             if(HAVE_LSX_INTRIN)
                 add_definitions(-DLOONGARCH_LSX)
-                set(LSX_SRCS ${ARCHDIR}/slide_hash_lsx.c)
+                set(LSX_SRCS ${ARCHDIR}/compare256_lsx.c ${ARCHDIR}/slide_hash_lsx.c)
                 list(APPEND ZLIB_ARCH_SRCS ${LSX_SRCS})
                 set_property(SOURCE ${LSX_SRCS} PROPERTY COMPILE_FLAGS "${LSXFLAG} ${NOLTOFLAG}")
             else()
@@ -1049,7 +1049,7 @@ if(WITH_OPTIM)
             check_lasx_intrinsics()
             if(HAVE_LASX_INTRIN AND HAVE_LSX_INTRIN)
                 add_definitions(-DLOONGARCH_LASX)
-                set(LASX_SRCS ${ARCHDIR}/slide_hash_lasx.c)
+                set(LASX_SRCS ${ARCHDIR}/compare256_lasx.c ${ARCHDIR}/slide_hash_lasx.c)
                 list(APPEND ZLIB_ARCH_SRCS ${LASX_SRCS})
                 set_property(SOURCE ${LASX_SRCS} PROPERTY COMPILE_FLAGS "${LASXFLAG} ${NOLTOFLAG}")
             else()
index 9002c062b08a705aba1b6bca5a753788afc4abdc..c62851b6848cdd55366b1f99fe5e44aa23fd7973 100644 (file)
@@ -20,6 +20,8 @@ TOPDIR=$(SRCTOP)
 all: \
        loongarch_features.o loongarch_features.lo \
        crc32_la.o crc32_la.lo \
+       compare256_lasx.o compare256_lasx.lo \
+       compare256_lsx.o compare256_lsx.lo \
        slide_hash_lasx.o slide_hash_lasx.lo \
        slide_hash_lsx.o slide_hash_lsx.lo
 
@@ -35,6 +37,18 @@ crc32_la.o: $(SRCDIR)/crc32_la.c
 crc32_la.lo: $(SRCDIR)/crc32_la.c
        $(CC) $(SFLAGS) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_la.c
 
+compare256_lasx.o:
+       $(CC) $(CFLAGS) $(LASXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_lasx.c
+
+compare256_lasx.lo:
+       $(CC) $(SFLAGS) $(LASXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_lasx.c
+
+compare256_lsx.o:
+       $(CC) $(CFLAGS) $(LSXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_lsx.c
+
+compare256_lsx.lo:
+       $(CC) $(SFLAGS) $(LSXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_lsx.c
+
 slide_hash_lasx.o:
        $(CC) $(CFLAGS) $(LASXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_lasx.c
 
diff --git a/arch/loongarch/compare256_lasx.c b/arch/loongarch/compare256_lasx.c
new file mode 100644 (file)
index 0000000..7cc05d9
--- /dev/null
@@ -0,0 +1,63 @@
+/* compare256_lasx.c -- LASX version of compare256, based on Intel AVX2 implementation
+ * Copyright Mika T. Lindqvist  <postmaster@raasu.org>
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zmemory.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+#if defined(LOONGARCH_LASX) && defined(HAVE_BUILTIN_CTZ)
+
+#include <lasxintrin.h>
+#include "lasxintrin_ext.h"
+
+static inline uint32_t compare256_lasx_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+
+    do {
+        __m256i ymm_src0, ymm_src1, ymm_cmp;
+        ymm_src0 = __lasx_xvld(src0, 0);
+        ymm_src1 = __lasx_xvld(src1, 0);
+        ymm_cmp = __lasx_xvseq_b(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
+        unsigned mask = (unsigned)lasx_movemask_b(ymm_cmp);
+        if (mask != 0xFFFFFFFF) {
+            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); /* Invert bits so identical = 0 */
+            return len + match_byte;
+        }
+
+        src0 += 32, src1 += 32, len += 32;
+
+        ymm_src0 = __lasx_xvld(src0, 0);
+        ymm_src1 = __lasx_xvld(src1, 0);
+        ymm_cmp = __lasx_xvseq_b(ymm_src0, ymm_src1);
+        mask = (unsigned)lasx_movemask_b(ymm_cmp);
+        if (mask != 0xFFFFFFFF) {
+            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+            return len + match_byte;
+        }
+
+        src0 += 32, src1 += 32, len += 32;
+    } while (len < 256);
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_lasx(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_lasx_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_lasx
+#define COMPARE256          compare256_lasx_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_lasx
+#define COMPARE256          compare256_lasx_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/arch/loongarch/compare256_lsx.c b/arch/loongarch/compare256_lsx.c
new file mode 100644 (file)
index 0000000..72b40cd
--- /dev/null
@@ -0,0 +1,99 @@
+/* compare256_lsx.c -- LSX version of compare256, based on Intel SSE implementation
+ * Copyright Adam Stylinski <kungfujesus06@gmail.com>
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zmemory.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+#if defined(LOONGARCH_LSX) && defined(HAVE_BUILTIN_CTZ)
+
+#include <lsxintrin.h>
+#include "lsxintrin_ext.h"
+
+static inline uint32_t compare256_lsx_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+    int align_offset = ((uintptr_t)src0) & 15;
+    const uint8_t *end0 = src0 + 256;
+    const uint8_t *end1 = src1 + 256;
+    __m128i xmm_src0, xmm_src1, xmm_cmp;
+
+    /* Do the first load unaligned, than all subsequent ones we have at least
+     * one aligned load. Sadly aligning both loads is probably unrealistic */
+    xmm_src0 = __lsx_vld(src0, 0);
+    xmm_src1 = __lsx_vld(src1, 0);
+    xmm_cmp = __lsx_vseq_b(xmm_src0, xmm_src1);
+
+    unsigned mask = (unsigned)lsx_movemask_b(xmm_cmp);
+
+    /* Compiler _may_ turn this branch into a ptest + movemask,
+     * since a lot of those uops are shared and fused */
+    if (mask != 0xFFFF) {
+        uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+        return len + match_byte;
+    }
+
+    int align_adv = 16 - align_offset;
+    len += align_adv;
+    src0 += align_adv;
+    src1 += align_adv;
+
+    /* Do a flooring division (should just be a shift right) */
+    int num_iter = (256 - len) / 16;
+
+    for (int i = 0; i < num_iter; ++i) {
+        xmm_src0 = __lsx_vld(src0, 0);
+        xmm_src1 = __lsx_vld(src1, 0);
+        xmm_cmp = __lsx_vseq_b(xmm_src0, xmm_src1);
+
+        mask = (unsigned)lsx_movemask_b(xmm_cmp);
+
+        /* Compiler _may_ turn this branch into a ptest + movemask,
+         * since a lot of those uops are shared and fused */
+        if (mask != 0xFFFF) {
+            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+            return len + match_byte;
+        }
+
+        len += 16, src0 += 16, src1 += 16;
+    }
+
+    if (align_offset) {
+        src0 = end0 - 16;
+        src1 = end1 - 16;
+        len = 256 - 16;
+
+        xmm_src0 = __lsx_vld(src0, 0);
+        xmm_src1 = __lsx_vld(src1, 0);
+        xmm_cmp = __lsx_vseq_b(xmm_src0, xmm_src1);
+
+        mask = (unsigned)lsx_movemask_b(xmm_cmp);
+
+        if (mask != 0xFFFF) {
+            uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+            return len + match_byte;
+        }
+    }
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_lsx(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_lsx_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_lsx
+#define COMPARE256          compare256_lsx_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_lsx
+#define COMPARE256          compare256_lsx_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/arch/loongarch/lasxintrin_ext.h b/arch/loongarch/lasxintrin_ext.h
new file mode 100644 (file)
index 0000000..920c143
--- /dev/null
@@ -0,0 +1,16 @@
+/* lasxintrin_ext.h
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifndef LASXINTRIN_EXT_H
+#define LASXINTRIN_EXT_H
+
+#include <lasxintrin.h>
+
+
+static inline int lasx_movemask_b(__m256i v) {
+    v = __lasx_xvmskltz_b(v);
+    return __lasx_xvpickve2gr_w(v, 0) | (__lasx_xvpickve2gr_w(v, 4) << 16);
+}
+
+#endif // include guard LASXINTRIN_EXT_H
index e73c8e6648650233585612869823e85fa07e74ef..afdf87e7ac8cde02665554deeda484c9f0e8b5d5 100644 (file)
@@ -16,10 +16,20 @@ void     crc32_fold_loongarch64(crc32_fold *crc, const uint8_t *src, size_t len,
 
 #ifdef LOONGARCH_LSX
 void slide_hash_lsx(deflate_state *s);
+#  ifdef HAVE_BUILTIN_CTZ
+    uint32_t compare256_lsx(const uint8_t *src0, const uint8_t *src1);
+    uint32_t longest_match_lsx(deflate_state *const s, Pos cur_match);
+    uint32_t longest_match_slow_lsx(deflate_state *const s, Pos cur_match);
+#  endif
 #endif
 
 #ifdef LOONGARCH_LASX
 void slide_hash_lasx(deflate_state *s);
+#  ifdef HAVE_BUILTIN_CTZ
+    uint32_t compare256_lasx(const uint8_t *src0, const uint8_t *src1);
+    uint32_t longest_match_lasx(deflate_state *const s, Pos cur_match);
+    uint32_t longest_match_slow_lasx(deflate_state *const s, Pos cur_match);
+#  endif
 #endif
 
 #ifdef DISABLE_RUNTIME_CPU_DETECTION
@@ -35,10 +45,26 @@ void slide_hash_lasx(deflate_state *s);
 #  if defined(LOONGARCH_LSX) && defined(__loongarch_sx)
 #    undef native_slide_hash
 #    define native_slide_hash slide_hash_lsx
+#    ifdef HAVE_BUILTIN_CTZ
+#      undef native_compare256
+#      define native_compare256 compare256_lsx
+#      undef native_longest_match
+#      define native_longest_match longest_match_lsx
+#      undef native_longest_match_slow
+#      define native_longest_match_slow longest_match_slow_lsx
+#    endif
 #  endif
 #  if defined(LOONGARCH_LASX) && defined(__loongarch_asx)
 #    undef native_slide_hash
 #    define native_slide_hash slide_hash_lasx
+#    ifdef HAVE_BUILTIN_CTZ
+#      undef native_compare256
+#      define native_compare256 compare256_lasx
+#      undef native_longest_match
+#      define native_longest_match longest_match_lasx
+#      undef native_longest_match_slow
+#      define native_longest_match_slow longest_match_slow_lasx
+#    endif
 #  endif
 #endif
 
diff --git a/arch/loongarch/lsxintrin_ext.h b/arch/loongarch/lsxintrin_ext.h
new file mode 100644 (file)
index 0000000..d2766fd
--- /dev/null
@@ -0,0 +1,15 @@
+/* lsxintrin_ext.h
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifndef LSXINTRIN_EXT_H
+#define LSXINTRIN_EXT_H
+
+#include <lsxintrin.h>
+
+
+static inline int lsx_movemask_b(__m128i v) {
+    return __lsx_vpickve2gr_w(__lsx_vmskltz_b(v), 0);
+}
+
+#endif // include guard LSXINTRIN_EXT_H
index f633de784494b994bb00719fe190847e879e76f7..107d864e4a413f79ec94542a5e62602cfea6c785 100755 (executable)
--- a/configure
+++ b/configure
@@ -2316,8 +2316,8 @@ EOF
                 CFLAGS="${CFLAGS} -DLOONGARCH_LSX"
                 SFLAGS="${SFLAGS} -DLOONGARCH_LSX"
 
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_hash_lsx.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_hash_lsx.lo"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare256_lsx.o slide_hash_lsx.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare256_lsx.lo slide_hash_lsx.lo"
             fi
 
             check_lasx_intrinsics
@@ -2325,8 +2325,8 @@ EOF
                 CFLAGS="${CFLAGS} -DLOONGARCH_LASX"
                 SFLAGS="${SFLAGS} -DLOONGARCH_LASX"
 
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_hash_lasx.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_hash_lasx.lo"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare256_lasx.o slide_hash_lasx.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare256_lasx.lo slide_hash_lasx.lo"
             fi
         fi
     ;;
index 1903310e7a597cfe940130422862add66ddcc1a5..abb82cabea764a2afa5fb8ee2bc406a3efc7b376 100644 (file)
@@ -280,11 +280,21 @@ static void init_functable(void) {
 #ifdef LOONGARCH_LSX
     if (cf.loongarch.has_lsx) {
         ft.slide_hash = slide_hash_lsx;
+#  ifdef HAVE_BUILTIN_CTZ
+        ft.compare256 = &compare256_lsx;
+        ft.longest_match = &longest_match_lsx;
+        ft.longest_match_slow = &longest_match_slow_lsx;
+#  endif
     }
 #endif
 #ifdef LOONGARCH_LASX
     if (cf.loongarch.has_lasx) {
         ft.slide_hash = slide_hash_lasx;
+#  ifdef HAVE_BUILTIN_CTZ
+        ft.compare256 = &compare256_lasx;
+        ft.longest_match = &longest_match_lasx;
+        ft.longest_match_slow = &longest_match_slow_lasx;
+#  endif
     }
 #endif
 
index 8ed2d0eb3dbd4e272c3543b21e431437db14ecc7..689aa6e934f9692f08dfef432cc4e9904208eecb 100644 (file)
@@ -92,5 +92,11 @@ BENCHMARK_COMPARE256(power9, compare256_power9, test_cpu_features.power.has_arch
 #ifdef RISCV_RVV
 BENCHMARK_COMPARE256(rvv, compare256_rvv, test_cpu_features.riscv.has_rvv);
 #endif
+#if defined(LOONGARCH_LSX) && defined(HAVE_BUILTIN_CTZ)
+BENCHMARK_COMPARE256(lsx, compare256_lsx, test_cpu_features.loongarch.has_lsx);
+#endif
+#if defined(LOONGARCH_LASX) && defined(HAVE_BUILTIN_CTZ)
+BENCHMARK_COMPARE256(lasx, compare256_lasx, test_cpu_features.loongarch.has_lasx);
+#endif
 
 #endif
index f367cd0f4eaa5d107677f5ce23333fa02568f789..1b52082ed032817d1cf132312b6834f1c25e6b33 100644 (file)
@@ -91,5 +91,11 @@ TEST_COMPARE256(power9, compare256_power9, test_cpu_features.power.has_arch_3_00
 #ifdef RISCV_RVV
 TEST_COMPARE256(rvv, compare256_rvv, test_cpu_features.riscv.has_rvv)
 #endif
+#if defined(LOONGARCH_LSX) && defined(HAVE_BUILTIN_CTZ)
+TEST_COMPARE256(lsx, compare256_lsx, test_cpu_features.loongarch.has_lsx)
+#endif
+#if defined(LOONGARCH_LASX) && defined(HAVE_BUILTIN_CTZ)
+TEST_COMPARE256(lasx, compare256_lasx, test_cpu_features.loongarch.has_lasx)
+#endif
 
 #endif