]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Add LoongArch64 slide_hash implementation
authorVladislav Shchapov <vladislav@shchapov.ru>
Tue, 10 Jun 2025 15:35:02 +0000 (20:35 +0500)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Fri, 11 Jul 2025 14:12:18 +0000 (16:12 +0200)
Signed-off-by: Vladislav Shchapov <vladislav@shchapov.ru>
CMakeLists.txt
arch/loongarch/Makefile.in
arch/loongarch/loongarch_functions.h
arch/loongarch/slide_hash_lasx.c [new file with mode: 0644]
arch/loongarch/slide_hash_lsx.c [new file with mode: 0644]
configure
functable.c
test/benchmarks/benchmark_slidehash.cc

index fcb2416caa4cbc96816787605de5a7b51fd1e330..ca11c6085b7519abe8470addbc653952b8666fc6 100644 (file)
@@ -1037,9 +1037,9 @@ if(WITH_OPTIM)
             check_lsx_intrinsics()
             if(HAVE_LSX_INTRIN)
                 add_definitions(-DLOONGARCH_LSX)
-                #set(LSX_SRCS ${ARCHDIR}/)
-                #list(APPEND ZLIB_ARCH_SRCS ${LSX_SRCS})
-                #set_property(SOURCE ${LSX_SRCS} PROPERTY COMPILE_FLAGS "${LSXFLAG} ${NOLTOFLAG}")
+                set(LSX_SRCS ${ARCHDIR}/slide_hash_lsx.c)
+                list(APPEND ZLIB_ARCH_SRCS ${LSX_SRCS})
+                set_property(SOURCE ${LSX_SRCS} PROPERTY COMPILE_FLAGS "${LSXFLAG} ${NOLTOFLAG}")
             else()
                 set(HAVE_LSX_INTRIN OFF)
             endif()
@@ -1049,9 +1049,9 @@ if(WITH_OPTIM)
             check_lasx_intrinsics()
             if(HAVE_LASX_INTRIN AND HAVE_LSX_INTRIN)
                 add_definitions(-DLOONGARCH_LASX)
-                #set(LASX_SRCS ${ARCHDIR}/)
-                #list(APPEND ZLIB_ARCH_SRCS ${LASX_SRCS})
-                #set_property(SOURCE ${LASX_SRCS} PROPERTY COMPILE_FLAGS "${LASXFLAG} ${NOLTOFLAG}")
+                set(LASX_SRCS ${ARCHDIR}/slide_hash_lasx.c)
+                list(APPEND ZLIB_ARCH_SRCS ${LASX_SRCS})
+                set_property(SOURCE ${LASX_SRCS} PROPERTY COMPILE_FLAGS "${LASXFLAG} ${NOLTOFLAG}")
             else()
                 set(HAVE_LASX_INTRIN OFF)
             endif()
index c4d8252f54fccad64c89b359fd496b9f491c1102..9002c062b08a705aba1b6bca5a753788afc4abdc 100644 (file)
@@ -19,7 +19,9 @@ TOPDIR=$(SRCTOP)
 
 all: \
        loongarch_features.o loongarch_features.lo \
-    crc32_la.o crc32_la.lo
+       crc32_la.o crc32_la.lo \
+       slide_hash_lasx.o slide_hash_lasx.lo \
+       slide_hash_lsx.o slide_hash_lsx.lo
 
 loongarch_features.o: $(SRCDIR)/loongarch_features.c
        $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/loongarch_features.c
@@ -33,6 +35,18 @@ crc32_la.o: $(SRCDIR)/crc32_la.c
 crc32_la.lo: $(SRCDIR)/crc32_la.c
        $(CC) $(SFLAGS) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_la.c
 
+slide_hash_lasx.o:
+       $(CC) $(CFLAGS) $(LASXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_lasx.c
+
+slide_hash_lasx.lo:
+       $(CC) $(SFLAGS) $(LASXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_lasx.c
+
+slide_hash_lsx.o:
+       $(CC) $(CFLAGS) $(LSXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_lsx.c
+
+slide_hash_lsx.lo:
+       $(CC) $(SFLAGS) $(LSXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_lsx.c
+
 mostlyclean: clean
 clean:
        rm -f *.o *.lo *~
index 9e10ffb10713811ddc889d6325c9085b431b8f39..ce982469a29fd2df2504c06726bd4f11a91ef065 100644 (file)
 uint32_t crc32_loongarch64(uint32_t crc, const uint8_t *buf, size_t len);
 #endif
 
+#ifdef LOONGARCH_LSX
+void slide_hash_lsx(deflate_state *s);
+#endif
+
+#ifdef LOONGARCH_LASX
+void slide_hash_lasx(deflate_state *s);
+#endif
+
 #ifdef DISABLE_RUNTIME_CPU_DETECTION
 // LOONGARCH - CRC32 - All known CPUs has crc instructions
 #  if defined(LOONGARCH_CRC)
@@ -19,8 +27,12 @@ uint32_t crc32_loongarch64(uint32_t crc, const uint8_t *buf, size_t len);
 #    define native_crc32 crc32_loongarch64
 #  endif
 #  if defined(LOONGARCH_LSX) && defined(__loongarch_sx)
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_lsx
 #  endif
 #  if defined(LOONGARCH_LASX) && defined(__loongarch_asx)
+#    undef native_slide_hash
+#    define native_slide_hash slide_hash_lasx
 #  endif
 #endif
 
diff --git a/arch/loongarch/slide_hash_lasx.c b/arch/loongarch/slide_hash_lasx.c
new file mode 100644 (file)
index 0000000..0779d9a
--- /dev/null
@@ -0,0 +1,41 @@
+/*
+ * LASX optimized hash slide, based on Intel AVX2 implementation
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * Authors:
+ *   Arjan van de Ven   <arjan@linux.intel.com>
+ *   Jim Kukunas        <james.t.kukunas@linux.intel.com>
+ *   Mika T. Lindqvist  <postmaster@raasu.org>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "zbuild.h"
+#include "deflate.h"
+
+#include <lasxintrin.h>
+
+static inline void slide_hash_chain(Pos *table, uint32_t entries, const __m256i wsize) {
+    table += entries;
+    table -= 16;
+
+    do {
+        __m256i value, result;
+
+        value = __lasx_xvld(table, 0);
+        result = __lasx_xvssub_hu(value, wsize);
+        __lasx_xvst(result, table, 0);
+
+        table -= 16;
+        entries -= 16;
+    } while (entries > 0);
+}
+
+Z_INTERNAL void slide_hash_lasx(deflate_state *s) {
+    Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+    uint16_t wsize = (uint16_t)s->w_size;
+    const __m256i ymm_wsize = __lasx_xvreplgr2vr_h((short)wsize);
+
+    slide_hash_chain(s->head, HASH_SIZE, ymm_wsize);
+    slide_hash_chain(s->prev, wsize, ymm_wsize);
+}
diff --git a/arch/loongarch/slide_hash_lsx.c b/arch/loongarch/slide_hash_lsx.c
new file mode 100644 (file)
index 0000000..ad235c4
--- /dev/null
@@ -0,0 +1,64 @@
+/*
+ * LSX optimized hash slide, based on Intel SSE implementation
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * Authors:
+ *   Arjan van de Ven   <arjan@linux.intel.com>
+ *   Jim Kukunas        <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "zbuild.h"
+#include "deflate.h"
+
+#include <lsxintrin.h>
+#include <assert.h>
+
+static inline void slide_hash_chain(Pos *table0, Pos *table1, uint32_t entries0,
+                                    uint32_t entries1, const __m128i wsize) {
+    uint32_t entries;
+    Pos *table;
+    __m128i value0, value1, result0, result1;
+
+    int on_chain = 0;
+
+next_chain:
+    table = (on_chain) ? table1 : table0;
+    entries = (on_chain) ? entries1 : entries0;
+
+    table += entries;
+    table -= 16;
+
+    /* ZALLOC allocates this pointer unless the user chose a custom allocator.
+     * Our alloc function is aligned to 64 byte boundaries */
+    do {
+        value0 = __lsx_vld(table, 0);
+        value1 = __lsx_vld(table, 16);
+        result0 = __lsx_vssub_hu(value0, wsize);
+        result1 = __lsx_vssub_hu(value1, wsize);
+        __lsx_vst(result0, table, 0);
+        __lsx_vst(result1, table, 16);
+
+        table -= 16;
+        entries -= 16;
+    } while (entries > 0);
+
+    ++on_chain;
+    if (on_chain > 1) {
+        return;
+    } else {
+        goto next_chain;
+    }
+}
+
+Z_INTERNAL void slide_hash_lsx(deflate_state *s) {
+    Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+    uint16_t wsize = (uint16_t)s->w_size;
+    const __m128i xmm_wsize = __lsx_vreplgr2vr_h((short)wsize);
+
+    assert(((uintptr_t)s->head & 15) == 0);
+    assert(((uintptr_t)s->prev & 15) == 0);
+
+    slide_hash_chain(s->head, s->prev, HASH_SIZE, wsize, xmm_wsize);
+}
index 48ee775e7b11a987ade17277d3156429831a1928..f633de784494b994bb00719fe190847e879e76f7 100755 (executable)
--- a/configure
+++ b/configure
@@ -2316,6 +2316,8 @@ EOF
                 CFLAGS="${CFLAGS} -DLOONGARCH_LSX"
                 SFLAGS="${SFLAGS} -DLOONGARCH_LSX"
 
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_hash_lsx.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_hash_lsx.lo"
             fi
 
             check_lasx_intrinsics
@@ -2323,6 +2325,8 @@ EOF
                 CFLAGS="${CFLAGS} -DLOONGARCH_LASX"
                 SFLAGS="${SFLAGS} -DLOONGARCH_LASX"
 
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_hash_lasx.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_hash_lasx.lo"
             fi
         fi
     ;;
index ac25c9151c72300f979d66174cd197fe70202d57..1c7c679b2c8b2a4dc27fe07b07f50738f14d5c88 100644 (file)
@@ -275,6 +275,16 @@ static void init_functable(void) {
         ft.crc32 = crc32_loongarch64;
     }
 #endif
+#ifdef LOONGARCH_LSX
+    if (cf.loongarch.has_lsx) {
+        ft.slide_hash = slide_hash_lsx;
+    }
+#endif
+#ifdef LOONGARCH_LASX
+    if (cf.loongarch.has_lasx) {
+        ft.slide_hash = slide_hash_lasx;
+    }
+#endif
 
     // Assign function pointers individually for atomic operation
     FUNCTABLE_ASSIGN(ft, force_init);
index 4e9b20ee3fa028b6fe287bb0f8bd59feadd8336d..9d98420b16724b620168496e6a5fa79901ff737c 100644 (file)
@@ -95,5 +95,11 @@ BENCHMARK_SLIDEHASH(sse2, slide_hash_sse2, test_cpu_features.x86.has_sse2);
 #ifdef X86_AVX2
 BENCHMARK_SLIDEHASH(avx2, slide_hash_avx2, test_cpu_features.x86.has_avx2);
 #endif
+#ifdef LOONGARCH_LSX
+BENCHMARK_SLIDEHASH(lsx, slide_hash_lsx, test_cpu_features.loongarch.has_lsx);
+#endif
+#ifdef LOONGARCH_LASX
+BENCHMARK_SLIDEHASH(lasx, slide_hash_lasx, test_cpu_features.loongarch.has_lasx);
+#endif
 
 #endif