From: Vladislav Shchapov Date: Tue, 10 Jun 2025 15:35:02 +0000 (+0500) Subject: Add LoongArch64 slide_hash implementation X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0de04c10d6d89f02845e549882345d22cd6dbcef;p=thirdparty%2Fzlib-ng.git Add LoongArch64 slide_hash implementation Signed-off-by: Vladislav Shchapov --- diff --git a/CMakeLists.txt b/CMakeLists.txt index fcb2416c..ca11c608 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1037,9 +1037,9 @@ if(WITH_OPTIM) check_lsx_intrinsics() if(HAVE_LSX_INTRIN) add_definitions(-DLOONGARCH_LSX) - #set(LSX_SRCS ${ARCHDIR}/) - #list(APPEND ZLIB_ARCH_SRCS ${LSX_SRCS}) - #set_property(SOURCE ${LSX_SRCS} PROPERTY COMPILE_FLAGS "${LSXFLAG} ${NOLTOFLAG}") + set(LSX_SRCS ${ARCHDIR}/slide_hash_lsx.c) + list(APPEND ZLIB_ARCH_SRCS ${LSX_SRCS}) + set_property(SOURCE ${LSX_SRCS} PROPERTY COMPILE_FLAGS "${LSXFLAG} ${NOLTOFLAG}") else() set(HAVE_LSX_INTRIN OFF) endif() @@ -1049,9 +1049,9 @@ if(WITH_OPTIM) check_lasx_intrinsics() if(HAVE_LASX_INTRIN AND HAVE_LSX_INTRIN) add_definitions(-DLOONGARCH_LASX) - #set(LASX_SRCS ${ARCHDIR}/) - #list(APPEND ZLIB_ARCH_SRCS ${LASX_SRCS}) - #set_property(SOURCE ${LASX_SRCS} PROPERTY COMPILE_FLAGS "${LASXFLAG} ${NOLTOFLAG}") + set(LASX_SRCS ${ARCHDIR}/slide_hash_lasx.c) + list(APPEND ZLIB_ARCH_SRCS ${LASX_SRCS}) + set_property(SOURCE ${LASX_SRCS} PROPERTY COMPILE_FLAGS "${LASXFLAG} ${NOLTOFLAG}") else() set(HAVE_LASX_INTRIN OFF) endif() diff --git a/arch/loongarch/Makefile.in b/arch/loongarch/Makefile.in index c4d8252f..9002c062 100644 --- a/arch/loongarch/Makefile.in +++ b/arch/loongarch/Makefile.in @@ -19,7 +19,9 @@ TOPDIR=$(SRCTOP) all: \ loongarch_features.o loongarch_features.lo \ - crc32_la.o crc32_la.lo + crc32_la.o crc32_la.lo \ + slide_hash_lasx.o slide_hash_lasx.lo \ + slide_hash_lsx.o slide_hash_lsx.lo loongarch_features.o: $(SRCDIR)/loongarch_features.c $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/loongarch_features.c @@ -33,6 +35,18 @@ crc32_la.o: $(SRCDIR)/crc32_la.c crc32_la.lo: $(SRCDIR)/crc32_la.c $(CC) $(SFLAGS) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_la.c +slide_hash_lasx.o: + $(CC) $(CFLAGS) $(LASXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_lasx.c + +slide_hash_lasx.lo: + $(CC) $(SFLAGS) $(LASXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_lasx.c + +slide_hash_lsx.o: + $(CC) $(CFLAGS) $(LSXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_lsx.c + +slide_hash_lsx.lo: + $(CC) $(SFLAGS) $(LSXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_lsx.c + mostlyclean: clean clean: rm -f *.o *.lo *~ diff --git a/arch/loongarch/loongarch_functions.h b/arch/loongarch/loongarch_functions.h index 9e10ffb1..ce982469 100644 --- a/arch/loongarch/loongarch_functions.h +++ b/arch/loongarch/loongarch_functions.h @@ -12,6 +12,14 @@ uint32_t crc32_loongarch64(uint32_t crc, const uint8_t *buf, size_t len); #endif +#ifdef LOONGARCH_LSX +void slide_hash_lsx(deflate_state *s); +#endif + +#ifdef LOONGARCH_LASX +void slide_hash_lasx(deflate_state *s); +#endif + #ifdef DISABLE_RUNTIME_CPU_DETECTION // LOONGARCH - CRC32 - All known CPUs has crc instructions # if defined(LOONGARCH_CRC) @@ -19,8 +27,12 @@ uint32_t crc32_loongarch64(uint32_t crc, const uint8_t *buf, size_t len); # define native_crc32 crc32_loongarch64 # endif # if defined(LOONGARCH_LSX) && defined(__loongarch_sx) +# undef native_slide_hash +# define native_slide_hash slide_hash_lsx # endif # if defined(LOONGARCH_LASX) && defined(__loongarch_asx) +# undef native_slide_hash +# define native_slide_hash slide_hash_lasx # endif #endif diff --git a/arch/loongarch/slide_hash_lasx.c b/arch/loongarch/slide_hash_lasx.c new file mode 100644 index 00000000..0779d9ab --- /dev/null +++ b/arch/loongarch/slide_hash_lasx.c @@ -0,0 +1,41 @@ +/* + * LASX optimized hash slide, based on Intel AVX2 implementation + * + * Copyright (C) 2017 Intel Corporation + * Copyright (C) 2025 Vladislav Shchapov + * Authors: + * Arjan van de Ven + * Jim Kukunas + * Mika T. Lindqvist + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ +#include "zbuild.h" +#include "deflate.h" + +#include + +static inline void slide_hash_chain(Pos *table, uint32_t entries, const __m256i wsize) { + table += entries; + table -= 16; + + do { + __m256i value, result; + + value = __lasx_xvld(table, 0); + result = __lasx_xvssub_hu(value, wsize); + __lasx_xvst(result, table, 0); + + table -= 16; + entries -= 16; + } while (entries > 0); +} + +Z_INTERNAL void slide_hash_lasx(deflate_state *s) { + Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t"); + uint16_t wsize = (uint16_t)s->w_size; + const __m256i ymm_wsize = __lasx_xvreplgr2vr_h((short)wsize); + + slide_hash_chain(s->head, HASH_SIZE, ymm_wsize); + slide_hash_chain(s->prev, wsize, ymm_wsize); +} diff --git a/arch/loongarch/slide_hash_lsx.c b/arch/loongarch/slide_hash_lsx.c new file mode 100644 index 00000000..ad235c47 --- /dev/null +++ b/arch/loongarch/slide_hash_lsx.c @@ -0,0 +1,64 @@ +/* + * LSX optimized hash slide, based on Intel SSE implementation + * + * Copyright (C) 2017 Intel Corporation + * Copyright (C) 2025 Vladislav Shchapov + * Authors: + * Arjan van de Ven + * Jim Kukunas + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ +#include "zbuild.h" +#include "deflate.h" + +#include +#include + +static inline void slide_hash_chain(Pos *table0, Pos *table1, uint32_t entries0, + uint32_t entries1, const __m128i wsize) { + uint32_t entries; + Pos *table; + __m128i value0, value1, result0, result1; + + int on_chain = 0; + +next_chain: + table = (on_chain) ? table1 : table0; + entries = (on_chain) ? entries1 : entries0; + + table += entries; + table -= 16; + + /* ZALLOC allocates this pointer unless the user chose a custom allocator. + * Our alloc function is aligned to 64 byte boundaries */ + do { + value0 = __lsx_vld(table, 0); + value1 = __lsx_vld(table, 16); + result0 = __lsx_vssub_hu(value0, wsize); + result1 = __lsx_vssub_hu(value1, wsize); + __lsx_vst(result0, table, 0); + __lsx_vst(result1, table, 16); + + table -= 16; + entries -= 16; + } while (entries > 0); + + ++on_chain; + if (on_chain > 1) { + return; + } else { + goto next_chain; + } +} + +Z_INTERNAL void slide_hash_lsx(deflate_state *s) { + Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t"); + uint16_t wsize = (uint16_t)s->w_size; + const __m128i xmm_wsize = __lsx_vreplgr2vr_h((short)wsize); + + assert(((uintptr_t)s->head & 15) == 0); + assert(((uintptr_t)s->prev & 15) == 0); + + slide_hash_chain(s->head, s->prev, HASH_SIZE, wsize, xmm_wsize); +} diff --git a/configure b/configure index 48ee775e..f633de78 100755 --- a/configure +++ b/configure @@ -2316,6 +2316,8 @@ EOF CFLAGS="${CFLAGS} -DLOONGARCH_LSX" SFLAGS="${SFLAGS} -DLOONGARCH_LSX" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_hash_lsx.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_hash_lsx.lo" fi check_lasx_intrinsics @@ -2323,6 +2325,8 @@ EOF CFLAGS="${CFLAGS} -DLOONGARCH_LASX" SFLAGS="${SFLAGS} -DLOONGARCH_LASX" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_hash_lasx.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_hash_lasx.lo" fi fi ;; diff --git a/functable.c b/functable.c index ac25c915..1c7c679b 100644 --- a/functable.c +++ b/functable.c @@ -275,6 +275,16 @@ static void init_functable(void) { ft.crc32 = crc32_loongarch64; } #endif +#ifdef LOONGARCH_LSX + if (cf.loongarch.has_lsx) { + ft.slide_hash = slide_hash_lsx; + } +#endif +#ifdef LOONGARCH_LASX + if (cf.loongarch.has_lasx) { + ft.slide_hash = slide_hash_lasx; + } +#endif // Assign function pointers individually for atomic operation FUNCTABLE_ASSIGN(ft, force_init); diff --git a/test/benchmarks/benchmark_slidehash.cc b/test/benchmarks/benchmark_slidehash.cc index 4e9b20ee..9d98420b 100644 --- a/test/benchmarks/benchmark_slidehash.cc +++ b/test/benchmarks/benchmark_slidehash.cc @@ -95,5 +95,11 @@ BENCHMARK_SLIDEHASH(sse2, slide_hash_sse2, test_cpu_features.x86.has_sse2); #ifdef X86_AVX2 BENCHMARK_SLIDEHASH(avx2, slide_hash_avx2, test_cpu_features.x86.has_avx2); #endif +#ifdef LOONGARCH_LSX +BENCHMARK_SLIDEHASH(lsx, slide_hash_lsx, test_cpu_features.loongarch.has_lsx); +#endif +#ifdef LOONGARCH_LASX +BENCHMARK_SLIDEHASH(lasx, slide_hash_lasx, test_cpu_features.loongarch.has_lasx); +#endif #endif