From: Vladislav Shchapov Date: Thu, 12 Jun 2025 10:25:23 +0000 (+0500) Subject: Add LoongArch64 compare256, longest_match, longest_match_slow implementation X-Git-Url: http://git.ipfire.org/gitweb/gitweb.cgi?a=commitdiff_plain;h=ec3f953eaed47166d80168769b4fefbdcf13f8ed;p=thirdparty%2Fzlib-ng.git Add LoongArch64 compare256, longest_match, longest_match_slow implementation Signed-off-by: Vladislav Shchapov --- diff --git a/CMakeLists.txt b/CMakeLists.txt index ca11c608..44ad4adf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1037,7 +1037,7 @@ if(WITH_OPTIM) check_lsx_intrinsics() if(HAVE_LSX_INTRIN) add_definitions(-DLOONGARCH_LSX) - set(LSX_SRCS ${ARCHDIR}/slide_hash_lsx.c) + set(LSX_SRCS ${ARCHDIR}/compare256_lsx.c ${ARCHDIR}/slide_hash_lsx.c) list(APPEND ZLIB_ARCH_SRCS ${LSX_SRCS}) set_property(SOURCE ${LSX_SRCS} PROPERTY COMPILE_FLAGS "${LSXFLAG} ${NOLTOFLAG}") else() @@ -1049,7 +1049,7 @@ if(WITH_OPTIM) check_lasx_intrinsics() if(HAVE_LASX_INTRIN AND HAVE_LSX_INTRIN) add_definitions(-DLOONGARCH_LASX) - set(LASX_SRCS ${ARCHDIR}/slide_hash_lasx.c) + set(LASX_SRCS ${ARCHDIR}/compare256_lasx.c ${ARCHDIR}/slide_hash_lasx.c) list(APPEND ZLIB_ARCH_SRCS ${LASX_SRCS}) set_property(SOURCE ${LASX_SRCS} PROPERTY COMPILE_FLAGS "${LASXFLAG} ${NOLTOFLAG}") else() diff --git a/arch/loongarch/Makefile.in b/arch/loongarch/Makefile.in index 9002c062..c62851b6 100644 --- a/arch/loongarch/Makefile.in +++ b/arch/loongarch/Makefile.in @@ -20,6 +20,8 @@ TOPDIR=$(SRCTOP) all: \ loongarch_features.o loongarch_features.lo \ crc32_la.o crc32_la.lo \ + compare256_lasx.o compare256_lasx.lo \ + compare256_lsx.o compare256_lsx.lo \ slide_hash_lasx.o slide_hash_lasx.lo \ slide_hash_lsx.o slide_hash_lsx.lo @@ -35,6 +37,18 @@ crc32_la.o: $(SRCDIR)/crc32_la.c crc32_la.lo: $(SRCDIR)/crc32_la.c $(CC) $(SFLAGS) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_la.c +compare256_lasx.o: + $(CC) $(CFLAGS) $(LASXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_lasx.c + +compare256_lasx.lo: + $(CC) $(SFLAGS) $(LASXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_lasx.c + +compare256_lsx.o: + $(CC) $(CFLAGS) $(LSXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_lsx.c + +compare256_lsx.lo: + $(CC) $(SFLAGS) $(LSXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_lsx.c + slide_hash_lasx.o: $(CC) $(CFLAGS) $(LASXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_lasx.c diff --git a/arch/loongarch/compare256_lasx.c b/arch/loongarch/compare256_lasx.c new file mode 100644 index 00000000..7cc05d99 --- /dev/null +++ b/arch/loongarch/compare256_lasx.c @@ -0,0 +1,63 @@ +/* compare256_lasx.c -- LASX version of compare256, based on Intel AVX2 implementation + * Copyright Mika T. Lindqvist + * Copyright (C) 2025 Vladislav Shchapov + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" +#include "zmemory.h" +#include "deflate.h" +#include "fallback_builtins.h" + +#if defined(LOONGARCH_LASX) && defined(HAVE_BUILTIN_CTZ) + +#include +#include "lasxintrin_ext.h" + +static inline uint32_t compare256_lasx_static(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0; + + do { + __m256i ymm_src0, ymm_src1, ymm_cmp; + ymm_src0 = __lasx_xvld(src0, 0); + ymm_src1 = __lasx_xvld(src1, 0); + ymm_cmp = __lasx_xvseq_b(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */ + unsigned mask = (unsigned)lasx_movemask_b(ymm_cmp); + if (mask != 0xFFFFFFFF) { + uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); /* Invert bits so identical = 0 */ + return len + match_byte; + } + + src0 += 32, src1 += 32, len += 32; + + ymm_src0 = __lasx_xvld(src0, 0); + ymm_src1 = __lasx_xvld(src1, 0); + ymm_cmp = __lasx_xvseq_b(ymm_src0, ymm_src1); + mask = (unsigned)lasx_movemask_b(ymm_cmp); + if (mask != 0xFFFFFFFF) { + uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); + return len + match_byte; + } + + src0 += 32, src1 += 32, len += 32; + } while (len < 256); + + return 256; +} + +Z_INTERNAL uint32_t compare256_lasx(const uint8_t *src0, const uint8_t *src1) { + return compare256_lasx_static(src0, src1); +} + +#define LONGEST_MATCH longest_match_lasx +#define COMPARE256 compare256_lasx_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_lasx +#define COMPARE256 compare256_lasx_static + +#include "match_tpl.h" + +#endif diff --git a/arch/loongarch/compare256_lsx.c b/arch/loongarch/compare256_lsx.c new file mode 100644 index 00000000..72b40cdd --- /dev/null +++ b/arch/loongarch/compare256_lsx.c @@ -0,0 +1,99 @@ +/* compare256_lsx.c -- LSX version of compare256, based on Intel SSE implementation + * Copyright Adam Stylinski + * Copyright (C) 2025 Vladislav Shchapov + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" +#include "zmemory.h" +#include "deflate.h" +#include "fallback_builtins.h" + +#if defined(LOONGARCH_LSX) && defined(HAVE_BUILTIN_CTZ) + +#include +#include "lsxintrin_ext.h" + +static inline uint32_t compare256_lsx_static(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0; + int align_offset = ((uintptr_t)src0) & 15; + const uint8_t *end0 = src0 + 256; + const uint8_t *end1 = src1 + 256; + __m128i xmm_src0, xmm_src1, xmm_cmp; + + /* Do the first load unaligned, than all subsequent ones we have at least + * one aligned load. Sadly aligning both loads is probably unrealistic */ + xmm_src0 = __lsx_vld(src0, 0); + xmm_src1 = __lsx_vld(src1, 0); + xmm_cmp = __lsx_vseq_b(xmm_src0, xmm_src1); + + unsigned mask = (unsigned)lsx_movemask_b(xmm_cmp); + + /* Compiler _may_ turn this branch into a ptest + movemask, + * since a lot of those uops are shared and fused */ + if (mask != 0xFFFF) { + uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); + return len + match_byte; + } + + int align_adv = 16 - align_offset; + len += align_adv; + src0 += align_adv; + src1 += align_adv; + + /* Do a flooring division (should just be a shift right) */ + int num_iter = (256 - len) / 16; + + for (int i = 0; i < num_iter; ++i) { + xmm_src0 = __lsx_vld(src0, 0); + xmm_src1 = __lsx_vld(src1, 0); + xmm_cmp = __lsx_vseq_b(xmm_src0, xmm_src1); + + mask = (unsigned)lsx_movemask_b(xmm_cmp); + + /* Compiler _may_ turn this branch into a ptest + movemask, + * since a lot of those uops are shared and fused */ + if (mask != 0xFFFF) { + uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); + return len + match_byte; + } + + len += 16, src0 += 16, src1 += 16; + } + + if (align_offset) { + src0 = end0 - 16; + src1 = end1 - 16; + len = 256 - 16; + + xmm_src0 = __lsx_vld(src0, 0); + xmm_src1 = __lsx_vld(src1, 0); + xmm_cmp = __lsx_vseq_b(xmm_src0, xmm_src1); + + mask = (unsigned)lsx_movemask_b(xmm_cmp); + + if (mask != 0xFFFF) { + uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); + return len + match_byte; + } + } + + return 256; +} + +Z_INTERNAL uint32_t compare256_lsx(const uint8_t *src0, const uint8_t *src1) { + return compare256_lsx_static(src0, src1); +} + +#define LONGEST_MATCH longest_match_lsx +#define COMPARE256 compare256_lsx_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_lsx +#define COMPARE256 compare256_lsx_static + +#include "match_tpl.h" + +#endif diff --git a/arch/loongarch/lasxintrin_ext.h b/arch/loongarch/lasxintrin_ext.h new file mode 100644 index 00000000..920c143a --- /dev/null +++ b/arch/loongarch/lasxintrin_ext.h @@ -0,0 +1,16 @@ +/* lasxintrin_ext.h + * Copyright (C) 2025 Vladislav Shchapov + * For conditions of distribution and use, see copyright notice in zlib.h + */ +#ifndef LASXINTRIN_EXT_H +#define LASXINTRIN_EXT_H + +#include + + +static inline int lasx_movemask_b(__m256i v) { + v = __lasx_xvmskltz_b(v); + return __lasx_xvpickve2gr_w(v, 0) | (__lasx_xvpickve2gr_w(v, 4) << 16); +} + +#endif // include guard LASXINTRIN_EXT_H diff --git a/arch/loongarch/loongarch_functions.h b/arch/loongarch/loongarch_functions.h index e73c8e66..afdf87e7 100644 --- a/arch/loongarch/loongarch_functions.h +++ b/arch/loongarch/loongarch_functions.h @@ -16,10 +16,20 @@ void crc32_fold_loongarch64(crc32_fold *crc, const uint8_t *src, size_t len, #ifdef LOONGARCH_LSX void slide_hash_lsx(deflate_state *s); +# ifdef HAVE_BUILTIN_CTZ + uint32_t compare256_lsx(const uint8_t *src0, const uint8_t *src1); + uint32_t longest_match_lsx(deflate_state *const s, Pos cur_match); + uint32_t longest_match_slow_lsx(deflate_state *const s, Pos cur_match); +# endif #endif #ifdef LOONGARCH_LASX void slide_hash_lasx(deflate_state *s); +# ifdef HAVE_BUILTIN_CTZ + uint32_t compare256_lasx(const uint8_t *src0, const uint8_t *src1); + uint32_t longest_match_lasx(deflate_state *const s, Pos cur_match); + uint32_t longest_match_slow_lasx(deflate_state *const s, Pos cur_match); +# endif #endif #ifdef DISABLE_RUNTIME_CPU_DETECTION @@ -35,10 +45,26 @@ void slide_hash_lasx(deflate_state *s); # if defined(LOONGARCH_LSX) && defined(__loongarch_sx) # undef native_slide_hash # define native_slide_hash slide_hash_lsx +# ifdef HAVE_BUILTIN_CTZ +# undef native_compare256 +# define native_compare256 compare256_lsx +# undef native_longest_match +# define native_longest_match longest_match_lsx +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_lsx +# endif # endif # if defined(LOONGARCH_LASX) && defined(__loongarch_asx) # undef native_slide_hash # define native_slide_hash slide_hash_lasx +# ifdef HAVE_BUILTIN_CTZ +# undef native_compare256 +# define native_compare256 compare256_lasx +# undef native_longest_match +# define native_longest_match longest_match_lasx +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_lasx +# endif # endif #endif diff --git a/arch/loongarch/lsxintrin_ext.h b/arch/loongarch/lsxintrin_ext.h new file mode 100644 index 00000000..d2766fdf --- /dev/null +++ b/arch/loongarch/lsxintrin_ext.h @@ -0,0 +1,15 @@ +/* lsxintrin_ext.h + * Copyright (C) 2025 Vladislav Shchapov + * For conditions of distribution and use, see copyright notice in zlib.h + */ +#ifndef LSXINTRIN_EXT_H +#define LSXINTRIN_EXT_H + +#include + + +static inline int lsx_movemask_b(__m128i v) { + return __lsx_vpickve2gr_w(__lsx_vmskltz_b(v), 0); +} + +#endif // include guard LSXINTRIN_EXT_H diff --git a/configure b/configure index f633de78..107d864e 100755 --- a/configure +++ b/configure @@ -2316,8 +2316,8 @@ EOF CFLAGS="${CFLAGS} -DLOONGARCH_LSX" SFLAGS="${SFLAGS} -DLOONGARCH_LSX" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_hash_lsx.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_hash_lsx.lo" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare256_lsx.o slide_hash_lsx.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare256_lsx.lo slide_hash_lsx.lo" fi check_lasx_intrinsics @@ -2325,8 +2325,8 @@ EOF CFLAGS="${CFLAGS} -DLOONGARCH_LASX" SFLAGS="${SFLAGS} -DLOONGARCH_LASX" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_hash_lasx.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_hash_lasx.lo" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare256_lasx.o slide_hash_lasx.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare256_lasx.lo slide_hash_lasx.lo" fi fi ;; diff --git a/functable.c b/functable.c index 1903310e..abb82cab 100644 --- a/functable.c +++ b/functable.c @@ -280,11 +280,21 @@ static void init_functable(void) { #ifdef LOONGARCH_LSX if (cf.loongarch.has_lsx) { ft.slide_hash = slide_hash_lsx; +# ifdef HAVE_BUILTIN_CTZ + ft.compare256 = &compare256_lsx; + ft.longest_match = &longest_match_lsx; + ft.longest_match_slow = &longest_match_slow_lsx; +# endif } #endif #ifdef LOONGARCH_LASX if (cf.loongarch.has_lasx) { ft.slide_hash = slide_hash_lasx; +# ifdef HAVE_BUILTIN_CTZ + ft.compare256 = &compare256_lasx; + ft.longest_match = &longest_match_lasx; + ft.longest_match_slow = &longest_match_slow_lasx; +# endif } #endif diff --git a/test/benchmarks/benchmark_compare256.cc b/test/benchmarks/benchmark_compare256.cc index 8ed2d0eb..689aa6e9 100644 --- a/test/benchmarks/benchmark_compare256.cc +++ b/test/benchmarks/benchmark_compare256.cc @@ -92,5 +92,11 @@ BENCHMARK_COMPARE256(power9, compare256_power9, test_cpu_features.power.has_arch #ifdef RISCV_RVV BENCHMARK_COMPARE256(rvv, compare256_rvv, test_cpu_features.riscv.has_rvv); #endif +#if defined(LOONGARCH_LSX) && defined(HAVE_BUILTIN_CTZ) +BENCHMARK_COMPARE256(lsx, compare256_lsx, test_cpu_features.loongarch.has_lsx); +#endif +#if defined(LOONGARCH_LASX) && defined(HAVE_BUILTIN_CTZ) +BENCHMARK_COMPARE256(lasx, compare256_lasx, test_cpu_features.loongarch.has_lasx); +#endif #endif diff --git a/test/test_compare256.cc b/test/test_compare256.cc index f367cd0f..1b52082e 100644 --- a/test/test_compare256.cc +++ b/test/test_compare256.cc @@ -91,5 +91,11 @@ TEST_COMPARE256(power9, compare256_power9, test_cpu_features.power.has_arch_3_00 #ifdef RISCV_RVV TEST_COMPARE256(rvv, compare256_rvv, test_cpu_features.riscv.has_rvv) #endif +#if defined(LOONGARCH_LSX) && defined(HAVE_BUILTIN_CTZ) +TEST_COMPARE256(lsx, compare256_lsx, test_cpu_features.loongarch.has_lsx) +#endif +#if defined(LOONGARCH_LASX) && defined(HAVE_BUILTIN_CTZ) +TEST_COMPARE256(lasx, compare256_lasx, test_cpu_features.loongarch.has_lasx) +#endif #endif