From: Vladislav Shchapov Date: Thu, 12 Jun 2025 15:34:26 +0000 (+0500) Subject: Add LoongArch64 (LSX) chunkmemset family of functions implementation X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=3abd56e27f1d2c0f9edcdf3177e9e519846dec07;p=thirdparty%2Fzlib-ng.git Add LoongArch64 (LSX) chunkmemset family of functions implementation Signed-off-by: Vladislav Shchapov --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 44ad4adf..021bc5a0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1037,7 +1037,7 @@ if(WITH_OPTIM) check_lsx_intrinsics() if(HAVE_LSX_INTRIN) add_definitions(-DLOONGARCH_LSX) - set(LSX_SRCS ${ARCHDIR}/compare256_lsx.c ${ARCHDIR}/slide_hash_lsx.c) + set(LSX_SRCS ${ARCHDIR}/chunkset_lsx.c ${ARCHDIR}/compare256_lsx.c ${ARCHDIR}/slide_hash_lsx.c) list(APPEND ZLIB_ARCH_SRCS ${LSX_SRCS}) set_property(SOURCE ${LSX_SRCS} PROPERTY COMPILE_FLAGS "${LSXFLAG} ${NOLTOFLAG}") else() diff --git a/arch/arm/chunkset_neon.c b/arch/arm/chunkset_neon.c index 68c9fef6..e99dcf0e 100644 --- a/arch/arm/chunkset_neon.c +++ b/arch/arm/chunkset_neon.c @@ -6,7 +6,7 @@ #include "neon_intrins.h" #include "zbuild.h" #include "zmemory.h" -#include "arch/generic/chunk_permute_table.h" +#include "arch/generic/chunk_128bit_perm_idx_lut.h" typedef uint8x16_t chunk_t; @@ -15,21 +15,6 @@ typedef uint8x16_t chunk_t; #define HAVE_CHUNKMEMSET_8 #define HAVE_CHUNK_MAG -static const lut_rem_pair perm_idx_lut[13] = { - {0, 1}, /* 3 */ - {0, 0}, /* don't care */ - {1 * 32, 1}, /* 5 */ - {2 * 32, 4}, /* 6 */ - {3 * 32, 2}, /* 7 */ - {0 * 32, 0}, /* don't care */ - {4 * 32, 7}, /* 9 */ - {5 * 32, 6}, /* 10 */ - {6 * 32, 5}, /* 11 */ - {7 * 32, 4}, /* 12 */ - {8 * 32, 3}, /* 13 */ - {9 * 32, 2}, /* 14 */ - {10 * 32, 1},/* 15 */ -}; static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { *chunk = vreinterpretq_u8_u16(vdupq_n_u16(zng_memread_2(from))); diff --git a/arch/generic/chunk_128bit_perm_idx_lut.h b/arch/generic/chunk_128bit_perm_idx_lut.h new file mode 100644 index 00000000..6e5098bf --- /dev/null +++ b/arch/generic/chunk_128bit_perm_idx_lut.h @@ -0,0 +1,26 @@ +/* chunk_128bit_perm_idx_lut.h - shared SSSE3/NEON/LSX permutation idx lut for use with chunkmemset family of functions. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef CHUNK_128BIT_PERM_IDX_LUT_H_ +#define CHUNK_128BIT_PERM_IDX_LUT_H_ + +#include "chunk_permute_table.h" + +static const lut_rem_pair perm_idx_lut[13] = { + {0, 1}, /* 3 */ + {0, 0}, /* don't care */ + {1 * 32, 1}, /* 5 */ + {2 * 32, 4}, /* 6 */ + {3 * 32, 2}, /* 7 */ + {0 * 32, 0}, /* don't care */ + {4 * 32, 7}, /* 9 */ + {5 * 32, 6}, /* 10 */ + {6 * 32, 5}, /* 11 */ + {7 * 32, 4}, /* 12 */ + {8 * 32, 3}, /* 13 */ + {9 * 32, 2}, /* 14 */ + {10 * 32, 1},/* 15 */ +}; + +#endif diff --git a/arch/loongarch/Makefile.in b/arch/loongarch/Makefile.in index c62851b6..7c2d0866 100644 --- a/arch/loongarch/Makefile.in +++ b/arch/loongarch/Makefile.in @@ -20,6 +20,7 @@ TOPDIR=$(SRCTOP) all: \ loongarch_features.o loongarch_features.lo \ crc32_la.o crc32_la.lo \ + chunkset_lsx.o chunkset_lsx.lo \ compare256_lasx.o compare256_lasx.lo \ compare256_lsx.o compare256_lsx.lo \ slide_hash_lasx.o slide_hash_lasx.lo \ @@ -37,6 +38,12 @@ crc32_la.o: $(SRCDIR)/crc32_la.c crc32_la.lo: $(SRCDIR)/crc32_la.c $(CC) $(SFLAGS) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_la.c +chunkset_lsx.o: + $(CC) $(CFLAGS) $(LSXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_lsx.c + +chunkset_lsx.lo: + $(CC) $(SFLAGS) $(LSXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_lsx.c + compare256_lasx.o: $(CC) $(CFLAGS) $(LASXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_lasx.c diff --git a/arch/loongarch/chunkset_lsx.c b/arch/loongarch/chunkset_lsx.c new file mode 100644 index 00000000..0253de2c --- /dev/null +++ b/arch/loongarch/chunkset_lsx.c @@ -0,0 +1,72 @@ +/* chunkset_lsx.c -- LSX inline functions to copy small data chunks, based on Intel SSSE3 implementation + * Copyright (C) 2025 Vladislav Shchapov + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" +#include "zmemory.h" + +#if defined(LOONGARCH_LSX) +#include +#include "lsxintrin_ext.h" +#include "arch/generic/chunk_128bit_perm_idx_lut.h" + +typedef __m128i chunk_t; + +#define HAVE_CHUNKMEMSET_2 +#define HAVE_CHUNKMEMSET_4 +#define HAVE_CHUNKMEMSET_8 +#define HAVE_CHUNK_MAG + + +static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { + *chunk = __lsx_vreplgr2vr_h(zng_memread_2(from)); +} + +static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { + *chunk = __lsx_vreplgr2vr_w(zng_memread_4(from)); +} + +static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { + *chunk = __lsx_vreplgr2vr_d(zng_memread_8(from)); +} + +static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { + *chunk = __lsx_vld(s, 0); +} + +static inline void storechunk(uint8_t *out, chunk_t *chunk) { + __lsx_vst(*chunk, out, 0); +} + +static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) { + lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; + __m128i perm_vec, ret_vec; + /* Important to note: + * This is _not_ to subvert the memory sanitizer but to instead unpoison some + * bytes we willingly and purposefully load uninitialized that we swizzle over + * in a vector register, anyway. If what we assume is wrong about what is used, + * the memory sanitizer will still usefully flag it */ + __msan_unpoison(buf + dist, 16 - dist); + ret_vec = __lsx_vld(buf, 0); + *chunk_rem = lut_rem.remval; + + perm_vec = __lsx_vld(permute_table + lut_rem.idx, 0); + ret_vec = lsx_shuffle_b(ret_vec, perm_vec); + + return ret_vec; +} + +#define CHUNKSIZE chunksize_lsx +#define CHUNKMEMSET chunkmemset_lsx +#define CHUNKMEMSET_SAFE chunkmemset_safe_lsx +#define CHUNKCOPY chunkcopy_lsx +#define CHUNKUNROLL chunkunroll_lsx + +#include "chunkset_tpl.h" + +#define INFLATE_FAST inflate_fast_lsx + +#include "inffast_tpl.h" + +#endif diff --git a/arch/loongarch/loongarch_functions.h b/arch/loongarch/loongarch_functions.h index afdf87e7..c3a3db44 100644 --- a/arch/loongarch/loongarch_functions.h +++ b/arch/loongarch/loongarch_functions.h @@ -21,6 +21,9 @@ void slide_hash_lsx(deflate_state *s); uint32_t longest_match_lsx(deflate_state *const s, Pos cur_match); uint32_t longest_match_slow_lsx(deflate_state *const s, Pos cur_match); # endif +uint32_t chunksize_lsx(void); +uint8_t* chunkmemset_safe_lsx(uint8_t *out, uint8_t *from, unsigned len, unsigned left); +void inflate_fast_lsx(PREFIX3(stream) *strm, uint32_t start); #endif #ifdef LOONGARCH_LASX @@ -45,6 +48,12 @@ void slide_hash_lasx(deflate_state *s); # if defined(LOONGARCH_LSX) && defined(__loongarch_sx) # undef native_slide_hash # define native_slide_hash slide_hash_lsx +# undef native_chunksize +# define native_chunksize chunksize_lsx +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_lsx +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_lsx # ifdef HAVE_BUILTIN_CTZ # undef native_compare256 # define native_compare256 compare256_lsx diff --git a/arch/loongarch/lsxintrin_ext.h b/arch/loongarch/lsxintrin_ext.h index d2766fdf..c89105e0 100644 --- a/arch/loongarch/lsxintrin_ext.h +++ b/arch/loongarch/lsxintrin_ext.h @@ -12,4 +12,15 @@ static inline int lsx_movemask_b(__m128i v) { return __lsx_vpickve2gr_w(__lsx_vmskltz_b(v), 0); } +static inline __m128i lsx_shuffle_b(__m128i a, __m128i b) { + /* most significant bit is set - negative 8-bit integer */ + __m128i msb_mask = __lsx_vslti_b(b, 0); + + /* shuffle, clear msb in indices vector b */ + __m128i dst = __lsx_vshuf_b(a, a, __lsx_vandi_b(b, 0xF)); + + /* invert and apply mask - clear dst-element if b-msb is set */ + return __lsx_vand_v(dst, __lsx_vnor_v(msb_mask, msb_mask)); +} + #endif // include guard LSXINTRIN_EXT_H diff --git a/arch/x86/chunkset_ssse3.c b/arch/x86/chunkset_ssse3.c index 75b698c6..7778e525 100644 --- a/arch/x86/chunkset_ssse3.c +++ b/arch/x86/chunkset_ssse3.c @@ -7,7 +7,7 @@ #if defined(X86_SSSE3) #include -#include "../generic/chunk_permute_table.h" +#include "arch/generic/chunk_128bit_perm_idx_lut.h" typedef __m128i chunk_t; @@ -16,22 +16,6 @@ typedef __m128i chunk_t; #define HAVE_CHUNKMEMSET_8 #define HAVE_CHUNK_MAG -static const lut_rem_pair perm_idx_lut[13] = { - {0, 1}, /* 3 */ - {0, 0}, /* don't care */ - {1 * 32, 1}, /* 5 */ - {2 * 32, 4}, /* 6 */ - {3 * 32, 2}, /* 7 */ - {0 * 32, 0}, /* don't care */ - {4 * 32, 7}, /* 9 */ - {5 * 32, 6}, /* 10 */ - {6 * 32, 5}, /* 11 */ - {7 * 32, 4}, /* 12 */ - {8 * 32, 3}, /* 13 */ - {9 * 32, 2}, /* 14 */ - {10 * 32, 1},/* 15 */ -}; - static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { *chunk = _mm_set1_epi16(zng_memread_2(from)); diff --git a/configure b/configure index 107d864e..cc1f2edc 100755 --- a/configure +++ b/configure @@ -2316,8 +2316,8 @@ EOF CFLAGS="${CFLAGS} -DLOONGARCH_LSX" SFLAGS="${SFLAGS} -DLOONGARCH_LSX" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare256_lsx.o slide_hash_lsx.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare256_lsx.lo slide_hash_lsx.lo" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} chunkset_lsx.o compare256_lsx.o slide_hash_lsx.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} chunkset_lsx.lo compare256_lsx.lo slide_hash_lsx.lo" fi check_lasx_intrinsics diff --git a/functable.c b/functable.c index abb82cab..8de8b399 100644 --- a/functable.c +++ b/functable.c @@ -285,6 +285,9 @@ static void init_functable(void) { ft.longest_match = &longest_match_lsx; ft.longest_match_slow = &longest_match_slow_lsx; # endif + ft.chunksize = &chunksize_lsx; + ft.chunkmemset_safe = &chunkmemset_safe_lsx; + ft.inflate_fast = &inflate_fast_lsx; } #endif #ifdef LOONGARCH_LASX