From: Vladislav Shchapov Date: Fri, 13 Jun 2025 15:49:34 +0000 (+0500) Subject: Add LoongArch64 (LASX) chunkmemset family of functions implementation X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=38ba3bdc28b9bd665375e36aa16becbfc05cf823;p=thirdparty%2Fzlib-ng.git Add LoongArch64 (LASX) chunkmemset family of functions implementation Signed-off-by: Vladislav Shchapov --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 021bc5a0..2ed57cd6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1049,7 +1049,7 @@ if(WITH_OPTIM) check_lasx_intrinsics() if(HAVE_LASX_INTRIN AND HAVE_LSX_INTRIN) add_definitions(-DLOONGARCH_LASX) - set(LASX_SRCS ${ARCHDIR}/compare256_lasx.c ${ARCHDIR}/slide_hash_lasx.c) + set(LASX_SRCS ${ARCHDIR}/chunkset_lasx.c ${ARCHDIR}/compare256_lasx.c ${ARCHDIR}/slide_hash_lasx.c) list(APPEND ZLIB_ARCH_SRCS ${LASX_SRCS}) set_property(SOURCE ${LASX_SRCS} PROPERTY COMPILE_FLAGS "${LASXFLAG} ${NOLTOFLAG}") else() diff --git a/arch/x86/avx2_tables.h b/arch/generic/chunk_256bit_perm_idx_lut.h similarity index 82% rename from arch/x86/avx2_tables.h rename to arch/generic/chunk_256bit_perm_idx_lut.h index 50759993..796a7df1 100644 --- a/arch/x86/avx2_tables.h +++ b/arch/generic/chunk_256bit_perm_idx_lut.h @@ -1,7 +1,10 @@ -#ifndef _AVX2_TABLES_H -#define _AVX2_TABLES_H +/* chunk_256bit_perm_idx_lut.h - shared AVX512/AVX2/LASX permutation idx lut for use with chunkmemset family of functions. + * For conditions of distribution and use, see copyright notice in zlib.h + */ +#ifndef CHUNK_256BIT_PERM_IDX_LUT_H_ +#define CHUNK_256BIT_PERM_IDX_LUT_H_ -#include "../generic/chunk_permute_table.h" +#include "chunk_permute_table.h" /* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can * never be 0 - 2, we'll start with an offset, subtracting 3 from the input */ diff --git a/arch/loongarch/Makefile.in b/arch/loongarch/Makefile.in index 7c2d0866..36988f60 100644 --- a/arch/loongarch/Makefile.in +++ b/arch/loongarch/Makefile.in @@ -20,6 +20,7 @@ TOPDIR=$(SRCTOP) all: \ loongarch_features.o loongarch_features.lo \ crc32_la.o crc32_la.lo \ + chunkset_lasx.o chunkset_lasx.lo \ chunkset_lsx.o chunkset_lsx.lo \ compare256_lasx.o compare256_lasx.lo \ compare256_lsx.o compare256_lsx.lo \ @@ -38,6 +39,12 @@ crc32_la.o: $(SRCDIR)/crc32_la.c crc32_la.lo: $(SRCDIR)/crc32_la.c $(CC) $(SFLAGS) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_la.c +chunkset_lasx.o: + $(CC) $(CFLAGS) $(LASXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_lasx.c + +chunkset_lasx.lo: + $(CC) $(SFLAGS) $(LASXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_lasx.c + chunkset_lsx.o: $(CC) $(CFLAGS) $(LSXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_lsx.c diff --git a/arch/loongarch/chunkset_lasx.c b/arch/loongarch/chunkset_lasx.c new file mode 100644 index 00000000..8b232327 --- /dev/null +++ b/arch/loongarch/chunkset_lasx.c @@ -0,0 +1,127 @@ +/* chunkset_lasx.c -- LASX inline functions to copy small data chunks, based on Intel AVX2 implementation + * Copyright (C) 2025 Vladislav Shchapov + * For conditions of distribution and use, see copyright notice in zlib.h + */ +#include "zbuild.h" +#include "zmemory.h" + +#ifdef LOONGARCH_LASX + +#include +#include "lasxintrin_ext.h" +#include "lsxintrin_ext.h" + +#include "arch/generic/chunk_256bit_perm_idx_lut.h" + +typedef __m256i chunk_t; +typedef __m128i halfchunk_t; + +#define HAVE_CHUNKMEMSET_2 +#define HAVE_CHUNKMEMSET_4 +#define HAVE_CHUNKMEMSET_8 +#define HAVE_CHUNKMEMSET_16 +#define HAVE_CHUNK_MAG +#define HAVE_HALF_CHUNK + +static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { + *chunk = __lasx_xvreplgr2vr_h(zng_memread_2(from)); +} + +static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { + *chunk = __lasx_xvreplgr2vr_w(zng_memread_4(from)); +} + +static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { + *chunk = __lasx_xvreplgr2vr_d(zng_memread_8(from)); +} + +static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) { + halfchunk_t half = __lsx_vld(from, 0); + *chunk = lasx_inserti128_si256(lasx_castsi128_si256(half), half, 1); +} + +static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { + *chunk = __lasx_xvld(s, 0); +} + +static inline void storechunk(uint8_t *out, chunk_t *chunk) { + __lasx_xvst(*chunk, out, 0); +} + +static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) { + lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; + __m256i ret_vec; + /* While technically we only need to read 4 or 8 bytes into this vector register for a lot of cases, GCC is + * compiling this to a shared load for all branches, preferring the simpler code. Given that the buf value isn't in + * GPRs to begin with the 256 bit load is _probably_ just as inexpensive */ + *chunk_rem = lut_rem.remval; + + /* See note in chunkset_ssse3.c for why this is ok */ + __msan_unpoison(buf + dist, 32 - dist); + + if (dist < 16) { + /* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after + * broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate + * shuffles and combining the halves later */ + const __m256i permute_xform = lasx_inserti128_si256(__lasx_xvreplgr2vr_b(0), __lsx_vreplgr2vr_b(16), 1); + __m256i perm_vec = __lasx_xvld(permute_table+lut_rem.idx, 0); + __m128i ret_vec0 = __lsx_vld(buf, 0); + perm_vec = __lasx_xvadd_b(perm_vec, permute_xform); + ret_vec = lasx_inserti128_si256(lasx_castsi128_si256(ret_vec0), ret_vec0, 1); + ret_vec = lasx_shuffle_b(ret_vec, perm_vec); + } else { + __m128i ret_vec0 = __lsx_vld(buf, 0); + __m128i ret_vec1 = __lsx_vld(buf, 16); + /* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */ + __m128i perm_vec1 = __lsx_vld(permute_table + lut_rem.idx, 0); + __m128i xlane_permutes = __lsx_vslt_b(perm_vec1, __lsx_vreplgr2vr_b(16)); + __m128i xlane_res = lsx_shuffle_b(ret_vec0, perm_vec1); + /* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_ + * shuffle those values */ + __m128i latter_half = __lsx_vbitsel_v(ret_vec1, xlane_res, xlane_permutes); + ret_vec = lasx_inserti128_si256(lasx_castsi128_si256(ret_vec0), latter_half, 1); + } + + return ret_vec; +} + +static inline void loadhalfchunk(uint8_t const *s, halfchunk_t *chunk) { + *chunk = __lsx_vld(s, 0); +} + +static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) { + __lsx_vst(*chunk, out, 0); +} + +static inline chunk_t halfchunk2whole(halfchunk_t *chunk) { + /* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately + * unlikely to be actually written or read from */ + return lasx_zextsi128_si256(*chunk); +} + +static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) { + lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; + __m128i perm_vec, ret_vec; + __msan_unpoison(buf + dist, 16 - dist); + ret_vec = __lsx_vld(buf, 0); + *chunk_rem = half_rem_vals[dist - 3]; + + perm_vec = __lsx_vld(permute_table + lut_rem.idx, 0); + ret_vec = lsx_shuffle_b(ret_vec, perm_vec); + + return ret_vec; +} + +#define CHUNKSIZE chunksize_lasx +#define CHUNKCOPY chunkcopy_lasx +#define CHUNKUNROLL chunkunroll_lasx +#define CHUNKMEMSET chunkmemset_lasx +#define CHUNKMEMSET_SAFE chunkmemset_safe_lasx + +#include "chunkset_tpl.h" + +#define INFLATE_FAST inflate_fast_lasx + +#include "inffast_tpl.h" + +#endif diff --git a/arch/loongarch/lasxintrin_ext.h b/arch/loongarch/lasxintrin_ext.h index 920c143a..4ab85751 100644 --- a/arch/loongarch/lasxintrin_ext.h +++ b/arch/loongarch/lasxintrin_ext.h @@ -5,6 +5,7 @@ #ifndef LASXINTRIN_EXT_H #define LASXINTRIN_EXT_H +#include #include @@ -13,4 +14,28 @@ static inline int lasx_movemask_b(__m256i v) { return __lasx_xvpickve2gr_w(v, 0) | (__lasx_xvpickve2gr_w(v, 4) << 16); } +static inline __m256i lasx_castsi128_si256(__m128i v) +{ + return (__m256i) { v[0], v[1], 0, 0 }; +} + +static inline __m256i lasx_inserti128_si256(__m256i a, __m128i b, const int imm8) { + if (imm8 == 0) + return __lasx_xvpermi_q(a, lasx_castsi128_si256(b), 0x30); + else + return __lasx_xvpermi_q(a, lasx_castsi128_si256(b), 0x02); +} + +static inline __m256i lasx_zextsi128_si256(__m128i v) { + return (__m256i) { v[0], v[1], 0, 0 }; + /* return lasx_inserti128_si256(__lasx_xvreplgr2vr_w(0), v, 0); */ +} + +/* See: lsx_shuffle_b */ +static inline __m256i lasx_shuffle_b(__m256i a, __m256i b) { + __m256i msb_mask = __lasx_xvslti_b(b, 0); + __m256i dst = __lasx_xvshuf_b(a, a, __lasx_xvandi_b(b, 0xF)); + return __lasx_xvand_v(dst, __lasx_xvnor_v(msb_mask, msb_mask)); +} + #endif // include guard LASXINTRIN_EXT_H diff --git a/arch/loongarch/loongarch_functions.h b/arch/loongarch/loongarch_functions.h index c3a3db44..c70d6c13 100644 --- a/arch/loongarch/loongarch_functions.h +++ b/arch/loongarch/loongarch_functions.h @@ -33,6 +33,9 @@ void slide_hash_lasx(deflate_state *s); uint32_t longest_match_lasx(deflate_state *const s, Pos cur_match); uint32_t longest_match_slow_lasx(deflate_state *const s, Pos cur_match); # endif +uint32_t chunksize_lasx(void); +uint8_t* chunkmemset_safe_lasx(uint8_t *out, uint8_t *from, unsigned len, unsigned left); +void inflate_fast_lasx(PREFIX3(stream) *strm, uint32_t start); #endif #ifdef DISABLE_RUNTIME_CPU_DETECTION @@ -66,6 +69,12 @@ void slide_hash_lasx(deflate_state *s); # if defined(LOONGARCH_LASX) && defined(__loongarch_asx) # undef native_slide_hash # define native_slide_hash slide_hash_lasx +# undef native_chunksize +# define native_chunksize chunksize_lasx +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_lasx +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_lasx # ifdef HAVE_BUILTIN_CTZ # undef native_compare256 # define native_compare256 compare256_lasx diff --git a/arch/x86/chunkset_avx2.c b/arch/x86/chunkset_avx2.c index c7f336fd..28deb34e 100644 --- a/arch/x86/chunkset_avx2.c +++ b/arch/x86/chunkset_avx2.c @@ -5,7 +5,7 @@ #include "zmemory.h" #ifdef X86_AVX2 -#include "avx2_tables.h" +#include "arch/generic/chunk_256bit_perm_idx_lut.h" #include #include "x86_intrins.h" diff --git a/arch/x86/chunkset_avx512.c b/arch/x86/chunkset_avx512.c index db8c1eb2..fc27a45a 100644 --- a/arch/x86/chunkset_avx512.c +++ b/arch/x86/chunkset_avx512.c @@ -6,7 +6,7 @@ #ifdef X86_AVX512 -#include "avx2_tables.h" +#include "arch/generic/chunk_256bit_perm_idx_lut.h" #include #include "x86_intrins.h" diff --git a/configure b/configure index cc1f2edc..80fd5538 100755 --- a/configure +++ b/configure @@ -2325,8 +2325,8 @@ EOF CFLAGS="${CFLAGS} -DLOONGARCH_LASX" SFLAGS="${SFLAGS} -DLOONGARCH_LASX" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare256_lasx.o slide_hash_lasx.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare256_lasx.lo slide_hash_lasx.lo" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} chunkset_lasx.o compare256_lasx.o slide_hash_lasx.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} chunkset_lasx.lo compare256_lasx.lo slide_hash_lasx.lo" fi fi ;; diff --git a/functable.c b/functable.c index 8de8b399..02bd7d3f 100644 --- a/functable.c +++ b/functable.c @@ -298,6 +298,9 @@ static void init_functable(void) { ft.longest_match = &longest_match_lasx; ft.longest_match_slow = &longest_match_slow_lasx; # endif + ft.chunksize = &chunksize_lasx; + ft.chunkmemset_safe = &chunkmemset_safe_lasx; + ft.inflate_fast = &inflate_fast_lasx; } #endif