check_lsx_intrinsics()
if(HAVE_LSX_INTRIN)
add_definitions(-DLOONGARCH_LSX)
- set(LSX_SRCS ${ARCHDIR}/compare256_lsx.c ${ARCHDIR}/slide_hash_lsx.c)
+ set(LSX_SRCS ${ARCHDIR}/chunkset_lsx.c ${ARCHDIR}/compare256_lsx.c ${ARCHDIR}/slide_hash_lsx.c)
list(APPEND ZLIB_ARCH_SRCS ${LSX_SRCS})
set_property(SOURCE ${LSX_SRCS} PROPERTY COMPILE_FLAGS "${LSXFLAG} ${NOLTOFLAG}")
else()
#include "neon_intrins.h"
#include "zbuild.h"
#include "zmemory.h"
-#include "arch/generic/chunk_permute_table.h"
+#include "arch/generic/chunk_128bit_perm_idx_lut.h"
typedef uint8x16_t chunk_t;
#define HAVE_CHUNKMEMSET_8
#define HAVE_CHUNK_MAG
-static const lut_rem_pair perm_idx_lut[13] = {
- {0, 1}, /* 3 */
- {0, 0}, /* don't care */
- {1 * 32, 1}, /* 5 */
- {2 * 32, 4}, /* 6 */
- {3 * 32, 2}, /* 7 */
- {0 * 32, 0}, /* don't care */
- {4 * 32, 7}, /* 9 */
- {5 * 32, 6}, /* 10 */
- {6 * 32, 5}, /* 11 */
- {7 * 32, 4}, /* 12 */
- {8 * 32, 3}, /* 13 */
- {9 * 32, 2}, /* 14 */
- {10 * 32, 1},/* 15 */
-};
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
*chunk = vreinterpretq_u8_u16(vdupq_n_u16(zng_memread_2(from)));
--- /dev/null
+/* chunk_128bit_perm_idx_lut.h - shared SSSE3/NEON/LSX permutation idx lut for use with chunkmemset family of functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CHUNK_128BIT_PERM_IDX_LUT_H_
+#define CHUNK_128BIT_PERM_IDX_LUT_H_
+
+#include "chunk_permute_table.h"
+
+static const lut_rem_pair perm_idx_lut[13] = {
+ {0, 1}, /* 3 */
+ {0, 0}, /* don't care */
+ {1 * 32, 1}, /* 5 */
+ {2 * 32, 4}, /* 6 */
+ {3 * 32, 2}, /* 7 */
+ {0 * 32, 0}, /* don't care */
+ {4 * 32, 7}, /* 9 */
+ {5 * 32, 6}, /* 10 */
+ {6 * 32, 5}, /* 11 */
+ {7 * 32, 4}, /* 12 */
+ {8 * 32, 3}, /* 13 */
+ {9 * 32, 2}, /* 14 */
+ {10 * 32, 1},/* 15 */
+};
+
+#endif
all: \
loongarch_features.o loongarch_features.lo \
crc32_la.o crc32_la.lo \
+ chunkset_lsx.o chunkset_lsx.lo \
compare256_lasx.o compare256_lasx.lo \
compare256_lsx.o compare256_lsx.lo \
slide_hash_lasx.o slide_hash_lasx.lo \
crc32_la.lo: $(SRCDIR)/crc32_la.c
$(CC) $(SFLAGS) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_la.c
+chunkset_lsx.o:
+ $(CC) $(CFLAGS) $(LSXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_lsx.c
+
+chunkset_lsx.lo:
+ $(CC) $(SFLAGS) $(LSXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_lsx.c
+
compare256_lasx.o:
$(CC) $(CFLAGS) $(LASXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_lasx.c
--- /dev/null
+/* chunkset_lsx.c -- LSX inline functions to copy small data chunks, based on Intel SSSE3 implementation
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zmemory.h"
+
+#if defined(LOONGARCH_LSX)
+#include <lsxintrin.h>
+#include "lsxintrin_ext.h"
+#include "arch/generic/chunk_128bit_perm_idx_lut.h"
+
+typedef __m128i chunk_t;
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+ *chunk = __lsx_vreplgr2vr_h(zng_memread_2(from));
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+ *chunk = __lsx_vreplgr2vr_w(zng_memread_4(from));
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+ *chunk = __lsx_vreplgr2vr_d(zng_memread_8(from));
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+ *chunk = __lsx_vld(s, 0);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+ __lsx_vst(*chunk, out, 0);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+ lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+ __m128i perm_vec, ret_vec;
+ /* Important to note:
+ * This is _not_ to subvert the memory sanitizer but to instead unpoison some
+ * bytes we willingly and purposefully load uninitialized that we swizzle over
+ * in a vector register, anyway. If what we assume is wrong about what is used,
+ * the memory sanitizer will still usefully flag it */
+ __msan_unpoison(buf + dist, 16 - dist);
+ ret_vec = __lsx_vld(buf, 0);
+ *chunk_rem = lut_rem.remval;
+
+ perm_vec = __lsx_vld(permute_table + lut_rem.idx, 0);
+ ret_vec = lsx_shuffle_b(ret_vec, perm_vec);
+
+ return ret_vec;
+}
+
+#define CHUNKSIZE chunksize_lsx
+#define CHUNKMEMSET chunkmemset_lsx
+#define CHUNKMEMSET_SAFE chunkmemset_safe_lsx
+#define CHUNKCOPY chunkcopy_lsx
+#define CHUNKUNROLL chunkunroll_lsx
+
+#include "chunkset_tpl.h"
+
+#define INFLATE_FAST inflate_fast_lsx
+
+#include "inffast_tpl.h"
+
+#endif
uint32_t longest_match_lsx(deflate_state *const s, Pos cur_match);
uint32_t longest_match_slow_lsx(deflate_state *const s, Pos cur_match);
# endif
+uint32_t chunksize_lsx(void);
+uint8_t* chunkmemset_safe_lsx(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+void inflate_fast_lsx(PREFIX3(stream) *strm, uint32_t start);
#endif
#ifdef LOONGARCH_LASX
# if defined(LOONGARCH_LSX) && defined(__loongarch_sx)
# undef native_slide_hash
# define native_slide_hash slide_hash_lsx
+# undef native_chunksize
+# define native_chunksize chunksize_lsx
+# undef native_chunkmemset_safe
+# define native_chunkmemset_safe chunkmemset_safe_lsx
+# undef native_inflate_fast
+# define native_inflate_fast inflate_fast_lsx
# ifdef HAVE_BUILTIN_CTZ
# undef native_compare256
# define native_compare256 compare256_lsx
return __lsx_vpickve2gr_w(__lsx_vmskltz_b(v), 0);
}
+static inline __m128i lsx_shuffle_b(__m128i a, __m128i b) {
+ /* most significant bit is set - negative 8-bit integer */
+ __m128i msb_mask = __lsx_vslti_b(b, 0);
+
+ /* shuffle, clear msb in indices vector b */
+ __m128i dst = __lsx_vshuf_b(a, a, __lsx_vandi_b(b, 0xF));
+
+ /* invert and apply mask - clear dst-element if b-msb is set */
+ return __lsx_vand_v(dst, __lsx_vnor_v(msb_mask, msb_mask));
+}
+
#endif // include guard LSXINTRIN_EXT_H
#if defined(X86_SSSE3)
#include <immintrin.h>
-#include "../generic/chunk_permute_table.h"
+#include "arch/generic/chunk_128bit_perm_idx_lut.h"
typedef __m128i chunk_t;
#define HAVE_CHUNKMEMSET_8
#define HAVE_CHUNK_MAG
-static const lut_rem_pair perm_idx_lut[13] = {
- {0, 1}, /* 3 */
- {0, 0}, /* don't care */
- {1 * 32, 1}, /* 5 */
- {2 * 32, 4}, /* 6 */
- {3 * 32, 2}, /* 7 */
- {0 * 32, 0}, /* don't care */
- {4 * 32, 7}, /* 9 */
- {5 * 32, 6}, /* 10 */
- {6 * 32, 5}, /* 11 */
- {7 * 32, 4}, /* 12 */
- {8 * 32, 3}, /* 13 */
- {9 * 32, 2}, /* 14 */
- {10 * 32, 1},/* 15 */
-};
-
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
*chunk = _mm_set1_epi16(zng_memread_2(from));
CFLAGS="${CFLAGS} -DLOONGARCH_LSX"
SFLAGS="${SFLAGS} -DLOONGARCH_LSX"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare256_lsx.o slide_hash_lsx.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare256_lsx.lo slide_hash_lsx.lo"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} chunkset_lsx.o compare256_lsx.o slide_hash_lsx.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} chunkset_lsx.lo compare256_lsx.lo slide_hash_lsx.lo"
fi
check_lasx_intrinsics
ft.longest_match = &longest_match_lsx;
ft.longest_match_slow = &longest_match_slow_lsx;
# endif
+ ft.chunksize = &chunksize_lsx;
+ ft.chunkmemset_safe = &chunkmemset_safe_lsx;
+ ft.inflate_fast = &inflate_fast_lsx;
}
#endif
#ifdef LOONGARCH_LASX