check_lsx_intrinsics()
if(HAVE_LSX_INTRIN)
add_definitions(-DLOONGARCH_LSX)
- set(LSX_SRCS ${ARCHDIR}/slide_hash_lsx.c)
+ set(LSX_SRCS ${ARCHDIR}/compare256_lsx.c ${ARCHDIR}/slide_hash_lsx.c)
list(APPEND ZLIB_ARCH_SRCS ${LSX_SRCS})
set_property(SOURCE ${LSX_SRCS} PROPERTY COMPILE_FLAGS "${LSXFLAG} ${NOLTOFLAG}")
else()
check_lasx_intrinsics()
if(HAVE_LASX_INTRIN AND HAVE_LSX_INTRIN)
add_definitions(-DLOONGARCH_LASX)
- set(LASX_SRCS ${ARCHDIR}/slide_hash_lasx.c)
+ set(LASX_SRCS ${ARCHDIR}/compare256_lasx.c ${ARCHDIR}/slide_hash_lasx.c)
list(APPEND ZLIB_ARCH_SRCS ${LASX_SRCS})
set_property(SOURCE ${LASX_SRCS} PROPERTY COMPILE_FLAGS "${LASXFLAG} ${NOLTOFLAG}")
else()
all: \
loongarch_features.o loongarch_features.lo \
crc32_la.o crc32_la.lo \
+ compare256_lasx.o compare256_lasx.lo \
+ compare256_lsx.o compare256_lsx.lo \
slide_hash_lasx.o slide_hash_lasx.lo \
slide_hash_lsx.o slide_hash_lsx.lo
crc32_la.lo: $(SRCDIR)/crc32_la.c
$(CC) $(SFLAGS) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_la.c
+compare256_lasx.o:
+ $(CC) $(CFLAGS) $(LASXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_lasx.c
+
+compare256_lasx.lo:
+ $(CC) $(SFLAGS) $(LASXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_lasx.c
+
+compare256_lsx.o:
+ $(CC) $(CFLAGS) $(LSXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_lsx.c
+
+compare256_lsx.lo:
+ $(CC) $(SFLAGS) $(LSXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_lsx.c
+
slide_hash_lasx.o:
$(CC) $(CFLAGS) $(LASXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_lasx.c
--- /dev/null
+/* compare256_lasx.c -- LASX version of compare256, based on Intel AVX2 implementation
+ * Copyright Mika T. Lindqvist <postmaster@raasu.org>
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zmemory.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+#if defined(LOONGARCH_LASX) && defined(HAVE_BUILTIN_CTZ)
+
+#include <lasxintrin.h>
+#include "lasxintrin_ext.h"
+
+static inline uint32_t compare256_lasx_static(const uint8_t *src0, const uint8_t *src1) {
+ uint32_t len = 0;
+
+ do {
+ __m256i ymm_src0, ymm_src1, ymm_cmp;
+ ymm_src0 = __lasx_xvld(src0, 0);
+ ymm_src1 = __lasx_xvld(src1, 0);
+ ymm_cmp = __lasx_xvseq_b(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
+ unsigned mask = (unsigned)lasx_movemask_b(ymm_cmp);
+ if (mask != 0xFFFFFFFF) {
+ uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); /* Invert bits so identical = 0 */
+ return len + match_byte;
+ }
+
+ src0 += 32, src1 += 32, len += 32;
+
+ ymm_src0 = __lasx_xvld(src0, 0);
+ ymm_src1 = __lasx_xvld(src1, 0);
+ ymm_cmp = __lasx_xvseq_b(ymm_src0, ymm_src1);
+ mask = (unsigned)lasx_movemask_b(ymm_cmp);
+ if (mask != 0xFFFFFFFF) {
+ uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+ return len + match_byte;
+ }
+
+ src0 += 32, src1 += 32, len += 32;
+ } while (len < 256);
+
+ return 256;
+}
+
+Z_INTERNAL uint32_t compare256_lasx(const uint8_t *src0, const uint8_t *src1) {
+ return compare256_lasx_static(src0, src1);
+}
+
+#define LONGEST_MATCH longest_match_lasx
+#define COMPARE256 compare256_lasx_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH longest_match_slow_lasx
+#define COMPARE256 compare256_lasx_static
+
+#include "match_tpl.h"
+
+#endif
--- /dev/null
+/* compare256_lsx.c -- LSX version of compare256, based on Intel SSE implementation
+ * Copyright Adam Stylinski <kungfujesus06@gmail.com>
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zmemory.h"
+#include "deflate.h"
+#include "fallback_builtins.h"
+
+#if defined(LOONGARCH_LSX) && defined(HAVE_BUILTIN_CTZ)
+
+#include <lsxintrin.h>
+#include "lsxintrin_ext.h"
+
+static inline uint32_t compare256_lsx_static(const uint8_t *src0, const uint8_t *src1) {
+ uint32_t len = 0;
+ int align_offset = ((uintptr_t)src0) & 15;
+ const uint8_t *end0 = src0 + 256;
+ const uint8_t *end1 = src1 + 256;
+ __m128i xmm_src0, xmm_src1, xmm_cmp;
+
+ /* Do the first load unaligned, than all subsequent ones we have at least
+ * one aligned load. Sadly aligning both loads is probably unrealistic */
+ xmm_src0 = __lsx_vld(src0, 0);
+ xmm_src1 = __lsx_vld(src1, 0);
+ xmm_cmp = __lsx_vseq_b(xmm_src0, xmm_src1);
+
+ unsigned mask = (unsigned)lsx_movemask_b(xmm_cmp);
+
+ /* Compiler _may_ turn this branch into a ptest + movemask,
+ * since a lot of those uops are shared and fused */
+ if (mask != 0xFFFF) {
+ uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+ return len + match_byte;
+ }
+
+ int align_adv = 16 - align_offset;
+ len += align_adv;
+ src0 += align_adv;
+ src1 += align_adv;
+
+ /* Do a flooring division (should just be a shift right) */
+ int num_iter = (256 - len) / 16;
+
+ for (int i = 0; i < num_iter; ++i) {
+ xmm_src0 = __lsx_vld(src0, 0);
+ xmm_src1 = __lsx_vld(src1, 0);
+ xmm_cmp = __lsx_vseq_b(xmm_src0, xmm_src1);
+
+ mask = (unsigned)lsx_movemask_b(xmm_cmp);
+
+ /* Compiler _may_ turn this branch into a ptest + movemask,
+ * since a lot of those uops are shared and fused */
+ if (mask != 0xFFFF) {
+ uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+ return len + match_byte;
+ }
+
+ len += 16, src0 += 16, src1 += 16;
+ }
+
+ if (align_offset) {
+ src0 = end0 - 16;
+ src1 = end1 - 16;
+ len = 256 - 16;
+
+ xmm_src0 = __lsx_vld(src0, 0);
+ xmm_src1 = __lsx_vld(src1, 0);
+ xmm_cmp = __lsx_vseq_b(xmm_src0, xmm_src1);
+
+ mask = (unsigned)lsx_movemask_b(xmm_cmp);
+
+ if (mask != 0xFFFF) {
+ uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+ return len + match_byte;
+ }
+ }
+
+ return 256;
+}
+
+Z_INTERNAL uint32_t compare256_lsx(const uint8_t *src0, const uint8_t *src1) {
+ return compare256_lsx_static(src0, src1);
+}
+
+#define LONGEST_MATCH longest_match_lsx
+#define COMPARE256 compare256_lsx_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH longest_match_slow_lsx
+#define COMPARE256 compare256_lsx_static
+
+#include "match_tpl.h"
+
+#endif
--- /dev/null
+/* lasxintrin_ext.h
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifndef LASXINTRIN_EXT_H
+#define LASXINTRIN_EXT_H
+
+#include <lasxintrin.h>
+
+
+static inline int lasx_movemask_b(__m256i v) {
+ v = __lasx_xvmskltz_b(v);
+ return __lasx_xvpickve2gr_w(v, 0) | (__lasx_xvpickve2gr_w(v, 4) << 16);
+}
+
+#endif // include guard LASXINTRIN_EXT_H
#ifdef LOONGARCH_LSX
void slide_hash_lsx(deflate_state *s);
+# ifdef HAVE_BUILTIN_CTZ
+ uint32_t compare256_lsx(const uint8_t *src0, const uint8_t *src1);
+ uint32_t longest_match_lsx(deflate_state *const s, Pos cur_match);
+ uint32_t longest_match_slow_lsx(deflate_state *const s, Pos cur_match);
+# endif
#endif
#ifdef LOONGARCH_LASX
void slide_hash_lasx(deflate_state *s);
+# ifdef HAVE_BUILTIN_CTZ
+ uint32_t compare256_lasx(const uint8_t *src0, const uint8_t *src1);
+ uint32_t longest_match_lasx(deflate_state *const s, Pos cur_match);
+ uint32_t longest_match_slow_lasx(deflate_state *const s, Pos cur_match);
+# endif
#endif
#ifdef DISABLE_RUNTIME_CPU_DETECTION
# if defined(LOONGARCH_LSX) && defined(__loongarch_sx)
# undef native_slide_hash
# define native_slide_hash slide_hash_lsx
+# ifdef HAVE_BUILTIN_CTZ
+# undef native_compare256
+# define native_compare256 compare256_lsx
+# undef native_longest_match
+# define native_longest_match longest_match_lsx
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_lsx
+# endif
# endif
# if defined(LOONGARCH_LASX) && defined(__loongarch_asx)
# undef native_slide_hash
# define native_slide_hash slide_hash_lasx
+# ifdef HAVE_BUILTIN_CTZ
+# undef native_compare256
+# define native_compare256 compare256_lasx
+# undef native_longest_match
+# define native_longest_match longest_match_lasx
+# undef native_longest_match_slow
+# define native_longest_match_slow longest_match_slow_lasx
+# endif
# endif
#endif
--- /dev/null
+/* lsxintrin_ext.h
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifndef LSXINTRIN_EXT_H
+#define LSXINTRIN_EXT_H
+
+#include <lsxintrin.h>
+
+
+static inline int lsx_movemask_b(__m128i v) {
+ return __lsx_vpickve2gr_w(__lsx_vmskltz_b(v), 0);
+}
+
+#endif // include guard LSXINTRIN_EXT_H
CFLAGS="${CFLAGS} -DLOONGARCH_LSX"
SFLAGS="${SFLAGS} -DLOONGARCH_LSX"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_hash_lsx.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_hash_lsx.lo"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare256_lsx.o slide_hash_lsx.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare256_lsx.lo slide_hash_lsx.lo"
fi
check_lasx_intrinsics
CFLAGS="${CFLAGS} -DLOONGARCH_LASX"
SFLAGS="${SFLAGS} -DLOONGARCH_LASX"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_hash_lasx.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_hash_lasx.lo"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare256_lasx.o slide_hash_lasx.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare256_lasx.lo slide_hash_lasx.lo"
fi
fi
;;
#ifdef LOONGARCH_LSX
if (cf.loongarch.has_lsx) {
ft.slide_hash = slide_hash_lsx;
+# ifdef HAVE_BUILTIN_CTZ
+ ft.compare256 = &compare256_lsx;
+ ft.longest_match = &longest_match_lsx;
+ ft.longest_match_slow = &longest_match_slow_lsx;
+# endif
}
#endif
#ifdef LOONGARCH_LASX
if (cf.loongarch.has_lasx) {
ft.slide_hash = slide_hash_lasx;
+# ifdef HAVE_BUILTIN_CTZ
+ ft.compare256 = &compare256_lasx;
+ ft.longest_match = &longest_match_lasx;
+ ft.longest_match_slow = &longest_match_slow_lasx;
+# endif
}
#endif
#ifdef RISCV_RVV
BENCHMARK_COMPARE256(rvv, compare256_rvv, test_cpu_features.riscv.has_rvv);
#endif
+#if defined(LOONGARCH_LSX) && defined(HAVE_BUILTIN_CTZ)
+BENCHMARK_COMPARE256(lsx, compare256_lsx, test_cpu_features.loongarch.has_lsx);
+#endif
+#if defined(LOONGARCH_LASX) && defined(HAVE_BUILTIN_CTZ)
+BENCHMARK_COMPARE256(lasx, compare256_lasx, test_cpu_features.loongarch.has_lasx);
+#endif
#endif
#ifdef RISCV_RVV
TEST_COMPARE256(rvv, compare256_rvv, test_cpu_features.riscv.has_rvv)
#endif
+#if defined(LOONGARCH_LSX) && defined(HAVE_BUILTIN_CTZ)
+TEST_COMPARE256(lsx, compare256_lsx, test_cpu_features.loongarch.has_lsx)
+#endif
+#if defined(LOONGARCH_LASX) && defined(HAVE_BUILTIN_CTZ)
+TEST_COMPARE256(lasx, compare256_lasx, test_cpu_features.loongarch.has_lasx)
+#endif
#endif