check_lsx_intrinsics()
if(HAVE_LSX_INTRIN)
add_definitions(-DLOONGARCH_LSX)
- #set(LSX_SRCS ${ARCHDIR}/)
- #list(APPEND ZLIB_ARCH_SRCS ${LSX_SRCS})
- #set_property(SOURCE ${LSX_SRCS} PROPERTY COMPILE_FLAGS "${LSXFLAG} ${NOLTOFLAG}")
+ set(LSX_SRCS ${ARCHDIR}/slide_hash_lsx.c)
+ list(APPEND ZLIB_ARCH_SRCS ${LSX_SRCS})
+ set_property(SOURCE ${LSX_SRCS} PROPERTY COMPILE_FLAGS "${LSXFLAG} ${NOLTOFLAG}")
else()
set(HAVE_LSX_INTRIN OFF)
endif()
check_lasx_intrinsics()
if(HAVE_LASX_INTRIN AND HAVE_LSX_INTRIN)
add_definitions(-DLOONGARCH_LASX)
- #set(LASX_SRCS ${ARCHDIR}/)
- #list(APPEND ZLIB_ARCH_SRCS ${LASX_SRCS})
- #set_property(SOURCE ${LASX_SRCS} PROPERTY COMPILE_FLAGS "${LASXFLAG} ${NOLTOFLAG}")
+ set(LASX_SRCS ${ARCHDIR}/slide_hash_lasx.c)
+ list(APPEND ZLIB_ARCH_SRCS ${LASX_SRCS})
+ set_property(SOURCE ${LASX_SRCS} PROPERTY COMPILE_FLAGS "${LASXFLAG} ${NOLTOFLAG}")
else()
set(HAVE_LASX_INTRIN OFF)
endif()
all: \
loongarch_features.o loongarch_features.lo \
- crc32_la.o crc32_la.lo
+ crc32_la.o crc32_la.lo \
+ slide_hash_lasx.o slide_hash_lasx.lo \
+ slide_hash_lsx.o slide_hash_lsx.lo
loongarch_features.o: $(SRCDIR)/loongarch_features.c
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/loongarch_features.c
crc32_la.lo: $(SRCDIR)/crc32_la.c
$(CC) $(SFLAGS) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_la.c
+slide_hash_lasx.o:
+ $(CC) $(CFLAGS) $(LASXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_lasx.c
+
+slide_hash_lasx.lo:
+ $(CC) $(SFLAGS) $(LASXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_lasx.c
+
+slide_hash_lsx.o:
+ $(CC) $(CFLAGS) $(LSXFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_lsx.c
+
+slide_hash_lsx.lo:
+ $(CC) $(SFLAGS) $(LSXFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_lsx.c
+
mostlyclean: clean
clean:
rm -f *.o *.lo *~
uint32_t crc32_loongarch64(uint32_t crc, const uint8_t *buf, size_t len);
#endif
+#ifdef LOONGARCH_LSX
+void slide_hash_lsx(deflate_state *s);
+#endif
+
+#ifdef LOONGARCH_LASX
+void slide_hash_lasx(deflate_state *s);
+#endif
+
#ifdef DISABLE_RUNTIME_CPU_DETECTION
// LOONGARCH - CRC32 - All known CPUs has crc instructions
# if defined(LOONGARCH_CRC)
# define native_crc32 crc32_loongarch64
# endif
# if defined(LOONGARCH_LSX) && defined(__loongarch_sx)
+# undef native_slide_hash
+# define native_slide_hash slide_hash_lsx
# endif
# if defined(LOONGARCH_LASX) && defined(__loongarch_asx)
+# undef native_slide_hash
+# define native_slide_hash slide_hash_lasx
# endif
#endif
--- /dev/null
+/*
+ * LASX optimized hash slide, based on Intel AVX2 implementation
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * Authors:
+ * Arjan van de Ven <arjan@linux.intel.com>
+ * Jim Kukunas <james.t.kukunas@linux.intel.com>
+ * Mika T. Lindqvist <postmaster@raasu.org>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "zbuild.h"
+#include "deflate.h"
+
+#include <lasxintrin.h>
+
+static inline void slide_hash_chain(Pos *table, uint32_t entries, const __m256i wsize) {
+ table += entries;
+ table -= 16;
+
+ do {
+ __m256i value, result;
+
+ value = __lasx_xvld(table, 0);
+ result = __lasx_xvssub_hu(value, wsize);
+ __lasx_xvst(result, table, 0);
+
+ table -= 16;
+ entries -= 16;
+ } while (entries > 0);
+}
+
+Z_INTERNAL void slide_hash_lasx(deflate_state *s) {
+ Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+ uint16_t wsize = (uint16_t)s->w_size;
+ const __m256i ymm_wsize = __lasx_xvreplgr2vr_h((short)wsize);
+
+ slide_hash_chain(s->head, HASH_SIZE, ymm_wsize);
+ slide_hash_chain(s->prev, wsize, ymm_wsize);
+}
--- /dev/null
+/*
+ * LSX optimized hash slide, based on Intel SSE implementation
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Copyright (C) 2025 Vladislav Shchapov <vladislav@shchapov.ru>
+ * Authors:
+ * Arjan van de Ven <arjan@linux.intel.com>
+ * Jim Kukunas <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "zbuild.h"
+#include "deflate.h"
+
+#include <lsxintrin.h>
+#include <assert.h>
+
+static inline void slide_hash_chain(Pos *table0, Pos *table1, uint32_t entries0,
+ uint32_t entries1, const __m128i wsize) {
+ uint32_t entries;
+ Pos *table;
+ __m128i value0, value1, result0, result1;
+
+ int on_chain = 0;
+
+next_chain:
+ table = (on_chain) ? table1 : table0;
+ entries = (on_chain) ? entries1 : entries0;
+
+ table += entries;
+ table -= 16;
+
+ /* ZALLOC allocates this pointer unless the user chose a custom allocator.
+ * Our alloc function is aligned to 64 byte boundaries */
+ do {
+ value0 = __lsx_vld(table, 0);
+ value1 = __lsx_vld(table, 16);
+ result0 = __lsx_vssub_hu(value0, wsize);
+ result1 = __lsx_vssub_hu(value1, wsize);
+ __lsx_vst(result0, table, 0);
+ __lsx_vst(result1, table, 16);
+
+ table -= 16;
+ entries -= 16;
+ } while (entries > 0);
+
+ ++on_chain;
+ if (on_chain > 1) {
+ return;
+ } else {
+ goto next_chain;
+ }
+}
+
+Z_INTERNAL void slide_hash_lsx(deflate_state *s) {
+ Assert(s->w_size <= UINT16_MAX, "w_size should fit in uint16_t");
+ uint16_t wsize = (uint16_t)s->w_size;
+ const __m128i xmm_wsize = __lsx_vreplgr2vr_h((short)wsize);
+
+ assert(((uintptr_t)s->head & 15) == 0);
+ assert(((uintptr_t)s->prev & 15) == 0);
+
+ slide_hash_chain(s->head, s->prev, HASH_SIZE, wsize, xmm_wsize);
+}
CFLAGS="${CFLAGS} -DLOONGARCH_LSX"
SFLAGS="${SFLAGS} -DLOONGARCH_LSX"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_hash_lsx.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_hash_lsx.lo"
fi
check_lasx_intrinsics
CFLAGS="${CFLAGS} -DLOONGARCH_LASX"
SFLAGS="${SFLAGS} -DLOONGARCH_LASX"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_hash_lasx.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_hash_lasx.lo"
fi
fi
;;
ft.crc32 = crc32_loongarch64;
}
#endif
+#ifdef LOONGARCH_LSX
+ if (cf.loongarch.has_lsx) {
+ ft.slide_hash = slide_hash_lsx;
+ }
+#endif
+#ifdef LOONGARCH_LASX
+ if (cf.loongarch.has_lasx) {
+ ft.slide_hash = slide_hash_lasx;
+ }
+#endif
// Assign function pointers individually for atomic operation
FUNCTABLE_ASSIGN(ft, force_init);
#ifdef X86_AVX2
BENCHMARK_SLIDEHASH(avx2, slide_hash_avx2, test_cpu_features.x86.has_avx2);
#endif
+#ifdef LOONGARCH_LSX
+BENCHMARK_SLIDEHASH(lsx, slide_hash_lsx, test_cpu_features.loongarch.has_lsx);
+#endif
+#ifdef LOONGARCH_LASX
+BENCHMARK_SLIDEHASH(lasx, slide_hash_lasx, test_cpu_features.loongarch.has_lasx);
+#endif
#endif