]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Add slide_hash to functable, and enable the sse2-optimized version.
authorHans Kristian Rosbach <hk-git@circlestorm.org>
Fri, 23 Aug 2019 18:25:26 +0000 (20:25 +0200)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Wed, 4 Sep 2019 06:53:36 +0000 (08:53 +0200)
Add necessary code to cmake and configure.
Fix slide_hash_sse2 to compile with zlib-ng.

CMakeLists.txt
arch/x86/slide_sse.c
configure
deflate.c
deflate.h
functable.c
functable.h

index 49a5f92703eb5cbd41252306cb9a026ee91038f3..7f709f92ceb6eacc66473c7f5dba2bf8e444c010 100644 (file)
@@ -592,6 +592,7 @@ if(WITH_OPTIM)
         if(HAVE_SSE2_INTRIN)
             add_definitions(-DX86_SSE2)
             set(ZLIB_ARCH_SRCS ${ZLIB_ARCH_SRCS} ${ARCHDIR}/fill_window_sse.c)
+            set(ZLIB_ARCH_SRCS ${ZLIB_ARCH_SRCS} ${ARCHDIR}/slide_sse.c)
             if(NOT ${ARCH} MATCHES "x86_64")
                 add_intrinsics_option("${SSE2FLAG}")
                 add_feature_info(FORCE_SSE2 FORCE_SSE2 "Assume CPU is SSE2 capable")
index 342fd562dd11521c73075f5a707acddd138d2fba..9d2ca2aa21ead6681f63e2b7aeb2b21cd8ec6894 100644 (file)
@@ -8,45 +8,39 @@
  *
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
-#include "deflate.h"
+#include "../../zbuild.h"
+#include "../../deflate.h"
 
-#ifdef USE_SSE_SLIDE
 #include <immintrin.h>
 
-void slide_hash_sse(deflate_state *s)
-{
+ZLIB_INTERNAL void slide_hash_sse2(deflate_state *s) {
+    Pos *p;
     unsigned n;
-    Posf *p;
-    uInt wsize = s->w_size;
-    z_const __m128i xmm_wsize = _mm_set1_epi16(s->w_size);
+    unsigned wsize = s->w_size;
+    const __m128i xmm_wsize = _mm_set1_epi16(s->w_size);
 
     n = s->hash_size;
     p = &s->head[n] - 8;
     do {
         __m128i value, result;
 
-       value = _mm_loadu_si128((__m128i *)p);
-       result= _mm_subs_epu16(value, xmm_wsize);
-       _mm_storeu_si128((__m128i *)p, result);
-       p -= 8;
-       n -= 8;
+        value = _mm_loadu_si128((__m128i *)p);
+        result= _mm_subs_epu16(value, xmm_wsize);
+        _mm_storeu_si128((__m128i *)p, result);
+        p -= 8;
+        n -= 8;
     } while (n > 0);
 
-#ifndef FASTEST
     n = wsize;
     p = &s->prev[n] - 8;
     do {
         __m128i value, result;
 
-       value = _mm_loadu_si128((__m128i *)p);
-       result= _mm_subs_epu16(value, xmm_wsize);
-       _mm_storeu_si128((__m128i *)p, result);
+        value = _mm_loadu_si128((__m128i *)p);
+        result= _mm_subs_epu16(value, xmm_wsize);
+        _mm_storeu_si128((__m128i *)p, result);
 
-       p -= 8;
-       n -= 8;
+        p -= 8;
+        n -= 8;
     } while (n > 0);
-#endif
 }
-
-#endif
-
index a6c7aa07924e599db1050e701727610b87301304..a00e7cf12657c7aa24fe99cbb9d1d36e3d8c2492 100755 (executable)
--- a/configure
+++ b/configure
@@ -994,8 +994,8 @@ case "${ARCH}" in
             if test ${HAVE_SSE2_INTRIN} -eq 1; then
                 CFLAGS="${CFLAGS} -DX86_SSE2"
                 SFLAGS="${SFLAGS} -DX86_SSE2"
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} fill_window_sse.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} fill_window_sse.lo"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} fill_window_sse.o slide_sse.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} fill_window_sse.lo slide_sse.lo"
 
                 if test $forcesse2 -eq 1; then
                     CFLAGS="${CFLAGS} -DX86_NOCHECK_SSE2"
@@ -1045,8 +1045,8 @@ case "${ARCH}" in
             CFLAGS="${CFLAGS} -DX86_CPUID -DX86_SSE2 -DX86_SSE42_CRC_HASH"
             SFLAGS="${SFLAGS} -DX86_CPUID -DX86_SSE2 -DX86_SSE42_CRC_HASH"
 
-            ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} x86.o fill_window_sse.o insert_string_sse.o"
-            ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} x86.lo fill_window_sse.lo insert_string_sse.lo"
+            ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} x86.o fill_window_sse.o insert_string_sse.o slide_sse.o"
+            ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} x86.lo fill_window_sse.lo insert_string_sse.lo slide_sse.lo"
 
             if test ${HAVE_SSE42CRC_INTRIN} -eq 1; then
                 CFLAGS="${CFLAGS} -DX86_SSE42_CRC_INTRIN"
index 5380b962134242dd9a7618d657063c829f7e06ea..8ee4f1be9a1542b4c86c2a2dc22cded5e7d846e2 100644 (file)
--- a/deflate.c
+++ b/deflate.c
@@ -105,7 +105,6 @@ typedef block_state (*compress_func) (deflate_state *s, int flush);
 /* Compression function. Returns the block state after the call. */
 
 static int deflateStateCheck      (PREFIX3(stream) *strm);
-static void slide_hash            (deflate_state *s);
 static block_state deflate_stored (deflate_state *s, int flush);
 ZLIB_INTERNAL block_state deflate_fast         (deflate_state *s, int flush);
 ZLIB_INTERNAL block_state deflate_quick        (deflate_state *s, int flush);
@@ -196,7 +195,7 @@ static const config configuration_table[10] = {
  * bit values at the expense of memory usage). We slide even when level == 0 to
  * keep the hash table consistent if we switch back to level > 0 later.
  */
-static void slide_hash(deflate_state *s) {
+ZLIB_INTERNAL void slide_hash_c(deflate_state *s) {
     unsigned n;
     Pos *p;
     unsigned int wsize = s->w_size;
@@ -639,7 +638,7 @@ int ZEXPORT PREFIX(deflateParams)(PREFIX3(stream) *strm, int level, int strategy
     if (s->level != level) {
         if (s->level == 0 && s->matches != 0) {
             if (s->matches == 1) {
-                slide_hash(s);
+                functable.slide_hash(s);
             } else {
                 CLEAR_HASH(s);
             }
@@ -1297,7 +1296,7 @@ void ZLIB_INTERNAL fill_window_c(deflate_state *s) {
             s->block_start -= (long) wsize;
             if (s->insert > s->strstart)
                 s->insert = s->strstart;
-            slide_hash(s);
+            functable.slide_hash(s);
             more += wsize;
         }
         if (s->strm->avail_in == 0)
index 99a4f5ca14606353f98cb61aae7973d6cea7ae30..a47cb72b059fe275b76d5fb88da9c6359ea5468b 100644 (file)
--- a/deflate.h
+++ b/deflate.h
@@ -330,6 +330,7 @@ static inline void put_short(deflate_state *s, uint16_t w) {
 
 
 void ZLIB_INTERNAL fill_window_c(deflate_state *s);
+void ZLIB_INTERNAL slide_hash_c(deflate_state *s);
 
         /* in trees.c */
 void ZLIB_INTERNAL zng_tr_init(deflate_state *s);
index b3020e000fce105a5aac9bb395a06388cc107fea..8ae6960529e9f4dc1d6892b8c0831897b77dfaed 100644 (file)
@@ -23,12 +23,18 @@ extern void fill_window_sse(deflate_state *s);
 extern void fill_window_arm(deflate_state *s);
 #endif
 
+/* slide_hash */
+#ifdef X86_SSE2
+void slide_hash_sse2(deflate_state *s);
+#endif
+
 /* adler32 */
 extern uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len);
 #if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && defined(ARM_NEON_ADLER32)
 extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len);
 #endif
 
+/* CRC32 */
 ZLIB_INTERNAL uint32_t crc32_generic(uint32_t, const unsigned char *, uint64_t);
 
 #ifdef DYNAMIC_CRC_TABLE
@@ -46,14 +52,22 @@ extern uint32_t crc32_little(uint32_t, const unsigned char *, uint64_t);
 extern uint32_t crc32_big(uint32_t, const unsigned char *, uint64_t);
 #endif
 
+
 /* stub definitions */
 ZLIB_INTERNAL Pos insert_string_stub(deflate_state *const s, const Pos str, unsigned int count);
 ZLIB_INTERNAL void fill_window_stub(deflate_state *s);
 ZLIB_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_t len);
 ZLIB_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64_t len);
+ZLIB_INTERNAL void slide_hash_stub(deflate_state *s);
 
 /* functable init */
-ZLIB_INTERNAL __thread struct functable_s functable = {fill_window_stub,insert_string_stub,adler32_stub,crc32_stub};
+ZLIB_INTERNAL __thread struct functable_s functable = {
+                                            fill_window_stub,
+                                            insert_string_stub,
+                                            adler32_stub,
+                                            crc32_stub,
+                                            slide_hash_stub
+                                          };
 
 
 /* stub functions */
@@ -88,6 +102,20 @@ ZLIB_INTERNAL void fill_window_stub(deflate_state *s) {
     functable.fill_window(s);
 }
 
+ZLIB_INTERNAL void slide_hash_stub(deflate_state *s) {
+    // Initialize default
+    functable.slide_hash=&slide_hash_c;
+
+    #ifdef X86_SSE2
+    # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
+    if (x86_cpu_has_sse2)
+    # endif
+        functable.slide_hash=&slide_hash_sse2;
+    #endif
+
+    functable.slide_hash(s);
+}
+
 ZLIB_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_t len) {
     // Initialize default
     functable.adler32=&adler32_c;
index 280651c32944ebab055ee6aeb4ac39876d4ecac2..a9c8e9b53192691901172b9e41bed0cc0caaa86c 100644 (file)
@@ -13,6 +13,7 @@ struct functable_s {
     Pos      (* insert_string)  (deflate_state *const s, const Pos str, unsigned int count);
     uint32_t (* adler32)        (uint32_t adler, const unsigned char *buf, size_t len);
     uint32_t (* crc32)          (uint32_t crc, const unsigned char *buf, uint64_t len);
+    void     (* slide_hash)     (deflate_state *s);
 };
 
 ZLIB_INTERNAL extern __thread struct functable_s functable;