]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Rewrite the loops such that gcc can vectorize them using saturated-sub
authorshuxinyang <syang@shuxinyangs-mbp.gateway.2wire.net>
Mon, 10 Mar 2014 00:20:02 +0000 (17:20 -0700)
committerhansr <hansr@hk.drivdigital.no>
Tue, 7 Oct 2014 12:45:00 +0000 (14:45 +0200)
on x86-64 architecture. Speedup the performance by some 7% on my linux box
with corei7 archiecture.

  The original loop is legal to be vectorized; gcc 4.7.* and 4.8.*
somehow fail to catch this case. There are still have room to squeeze
from the vectorized code. However, since these loops now account for about
1.5% of execution time, it is not worthwhile to sequeeze the performance
via hand-writing assembly.

  The original loops are guarded with "#ifdef NOT_TWEAK_COMPILER". By
default, the modified version is picked up unless the code is compiled
explictly with -DNOT_TWEAK_COMPILER.

deflate.c

index 78e32f7ae27b06fa960a3a717ed294cb0170d576..185682daeaf921c35f9b485c33191618f067c437 100644 (file)
--- a/deflate.c
+++ b/deflate.c
@@ -1313,15 +1313,44 @@ local void fill_window_c(s)
              */
             n = s->hash_size;
             p = &s->head[n];
+#ifdef NOT_TWEAK_COMPILER
             do {
                 unsigned m;
                 m = *--p;
                 *p = (Pos)(m >= wsize ? m-wsize : NIL);
             } while (--n);
-
+#else
+            /* As of I make this change, gcc (4.8.*) isn't able to vectorize
+             * this hot loop using saturated-subtraction on x86-64 architecture.
+             * To avoid this defect, we can change the loop such that
+             *    o. the pointer advance forward, and
+             *    o. demote the variable 'm' to be local to the loop, and
+             *       choose type "Pos" (instead of 'unsigned int') for the
+             *       variable to avoid unncessary zero-extension.
+             */
+            {
+                int i; 
+                typeof(p) q = p - n;
+                for (i = 0; i < n; i++) {
+                    Pos m = *q;
+                    Pos t = wsize;
+                    *q++ = (Pos)(m >= t ? m-t: NIL);
+                }
+            }
+            
+            /* The following three assignments are unnecessary as the variable
+             * p, n and m are dead at this point. The rationale for these
+             * statements is to ease the reader to verify the two loops are
+             * equivalent.
+             */
+            p = p - n;
+            n = 0;
+            m = *p;
+#endif /* NOT_TWEAK_COMPILER */
             n = wsize;
 #ifndef FASTEST
             p = &s->prev[n];
+#ifdef NOT_TWEAK_COMPILER
             do {
                 unsigned m;
                 m = *--p;
@@ -1330,6 +1359,20 @@ local void fill_window_c(s)
                  * its value will never be used.
                  */
             } while (--n);
+#else
+            {
+                int i; 
+                typeof(p) q = p - n;
+                for (i = 0; i < n; i++) {
+                    Pos m = *q;
+                    Pos t = wsize;
+                    *q++ = (Pos)(m >= t ? m-t: NIL);
+                }
+                p = p - n;
+                m = *p;
+                n = 0;
+            }
+#endif /* NOT_TWEAK_COMPILER */
 #endif
             more += wsize;
         }