From 5545321e6144c36842e4bdb717000c0098f5698c Mon Sep 17 00:00:00 2001 From: shuxinyang Date: Sun, 9 Mar 2014 17:20:02 -0700 Subject: [PATCH] Rewrite the loops such that gcc can vectorize them using saturated-sub on x86-64 architecture. Speedup the performance by some 7% on my linux box with corei7 archiecture. The original loop is legal to be vectorized; gcc 4.7.* and 4.8.* somehow fail to catch this case. There are still have room to squeeze from the vectorized code. However, since these loops now account for about 1.5% of execution time, it is not worthwhile to sequeeze the performance via hand-writing assembly. The original loops are guarded with "#ifdef NOT_TWEAK_COMPILER". By default, the modified version is picked up unless the code is compiled explictly with -DNOT_TWEAK_COMPILER. --- deflate.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/deflate.c b/deflate.c index 78e32f7ae..185682dae 100644 --- a/deflate.c +++ b/deflate.c @@ -1313,15 +1313,44 @@ local void fill_window_c(s) */ n = s->hash_size; p = &s->head[n]; +#ifdef NOT_TWEAK_COMPILER do { unsigned m; m = *--p; *p = (Pos)(m >= wsize ? m-wsize : NIL); } while (--n); - +#else + /* As of I make this change, gcc (4.8.*) isn't able to vectorize + * this hot loop using saturated-subtraction on x86-64 architecture. + * To avoid this defect, we can change the loop such that + * o. the pointer advance forward, and + * o. demote the variable 'm' to be local to the loop, and + * choose type "Pos" (instead of 'unsigned int') for the + * variable to avoid unncessary zero-extension. + */ + { + int i; + typeof(p) q = p - n; + for (i = 0; i < n; i++) { + Pos m = *q; + Pos t = wsize; + *q++ = (Pos)(m >= t ? m-t: NIL); + } + } + + /* The following three assignments are unnecessary as the variable + * p, n and m are dead at this point. The rationale for these + * statements is to ease the reader to verify the two loops are + * equivalent. + */ + p = p - n; + n = 0; + m = *p; +#endif /* NOT_TWEAK_COMPILER */ n = wsize; #ifndef FASTEST p = &s->prev[n]; +#ifdef NOT_TWEAK_COMPILER do { unsigned m; m = *--p; @@ -1330,6 +1359,20 @@ local void fill_window_c(s) * its value will never be used. */ } while (--n); +#else + { + int i; + typeof(p) q = p - n; + for (i = 0; i < n; i++) { + Pos m = *q; + Pos t = wsize; + *q++ = (Pos)(m >= t ? m-t: NIL); + } + p = p - n; + m = *p; + n = 0; + } +#endif /* NOT_TWEAK_COMPILER */ #endif more += wsize; } -- 2.47.3