Excessive loop unrolling is detrimental to performance. This patch
adds a preprocessor define, ADLER32_UNROLL_LESS, to reduce unrolling
factor from 16 to 8.
Updates configure script to set as default on x86
/* do length NMAX blocks -- requires just one modulo operation */
while (len >= NMAX) {
len -= NMAX;
+#ifndef ADLER32_UNROLL_LESS
n = NMAX / 16; /* NMAX is divisible by 16 */
+#else
+ n = NMAX / 8; /* NMAX is divisible by 8 */
+#endif
do {
+#ifndef ADLER32_UNROLL_LESS
DO16(buf); /* 16 sums unrolled */
buf += 16;
+#else
+ DO8(buf,0); /* 8 sums unrolled */
+ buf += 8;
+#endif
} while (--n);
MOD(adler);
MOD(sum2);
/* do remaining bytes (less than NMAX, still just one modulo) */
if (len) { /* avoid modulos if none remaining */
+#ifndef ADLER32_UNROLL_LESS
while (len >= 16) {
len -= 16;
DO16(buf);
buf += 16;
+#else
+ while (len >= 8) {
+ len -= 8;
+ DO8(buf, 0);
+ buf += 8;
+#endif
}
while (len--) {
adler += *buf++;
CFLAGS="${CFLAGS} -DUNALIGNED_OK"
SFLAGS="${SFLAGS} -DUNALIGNED_OK"
+
+ CFLAGS="${CFLAGS} -DADLER32_UNROLL_LESS"
+ SFLAGS="${SFLAGS} -DADLER32_UNROLL_LESS"
;;
i386 | i486 | i586 | i686)
OBJC="${OBJC} x86.o"
CFLAGS="${CFLAGS} -DUNALIGNED_OK"
SFLAGS="${SFLAGS} -DUNALIGNED_OK"
+
+ CFLAGS="${CFLAGS} -DADLER32_UNROLL_LESS"
+ SFLAGS="${SFLAGS} -DADLER32_UNROLL_LESS"
;;
esac