From: Ilya Tokar <tokarip@google.com>
Date: Wed, 18 Sep 2024 21:36:37 +0000 (-0400)
Subject: Optimize compression by avoiding unpredictable branches
X-Git-Tag: v1.5.7^2~76^2~9
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e8fce38954efd6bc58a5b0ec72dd26f41c8365e6;p=thirdparty%2Fzstd.git

Optimize compression by avoiding unpredictable branches

Avoid unpredictable branch. Use conditional move to generate the address
that is guaranteed to be safe and compare unconditionally.
Instead of

if (idx < limit && x[idx] == val ) // mispredicted idx < limit branch

Do

addr = cmov(safe,x+idx)
if (*addr == val && idx < limit) // almost always false so well predicted

Using microbenchmarks from https://github.com/google/fleetbench,
I get about ~10% speed-up:

name                                                                                          old cpu/op   new cpu/op    delta
BM_ZSTD_COMPRESS_Fleet/compression_level:-7/window_log:15                                     1.46ns ± 3%   1.31ns ± 7%   -9.88%  (p=0.000 n=35+38)
BM_ZSTD_COMPRESS_Fleet/compression_level:-7/window_log:16                                     1.41ns ± 3%   1.28ns ± 3%   -9.56%  (p=0.000 n=36+39)
BM_ZSTD_COMPRESS_Fleet/compression_level:-5/window_log:15                                     1.61ns ± 1%   1.43ns ± 3%  -10.70%  (p=0.000 n=30+39)
BM_ZSTD_COMPRESS_Fleet/compression_level:-5/window_log:16                                     1.54ns ± 2%   1.39ns ± 3%   -9.21%  (p=0.000 n=37+39)
BM_ZSTD_COMPRESS_Fleet/compression_level:-3/window_log:15                                     1.82ns ± 2%   1.61ns ± 3%  -11.31%  (p=0.000 n=37+40)
BM_ZSTD_COMPRESS_Fleet/compression_level:-3/window_log:16                                     1.73ns ± 3%   1.56ns ± 3%   -9.50%  (p=0.000 n=38+39)
BM_ZSTD_COMPRESS_Fleet/compression_level:-1/window_log:15                                     2.12ns ± 2%   1.79ns ± 3%  -15.55%  (p=0.000 n=34+39)
BM_ZSTD_COMPRESS_Fleet/compression_level:-1/window_log:16                                     1.99ns ± 3%   1.72ns ± 3%  -13.70%  (p=0.000 n=38+38)
BM_ZSTD_COMPRESS_Fleet/compression_level:0/window_log:15                                      3.22ns ± 3%   2.94ns ± 3%   -8.67%  (p=0.000 n=38+40)
BM_ZSTD_COMPRESS_Fleet/compression_level:0/window_log:16                                      3.19ns ± 4%   2.86ns ± 4%  -10.55%  (p=0.000 n=40+38)
BM_ZSTD_COMPRESS_Fleet/compression_level:1/window_log:15                                      2.60ns ± 3%   2.22ns ± 3%  -14.53%  (p=0.000 n=40+39)
BM_ZSTD_COMPRESS_Fleet/compression_level:1/window_log:16                                      2.46ns ± 3%   2.13ns ± 2%  -13.67%  (p=0.000 n=39+36)
BM_ZSTD_COMPRESS_Fleet/compression_level:2/window_log:15                                      2.69ns ± 3%   2.46ns ± 3%   -8.63%  (p=0.000 n=37+39)
BM_ZSTD_COMPRESS_Fleet/compression_level:2/window_log:16                                      2.63ns ± 3%   2.36ns ± 3%  -10.47%  (p=0.000 n=40+40)
BM_ZSTD_COMPRESS_Fleet/compression_level:3/window_log:15                                      3.20ns ± 2%   2.95ns ± 3%   -7.94%  (p=0.000 n=35+40)
BM_ZSTD_COMPRESS_Fleet/compression_level:3/window_log:16                                      3.20ns ± 4%   2.87ns ± 4%  -10.33%  (p=0.000 n=40+40)

I've also measured the impact on internal workloads and saw similar
~10% improvement in performance, measured by cpu usage/byte of data.
---

diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h
index ba1450852..cbcded28d 100644
--- a/lib/compress/zstd_compress_internal.h
+++ b/lib/compress/zstd_compress_internal.h
@@ -557,6 +557,23 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value)
     return 1;
 }
 
+/* ZSTD_selectAddr:
+ * @return a >= b ? trueAddr : falseAddr,
+ * tries to force branchless codegen. */
+MEM_STATIC const BYTE* ZSTD_selectAddr(U32 a, U32 b, const BYTE* trueAddr, const BYTE* falseAddr) {
+#if defined(__GNUC__) && defined(__x86_64__)
+    __asm__ (
+        "cmp %1, %2\n"
+        "cmova %3, %0\n"
+        : "+r"(trueAddr)
+        : "r"(a), "r"(b), "r"(falseAddr)
+        );
+    return trueAddr;
+#else
+    return a >= b ? trueAddr : falseAddr;
+#endif
+}
+
 /* ZSTD_noCompressBlock() :
  * Writes uncompressed block to dst buffer from given src.
  * Returns the size of the block */
diff --git a/lib/compress/zstd_double_fast.c b/lib/compress/zstd_double_fast.c
index 1b5f64f20..819658caf 100644
--- a/lib/compress/zstd_double_fast.c
+++ b/lib/compress/zstd_double_fast.c
@@ -140,11 +140,17 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
     U32 idxl1; /* the long match index for ip1 */
 
     const BYTE* matchl0; /* the long match for ip */
+    const BYTE* matchl0_safe; /* matchl0 or safe address */
     const BYTE* matchs0; /* the short match for ip */
     const BYTE* matchl1; /* the long match for ip1 */
+    const BYTE* matchs0_safe; /* matchs0 or safe address */
 
     const BYTE* ip = istart; /* the current position */
     const BYTE* ip1; /* the next position */
+    /* Array of ~random data, should have low probability of matching data
+     * we load from here instead of from tables, if matchl0/matchl1 are
+     * invalid indices. Used to avoid unpredictable branches. */
+    const BYTE dummy[] = {0x12,0x34,0x56,0x78,0x9a,0xbc,0xde,0xf0,0xe2,0xb4};
 
     DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_noDict_generic");
 
@@ -191,24 +197,29 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
 
             hl1 = ZSTD_hashPtr(ip1, hBitsL, 8);
 
-            if (idxl0 > prefixLowestIndex) {
-                /* check prefix long match */
-                if (MEM_read64(matchl0) == MEM_read64(ip)) {
-                    mLength = ZSTD_count(ip+8, matchl0+8, iend) + 8;
-                    offset = (U32)(ip-matchl0);
-                    while (((ip>anchor) & (matchl0>prefixLowest)) && (ip[-1] == matchl0[-1])) { ip--; matchl0--; mLength++; } /* catch up */
-                    goto _match_found;
-                }
+            /* idxl0 > prefixLowestIndex is a (somewhat) unpredictable branch.
+             * However expression below complies into conditional move. Since
+             * match is unlikely and we only *branch* on idxl0 > prefixLowestIndex
+             * if there is a match, all branches become predictable. */
+            matchl0_safe = ZSTD_selectAddr(prefixLowestIndex, idxl0, &dummy[0], matchl0);
+
+            /* check prefix long match */
+            if (MEM_read64(matchl0_safe) == MEM_read64(ip) && matchl0_safe == matchl0) {
+                mLength = ZSTD_count(ip+8, matchl0+8, iend) + 8;
+                offset = (U32)(ip-matchl0);
+                while (((ip>anchor) & (matchl0>prefixLowest)) && (ip[-1] == matchl0[-1])) { ip--; matchl0--; mLength++; } /* catch up */
+                goto _match_found;
             }
 
             idxl1 = hashLong[hl1];
             matchl1 = base + idxl1;
 
-            if (idxs0 > prefixLowestIndex) {
-                /* check prefix short match */
-                if (MEM_read32(matchs0) == MEM_read32(ip)) {
-                    goto _search_next_long;
-                }
+            /* Same optimization as matchl0 above */
+            matchs0_safe = ZSTD_selectAddr(prefixLowestIndex, idxs0, &dummy[0], matchs0);
+
+            /* check prefix short match */
+            if(MEM_read32(matchs0_safe) == MEM_read32(ip) && matchs0_safe == matchs0) {
+                  goto _search_next_long;
             }
 
             if (ip1 >= nextStep) {
diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c
index c6baa49d4..838a18ee5 100644
--- a/lib/compress/zstd_fast.c
+++ b/lib/compress/zstd_fast.c
@@ -162,6 +162,11 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
     const BYTE* const prefixStart = base + prefixStartIndex;
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - HASH_READ_SIZE;
+    /* Array of ~random data, should have low probability of matching data
+     * we load from here instead of from tables, if the index is invalid.
+     * Used to avoid unpredictable branches. */
+    const BYTE dummy[] = {0x12,0x34,0x56,0x78,0x9a,0xbc,0xde,0xf0,0xe2,0xb4};
+    const BYTE *mvalAddr;
 
     const BYTE* anchor = istart;
     const BYTE* ip0 = istart;
@@ -246,15 +251,18 @@ _start: /* Requires: ip0 */
             goto _match;
         }
 
+        /* idx >= prefixStartIndex is a (somewhat) unpredictable branch.
+         * However expression below complies into conditional move. Since
+         * match is unlikely and we only *branch* on idxl0 > prefixLowestIndex
+         * if there is a match, all branches become predictable. */
+        mvalAddr = base + idx;
+        mvalAddr = ZSTD_selectAddr(idx, prefixStartIndex, mvalAddr, &dummy[0]);
+
         /* load match for ip[0] */
-        if (idx >= prefixStartIndex) {
-            mval = MEM_read32(base + idx);
-        } else {
-            mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */
-        }
+        mval = MEM_read32(mvalAddr);
 
         /* check match at ip[0] */
-        if (MEM_read32(ip0) == mval) {
+        if (MEM_read32(ip0) == mval && idx >= prefixStartIndex) {
             /* found a match! */
 
             /* First write next hash table entry; we've already calculated it.
@@ -281,15 +289,15 @@ _start: /* Requires: ip0 */
         current0 = (U32)(ip0 - base);
         hashTable[hash0] = current0;
 
+        mvalAddr = base + idx;
+        mvalAddr = ZSTD_selectAddr(idx, prefixStartIndex, mvalAddr, &dummy[0]);
+
         /* load match for ip[0] */
-        if (idx >= prefixStartIndex) {
-            mval = MEM_read32(base + idx);
-        } else {
-            mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */
-        }
+        mval = MEM_read32(mvalAddr);
+
 
         /* check match at ip[0] */
-        if (MEM_read32(ip0) == mval) {
+        if (MEM_read32(ip0) == mval && idx >= prefixStartIndex) {
             /* found a match! */
 
             /* first write next hash table entry; we've already calculated it */