]> git.ipfire.org Git - thirdparty/vectorscan.git/commitdiff
Reinforced Teddy with 1-byte approach, based on "shift-or" and AVX2.
authorChang, Harry <harry.chang@intel.com>
Sun, 22 Jan 2017 20:23:25 +0000 (12:23 -0800)
committerMatthew Barr <matthew.barr@intel.com>
Mon, 21 Aug 2017 01:10:11 +0000 (11:10 +1000)
src/fdr/fdr_confirm.h
src/fdr/fdr_confirm_compile.cpp
src/fdr/fdr_confirm_runtime.h
src/fdr/teddy.c
src/fdr/teddy_avx2.c
src/fdr/teddy_compile.cpp
src/fdr/teddy_engine_description.cpp
src/fdr/teddy_engine_description.h
src/fdr/teddy_internal.h
src/fdr/teddy_runtime_common.h

index e160b96d149ca3085d16270607bdfa7cc876f918..d975747e61d95af53e6c6a5d83f0b8a034737e52 100644 (file)
@@ -78,12 +78,8 @@ struct LitInfo {
 struct FDRConfirm {
     CONF_TYPE andmsk;
     CONF_TYPE mult;
-    u32 nBitsOrSoleID; // if flags is NO_CONFIRM then this is soleID
-    u32 flags;  // sole meaning is 'non-zero means no-confirm' (that is all)
+    u32 nBits;
     hwlm_group_t groups;
-    u32 soleLitSize;
-    u32 soleLitCmp;
-    u32 soleLitMsk;
 };
 
 static really_inline
index 616ff86ed057e8fbc83b5dd3fd7b120380019228..a6eee4cfa21727648a457a9b45bec0349dfa9159 100644 (file)
@@ -130,7 +130,7 @@ void fillLitInfo(const vector<hwlmLiteral> &lits, vector<LitInfo> &tmpLitInfo,
 
 static
 bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
-                                       bool make_small, bool make_confirm) {
+                                       bool make_small) {
     // Every literal must fit within CONF_TYPE.
     assert(all_of_in(lits, [](const hwlmLiteral &lit) {
         return lit.s.size() <= sizeof(CONF_TYPE);
@@ -153,42 +153,6 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
     }
 
     CONF_TYPE mult = (CONF_TYPE)0x0b4e0ef37bc32127ULL;
-    u32 flags = 0;
-    // we use next three variables for 'confirmless' case to speed-up
-    // confirmation process
-    u32 soleLitSize = 0;
-    u32 soleLitCmp = 0;
-    u32 soleLitMsk = 0;
-
-    if (!make_confirm) {
-        flags = FDRC_FLAG_NO_CONFIRM;
-        if (lits[0].noruns) {
-            // messy - need to clean this up later as flags is sorta kinda
-            // obsoleted
-            flags |= FDRC_FLAG_NOREPEAT;
-        }
-        mult = 0;
-        soleLitSize = lits[0].s.size() - 1;
-        // we can get to this point only in confirmless case;
-        // it means that we have only one literal per FDRConfirm (no packing),
-        // with no literal mask and size of literal is less or equal
-        // to the number of masks of Teddy engine;
-        // maximum number of masks for Teddy is 4, so the size of
-        // literal is definitely less or equal to size of u32
-        assert(lits[0].s.size() <= sizeof(u32));
-        for (u32 i = 0; i < lits[0].s.size(); i++) {
-            u32 shiftLoc = (sizeof(u32) - i - 1) * 8;
-            u8 c = lits[0].s[lits[0].s.size() - i - 1];
-            if (lits[0].nocase && ourisalpha(c)) {
-                soleLitCmp |= (u32)(c & CASE_CLEAR) << shiftLoc;
-                soleLitMsk |= (u32)CASE_CLEAR << shiftLoc;
-            }
-            else {
-                soleLitCmp |= (u32)c << shiftLoc;
-                soleLitMsk |= (u32)0xff << shiftLoc;
-            }
-        }
-    }
 
     // we can walk the vector and assign elements from the vectors to a
     // map by hash value
@@ -276,11 +240,7 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
 
     fdrc->andmsk = andmsk;
     fdrc->mult = mult;
-    fdrc->nBitsOrSoleID = (flags & FDRC_FLAG_NO_CONFIRM) ? lits[0].id : nBits;
-    fdrc->flags = flags;
-    fdrc->soleLitSize = soleLitSize;
-    fdrc->soleLitCmp = soleLitCmp;
-    fdrc->soleLitMsk = soleLitMsk;
+    fdrc->nBits = nBits;
 
     fdrc->groups = gm;
 
@@ -334,12 +294,8 @@ setupFullConfs(const vector<hwlmLiteral> &lits,
                const EngineDescription &eng,
                map<BucketIndex, vector<LiteralIndex>> &bucketToLits,
                bool make_small) {
-    bool makeConfirm = true;
     unique_ptr<TeddyEngineDescription> teddyDescr =
         getTeddyDescription(eng.getID());
-    if (teddyDescr) {
-        makeConfirm = teddyDescr->needConfirm(lits);
-    }
 
     BC2CONF bc2Conf;
     u32 totalConfirmSize = 0;
@@ -351,7 +307,7 @@ setupFullConfs(const vector<hwlmLiteral> &lits,
             }
 
             DEBUG_PRINTF("b %d sz %zu\n", b, vl.size());
-            auto fc = getFDRConfirm(vl, make_small, makeConfirm);
+            auto fc = getFDRConfirm(vl, make_small);
             totalConfirmSize += fc.size();
             bc2Conf.emplace(b, move(fc));
         }
index ea644bfb694746a748d525e5f214088bcaf70362..d75408f43469b7a52134e3456b7bbc6f4a25c1ee 100644 (file)
@@ -43,11 +43,12 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
                  size_t i, hwlmcb_rv_t *control, u32 *last_match,
                  u64a conf_key) {
     assert(i < a->len);
+    assert(i >= a->start_offset);
     assert(ISALIGNED(fdrc));
 
     const u8 * buf = a->buf;
     u32 c = CONF_HASH_CALL(conf_key, fdrc->andmsk, fdrc->mult,
-                           fdrc->nBitsOrSoleID);
+                           fdrc->nBits);
     u32 start = getConfirmLitIndex(fdrc)[c];
     if (likely(!start)) {
         return;
@@ -94,80 +95,4 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
     } while (oldNext);
 }
 
-// 'light-weight' confirmation function which is used by 1-mask Teddy;
-// in the 'confirmless' case it simply calls callback function,
-// otherwise it calls 'confWithBit' function for the full confirmation procedure
-static really_inline
-void confWithBit1(const struct FDRConfirm *fdrc,
-                  const struct FDR_Runtime_Args *a, size_t i,
-                  hwlmcb_rv_t *control, u32 *last_match, u64a conf_key) {
-    assert(i < a->len);
-    assert(ISALIGNED(fdrc));
-
-    if (unlikely(fdrc->mult)) {
-        confWithBit(fdrc, a, i, control, last_match, conf_key);
-        return;
-    } else {
-        u32 id = fdrc->nBitsOrSoleID;
-
-        if ((*last_match == id) && (fdrc->flags & FDRC_FLAG_NOREPEAT)) {
-            return;
-        }
-        *last_match = id;
-        *control = a->cb(i, i, id, a->ctxt);
-    }
-}
-
-// This is 'light-weight' confirmation function which is used by 2-3-4-mask Teddy
-// In the 'confirmless' case it makes fast 32-bit comparison,
-// otherwise it calls 'confWithBit' function for the full confirmation procedure
-static really_inline
-void confWithBitMany(const struct FDRConfirm *fdrc,
-                     const struct FDR_Runtime_Args *a, size_t i, CautionReason r,
-                     hwlmcb_rv_t *control, u32 *last_match, u64a conf_key) {
-    assert(i < a->len);
-    assert(ISALIGNED(fdrc));
-
-    if (i < a->start_offset) {
-        return;
-    }
-
-    if (unlikely(fdrc->mult)) {
-        confWithBit(fdrc, a, i, control, last_match, conf_key);
-        return;
-    } else {
-        const u32 id = fdrc->nBitsOrSoleID;
-        const u32 len = fdrc->soleLitSize;
-
-        if ((*last_match == id) && (fdrc->flags & FDRC_FLAG_NOREPEAT)) {
-            return;
-        }
-
-        if (r == VECTORING && len > i - a->start_offset) {
-            if (len > i + a->len_history) {
-                return;
-            }
-
-            u32 cmp = (u32)a->buf[i] << 24;
-
-            if (len <= i) {
-                for (u32 j = 1; j <= len; j++) {
-                    cmp |= (u32)a->buf[i - j] << (24 - (j * 8));
-                }
-            } else {
-                for (u32 j = 1; j <= i; j++) {
-                    cmp |= (u32)a->buf[i - j] << (24 - (j * 8));
-                }
-                cmp |= (u32)(a->histBytes >> (40 + i * 8));
-            }
-
-            if ((fdrc->soleLitMsk & cmp) != fdrc->soleLitCmp) {
-               return;
-            }
-        }
-        *last_match = id;
-        *control = a->cb(i - len, i, id, a->ctxt);
-    }
-}
-
 #endif
index 636c741b445ba187afe43ff598abe64b44c57a90..da5096a00330060130d4bd04d68b4333bddebc36 100644 (file)
 #include "util/simd_utils.h"
 
 const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
 };
 
+#if defined(__AVX2__) // reinforced teddy
+
+#ifdef ARCH_64_BIT
+#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
+do {                                                                        \
+    if (unlikely(diff256(var, ones256()))) {                                \
+        m128 lo = movdq_lo(var);                                            \
+        m128 hi = movdq_hi(var);                                            \
+        u64a part1 = movq(lo);                                              \
+        u64a part2 = movq(rshiftbyte_m128(lo, 8));                          \
+        u64a part3 = movq(hi);                                              \
+        u64a part4 = movq(rshiftbyte_m128(hi, 8));                          \
+        if (unlikely(part1 != ones_u64a)) {                                 \
+            part1 = ~part1;                                                 \
+            conf_fn(&part1, bucket, offset, confBase, reason, a, ptr,       \
+                    &control, &last_match);                                 \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+        if (unlikely(part2 != ones_u64a)) {                                 \
+            part2 = ~part2;                                                 \
+            conf_fn(&part2, bucket, offset + 8, confBase, reason, a, ptr,   \
+                    &control, &last_match);                                 \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+        if (unlikely(part3 != ones_u64a)) {                                 \
+            part3 = ~part3;                                                 \
+            conf_fn(&part3, bucket, offset + 16, confBase, reason, a, ptr,  \
+                    &control, &last_match);                                 \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+        if (unlikely(part4 != ones_u64a)) {                                 \
+            part4 = ~part4;                                                 \
+            conf_fn(&part4, bucket, offset + 24, confBase, reason, a, ptr,  \
+                    &control, &last_match);                                 \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+    }                                                                       \
+} while(0)
+#else
+#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
+do {                                                                        \
+    if (unlikely(diff256(var, ones256()))) {                                \
+        m128 lo = movdq_lo(var);                                            \
+        m128 hi = movdq_hi(var);                                            \
+        u32 part1 = movd(lo);                                               \
+        u32 part2 = movd(rshiftbyte_m128(lo, 4));                           \
+        u32 part3 = movd(rshiftbyte_m128(lo, 8));                           \
+        u32 part4 = movd(rshiftbyte_m128(lo, 12));                          \
+        u32 part5 = movd(hi);                                               \
+        u32 part6 = movd(rshiftbyte_m128(hi, 4));                           \
+        u32 part7 = movd(rshiftbyte_m128(hi, 8));                           \
+        u32 part8 = movd(rshiftbyte_m128(hi, 12));                          \
+        if (unlikely(part1 != ones_u32)) {                                  \
+            part1 = ~part1;                                                 \
+            conf_fn(&part1, bucket, offset, confBase, reason, a, ptr,       \
+                    &control, &last_match);                                 \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+        if (unlikely(part2 != ones_u32)) {                                  \
+            part2 = ~part2;                                                 \
+            conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr,   \
+                    &control, &last_match);                                 \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+        if (unlikely(part3 != ones_u32)) {                                  \
+            part3 = ~part3;                                                 \
+            conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr,   \
+                    &control, &last_match);                                 \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+        if (unlikely(part4 != ones_u32)) {                                  \
+            part4 = ~part4;                                                 \
+            conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr,  \
+                    &control, &last_match);                                 \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+        if (unlikely(part5 != ones_u32)) {                                  \
+            part5 = ~part5;                                                 \
+            conf_fn(&part5, bucket, offset + 16, confBase, reason, a, ptr,  \
+                    &control, &last_match);                                 \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+        if (unlikely(part6 != ones_u32)) {                                  \
+            part6 = ~part6;                                                 \
+            conf_fn(&part6, bucket, offset + 20, confBase, reason, a, ptr,  \
+                    &control, &last_match);                                 \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+        if (unlikely(part7 != ones_u32)) {                                  \
+            part7 = ~part7;                                                 \
+            conf_fn(&part7, bucket, offset + 24, confBase, reason, a, ptr,  \
+                    &control, &last_match);                                 \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+        if (unlikely(part8 != ones_u32)) {                                  \
+            part8 = ~part8;                                                 \
+            conf_fn(&part8, bucket, offset + 28, confBase, reason, a, ptr,  \
+                    &control, &last_match);                                 \
+            CHECK_HWLM_TERMINATE_MATCHING;                                  \
+        }                                                                   \
+    }                                                                       \
+} while(0)
+#endif
+
+#define PREP_SHUF_MASK_NO_REINFORCEMENT(val)                                \
+    m256 lo = and256(val, *lo_mask);                                        \
+    m256 hi = and256(rshift64_m256(val, 4), *lo_mask)
+
+#define PREP_SHUF_MASK                                                      \
+    PREP_SHUF_MASK_NO_REINFORCEMENT(load256(ptr));                          \
+    *c_128 = *(ptr + 15);                                                   \
+    m256 r_msk = set64x4(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]); \
+    *c_0 = *(ptr + 31)
+
+#define SHIFT_OR_M1                                                         \
+    or256(pshufb_m256(dup_mask[0], lo), pshufb_m256(dup_mask[1], hi))
+
+#define SHIFT_OR_M2                                                         \
+    or256(lshift128_m256(or256(pshufb_m256(dup_mask[2], lo),                \
+                               pshufb_m256(dup_mask[3], hi)),               \
+                         1), SHIFT_OR_M1)
+
+#define SHIFT_OR_M3                                                         \
+    or256(lshift128_m256(or256(pshufb_m256(dup_mask[4], lo),                \
+                               pshufb_m256(dup_mask[5], hi)),               \
+                         2), SHIFT_OR_M2)
+
+#define SHIFT_OR_M4                                                         \
+    or256(lshift128_m256(or256(pshufb_m256(dup_mask[6], lo),                \
+                               pshufb_m256(dup_mask[7], hi)),               \
+                         3), SHIFT_OR_M3)
+
+static really_inline
+m256 prep_conf_teddy_no_reinforcement_m1(const m256 *lo_mask,
+                                         const m256 *dup_mask,
+                                         const m256 val) {
+    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+    return SHIFT_OR_M1;
+}
+
+static really_inline
+m256 prep_conf_teddy_no_reinforcement_m2(const m256 *lo_mask,
+                                         const m256 *dup_mask,
+                                         const m256 val) {
+    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+    return SHIFT_OR_M2;
+}
+
+static really_inline
+m256 prep_conf_teddy_no_reinforcement_m3(const m256 *lo_mask,
+                                         const m256 *dup_mask,
+                                         const m256 val) {
+    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+    return SHIFT_OR_M3;
+}
+
+static really_inline
+m256 prep_conf_teddy_no_reinforcement_m4(const m256 *lo_mask,
+                                         const m256 *dup_mask,
+                                         const m256 val) {
+    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+    return SHIFT_OR_M4;
+}
+
+static really_inline
+m256 prep_conf_teddy_m1(const m256 *lo_mask, const m256 *dup_mask,
+                        const u8 *ptr, const u64a *r_msk_base,
+                        u32 *c_0, u32 *c_128) {
+    PREP_SHUF_MASK;
+    return or256(SHIFT_OR_M1, r_msk);
+}
+
+static really_inline
+m256 prep_conf_teddy_m2(const m256 *lo_mask, const m256 *dup_mask,
+                        const u8 *ptr, const u64a *r_msk_base,
+                        u32 *c_0, u32 *c_128) {
+    PREP_SHUF_MASK;
+    return or256(SHIFT_OR_M2, r_msk);
+}
+
+static really_inline
+m256 prep_conf_teddy_m3(const m256 *lo_mask, const m256 *dup_mask,
+                        const u8 *ptr, const u64a *r_msk_base,
+                        u32 *c_0, u32 *c_128) {
+    PREP_SHUF_MASK;
+    return or256(SHIFT_OR_M3, r_msk);
+}
+
+static really_inline
+m256 prep_conf_teddy_m4(const m256 *lo_mask, const m256 *dup_mask,
+                        const u8 *ptr, const u64a *r_msk_base,
+                        u32 *c_0, u32 *c_128) {
+    PREP_SHUF_MASK;
+    return or256(SHIFT_OR_M4, r_msk);
+}
+
+#else // not defined __AVX2__
+
 #ifdef ARCH_64_BIT
 #define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
 do {                                                                        \
-    if (unlikely(isnonzero128(var))) {                                      \
+    if (unlikely(diff128(var, ones128()))) {                                \
         u64a lo = movq(var);                                                \
         u64a hi = movq(rshiftbyte_m128(var, 8));                            \
-        if (unlikely(lo)) {                                                 \
+        if (unlikely(lo != ones_u64a)) {                                    \
+            lo = ~lo;                                                       \
             conf_fn(&lo, bucket, offset, confBase, reason, a, ptr,          \
                     &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
-        if (unlikely(hi)) {                                                 \
+        if (unlikely(hi != ones_u64a)) {                                    \
+            hi = ~hi;                                                       \
             conf_fn(&hi, bucket, offset + 8, confBase, reason, a, ptr,      \
                     &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
     }                                                                       \
-} while (0);
+} while(0)
 #else
 #define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
 do {                                                                        \
-    if (unlikely(isnonzero128(var))) {                                      \
+    if (unlikely(diff128(var, ones128()))) {                                \
         u32 part1 = movd(var);                                              \
         u32 part2 = movd(rshiftbyte_m128(var, 4));                          \
         u32 part3 = movd(rshiftbyte_m128(var, 8));                          \
         u32 part4 = movd(rshiftbyte_m128(var, 12));                         \
-        if (unlikely(part1)) {                                              \
+        if (unlikely(part1 != ones_u32)) {                                  \
+            part1 = ~part1;                                                 \
             conf_fn(&part1, bucket, offset, confBase, reason, a, ptr,       \
                     &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
-        if (unlikely(part2)) {                                              \
+        if (unlikely(part2 != ones_u32)) {                                  \
+            part2 = ~part2;                                                 \
             conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr,   \
                     &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
-        if (unlikely(part3)) {                                              \
+        if (unlikely(part3 != ones_u32)) {                                  \
+            part3 = ~part3;                                                 \
             conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr,   \
                     &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
-        if (unlikely(part4)) {                                              \
+        if (unlikely(part4 != ones_u32)) {                                  \
+            part4 = ~part4;                                                 \
             conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr,  \
                     &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
     }                                                                       \
-} while (0);
+} while(0)
 #endif
 
 static really_inline
@@ -129,8 +333,8 @@ m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) {
     m128 mask = set16x8(0xf);
     m128 lo = and128(val, mask);
     m128 hi = and128(rshift64_m128(val, 4), mask);
-    return and128(pshufb_m128(maskBase[0 * 2], lo),
-                  pshufb_m128(maskBase[0 * 2 + 1], hi));
+    return or128(pshufb_m128(maskBase[0 * 2], lo),
+                 pshufb_m128(maskBase[0 * 2 + 1], hi));
 }
 
 static really_inline
@@ -140,11 +344,11 @@ m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) {
     m128 hi = and128(rshift64_m128(val, 4), mask);
     m128 r = prep_conf_teddy_m1(maskBase, val);
 
-    m128 res_1 = and128(pshufb_m128(maskBase[1*2], lo),
-                        pshufb_m128(maskBase[1*2+1], hi));
-    m128 res_shifted_1 = palignr(res_1, *old_1, 16-1);
+    m128 res_1 = or128(pshufb_m128(maskBase[1 * 2], lo),
+                       pshufb_m128(maskBase[1 * 2 + 1], hi));
+    m128 res_shifted_1 = palignr(res_1, *old_1, 16 - 1);
     *old_1 = res_1;
-    return and128(r, res_shifted_1);
+    return or128(r, res_shifted_1);
 }
 
 static really_inline
@@ -155,11 +359,11 @@ m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
     m128 hi = and128(rshift64_m128(val, 4), mask);
     m128 r = prep_conf_teddy_m2(maskBase, old_1, val);
 
-    m128 res_2 = and128(pshufb_m128(maskBase[2*2], lo),
-                        pshufb_m128(maskBase[2*2+1], hi));
-    m128 res_shifted_2 = palignr(res_2, *old_2, 16-2);
+    m128 res_2 = or128(pshufb_m128(maskBase[2 * 2], lo),
+                       pshufb_m128(maskBase[2 * 2 + 1], hi));
+    m128 res_shifted_2 = palignr(res_2, *old_2, 16 - 2);
     *old_2 = res_2;
-    return and128(r, res_shifted_2);
+    return or128(r, res_shifted_2);
 }
 
 static really_inline
@@ -170,487 +374,260 @@ m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
     m128 hi = and128(rshift64_m128(val, 4), mask);
     m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, val);
 
-    m128 res_3 = and128(pshufb_m128(maskBase[3*2], lo),
-                        pshufb_m128(maskBase[3*2+1], hi));
-    m128 res_shifted_3 = palignr(res_3, *old_3, 16-3);
+    m128 res_3 = or128(pshufb_m128(maskBase[3 * 2], lo),
+                       pshufb_m128(maskBase[3 * 2 + 1], hi));
+    m128 res_shifted_3 = palignr(res_3, *old_3, 16 - 3);
     *old_3 = res_3;
-    return and128(r, res_shifted_3);
+    return or128(r, res_shifted_3);
 }
 
+#endif // __AVX2__
+
+#if defined(__AVX2__) // reinforced teddy
+
+#define PREP_CONF_FN_NO_REINFORCEMENT(val, n)                                 \
+    prep_conf_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val)
+
+#define PREP_CONF_FN(ptr, n)                                                  \
+    prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base, &c_0, &c_128)
+
+#define PREPARE_MASKS_1                                                       \
+    dup_mask[0] = set2x128(maskBase[0]);                                      \
+    dup_mask[1] = set2x128(maskBase[1]);
+
+#define PREPARE_MASKS_2                                                       \
+    PREPARE_MASKS_1                                                           \
+    dup_mask[2] = set2x128(maskBase[2]);                                      \
+    dup_mask[3] = set2x128(maskBase[3]);
+
+#define PREPARE_MASKS_3                                                       \
+    PREPARE_MASKS_2                                                           \
+    dup_mask[4] = set2x128(maskBase[4]);                                      \
+    dup_mask[5] = set2x128(maskBase[5]);
+
+#define PREPARE_MASKS_4                                                       \
+    PREPARE_MASKS_3                                                           \
+    dup_mask[6] = set2x128(maskBase[6]);                                      \
+    dup_mask[7] = set2x128(maskBase[7]);
+
+#define PREPARE_MASKS(n)                                                      \
+    m256 lo_mask = set32x8(0xf);                                              \
+    m256 dup_mask[n * 2];                                                     \
+    PREPARE_MASKS_##n
+
+#else // not defined __AVX2__
+
+#define FDR_EXEC_TEDDY_RES_OLD_1
+
+#define FDR_EXEC_TEDDY_RES_OLD_2                                              \
+    m128 res_old_1 = zeroes128();
+
+#define FDR_EXEC_TEDDY_RES_OLD_3                                              \
+    m128 res_old_1 = zeroes128();                                             \
+    m128 res_old_2 = zeroes128();
+
+#define FDR_EXEC_TEDDY_RES_OLD_4                                              \
+    m128 res_old_1 = zeroes128();                                             \
+    m128 res_old_2 = zeroes128();                                             \
+    m128 res_old_3 = zeroes128();
+
+#define FDR_EXEC_TEDDY_RES_OLD(n) FDR_EXEC_TEDDY_RES_OLD_##n
+
+#define PREP_CONF_FN_1(mask_base, val)                                        \
+    prep_conf_teddy_m1(mask_base, val)
+
+#define PREP_CONF_FN_2(mask_base, val)                                        \
+    prep_conf_teddy_m2(mask_base, &res_old_1, val)
+
+#define PREP_CONF_FN_3(mask_base, val)                                        \
+    prep_conf_teddy_m3(mask_base, &res_old_1, &res_old_2, val)
+
+#define PREP_CONF_FN_4(mask_base, val)                                        \
+    prep_conf_teddy_m4(mask_base, &res_old_1, &res_old_2, &res_old_3, val)
+
+#define PREP_CONF_FN(mask_base, val, n)                                       \
+    PREP_CONF_FN_##n(mask_base, val)
+#endif // __AVX2__
+
+
+#if defined(__AVX2__) // reinforced teddy
+#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn)                       \
+do {                                                                          \
+    const u8 *buf_end = a->buf + a->len;                                      \
+    const u8 *ptr = a->buf + a->start_offset;                                 \
+    u32 floodBackoff = FLOOD_BACKOFF_START;                                   \
+    const u8 *tryFloodDetect = a->firstFloodDetect;                           \
+    u32 last_match = (u32)-1;                                                 \
+    const struct Teddy *teddy = (const struct Teddy *)fdr;                    \
+    const size_t iterBytes = 64;                                              \
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",                 \
+                 a->buf, a->len, a->start_offset);                            \
+                                                                              \
+    const m128 *maskBase = getMaskBase(teddy);                                \
+    PREPARE_MASKS(n_msk);                                                     \
+    const u32 *confBase = getConfBase(teddy);                                 \
+                                                                              \
+    const u64a *r_msk_base = getReinforcedMaskBase(teddy, n_msk);             \
+    u32 c_0 = 0x100;                                                          \
+    u32 c_128 = 0x100;                                                        \
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 32);                               \
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);           \
+    if (ptr < mainStart) {                                                    \
+        ptr = mainStart - 32;                                                 \
+        m256 p_mask;                                                          \
+        m256 val_0 = vectoredLoad256(&p_mask, ptr, a->start_offset,           \
+                                     a->buf, buf_end,                         \
+                                     a->buf_history, a->len_history, n_msk);  \
+        m256 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk);               \
+        c_0 = *(ptr + 31);                                                    \
+        r_0 = or256(r_0, p_mask);                                             \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
+        ptr += 32;                                                            \
+    }                                                                         \
+                                                                              \
+    if (ptr + 32 <= buf_end) {                                                \
+        m256 r_0 = PREP_CONF_FN(ptr, n_msk);                                  \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
+        ptr += 32;                                                            \
+    }                                                                         \
+                                                                              \
+    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {                    \
+        __builtin_prefetch(ptr + (iterBytes * 4));                            \
+        CHECK_FLOOD;                                                          \
+        m256 r_0 = PREP_CONF_FN(ptr, n_msk);                                  \
+        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn);                      \
+        m256 r_1 = PREP_CONF_FN(ptr + 32, n_msk);                             \
+        CONFIRM_TEDDY(r_1, 8, 32, NOT_CAUTIOUS, conf_fn);                     \
+    }                                                                         \
+                                                                              \
+    if (ptr + 32 <= buf_end) {                                                \
+        m256 r_0 = PREP_CONF_FN(ptr, n_msk);                                  \
+        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn);                      \
+        ptr += 32;                                                            \
+    }                                                                         \
+                                                                              \
+    assert(ptr + 32 > buf_end);                                               \
+    if (ptr < buf_end) {                                                      \
+        m256 p_mask;                                                          \
+        m256 val_0 = vectoredLoad256(&p_mask, ptr, 0, ptr, buf_end,           \
+                                     a->buf_history, a->len_history, n_msk);  \
+        m256 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk);               \
+        r_0 = or256(r_0, p_mask);                                             \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
+    }                                                                         \
+                                                                              \
+    return HWLM_SUCCESS;                                                      \
+} while(0)
+#else // not defined __AVX2__
+#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn)                       \
+do {                                                                          \
+    const u8 *buf_end = a->buf + a->len;                                      \
+    const u8 *ptr = a->buf + a->start_offset;                                 \
+    u32 floodBackoff = FLOOD_BACKOFF_START;                                   \
+    const u8 *tryFloodDetect = a->firstFloodDetect;                           \
+    u32 last_match = (u32)-1;                                                 \
+    const struct Teddy *teddy = (const struct Teddy *)fdr;                    \
+    const size_t iterBytes = 32;                                              \
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",                 \
+                 a->buf, a->len, a->start_offset);                            \
+                                                                              \
+    const m128 *maskBase = getMaskBase(teddy);                                \
+    const u32 *confBase = getConfBase(teddy);                                 \
+                                                                              \
+    FDR_EXEC_TEDDY_RES_OLD(n_msk);                                            \
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);                               \
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);           \
+    if (ptr < mainStart) {                                                    \
+        ptr = mainStart - 16;                                                 \
+        m128 p_mask;                                                          \
+        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->start_offset,           \
+                                     a->buf, buf_end,                         \
+                                     a->buf_history, a->len_history, n_msk);  \
+        m128 r_0 = PREP_CONF_FN(maskBase, val_0, n_msk);                      \
+        r_0 = or128(r_0, p_mask);                                             \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
+        ptr += 16;                                                            \
+    }                                                                         \
+                                                                              \
+    if (ptr + 16 <= buf_end) {                                                \
+        m128 r_0 = PREP_CONF_FN(maskBase, load128(ptr), n_msk);               \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
+        ptr += 16;                                                            \
+    }                                                                         \
+                                                                              \
+    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {                    \
+        __builtin_prefetch(ptr + (iterBytes * 4));                            \
+        CHECK_FLOOD;                                                          \
+        m128 r_0 = PREP_CONF_FN(maskBase, load128(ptr), n_msk);               \
+        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn);                      \
+        m128 r_1 = PREP_CONF_FN(maskBase, load128(ptr + 16), n_msk);          \
+        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, conf_fn);                     \
+    }                                                                         \
+                                                                              \
+    if (ptr + 16 <= buf_end) {                                                \
+        m128 r_0 = PREP_CONF_FN(maskBase, load128(ptr), n_msk);               \
+        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn);                      \
+        ptr += 16;                                                            \
+    }                                                                         \
+                                                                              \
+    assert(ptr + 16 > buf_end);                                               \
+    if (ptr < buf_end) {                                                      \
+        m128 p_mask;                                                          \
+        m128 val_0 = vectoredLoad128(&p_mask, ptr, 0, ptr, buf_end,           \
+                                     a->buf_history, a->len_history, n_msk);  \
+        m128 r_0 = PREP_CONF_FN(maskBase, val_0, n_msk);                      \
+        r_0 = or128(r_0, p_mask);                                             \
+        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
+    }                                                                         \
+                                                                              \
+    return HWLM_SUCCESS;                                                      \
+} while(0)
+#endif // __AVX2__
+
 hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
                                   const struct FDR_Runtime_Args *a,
                                   hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m128 *maskBase = getMaskBase(teddy);
-    const u32 *confBase = getConfBase(teddy);
-
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 1);
-        m128 r_0 = prep_conf_teddy_m1(maskBase, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
-        ptr += 16;
-    }
-
-    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit1_teddy);
-        m128 r_1 = prep_conf_teddy_m1(maskBase, load128(ptr + 16));
-        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit1_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 1);
-        m128 r_0 = prep_conf_teddy_m1(maskBase, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
-    }
-
-    return HWLM_SUCCESS;
+    FDR_EXEC_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
 }
 
 hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
                                       const struct FDR_Runtime_Args *a,
                                       hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m128 *maskBase = getMaskBase(teddy);
-    const u32 *confBase = getConfBase(teddy);
-
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 1);
-        m128 r_0 = prep_conf_teddy_m1(maskBase, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
-        m128 r_1 = prep_conf_teddy_m1(maskBase, load128(ptr + 16));
-        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 1);
-        m128 r_0 = prep_conf_teddy_m1(maskBase, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
-    }
-
-    return HWLM_SUCCESS;
+    FDR_EXEC_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
 }
 
 hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
                                   const struct FDR_Runtime_Args *a,
                                   hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m128 *maskBase = getMaskBase(teddy);
-    const u32 *confBase = getConfBase(teddy);
-
-    m128 res_old_1 = ones128();
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 2);
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
-        ptr += 16;
-    }
-
-    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-        m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr + 16));
-        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 2);
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
-    }
-
-    return HWLM_SUCCESS;
+    FDR_EXEC_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
 }
 
 hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
                                       const struct FDR_Runtime_Args *a,
                                       hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m128 *maskBase = getMaskBase(teddy);
-    const u32 *confBase = getConfBase(teddy);
-
-    m128 res_old_1 = ones128();
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 2);
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
-        m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr + 16));
-        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                   a->buf_history, a->len_history, 2);
-        m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
-    }
-
-    return HWLM_SUCCESS;
+    FDR_EXEC_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
 }
 
 hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
                                   const struct FDR_Runtime_Args *a,
                                   hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m128 *maskBase = getMaskBase(teddy);
-    const u32 *confBase = getConfBase(teddy);
-
-    m128 res_old_1 = ones128();
-    m128 res_old_2 = ones128();
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 3);
-        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
-        ptr += 16;
-    }
-
-    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-        m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      load128(ptr + 16));
-        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 3);
-        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
-    }
-
-    return HWLM_SUCCESS;
+    FDR_EXEC_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
 }
 
 hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
                                       const struct FDR_Runtime_Args *a,
                                       hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m128 *maskBase = getMaskBase(teddy);
-    const u32 *confBase = getConfBase(teddy);
-
-    m128 res_old_1 = ones128();
-    m128 res_old_2 = ones128();
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 3);
-        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
-        m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                      load128(ptr + 16));
-        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 3);
-        m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
-    }
-
-    return HWLM_SUCCESS;
+    FDR_EXEC_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
 }
 
 hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
                                   const struct FDR_Runtime_Args *a,
                                   hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m128 *maskBase = getMaskBase(teddy);
-    const u32 *confBase = getConfBase(teddy);
-
-    m128 res_old_1 = ones128();
-    m128 res_old_2 = ones128();
-    m128 res_old_3 = ones128();
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 4);
-        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
-        ptr += 16;
-    }
-
-    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-        m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, load128(ptr + 16));
-        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 4);
-        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
-    }
-
-    return HWLM_SUCCESS;
+    FDR_EXEC_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
 }
 
 hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
                                       const struct FDR_Runtime_Args *a,
                                       hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m128 *maskBase = getMaskBase(teddy);
-    const u32 *confBase = getConfBase(teddy);
-
-    m128 res_old_1 = ones128();
-    m128 res_old_2 = ones128();
-    m128 res_old_3 = ones128();
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 4);
-        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, load128(ptr));
-        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
-        m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, load128(ptr + 16));
-        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m128 p_mask;
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
-                                     a->buf_history, a->len_history, 4);
-        m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                      &res_old_3, val_0);
-        r_0 = and128(r_0, p_mask);
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
-    }
-
-    return HWLM_SUCCESS;
+    FDR_EXEC_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
 }
index 89117b0b28465e31bc5bf7fec8eb30dda15305fe..11ea0f8e74ceaf643ab03c2c7fb7e7c0d14db858 100644 (file)
 
 #if defined(HAVE_AVX2)
 
+const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = {
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff},
+    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
+};
+
 #ifdef ARCH_64_BIT
 #define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn)             \
 do {                                                                        \
-    if (unlikely(isnonzero256(var))) {                                      \
+    if (unlikely(diff256(var, ones256()))) {                                \
         m256 swap = swap128in256(var);                                      \
         m256 r = interleave256lo(var, swap);                                \
         u64a part1 = extractlow64from256(r);                                \
@@ -51,32 +120,36 @@ do {                                                                        \
         r = interleave256hi(var, swap);                                     \
         u64a part3 = extractlow64from256(r);                                \
         u64a part4 = extract64from256(r, 1);                                \
-        if (unlikely(part1)) {                                              \
+        if (unlikely(part1 != ones_u64a)) {                                 \
+            part1 = ~part1;                                                 \
             conf_fn(&part1, bucket, offset, confBase, reason, a, ptr,       \
                     &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
-        if (unlikely(part2)) {                                              \
+        if (unlikely(part2 != ones_u64a)) {                                 \
+            part2 = ~part2;                                                 \
             conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr,   \
                     &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
-        if (unlikely(part3)) {                                              \
+        if (unlikely(part3 != ones_u64a)) {                                 \
+            part3 = ~part3;                                                 \
             conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr,   \
                     &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
-        if (unlikely(part4)) {                                              \
+        if (unlikely(part4 != ones_u64a)) {                                 \
+            part4 = ~part4;                                                 \
             conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr,  \
                     &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
     }                                                                       \
-} while (0);
+} while(0)
 #else
 #define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn)             \
 do {                                                                        \
-    if (unlikely(isnonzero256(var))) {                                      \
+    if (unlikely(diff256(var, ones256()))) {                                \
         m256 swap = swap128in256(var);                                      \
         m256 r = interleave256lo(var, swap);                                \
         u32 part1 = extractlow32from256(r);                                 \
@@ -88,56 +161,65 @@ do {                                                                        \
         u32 part6 = extract32from256(r, 1);                                 \
         u32 part7 = extract32from256(r, 2);                                 \
         u32 part8 = extract32from256(r, 3);                                 \
-        if (unlikely(part1)) {                                              \
+        if (unlikely(part1 != ones_u32)) {                                  \
+            part1 = ~part1;                                                 \
             conf_fn(&part1, bucket, offset, confBase, reason, a, ptr,       \
                     &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
-        if (unlikely(part2)) {                                              \
+        if (unlikely(part2 != ones_u32)) {                                  \
+            part2 = ~part2;                                                 \
             conf_fn(&part2, bucket, offset + 2, confBase, reason, a, ptr,   \
                     &control, &last_match);                                 \
         }                                                                   \
-        if (unlikely(part3)) {                                              \
+        if (unlikely(part3 != ones_u32)) {                                  \
+            part3 = ~part3;                                                 \
             conf_fn(&part3, bucket, offset + 4, confBase, reason, a, ptr,   \
                     &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
-        if (unlikely(part4)) {                                              \
+        if (unlikely(part4 != ones_u32)) {                                  \
+            part4 = ~part4;                                                 \
             conf_fn(&part4, bucket, offset + 6, confBase, reason, a, ptr,   \
                     &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
-        if (unlikely(part5)) {                                              \
+        if (unlikely(part5 != ones_u32)) {                                  \
+            part5 = ~part5;                                                 \
             conf_fn(&part5, bucket, offset + 8, confBase, reason, a, ptr,   \
                     &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
-        if (unlikely(part6)) {                                              \
+        if (unlikely(part6 != ones_u32)) {                                  \
+            part6 = ~part6;                                                 \
             conf_fn(&part6, bucket, offset + 10, confBase, reason, a, ptr,  \
                     &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
-        if (unlikely(part7)) {                                              \
+        if (unlikely(part7 != ones_u32)) {                                  \
+            part7 = ~part7;                                                 \
             conf_fn(&part7, bucket, offset + 12, confBase, reason, a, ptr,  \
                     &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
-        if (unlikely(part8)) {                                              \
+        if (unlikely(part8 != ones_u32)) {                                  \
+            part8 = ~part8;                                                 \
             conf_fn(&part8, bucket, offset + 14, confBase, reason, a, ptr,  \
                     &control, &last_match);                                 \
             CHECK_HWLM_TERMINATE_MATCHING;                                  \
         }                                                                   \
     }                                                                       \
-} while (0);
+} while(0)
 #endif
 
 static really_inline
-m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
+m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset,
+                       const u8 *lo, const u8 *hi,
                        const u8 *buf_history, size_t len_history,
                        const u32 nMasks) {
     m128 p_mask128;
-    m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, lo, hi, buf_history,
-                                        len_history, nMasks));
+    m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
+                                        buf_history, len_history, nMasks));
     *p_mask = set2x128(p_mask128);
     return ret;
 }
@@ -147,8 +229,8 @@ m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
     m256 mask = set32x8(0xf);
     m256 lo = and256(val, mask);
     m256 hi = and256(rshift64_m256(val, 4), mask);
-    return and256(pshufb_m256(maskBase[0*2], lo),
-                  pshufb_m256(maskBase[0*2+1], hi));
+    return or256(pshufb_m256(maskBase[0 * 2], lo),
+                 pshufb_m256(maskBase[0 * 2 + 1], hi));
 }
 
 static really_inline
@@ -158,11 +240,11 @@ m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
     m256 hi = and256(rshift64_m256(val, 4), mask);
     m256 r = prep_conf_fat_teddy_m1(maskBase, val);
 
-    m256 res_1 = and256(pshufb_m256(maskBase[1*2], lo),
-                        pshufb_m256(maskBase[1*2+1], hi));
-    m256 res_shifted_1 = vpalignr(res_1, *old_1, 16-1);
+    m256 res_1 = or256(pshufb_m256(maskBase[1 * 2], lo),
+                       pshufb_m256(maskBase[1 * 2 + 1], hi));
+    m256 res_shifted_1 = vpalignr(res_1, *old_1, 16 - 1);
     *old_1 = res_1;
-    return and256(r, res_shifted_1);
+    return or256(r, res_shifted_1);
 }
 
 static really_inline
@@ -173,11 +255,11 @@ m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
     m256 hi = and256(rshift64_m256(val, 4), mask);
     m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val);
 
-    m256 res_2 = and256(pshufb_m256(maskBase[2*2], lo),
-                        pshufb_m256(maskBase[2*2+1], hi));
-    m256 res_shifted_2 = vpalignr(res_2, *old_2, 16-2);
+    m256 res_2 = or256(pshufb_m256(maskBase[2 * 2], lo),
+                       pshufb_m256(maskBase[2 * 2 + 1], hi));
+    m256 res_shifted_2 = vpalignr(res_2, *old_2, 16 - 2);
     *old_2 = res_2;
-    return and256(r, res_shifted_2);
+    return or256(r, res_shifted_2);
 }
 
 static really_inline
@@ -188,11 +270,11 @@ m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
     m256 hi = and256(rshift64_m256(val, 4), mask);
     m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val);
 
-    m256 res_3 = and256(pshufb_m256(maskBase[3*2], lo),
-                        pshufb_m256(maskBase[3*2+1], hi));
-    m256 res_shifted_3 = vpalignr(res_3, *old_3, 16-3);
+    m256 res_3 = or256(pshufb_m256(maskBase[3 * 2], lo),
+                       pshufb_m256(maskBase[3 * 2 + 1], hi));
+    m256 res_shifted_3 = vpalignr(res_3, *old_3, 16 - 3);
     *old_3 = res_3;
-    return and256(r, res_shifted_3);
+    return or256(r, res_shifted_3);
 }
 
 static really_inline
@@ -200,486 +282,151 @@ const m256 *getMaskBase_avx2(const struct Teddy *teddy) {
     return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
 }
 
+#define FDR_EXEC_FAT_TEDDY_RES_OLD_1                                        \
+do {                                                                        \
+} while(0)
+
+#define FDR_EXEC_FAT_TEDDY_RES_OLD_2                                        \
+    m256 res_old_1 = zeroes256();
+
+#define FDR_EXEC_FAT_TEDDY_RES_OLD_3                                        \
+    m256 res_old_1 = zeroes256();                                           \
+    m256 res_old_2 = zeroes256();
+
+#define FDR_EXEC_FAT_TEDDY_RES_OLD_4                                        \
+    m256 res_old_1 = zeroes256();                                           \
+    m256 res_old_2 = zeroes256();                                           \
+    m256 res_old_3 = zeroes256();
+
+#define FDR_EXEC_FAT_TEDDY_RES_OLD(n) FDR_EXEC_FAT_TEDDY_RES_OLD_##n
+
+#define PREP_CONF_FAT_FN_1(mask_base, val)                                  \
+    prep_conf_fat_teddy_m1(mask_base, val)
+
+#define PREP_CONF_FAT_FN_2(mask_base, val)                                  \
+    prep_conf_fat_teddy_m2(mask_base, &res_old_1, val)
+
+#define PREP_CONF_FAT_FN_3(mask_base, val)                                  \
+    prep_conf_fat_teddy_m3(mask_base, &res_old_1, &res_old_2, val)
+
+#define PREP_CONF_FAT_FN_4(mask_base, val)                                  \
+    prep_conf_fat_teddy_m4(mask_base, &res_old_1, &res_old_2, &res_old_3, val)
+
+#define PREP_CONF_FAT_FN(mask_base, val, n)                                 \
+    PREP_CONF_FAT_FN_##n(mask_base, val)
+
+#define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn)                 \
+do {                                                                        \
+    const u8 *buf_end = a->buf + a->len;                                    \
+    const u8 *ptr = a->buf + a->start_offset;                               \
+    u32 floodBackoff = FLOOD_BACKOFF_START;                                 \
+    const u8 *tryFloodDetect = a->firstFloodDetect;                         \
+    u32 last_match = (u32)-1;                                               \
+    const struct Teddy *teddy = (const struct Teddy *)fdr;                  \
+    const size_t iterBytes = 32;                                            \
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",               \
+                 a->buf, a->len, a->start_offset);                          \
+                                                                            \
+    const m256 *maskBase = getMaskBase_avx2(teddy);                         \
+    const u32 *confBase = getConfBase(teddy);                               \
+                                                                            \
+    FDR_EXEC_FAT_TEDDY_RES_OLD(n_msk);                                      \
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);                             \
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);         \
+    if (ptr < mainStart) {                                                  \
+        ptr = mainStart - 16;                                               \
+        m256 p_mask;                                                        \
+        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->start_offset,       \
+                                       a->buf, buf_end,                     \
+                                       a->buf_history, a->len_history,      \
+                                       n_msk);                              \
+        m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk);                \
+        r_0 = or256(r_0, p_mask);                                           \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn);                  \
+        ptr += 16;                                                          \
+    }                                                                       \
+                                                                            \
+    if (ptr + 16 <= buf_end) {                                              \
+        m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk);       \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn);                  \
+        ptr += 16;                                                          \
+    }                                                                       \
+                                                                            \
+    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {                 \
+        __builtin_prefetch(ptr + (iterBytes * 4));                          \
+        CHECK_FLOOD;                                                        \
+        m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk);       \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn);               \
+        m256 r_1 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr + 16), n_msk);  \
+        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, conf_fn);              \
+    }                                                                       \
+                                                                            \
+    if (ptr + 16 <= buf_end) {                                              \
+        m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk);       \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn);               \
+        ptr += 16;                                                          \
+    }                                                                       \
+                                                                            \
+    assert(ptr + 16 > buf_end);                                             \
+    if (ptr < buf_end) {                                                    \
+        m256 p_mask;                                                        \
+        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, 0, ptr, buf_end,       \
+                                       a->buf_history, a->len_history,      \
+                                       n_msk);                              \
+        m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk);                \
+        r_0 = or256(r_0, p_mask);                                           \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn);                  \
+    }                                                                       \
+                                                                            \
+    return HWLM_SUCCESS;                                                    \
+} while(0)
+
 hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
                                            const struct FDR_Runtime_Args *a,
                                            hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m256 *maskBase = getMaskBase_avx2(teddy);
-    const u32 *confBase = getConfBase(teddy);
-
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 1);
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
-        ptr += 16;
-    }
-
-    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit1_teddy);
-        m256 r_1 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr + 16));
-        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit1_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 1);
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
-    }
-
-    return HWLM_SUCCESS;
+    FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
 }
 
 hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
                                                const struct FDR_Runtime_Args *a,
                                                hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m256 *maskBase = getMaskBase_avx2(teddy);
-    const u32 *confBase = getConfBase(teddy);
-
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 1);
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
-        m256 r_1 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr + 16));
-        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 1);
-        m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
-    }
-
-    return HWLM_SUCCESS;
+    FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
 }
 
 hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
                                            const struct FDR_Runtime_Args *a,
                                            hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m256 *maskBase = getMaskBase_avx2(teddy);
-    const u32 *confBase = getConfBase(teddy);
-
-    m256 res_old_1 = ones256();
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 2);
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
-        ptr += 16;
-    }
-
-    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-        m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1,
-                                          load2x128(ptr + 16));
-        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 2);
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
-    }
-
-    return HWLM_SUCCESS;
+    FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
 }
 
 hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
                                                const struct FDR_Runtime_Args *a,
                                                hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m256 *maskBase = getMaskBase_avx2(teddy);
-    const u32 *confBase = getConfBase(teddy);
-
-    m256 res_old_1 = ones256();
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 2);
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
-        m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1,
-                                          load2x128(ptr + 16));
-        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 2);
-        m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
-    }
-
-    return HWLM_SUCCESS;
+    FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
 }
 
 hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
                                            const struct FDR_Runtime_Args *a,
                                            hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m256 *maskBase = getMaskBase_avx2(teddy);
-    const u32 *confBase = getConfBase(teddy);
-
-    m256 res_old_1 = ones256();
-    m256 res_old_2 = ones256();
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 3);
-        m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
-        ptr += 16;
-    }
-
-    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-        m256 r_1 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          load2x128(ptr + 16));
-        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 3);
-        m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
-    }
-
-    return HWLM_SUCCESS;
+    FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
 }
 
 hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
                                                const struct FDR_Runtime_Args *a,
                                                hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m256 *maskBase = getMaskBase_avx2(teddy);
-    const u32 *confBase = getConfBase(teddy);
-
-    m256 res_old_1 = ones256();
-    m256 res_old_2 = ones256();
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 3);
-        m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
-        m256 r_1 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          load2x128(ptr + 16));
-        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 3);
-        m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
-                                          val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
-    }
-
-    return HWLM_SUCCESS;
+    FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
 }
 
 hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
                                            const struct FDR_Runtime_Args *a,
                                            hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m256 *maskBase = getMaskBase_avx2(teddy);
-    const u32 *confBase = getConfBase(teddy);
-
-    m256 res_old_1 = ones256();
-    m256 res_old_2 = ones256();
-    m256 res_old_3 = ones256();
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 4);
-        m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
-        ptr += 16;
-    }
-
-    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-        m256 r_1 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, load2x128(ptr + 16));
-        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 4);
-        m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
-    }
-
-    return HWLM_SUCCESS;
+    FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
 }
 
 hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
                                                const struct FDR_Runtime_Args *a,
                                                hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 32;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m256 *maskBase = getMaskBase_avx2(teddy);
-    const u32 *confBase = getConfBase(teddy);
-
-    m256 res_old_1 = ones256();
-    m256 res_old_2 = ones256();
-    m256 res_old_3 = ones256();
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 16;
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 4);
-        m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    if (ptr + 16 < buf_end) {
-        m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
-        ptr += 16;
-    }
-
-    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-        m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, load2x128(ptr));
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
-        m256 r_1 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, load2x128(ptr + 16));
-        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 16) {
-        m256 p_mask;
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
-                                       a->buf_history, a->len_history, 4);
-        m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
-                                          &res_old_3, val_0);
-        r_0 = and256(r_0, p_mask);
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
-    }
-
-    return HWLM_SUCCESS;
+    FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
 }
 
 #endif // HAVE_AVX2
index 663d048378fd571aadb1b6a6a14c0eadf7ac3b30..14f19354d088cff01bbc0accc4f1d03362163238 100644 (file)
@@ -309,74 +309,65 @@ bool TeddyCompiler::pack(map<BucketIndex,
     return true;
 }
 
-bytecode_ptr<FDR> TeddyCompiler::build() {
-    assert(eng.numMasks <= MAX_NUM_MASKS);
+// this entry has all-zero mask to skip reinforcement
+#define NO_REINFORCEMENT N_CHARS
 
-    if (lits.size() > eng.getNumBuckets() * TEDDY_BUCKET_LOAD) {
-        DEBUG_PRINTF("too many literals: %zu\n", lits.size());
-        return nullptr;
+// this means every entry in reinforcement table
+#define ALL_CHAR_SET N_CHARS
+
+// each item's reinforcement mask has REINFORCED_MSK_LEN bytes
+#define REINFORCED_MSK_LEN 8
+
+static
+void initReinforcedTable(u8 *reinforcedMsk) {
+    u64a *mask = (u64a *)reinforcedMsk;
+    fill_n(mask, N_CHARS, 0x00ffffffffffffffULL);
+}
+
+static
+void fillReinforcedMskZero(u8 *reinforcedMsk) {
+    u8 *mc = reinforcedMsk + NO_REINFORCEMENT * REINFORCED_MSK_LEN;
+    fill_n(mc, REINFORCED_MSK_LEN, 0x00);
+}
+
+static
+void fillReinforcedMsk(u8 *reinforcedMsk, u16 c, u32 j, u8 bmsk) {
+    assert(j > 0);
+    if (c == ALL_CHAR_SET) {
+        for (size_t i = 0; i < N_CHARS; i++) {
+            u8 *mc = reinforcedMsk + i * REINFORCED_MSK_LEN;
+            mc[j - 1] &= ~bmsk;
+        }
+    } else {
+        u8 *mc = reinforcedMsk + c * REINFORCED_MSK_LEN;
+        mc[j - 1] &= ~bmsk;
     }
+}
 
 #ifdef TEDDY_DEBUG
-    for (size_t i = 0; i < lits.size(); i++) {
-        printf("lit %zu (len = %zu, %s) is ", i, lits[i].s.size(),
-               lits[i].nocase ? "caseless" : "caseful");
-        for (size_t j = 0; j < lits[i].s.size(); j++) {
-            printf("%02x", ((u32)lits[i].s[j]) & 0xff);
+static
+void dumpReinforcedMaskTable(const u8 *msks) {
+    for (u32 i = 0; i <= N_CHARS; i++) {
+        printf("0x%02x: ", i);
+        for (u32 j = 0; j < REINFORCED_MSK_LEN; j++) {
+            u8 val = msks[i * REINFORCED_MSK_LEN + j];
+            for (u32 k = 0; k < 8; k++) {
+                printf("%s", ((val >> k) & 0x1) ? "1" : "0");
+            }
+            printf(" ");
         }
         printf("\n");
     }
+}
 #endif
 
-    map<BucketIndex, std::vector<LiteralIndex>> bucketToLits;
-    if (eng.needConfirm(lits)) {
-        if (!pack(bucketToLits)) {
-            DEBUG_PRINTF("more lits (%zu) than buckets (%u), can't pack.\n",
-                         lits.size(), eng.getNumBuckets());
-            return nullptr;
-        }
-    } else {
-        for (u32 i = 0; i < lits.size(); i++) {
-            bucketToLits[i].push_back(i);
-        }
-    }
-    u32 maskWidth = eng.getNumBuckets() / 8;
-
-    size_t headerSize = sizeof(Teddy);
-    size_t maskLen = eng.numMasks * 16 * 2 * maskWidth;
-
-    auto floodTable = setupFDRFloodControl(lits, eng, grey);
-    auto confirmTable = setupFullConfs(lits, eng, bucketToLits, make_small);
-
-    // Note: we place each major structure here on a cacheline boundary.
-    size_t size = ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen) +
-                  ROUNDUP_CL(confirmTable.size()) + floodTable.size();
-
-    auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
-    assert(fdr); // otherwise would have thrown std::bad_alloc
-    Teddy *teddy = (Teddy *)fdr.get(); // ugly
-    u8 *teddy_base = (u8 *)teddy;
-
-    // Write header.
-    teddy->size = size;
-    teddy->engineID = eng.getID();
-    teddy->maxStringLen = verify_u32(maxLen(lits));
-
-    // Write confirm structures.
-    u8 *ptr = teddy_base + ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen);
-    assert(ISALIGNED_CL(ptr));
-    teddy->confOffset = verify_u32(ptr - teddy_base);
-    memcpy(ptr, confirmTable.get(), confirmTable.size());
-    ptr += ROUNDUP_CL(confirmTable.size());
-
-    // Write flood control structures.
-    assert(ISALIGNED_CL(ptr));
-    teddy->floodOffset = verify_u32(ptr - teddy_base);
-    memcpy(ptr, floodTable.get(), floodTable.size());
-    ptr += floodTable.size();
-
-    // Write teddy masks.
-    u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize);
+static
+void fillNibbleMasks(const map<BucketIndex,
+                               vector<LiteralIndex>> &bucketToLits,
+                     const vector<hwlmLiteral> &lits,
+                     u32 numMasks, u32 maskWidth, size_t maskLen,
+                     u8 *baseMsk) {
+    memset(baseMsk, 0xff, maskLen);
 
     for (const auto &b2l : bucketToLits) {
         const u32 &bucket_id = b2l.first;
@@ -389,7 +380,7 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
             const u32 sz = verify_u32(l.s.size());
 
             // fill in masks
-            for (u32 j = 0; j < eng.numMasks; j++) {
+            for (u32 j = 0; j < numMasks; j++) {
                 const u32 msk_id_lo = j * 2 * maskWidth + (bucket_id / 8);
                 const u32 msk_id_hi = (j * 2 + 1) * maskWidth + (bucket_id / 8);
                 const u32 lo_base = msk_id_lo * 16;
@@ -399,8 +390,8 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
                 // locations in these masks with '1'
                 if (j >= sz) {
                     for (u32 n = 0; n < 16; n++) {
-                        baseMsk[lo_base + n] |= bmsk;
-                        baseMsk[hi_base + n] |= bmsk;
+                        baseMsk[lo_base + n] &= ~bmsk;
+                        baseMsk[hi_base + n] &= ~bmsk;
                     }
                 } else {
                     u8 c = l.s[sz - 1 - j];
@@ -419,27 +410,139 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
 
                         for (u8 cm = 0; cm < 0x10; cm++) {
                             if ((cm & m_lo) == (cmp_lo & m_lo)) {
-                                baseMsk[lo_base + cm] |= bmsk;
+                                baseMsk[lo_base + cm] &= ~bmsk;
                             }
                             if ((cm & m_hi) == (cmp_hi & m_hi)) {
-                                baseMsk[hi_base + cm] |= bmsk;
+                                baseMsk[hi_base + cm] &= ~bmsk;
                             }
                         }
                     } else {
                         if (l.nocase && ourisalpha(c)) {
                             u32 cmHalfClear = (0xdf >> hiShift) & 0xf;
                             u32 cmHalfSet = (0x20 >> hiShift) & 0xf;
-                            baseMsk[hi_base + (n_hi & cmHalfClear)] |= bmsk;
-                            baseMsk[hi_base + (n_hi | cmHalfSet)] |= bmsk;
+                            baseMsk[hi_base + (n_hi & cmHalfClear)] &= ~bmsk;
+                            baseMsk[hi_base + (n_hi | cmHalfSet)] &= ~bmsk;
                         } else {
-                            baseMsk[hi_base + n_hi] |= bmsk;
+                            baseMsk[hi_base + n_hi] &= ~bmsk;
                         }
-                        baseMsk[lo_base + n_lo] |= bmsk;
+                        baseMsk[lo_base + n_lo] &= ~bmsk;
                     }
                 }
             }
         }
     }
+}
+
+static
+void fillReinforcedTable(const map<BucketIndex,
+                                   vector<LiteralIndex>> &bucketToLits,
+                         const vector<hwlmLiteral> &lits,
+                         u8 *reinforcedMsk) {
+    initReinforcedTable(reinforcedMsk);
+
+    for (const auto &b2l : bucketToLits) {
+        const u32 &bucket_id = b2l.first;
+        const vector<LiteralIndex> &ids = b2l.second;
+        const u8 bmsk = 1U << (bucket_id % 8);
+
+        for (const LiteralIndex &lit_id : ids) {
+            const hwlmLiteral &l = lits[lit_id];
+            DEBUG_PRINTF("putting lit %u into bucket %u\n", lit_id, bucket_id);
+            const u32 sz = verify_u32(l.s.size());
+
+            // fill in reinforced masks
+            for (u32 j = 1; j < REINFORCED_MSK_LEN; j++) {
+                if (sz - 1 < j) {
+                    fillReinforcedMsk(reinforcedMsk, ALL_CHAR_SET, j, bmsk);
+                } else {
+                    u8 c = l.s[sz - 1 - j];
+                    if (l.nocase && ourisalpha(c)) {
+                        u8 c_up = c & 0xdf;
+                        fillReinforcedMsk(reinforcedMsk, c_up, j, bmsk);
+                        u8 c_lo = c | 0x20;
+                        fillReinforcedMsk(reinforcedMsk, c_lo, j, bmsk);
+                    } else {
+                        fillReinforcedMsk(reinforcedMsk, c, j, bmsk);
+                    }
+                }
+            }
+        }
+    }
+
+    fillReinforcedMskZero(reinforcedMsk);
+}
+
+bytecode_ptr<FDR> TeddyCompiler::build() {
+    assert(eng.numMasks <= MAX_NUM_MASKS);
+
+    if (lits.size() > eng.getNumBuckets() * TEDDY_BUCKET_LOAD) {
+        DEBUG_PRINTF("too many literals: %zu\n", lits.size());
+        return nullptr;
+    }
+
+#ifdef TEDDY_DEBUG
+    for (size_t i = 0; i < lits.size(); i++) {
+        printf("lit %zu (len = %zu, %s) is ", i, lits[i].s.size(),
+               lits[i].nocase ? "caseless" : "caseful");
+        for (size_t j = 0; j < lits[i].s.size(); j++) {
+            printf("%02x", ((u32)lits[i].s[j])&0xff);
+        }
+        printf("\n");
+    }
+#endif
+
+    map<BucketIndex, std::vector<LiteralIndex>> bucketToLits;
+    if (!pack(bucketToLits)) {
+        DEBUG_PRINTF("more lits (%zu) than buckets (%u), can't pack.\n",
+                     lits.size(), eng.getNumBuckets());
+        return nullptr;
+    }
+    u32 maskWidth = eng.getNumBuckets() / 8;
+
+    size_t headerSize = sizeof(Teddy);
+    size_t maskLen = eng.numMasks * 16 * 2 * maskWidth;
+    size_t reinforcedMaskLen = (N_CHARS + 1) * REINFORCED_MSK_LEN;
+
+    auto floodTable = setupFDRFloodControl(lits, eng, grey);
+    auto confirmTable = setupFullConfs(lits, eng, bucketToLits, make_small);
+
+    // Note: we place each major structure here on a cacheline boundary.
+    size_t size = ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen) +
+                  ROUNDUP_CL(reinforcedMaskLen) +
+                  ROUNDUP_CL(confirmTable.size()) + floodTable.size();
+
+    auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
+    assert(fdr); // otherwise would have thrown std::bad_alloc
+    Teddy *teddy = (Teddy *)fdr.get(); // ugly
+    u8 *teddy_base = (u8 *)teddy;
+
+    // Write header.
+    teddy->size = size;
+    teddy->engineID = eng.getID();
+    teddy->maxStringLen = verify_u32(maxLen(lits));
+
+    // Write confirm structures.
+    u8 *ptr = teddy_base + ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen) +
+              ROUNDUP_CL(reinforcedMaskLen);
+    assert(ISALIGNED_CL(ptr));
+    teddy->confOffset = verify_u32(ptr - teddy_base);
+    memcpy(ptr, confirmTable.get(), confirmTable.size());
+    ptr += ROUNDUP_CL(confirmTable.size());
+
+    // Write flood control structures.
+    assert(ISALIGNED_CL(ptr));
+    teddy->floodOffset = verify_u32(ptr - teddy_base);
+    memcpy(ptr, floodTable.get(), floodTable.size());
+    ptr += floodTable.size();
+
+    // Write teddy masks.
+    u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize);
+    fillNibbleMasks(bucketToLits, lits, eng.numMasks, maskWidth, maskLen,
+                    baseMsk);
+
+    // Write reinforcement masks.
+    u8 *reinforcedMsk = baseMsk + ROUNDUP_CL(maskLen);
+    fillReinforcedTable(bucketToLits, lits, reinforcedMsk);
 
 #ifdef TEDDY_DEBUG
     for (u32 i = 0; i < eng.numMasks * 2; i++) {
@@ -452,6 +555,10 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
         }
         printf("\n");
     }
+
+    printf("\n===============================================\n"
+           "reinforced mask table for low boundary (original)\n\n");
+    dumpReinforcedMaskTable(reinforcedMsk);
 #endif
 
     return fdr;
index f7559b13f15385b0154409446879bca570d3bda2..88ae0f53821aec8baf9af56b7f1d66cc11c89fbc 100644 (file)
@@ -51,18 +51,6 @@ u32 TeddyEngineDescription::getDefaultFloodSuffixLength() const {
     return numMasks;
 }
 
-bool TeddyEngineDescription::needConfirm(const vector<hwlmLiteral> &lits) const {
-    if (packed || lits.size() > getNumBuckets()) {
-        return true;
-    }
-    for (const auto &lit : lits) {
-        if (lit.s.size() > numMasks || !lit.msk.empty()) {
-            return true;
-        }
-    }
-    return false;
-}
-
 void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {
     static const TeddyEngineDef defns[] = {
         { 3, 0 | HS_CPU_FEATURES_AVX2, 1, 16, false },
index 3979a5d3223a14dae34d2a3b91c3291bba562ff1..95931613840c44ed726c8c45e73a866597f32903 100644 (file)
@@ -55,7 +55,6 @@ public:
     explicit TeddyEngineDescription(const TeddyEngineDef &def);
 
     u32 getDefaultFloodSuffixLength() const override;
-    bool needConfirm(const std::vector<hwlmLiteral> &lits) const;
 };
 
 std::unique_ptr<TeddyEngineDescription>
index 359d1e135b1405f233872026b1ac57abb542212a..d175245244a311d6d33084f711722785f99e2a66 100644 (file)
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+/* Teddy bytecode layout:
+ * * |-----|
+ * * |     | struct Teddy
+ * * |-----|
+ * * |     | teddy masks
+ * * |     |
+ * * |-----|
+ * * |     | reinforcement mask table
+ * * |     |
+ * * |-----|
+ * * |     | confirm
+ * * |     |
+ * * |     |
+ * * |-----|
+ * * |     | flood control
+ * * |     |
+ * * |-----|
+ */
+
 #ifndef TEDDY_INTERNAL_H
 #define TEDDY_INTERNAL_H
 
index f63df7240648bc6dac41fe2ea2ea0f48e957d64b..c1333964b0cb127aa03f99cbd402388987842c11 100644 (file)
 #include "ue2common.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
+#include "util/uniform_ops.h"
 
 extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
+#if defined(__AVX2__)
+extern const u8 ALIGN_DIRECTIVE p_mask_arr256[33][64];
+#endif
 
 #ifdef ARCH_64_BIT
 #define TEDDY_CONF_TYPE u64a
@@ -110,8 +114,27 @@ void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) {
 }
 
 // Note: p_mask is an output param that initialises a poison mask.
+//       *p_mask = load128(p_mask_arr[n] + 16 - m) means:
+//       m byte 0xff in the beginning, followed by n byte 0x00,
+//       then followed by the rest bytes 0xff.
+// ptr >= lo:
+//     no history.
+//     for end/short zone, ptr==lo and start_offset==0
+//     for start zone, see below
+//          lo         ptr                      hi           hi
+//          |----------|-------|----------------|............|
+//          start      0       start+offset     end(<=16)
+// p_mask              ffff..ff0000...........00ffff..........
+// ptr < lo:
+//     only start zone.
+//             history
+//          ptr        lo                       hi           hi
+//          |----------|-------|----------------|............|
+//          0          start   start+offset     end(<=16)
+// p_mask   ffff.....ffffff..ff0000...........00ffff..........
 static really_inline
-m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
+m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
+                     const u8 *lo, const u8 *hi,
                      const u8 *buf_history, size_t len_history,
                      const u32 nMasks) {
     union {
@@ -123,27 +146,34 @@ m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
     uintptr_t copy_start;
     uintptr_t copy_len;
 
-    if (ptr >= lo) {
+    if (ptr >= lo) { // short/end/start zone
+        uintptr_t start = (uintptr_t)(ptr - lo);
         uintptr_t avail = (uintptr_t)(hi - ptr);
         if (avail >= 16) {
-            *p_mask = load128(p_mask_arr[16] + 16);
+            assert(start_offset - start <= 16);
+            *p_mask = loadu128(p_mask_arr[16 - start_offset + start]
+                               + 16 - start_offset + start);
             return loadu128(ptr);
         }
-        *p_mask = load128(p_mask_arr[avail] + 16);
+        assert(start_offset - start <= avail);
+        *p_mask = loadu128(p_mask_arr[avail - start_offset + start]
+                           + 16 - start_offset + start);
         copy_start = 0;
         copy_len = avail;
-    } else {
+    } else { // start zone
         uintptr_t need = MIN((uintptr_t)(lo - ptr),
                              MIN(len_history, nMasks - 1));
         uintptr_t start = (uintptr_t)(lo - ptr);
         uintptr_t i;
-        for (i = start - need; ptr + i < lo; i++) {
-            u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
+        for (i = start - need; i < start; i++) {
+            u.val8[i] = buf_history[len_history - (start - i)];
         }
         uintptr_t end = MIN(16, (uintptr_t)(hi - ptr));
-        *p_mask = loadu128(p_mask_arr[end - start] + 16 - start);
-        copy_start = i;
-        copy_len = end - i;
+        assert(start + start_offset <= end);
+        *p_mask = loadu128(p_mask_arr[end - start - start_offset]
+                           + 16 - start - start_offset);
+        copy_start = start;
+        copy_len = end - start;
     }
 
     // Runt block from the buffer.
@@ -152,6 +182,135 @@ m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
     return u.val128;
 }
 
+#if defined(__AVX2__)
+/*
+ * \brief Copy a block of [0,31] bytes efficiently.
+ *
+ * This function is a workaround intended to stop some compilers from
+ * synthesizing a memcpy function call out of the copy of a small number of
+ * bytes that we do in vectoredLoad256.
+ */
+static really_inline
+void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) {
+    switch (len) {
+    case 0:
+        break;
+    case 1:
+        *dst = *src;
+        break;
+    case 2:
+        unaligned_store_u16(dst, unaligned_load_u16(src));
+        break;
+    case 3:
+        unaligned_store_u16(dst, unaligned_load_u16(src));
+        dst[2] = src[2];
+        break;
+    case 4:
+        unaligned_store_u32(dst, unaligned_load_u32(src));
+        break;
+    case 5:
+    case 6:
+    case 7:
+        /* Perform copy with two overlapping 4-byte chunks. */
+        unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
+        unaligned_store_u32(dst, unaligned_load_u32(src));
+        break;
+    case 8:
+        unaligned_store_u64a(dst, unaligned_load_u64a(src));
+        break;
+    case 9:
+    case 10:
+    case 11:
+    case 12:
+    case 13:
+    case 14:
+    case 15:
+        /* Perform copy with two overlapping 8-byte chunks. */
+        unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
+        unaligned_store_u64a(dst, unaligned_load_u64a(src));
+        break;
+    case 16:
+        storeu128(dst, loadu128(src));
+        break;
+    default:
+        /* Perform copy with two overlapping 16-byte chunks. */
+        assert(len < 32);
+        storeu128(dst + len - 16, loadu128(src + len - 16));
+        storeu128(dst, loadu128(src));
+        break;
+    }
+}
+
+// Note: p_mask is an output param that initialises a poison mask.
+//       *p_mask = load256(p_mask_arr256[n] + 32 - m) means:
+//       m byte 0xff in the beginning, followed by n byte 0x00,
+//       then followed by the rest bytes 0xff.
+// ptr >= lo:
+//     no history.
+//     for end/short zone, ptr==lo and start_offset==0
+//     for start zone, see below
+//          lo         ptr                      hi           hi
+//          |----------|-------|----------------|............|
+//          start      0       start+offset     end(<=32)
+// p_mask              ffff..ff0000...........00ffff..........
+// ptr < lo:
+//     only start zone.
+//             history
+//          ptr        lo                       hi           hi
+//          |----------|-------|----------------|............|
+//          0          start   start+offset     end(<=32)
+// p_mask   ffff.....ffffff..ff0000...........00ffff..........
+static really_inline
+m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
+                     const u8 *lo, const u8 *hi,
+                     const u8 *buf_history, size_t len_history,
+                     const u32 nMasks) {
+    union {
+        u8 val8[32];
+        m256 val256;
+    } u;
+    u.val256 = zeroes256();
+
+    uintptr_t copy_start;
+    uintptr_t copy_len;
+
+    if (ptr >= lo) { // short/end/start zone
+        uintptr_t start = (uintptr_t)(ptr - lo);
+        uintptr_t avail = (uintptr_t)(hi - ptr);
+        if (avail >= 32) {
+            assert(start_offset - start <= 32);
+            *p_mask = loadu256(p_mask_arr256[32 - start_offset + start]
+                               + 32 - start_offset + start);
+            return loadu256(ptr);
+        }
+        assert(start_offset - start <= avail);
+        *p_mask = loadu256(p_mask_arr256[avail - start_offset + start]
+                           + 32 - start_offset + start);
+        copy_start = 0;
+        copy_len = avail;
+    } else { //start zone
+        uintptr_t need = MIN((uintptr_t)(lo - ptr),
+                             MIN(len_history, nMasks - 1));
+        uintptr_t start = (uintptr_t)(lo - ptr);
+        uintptr_t i;
+        for (i = start - need; i < start; i++) {
+            u.val8[i] = buf_history[len_history - (start - i)];
+        }
+        uintptr_t end = MIN(32, (uintptr_t)(hi - ptr));
+        assert(start + start_offset <= end);
+        *p_mask = loadu256(p_mask_arr256[end - start - start_offset]
+                           + 32 - start - start_offset);
+        copy_start = start;
+        copy_len = end - start;
+    }
+
+    // Runt block from the buffer.
+    copyRuntBlock256(&u.val8[copy_start], &ptr[copy_start], copy_len);
+
+    return u.val256;
+}
+#endif // __AVX2__
+
 static really_inline
 u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte,
                 CautionReason reason) {
@@ -197,50 +356,14 @@ void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
 }
 
 static really_inline
-void do_confWithBit1_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
-                           const u32 *confBase, CautionReason reason,
-                           const struct FDR_Runtime_Args *a, const u8 *ptr,
-                           hwlmcb_rv_t *control, u32 *last_match) {
-    do {
-        u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
-        u32 byte = bit / bucket + offset;
-        u32 idx  = bit % bucket;
-        u32 cf = confBase[idx];
-        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
-                                        ((const u8 *)confBase + cf);
-        if (!(fdrc->groups & *control)) {
-            continue;
-        }
-        u64a confVal = getConfVal(a, ptr, byte, reason);
-        confWithBit1(fdrc, a, ptr - a->buf + byte, control, last_match,
-                     confVal);
-    } while (unlikely(*conf));
-}
-
-static really_inline
-void do_confWithBitMany_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
-                              const u32 *confBase, CautionReason reason,
-                              const struct FDR_Runtime_Args *a, const u8 *ptr,
-                              hwlmcb_rv_t *control, u32 *last_match) {
-    do {
-        u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
-        u32 byte = bit / bucket + offset;
-        u32 idx = bit % bucket;
-        u32 cf = confBase[idx];
-        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
-                                        ((const u8 *)confBase + cf);
-        if (!(fdrc->groups & *control)) {
-            continue;
-        }
-        u64a confVal = getConfVal(a, ptr, byte, reason);
-        confWithBitMany(fdrc, a, ptr - a->buf + byte, reason, control,
-                        last_match, confVal);
-    } while (unlikely(*conf));
+const m128 *getMaskBase(const struct Teddy *teddy) {
+    return (const m128 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
 }
 
 static really_inline
-const m128 *getMaskBase(const struct Teddy *teddy) {
-    return (const m128 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
+const u64a *getReinforcedMaskBase(const struct Teddy *teddy, u8 numMask) {
+    return (const u64a *)((const u8 *)getMaskBase(teddy)
+                          + ROUNDUP_CL(2 * numMask * sizeof(m128)));
 }
 
 static really_inline