]> git.ipfire.org Git - thirdparty/vectorscan.git/commitdiff
Teddy macros unrolling - initial PR to test in CI (#294)
authorg. economou <8426352+isildur-g@users.noreply.github.com>
Wed, 26 Jun 2024 19:35:33 +0000 (22:35 +0300)
committerGitHub <noreply@github.com>
Wed, 26 Jun 2024 19:35:33 +0000 (22:35 +0300)
Major refactoring of teddy and teddy_avx2, unrolling macros to C++ templated functions

---------

Co-authored-by: G.E <gregory.economou@vectorcamp.gr>
CMakeLists.txt
src/fdr/fdr_confirm_runtime.h
src/fdr/flood_runtime.h
src/fdr/teddy.c [deleted file]
src/fdr/teddy.cpp [new file with mode: 0644]
src/fdr/teddy.h
src/fdr/teddy_avx2.c [deleted file]
src/fdr/teddy_fat.cpp [new file with mode: 0644]
src/fdr/teddy_runtime_common.h
src/util/arch/arm/simd_utils.h
src/util/arch/x86/simd_utils.h

index c6952f41b52dd677d5a8acdc115bbb4e3eec78cf..b1daee702ba2421ab138c6e60d509627dd80b3a4 100644 (file)
@@ -297,7 +297,7 @@ set (hs_exec_SRCS
     src/fdr/fdr_confirm_runtime.h
     src/fdr/flood_runtime.h
     src/fdr/fdr_loadval.h
-    src/fdr/teddy.c
+    src/fdr/teddy.cpp
     src/fdr/teddy.h
     src/fdr/teddy_internal.h
     src/fdr/teddy_runtime_common.h
@@ -441,9 +441,11 @@ set (hs_exec_SRCS
     src/util/supervector/arch/ppc64el/impl.cpp)
 endif()
 
+
 if (ARCH_IA32 OR ARCH_X86_64)
     set (hs_exec_avx2_SRCS
-        src/fdr/teddy_avx2.c
+        src/fdr/teddy.cpp
+        src/fdr/teddy_fat.cpp
         src/util/arch/x86/masked_move.c
         src/util/arch/x86/masked_move.h
     )
index 5a2164952c9295964cad10db9bc5fb3e3e528052..e2674ffccf814943760e299142ec3568b41fbeaa 100644 (file)
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2019, Intel Corporation
+ * Copyright (c) 2024, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -54,9 +55,14 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
     if (likely(!start)) {
         return;
     }
-
+// these cplusplus checks are needed because this is included in both fdr.c and teddy.cpp
+#ifdef __cplusplus
+    const struct LitInfo *li
+        = reinterpret_cast<const struct LitInfo *>(reinterpret_cast<const u8 *>(fdrc) + start);
+#else
     const struct LitInfo *li
         = (const struct LitInfo *)((const u8 *)fdrc + start);
+#endif
 
     struct hs_scratch *scratch = a->scratch;
     assert(!scratch->fdr_conf);
@@ -74,18 +80,20 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
             goto out;
         }
 
-        const u8 *loc = buf + i - li->size + 1;
+        do{  // this do while is to block off the line below from the goto
+            const u8 *loc = buf + i - li->size + 1;
+        
+            if (loc < buf) {
+                u32 full_overhang = buf - loc;
+                size_t len_history = a->len_history;
 
-        if (loc < buf) {
-            u32 full_overhang = buf - loc;
-            size_t len_history = a->len_history;
-
-            // can't do a vectored confirm either if we don't have
-            // the bytes
-            if (full_overhang > len_history) {
-                goto out;
+                // can't do a vectored confirm either if we don't have
+                // the bytes
+                if (full_overhang > len_history) {
+                    goto out;
+                }
             }
-        }
+        }while(0);
         assert(li->size <= sizeof(CONF_TYPE));
 
         if (unlikely(!(li->groups & *control))) {
index 2d5a32d92add75475c5f794d0f2708cec6bb1f54..e7529c07855d2705be936356bb76d65134bf409c 100644 (file)
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2024, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
 #define FLOOD_MINIMUM_SIZE 256
 #define FLOOD_BACKOFF_START 32
 
+// this is because this file is included in both fdr.c and teddy.cpp
+#if defined __cplusplus
+#define CU64A_P_CAST(X) reinterpret_cast<const u64a*>(X)
+#else
+#define CU64A_P_CAST(X) (const u64a *)(X)
+#endif
+
 static really_inline
 const u8 * nextFloodDetect(const u8 * buf, size_t len, u32 floodBackoff) {
     // if we don't have a flood at either the start or end,
@@ -47,18 +55,18 @@ const u8 * nextFloodDetect(const u8 * buf, size_t len, u32 floodBackoff) {
 
     /* entry points in runtime.c prefetch relevant data */
 #ifndef FLOOD_32
-    u64a x11 = *(const u64a *)ROUNDUP_PTR(buf, 8);
-    u64a x12 = *(const u64a *)ROUNDUP_PTR(buf+8, 8);
+    u64a x11 = *CU64A_P_CAST(ROUNDUP_PTR(buf, 8));
+    u64a x12 = *CU64A_P_CAST(ROUNDUP_PTR(buf+8, 8));
     if (x11 == x12) {
         return buf + floodBackoff;
     }
-    u64a x21 = *(const u64a *)ROUNDUP_PTR(buf + len/2, 8);
-    u64a x22 = *(const u64a *)ROUNDUP_PTR(buf + len/2 + 8, 8);
+    u64a x21 = *CU64A_P_CAST(ROUNDUP_PTR(buf + len/2, 8));
+    u64a x22 = *CU64A_P_CAST(ROUNDUP_PTR(buf + len/2 + 8, 8));
     if (x21 == x22) {
         return buf + floodBackoff;
     }
-    u64a x31 = *(const u64a *)ROUNDUP_PTR(buf + len - 24, 8);
-    u64a x32 = *(const u64a *)ROUNDUP_PTR(buf + len - 16, 8);
+    u64a x31 = *CU64A_P_CAST(ROUNDUP_PTR(buf + len - 24, 8));
+    u64a x32 = *CU64A_P_CAST(ROUNDUP_PTR(buf + len - 16, 8));
     if (x31 == x32) {
         return buf + floodBackoff;
     }
@@ -106,9 +114,15 @@ const u8 * floodDetect(const struct FDR * fdr,
 
     // go from c to our FDRFlood structure
     u8 c = buf[i];
+#ifdef __cplusplus
+    const u8 * fBase = (reinterpret_cast<const u8 *>(fdr)) + fdr->floodOffset;
+    u32 fIdx = (reinterpret_cast<const u32 *>(fBase))[c];
+    const struct FDRFlood * fsb = reinterpret_cast<const struct FDRFlood *>(fBase + sizeof(u32) * 256);
+#else
     const u8 * fBase = ((const u8 *)fdr) + fdr->floodOffset;
     u32 fIdx = ((const u32 *)fBase)[c];
     const struct FDRFlood * fsb = (const struct FDRFlood *)(fBase + sizeof(u32) * 256);
+#endif
     const struct FDRFlood * fl = &fsb[fIdx];
 
 #ifndef FLOOD_32
@@ -116,7 +130,7 @@ const u8 * floodDetect(const struct FDR * fdr,
     cmpVal |= cmpVal << 8;
     cmpVal |= cmpVal << 16;
     cmpVal |= cmpVal << 32;
-    u64a probe = *(const u64a *)ROUNDUP_PTR(buf+i, 8);
+    u64a probe = *CU64A_P_CAST(ROUNDUP_PTR(buf+i, 8));
 #else
     u32 cmpVal = c;
     cmpVal |= cmpVal << 8;
@@ -139,16 +153,16 @@ const u8 * floodDetect(const struct FDR * fdr,
 #ifndef FLOOD_32
     j -= (u32)((uintptr_t)buf + j) & 0x7; // push j back to yield 8-aligned addrs
     for (; j + 32 < mainLoopLen; j += 32) {
-        u64a v = *(const u64a *)(buf + j);
-        u64a v2 = *(const u64a *)(buf + j + 8);
-        u64a v3 = *(const u64a *)(buf + j + 16);
-        u64a v4 = *(const u64a *)(buf + j + 24);
+        u64a v = *CU64A_P_CAST(buf + j);
+        u64a v2 = *CU64A_P_CAST(buf + j + 8);
+        u64a v3 = *CU64A_P_CAST(buf + j + 16);
+        u64a v4 = *CU64A_P_CAST(buf + j + 24);
         if ((v4 != cmpVal) || (v3 != cmpVal) || (v2 != cmpVal) || (v != cmpVal)) {
             break;
         }
     }
     for (; j + 8 < mainLoopLen; j += 8) {
-        u64a v = *(const u64a *)(buf + j);
+        u64a v = *CU64A_P_CAST(buf + j);
         if (v != cmpVal) {
             break;
         }
@@ -172,7 +186,11 @@ const u8 * floodDetect(const struct FDR * fdr,
     }
 #endif
     for (; j < mainLoopLen; j++) {
+#ifdef __cplusplus
+        u8 v = *(reinterpret_cast<const u8 *>(buf + j));
+#else
         u8 v = *(const u8 *)(buf + j);
+#endif
         if (v != c) {
             break;
         }
diff --git a/src/fdr/teddy.c b/src/fdr/teddy.c
deleted file mode 100644 (file)
index 65db3df..0000000
+++ /dev/null
@@ -1,1116 +0,0 @@
-/*
- * Copyright (c) 2015-2020, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/** \file
- * \brief Teddy literal matcher: SSSE3 engine runtime.
- */
-
-#include "fdr_internal.h"
-#include "flood_runtime.h"
-#include "teddy.h"
-#include "teddy_internal.h"
-#include "teddy_runtime_common.h"
-#include "util/simd_utils.h"
-
-const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
-};
-
-#if defined(HAVE_AVX512VBMI) // VBMI strong teddy
-
-#define CONF_CHUNK_64(chunk, bucket, off, reason, pt, conf_fn)              \
-do {                                                                        \
-    if (unlikely(chunk != ones_u64a)) {                                     \
-        chunk = ~chunk;                                                     \
-        conf_fn(&chunk, bucket, off, confBase, reason, a, pt,               \
-                &control, &last_match);                                     \
-        CHECK_HWLM_TERMINATE_MATCHING;                                      \
-    }                                                                       \
-} while(0)
-
-#define CONF_CHUNK_32(chunk, bucket, off, reason, pt, conf_fn)              \
-do {                                                                        \
-    if (unlikely(chunk != ones_u32)) {                                      \
-        chunk = ~chunk;                                                     \
-        conf_fn(&chunk, bucket, off, confBase, reason, a, pt,               \
-                &control, &last_match);                                     \
-        CHECK_HWLM_TERMINATE_MATCHING;                                      \
-    }                                                                       \
-} while(0)
-
-#else
-
-#define CONF_CHUNK_64(chunk, bucket, off, reason, conf_fn)                  \
-do {                                                                        \
-    if (unlikely(chunk != ones_u64a)) {                                     \
-        chunk = ~chunk;                                                     \
-        conf_fn(&chunk, bucket, off, confBase, reason, a, ptr,              \
-                &control, &last_match);                                     \
-        CHECK_HWLM_TERMINATE_MATCHING;                                      \
-    }                                                                       \
-} while(0)
-
-#define CONF_CHUNK_32(chunk, bucket, off, reason, conf_fn)                  \
-do {                                                                        \
-    if (unlikely(chunk != ones_u32)) {                                      \
-        chunk = ~chunk;                                                     \
-        conf_fn(&chunk, bucket, off, confBase, reason, a, ptr,              \
-                &control, &last_match);                                     \
-        CHECK_HWLM_TERMINATE_MATCHING;                                      \
-    }                                                                       \
-} while(0)
-
-#endif
-
-#if defined(HAVE_AVX512VBMI) // VBMI strong teddy
-
-#ifdef ARCH_64_BIT
-#define CONFIRM_TEDDY(var, bucket, offset, reason, pt, conf_fn)             \
-do {                                                                        \
-    if (unlikely(diff512(var, ones512()))) {                                \
-        m128 p128_0 = extract128from512(var, 0);                            \
-        m128 p128_1 = extract128from512(var, 1);                            \
-        m128 p128_2 = extract128from512(var, 2);                            \
-        m128 p128_3 = extract128from512(var, 3);                            \
-        u64a part1 = movq(p128_0);                                          \
-        u64a part2 = movq(rshiftbyte_m128(p128_0, 8));                      \
-        u64a part3 = movq(p128_1);                                          \
-        u64a part4 = movq(rshiftbyte_m128(p128_1, 8));                      \
-        u64a part5 = movq(p128_2);                                          \
-        u64a part6 = movq(rshiftbyte_m128(p128_2, 8));                      \
-        u64a part7 = movq(p128_3);                                          \
-        u64a part8 = movq(rshiftbyte_m128(p128_3, 8));                      \
-        CONF_CHUNK_64(part1, bucket, offset, reason, pt, conf_fn);          \
-        CONF_CHUNK_64(part2, bucket, offset + 8, reason, pt, conf_fn);      \
-        CONF_CHUNK_64(part3, bucket, offset + 16, reason, pt, conf_fn);     \
-        CONF_CHUNK_64(part4, bucket, offset + 24, reason, pt, conf_fn);     \
-        CONF_CHUNK_64(part5, bucket, offset + 32, reason, pt, conf_fn);     \
-        CONF_CHUNK_64(part6, bucket, offset + 40, reason, pt, conf_fn);     \
-        CONF_CHUNK_64(part7, bucket, offset + 48, reason, pt, conf_fn);     \
-        CONF_CHUNK_64(part8, bucket, offset + 56, reason, pt, conf_fn);     \
-    }                                                                       \
-} while(0)
-#else
-#define CONFIRM_TEDDY(var, bucket, offset, reason, pt, conf_fn)             \
-do {                                                                        \
-    if (unlikely(diff512(var, ones512()))) {                                \
-        m128 p128_0 = extract128from512(var, 0);                            \
-        m128 p128_1 = extract128from512(var, 1);                            \
-        m128 p128_2 = extract128from512(var, 2);                            \
-        m128 p128_3 = extract128from512(var, 3);                            \
-        u32 part1 = movd(p128_0);                                           \
-        u32 part2 = movd(rshiftbyte_m128(p128_0, 4));                       \
-        u32 part3 = movd(rshiftbyte_m128(p128_0, 8));                       \
-        u32 part4 = movd(rshiftbyte_m128(p128_0, 12));                      \
-        u32 part5 = movd(p128_1);                                           \
-        u32 part6 = movd(rshiftbyte_m128(p128_1, 4));                       \
-        u32 part7 = movd(rshiftbyte_m128(p128_1, 8));                       \
-        u32 part8 = movd(rshiftbyte_m128(p128_1, 12));                      \
-        u32 part9 = movd(p128_2);                                           \
-        u32 part10 = movd(rshiftbyte_m128(p128_2, 4));                      \
-        u32 part11 = movd(rshiftbyte_m128(p128_2, 8));                      \
-        u32 part12 = movd(rshiftbyte_m128(p128_2, 12));                     \
-        u32 part13 = movd(p128_3);                                          \
-        u32 part14 = movd(rshiftbyte_m128(p128_3, 4));                      \
-        u32 part15 = movd(rshiftbyte_m128(p128_3, 8));                      \
-        u32 part16 = movd(rshiftbyte_m128(p128_3, 12));                     \
-        CONF_CHUNK_32(part1, bucket, offset, reason, pt, conf_fn);          \
-        CONF_CHUNK_32(part2, bucket, offset + 4, reason, pt, conf_fn);      \
-        CONF_CHUNK_32(part3, bucket, offset + 8, reason, pt, conf_fn);      \
-        CONF_CHUNK_32(part4, bucket, offset + 12, reason, pt, conf_fn);     \
-        CONF_CHUNK_32(part5, bucket, offset + 16, reason, pt, conf_fn);     \
-        CONF_CHUNK_32(part6, bucket, offset + 20, reason, pt, conf_fn);     \
-        CONF_CHUNK_32(part7, bucket, offset + 24, reason, pt, conf_fn);     \
-        CONF_CHUNK_32(part8, bucket, offset + 28, reason, pt, conf_fn);     \
-        CONF_CHUNK_32(part9, bucket, offset + 32, reason, pt, conf_fn);     \
-        CONF_CHUNK_32(part10, bucket, offset + 36, reason, pt, conf_fn);    \
-        CONF_CHUNK_32(part11, bucket, offset + 40, reason, pt, conf_fn);    \
-        CONF_CHUNK_32(part12, bucket, offset + 44, reason, pt, conf_fn);    \
-        CONF_CHUNK_32(part13, bucket, offset + 48, reason, pt, conf_fn);    \
-        CONF_CHUNK_32(part14, bucket, offset + 52, reason, pt, conf_fn);    \
-        CONF_CHUNK_32(part15, bucket, offset + 56, reason, pt, conf_fn);    \
-        CONF_CHUNK_32(part16, bucket, offset + 60, reason, pt, conf_fn);    \
-    }                                                                       \
-} while(0)
-#endif
-
-#define PREP_SHUF_MASK                                                      \
-    m512 lo = and512(val, *lo_mask);                                        \
-    m512 hi = and512(rshift64_m512(val, 4), *lo_mask)
-
-#define TEDDY_VBMI_PSHUFB_OR_M1                              \
-    m512 shuf_or_b0 = or512(pshufb_m512(dup_mask[0], lo),    \
-                            pshufb_m512(dup_mask[1], hi));
-
-#define TEDDY_VBMI_PSHUFB_OR_M2                              \
-    TEDDY_VBMI_PSHUFB_OR_M1                                  \
-    m512 shuf_or_b1 = or512(pshufb_m512(dup_mask[2], lo),    \
-                            pshufb_m512(dup_mask[3], hi));
-
-#define TEDDY_VBMI_PSHUFB_OR_M3                              \
-    TEDDY_VBMI_PSHUFB_OR_M2                                  \
-    m512 shuf_or_b2 = or512(pshufb_m512(dup_mask[4], lo),    \
-                            pshufb_m512(dup_mask[5], hi));
-
-#define TEDDY_VBMI_PSHUFB_OR_M4                              \
-    TEDDY_VBMI_PSHUFB_OR_M3                                  \
-    m512 shuf_or_b3 = or512(pshufb_m512(dup_mask[6], lo),    \
-                            pshufb_m512(dup_mask[7], hi));
-
-#define TEDDY_VBMI_SL1_MASK   0xfffffffffffffffeULL
-#define TEDDY_VBMI_SL2_MASK   0xfffffffffffffffcULL
-#define TEDDY_VBMI_SL3_MASK   0xfffffffffffffff8ULL
-
-#define TEDDY_VBMI_SHIFT_M1
-
-#define TEDDY_VBMI_SHIFT_M2                      \
-    TEDDY_VBMI_SHIFT_M1                          \
-    m512 sl1 = maskz_vpermb512(TEDDY_VBMI_SL1_MASK, sl_msk[0], shuf_or_b1);
-
-#define TEDDY_VBMI_SHIFT_M3                      \
-    TEDDY_VBMI_SHIFT_M2                          \
-    m512 sl2 = maskz_vpermb512(TEDDY_VBMI_SL2_MASK, sl_msk[1], shuf_or_b2);
-
-#define TEDDY_VBMI_SHIFT_M4                      \
-    TEDDY_VBMI_SHIFT_M3                          \
-    m512 sl3 = maskz_vpermb512(TEDDY_VBMI_SL3_MASK, sl_msk[2], shuf_or_b3);
-
-#define SHIFT_OR_M1            \
-    shuf_or_b0
-
-#define SHIFT_OR_M2            \
-    or512(sl1, SHIFT_OR_M1)
-
-#define SHIFT_OR_M3            \
-    or512(sl2, SHIFT_OR_M2)
-
-#define SHIFT_OR_M4            \
-    or512(sl3, SHIFT_OR_M3)
-
-static really_inline
-m512 prep_conf_teddy_m1(const m512 *lo_mask, const m512 *dup_mask,
-                        UNUSED const m512 *sl_msk, const m512 val) {
-    PREP_SHUF_MASK;
-    TEDDY_VBMI_PSHUFB_OR_M1;
-    TEDDY_VBMI_SHIFT_M1;
-    return SHIFT_OR_M1;
-}
-
-static really_inline
-m512 prep_conf_teddy_m2(const m512 *lo_mask, const m512 *dup_mask,
-                        const m512 *sl_msk, const m512 val) {
-    PREP_SHUF_MASK;
-    TEDDY_VBMI_PSHUFB_OR_M2;
-    TEDDY_VBMI_SHIFT_M2;
-    return SHIFT_OR_M2;
-}
-
-static really_inline
-m512 prep_conf_teddy_m3(const m512 *lo_mask, const m512 *dup_mask,
-                        const m512 *sl_msk, const m512 val) {
-    PREP_SHUF_MASK;
-    TEDDY_VBMI_PSHUFB_OR_M3;
-    TEDDY_VBMI_SHIFT_M3;
-    return SHIFT_OR_M3;
-}
-
-static really_inline
-m512 prep_conf_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
-                        const m512 *sl_msk, const m512 val) {
-    PREP_SHUF_MASK;
-    TEDDY_VBMI_PSHUFB_OR_M4;
-    TEDDY_VBMI_SHIFT_M4;
-    return SHIFT_OR_M4;
-}
-
-#define PREP_CONF_FN(val, n)                                                  \
-    prep_conf_teddy_m##n(&lo_mask, dup_mask, sl_msk, val)
-
-#define TEDDY_VBMI_SL1_POS    15
-#define TEDDY_VBMI_SL2_POS    14
-#define TEDDY_VBMI_SL3_POS    13
-
-#define TEDDY_VBMI_LOAD_SHIFT_MASK_M1
-
-#define TEDDY_VBMI_LOAD_SHIFT_MASK_M2    \
-    TEDDY_VBMI_LOAD_SHIFT_MASK_M1        \
-    sl_msk[0] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL1_POS);
-
-#define TEDDY_VBMI_LOAD_SHIFT_MASK_M3    \
-    TEDDY_VBMI_LOAD_SHIFT_MASK_M2        \
-    sl_msk[1] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL2_POS);
-
-#define TEDDY_VBMI_LOAD_SHIFT_MASK_M4    \
-    TEDDY_VBMI_LOAD_SHIFT_MASK_M3        \
-    sl_msk[2] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL3_POS);
-
-#define PREPARE_MASKS_1                                                       \
-    dup_mask[0] = set1_4x128(maskBase[0]);                                      \
-    dup_mask[1] = set1_4x128(maskBase[1]);
-
-#define PREPARE_MASKS_2                                                       \
-    PREPARE_MASKS_1                                                           \
-    dup_mask[2] = set1_4x128(maskBase[2]);                                      \
-    dup_mask[3] = set1_4x128(maskBase[3]);
-
-#define PREPARE_MASKS_3                                                       \
-    PREPARE_MASKS_2                                                           \
-    dup_mask[4] = set1_4x128(maskBase[4]);                                      \
-    dup_mask[5] = set1_4x128(maskBase[5]);
-
-#define PREPARE_MASKS_4                                                       \
-    PREPARE_MASKS_3                                                           \
-    dup_mask[6] = set1_4x128(maskBase[6]);                                      \
-    dup_mask[7] = set1_4x128(maskBase[7]);
-
-#define PREPARE_MASKS(n)                                                      \
-    m512 lo_mask = set1_64x8(0xf);                                              \
-    m512 dup_mask[n * 2];                                                     \
-    m512 sl_msk[n - 1];                                                       \
-    PREPARE_MASKS_##n                                                         \
-    TEDDY_VBMI_LOAD_SHIFT_MASK_M##n
-
-#define TEDDY_VBMI_CONF_MASK_HEAD   (0xffffffffffffffffULL >> n_sh)
-#define TEDDY_VBMI_CONF_MASK_FULL   (0xffffffffffffffffULL << n_sh)
-#define TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffffffffffULL >> (64 - n) << overlap)
-#define TEDDY_VBMI_LOAD_MASK_PATCH  (0xffffffffffffffffULL >> (64 - n_sh))
-
-#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn)                       \
-do {                                                                          \
-    const u8 *buf_end = a->buf + a->len;                                      \
-    const u8 *ptr = a->buf + a->start_offset;                                 \
-    u32 floodBackoff = FLOOD_BACKOFF_START;                                   \
-    const u8 *tryFloodDetect = a->firstFloodDetect;                           \
-    u32 last_match = ones_u32;                                                \
-    const struct Teddy *teddy = (const struct Teddy *)fdr;                    \
-    const size_t iterBytes = 64;                                              \
-    u32 n_sh = n_msk - 1;                                                     \
-    const size_t loopBytes = 64 - n_sh;                                       \
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",                 \
-                 a->buf, a->len, a->start_offset);                            \
-                                                                              \
-    const m128 *maskBase = getMaskBase(teddy);                                \
-    PREPARE_MASKS(n_msk);                                                     \
-    const u32 *confBase = getConfBase(teddy);                                 \
-                                                                              \
-    u64a k = TEDDY_VBMI_CONF_MASK_FULL;                                       \
-    m512 p_mask = set_mask_m512(~k);                                          \
-    u32 overlap = 0;                                                          \
-    u64a patch = 0;                                                           \
-    if (likely(ptr + loopBytes <= buf_end)) {                                 \
-        m512 p_mask0 = set_mask_m512(~TEDDY_VBMI_CONF_MASK_HEAD);             \
-        m512 r_0 = PREP_CONF_FN(loadu512(ptr), n_msk);                        \
-        r_0 = or512(r_0, p_mask0);                                            \
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, ptr, conf_fn);                    \
-        ptr += loopBytes;                                                     \
-        overlap = n_sh;                                                       \
-        patch = TEDDY_VBMI_LOAD_MASK_PATCH;                                   \
-    }                                                                         \
-                                                                              \
-    for (; ptr + loopBytes <= buf_end; ptr += loopBytes) {                    \
-        __builtin_prefetch(ptr - n_sh + (64 * 2));                            \
-        CHECK_FLOOD;                                                          \
-        m512 r_0 = PREP_CONF_FN(loadu512(ptr - n_sh), n_msk);                 \
-        r_0 = or512(r_0, p_mask);                                             \
-        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, ptr - n_sh, conf_fn);          \
-    }                                                                         \
-                                                                              \
-    assert(ptr + loopBytes > buf_end);                                        \
-    if (ptr < buf_end) {                                                      \
-        u32 left = (u32)(buf_end - ptr);                                      \
-        u64a k1 = TEDDY_VBMI_CONF_MASK_VAR(left);                             \
-        m512 p_mask1 = set_mask_m512(~k1);                                    \
-        m512 val_0 = loadu_maskz_m512(k1 | patch, ptr - overlap);             \
-        m512 r_0 = PREP_CONF_FN(val_0, n_msk);                                \
-        r_0 = or512(r_0, p_mask1);                                            \
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, ptr - overlap, conf_fn);          \
-    }                                                                         \
-                                                                              \
-    return HWLM_SUCCESS;                                                      \
-} while(0)
-
-#elif defined(HAVE_AVX512) // AVX512 reinforced teddy
-
-#ifdef ARCH_64_BIT
-#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
-do {                                                                        \
-    if (unlikely(diff512(var, ones512()))) {                                \
-        m128 p128_0 = extract128from512(var, 0);                            \
-        m128 p128_1 = extract128from512(var, 1);                            \
-        m128 p128_2 = extract128from512(var, 2);                            \
-        m128 p128_3 = extract128from512(var, 3);                            \
-        u64a part1 = movq(p128_0);                                          \
-        u64a part2 = movq(rshiftbyte_m128(p128_0, 8));                      \
-        u64a part3 = movq(p128_1);                                          \
-        u64a part4 = movq(rshiftbyte_m128(p128_1, 8));                      \
-        u64a part5 = movq(p128_2);                                          \
-        u64a part6 = movq(rshiftbyte_m128(p128_2, 8));                      \
-        u64a part7 = movq(p128_3);                                          \
-        u64a part8 = movq(rshiftbyte_m128(p128_3, 8));                      \
-        CONF_CHUNK_64(part1, bucket, offset, reason, conf_fn);              \
-        CONF_CHUNK_64(part2, bucket, offset + 8, reason, conf_fn);          \
-        CONF_CHUNK_64(part3, bucket, offset + 16, reason, conf_fn);         \
-        CONF_CHUNK_64(part4, bucket, offset + 24, reason, conf_fn);         \
-        CONF_CHUNK_64(part5, bucket, offset + 32, reason, conf_fn);         \
-        CONF_CHUNK_64(part6, bucket, offset + 40, reason, conf_fn);         \
-        CONF_CHUNK_64(part7, bucket, offset + 48, reason, conf_fn);         \
-        CONF_CHUNK_64(part8, bucket, offset + 56, reason, conf_fn);         \
-    }                                                                       \
-} while(0)
-#else
-#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
-do {                                                                        \
-    if (unlikely(diff512(var, ones512()))) {                                \
-        m128 p128_0 = extract128from512(var, 0);                            \
-        m128 p128_1 = extract128from512(var, 1);                            \
-        m128 p128_2 = extract128from512(var, 2);                            \
-        m128 p128_3 = extract128from512(var, 3);                            \
-        u32 part1 = movd(p128_0);                                           \
-        u32 part2 = movd(rshiftbyte_m128(p128_0, 4));                       \
-        u32 part3 = movd(rshiftbyte_m128(p128_0, 8));                       \
-        u32 part4 = movd(rshiftbyte_m128(p128_0, 12));                      \
-        u32 part5 = movd(p128_1);                                           \
-        u32 part6 = movd(rshiftbyte_m128(p128_1, 4));                       \
-        u32 part7 = movd(rshiftbyte_m128(p128_1, 8));                       \
-        u32 part8 = movd(rshiftbyte_m128(p128_1, 12));                      \
-        u32 part9 = movd(p128_2);                                           \
-        u32 part10 = movd(rshiftbyte_m128(p128_2, 4));                      \
-        u32 part11 = movd(rshiftbyte_m128(p128_2, 8));                      \
-        u32 part12 = movd(rshiftbyte_m128(p128_2, 12));                     \
-        u32 part13 = movd(p128_3);                                          \
-        u32 part14 = movd(rshiftbyte_m128(p128_3, 4));                      \
-        u32 part15 = movd(rshiftbyte_m128(p128_3, 8));                      \
-        u32 part16 = movd(rshiftbyte_m128(p128_3, 12));                     \
-        CONF_CHUNK_32(part1, bucket, offset, reason, conf_fn);              \
-        CONF_CHUNK_32(part2, bucket, offset + 4, reason, conf_fn);          \
-        CONF_CHUNK_32(part3, bucket, offset + 8, reason, conf_fn);          \
-        CONF_CHUNK_32(part4, bucket, offset + 12, reason, conf_fn);         \
-        CONF_CHUNK_32(part5, bucket, offset + 16, reason, conf_fn);         \
-        CONF_CHUNK_32(part6, bucket, offset + 20, reason, conf_fn);         \
-        CONF_CHUNK_32(part7, bucket, offset + 24, reason, conf_fn);         \
-        CONF_CHUNK_32(part8, bucket, offset + 28, reason, conf_fn);         \
-        CONF_CHUNK_32(part9, bucket, offset + 32, reason, conf_fn);         \
-        CONF_CHUNK_32(part10, bucket, offset + 36, reason, conf_fn);        \
-        CONF_CHUNK_32(part11, bucket, offset + 40, reason, conf_fn);        \
-        CONF_CHUNK_32(part12, bucket, offset + 44, reason, conf_fn);        \
-        CONF_CHUNK_32(part13, bucket, offset + 48, reason, conf_fn);        \
-        CONF_CHUNK_32(part14, bucket, offset + 52, reason, conf_fn);        \
-        CONF_CHUNK_32(part15, bucket, offset + 56, reason, conf_fn);        \
-        CONF_CHUNK_32(part16, bucket, offset + 60, reason, conf_fn);        \
-    }                                                                       \
-} while(0)
-#endif
-
-#define PREP_SHUF_MASK_NO_REINFORCEMENT(val)                                \
-    m512 lo = and512(val, *lo_mask);                                        \
-    m512 hi = and512(rshift64_m512(val, 4), *lo_mask)
-
-#define PREP_SHUF_MASK                                                      \
-    PREP_SHUF_MASK_NO_REINFORCEMENT(load512(ptr));                          \
-    *c_16 = *(ptr + 15);                                                    \
-    *c_32 = *(ptr + 31);                                                    \
-    *c_48 = *(ptr + 47);                                                    \
-    m512 r_msk = set8x64(0ULL, r_msk_base[*c_48], 0ULL, r_msk_base[*c_32],\
-                           0ULL, r_msk_base[*c_16], 0ULL, r_msk_base[*c_0]);\
-    *c_0 = *(ptr + 63)
-
-#define SHIFT_OR_M1                                                         \
-    or512(pshufb_m512(dup_mask[0], lo), pshufb_m512(dup_mask[1], hi))
-
-#define SHIFT_OR_M2                                                         \
-    or512(lshift128_m512(or512(pshufb_m512(dup_mask[2], lo),                \
-                               pshufb_m512(dup_mask[3], hi)),               \
-                         1), SHIFT_OR_M1)
-
-#define SHIFT_OR_M3                                                         \
-    or512(lshift128_m512(or512(pshufb_m512(dup_mask[4], lo),                \
-                               pshufb_m512(dup_mask[5], hi)),               \
-                         2), SHIFT_OR_M2)
-
-#define SHIFT_OR_M4                                                         \
-    or512(lshift128_m512(or512(pshufb_m512(dup_mask[6], lo),                \
-                               pshufb_m512(dup_mask[7], hi)),               \
-                         3), SHIFT_OR_M3)
-
-static really_inline
-m512 prep_conf_teddy_no_reinforcement_m1(const m512 *lo_mask,
-                                         const m512 *dup_mask,
-                                         const m512 val) {
-    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
-    return SHIFT_OR_M1;
-}
-
-static really_inline
-m512 prep_conf_teddy_no_reinforcement_m2(const m512 *lo_mask,
-                                         const m512 *dup_mask,
-                                         const m512 val) {
-    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
-    return SHIFT_OR_M2;
-}
-
-static really_inline
-m512 prep_conf_teddy_no_reinforcement_m3(const m512 *lo_mask,
-                                         const m512 *dup_mask,
-                                         const m512 val) {
-    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
-    return SHIFT_OR_M3;
-}
-
-static really_inline
-m512 prep_conf_teddy_no_reinforcement_m4(const m512 *lo_mask,
-                                         const m512 *dup_mask,
-                                         const m512 val) {
-    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
-    return SHIFT_OR_M4;
-}
-
-static really_inline
-m512 prep_conf_teddy_m1(const m512 *lo_mask, const m512 *dup_mask,
-                        const u8 *ptr, const u64a *r_msk_base,
-                        u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
-    PREP_SHUF_MASK;
-    return or512(SHIFT_OR_M1, r_msk);
-}
-
-static really_inline
-m512 prep_conf_teddy_m2(const m512 *lo_mask, const m512 *dup_mask,
-                        const u8 *ptr, const u64a *r_msk_base,
-                        u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
-    PREP_SHUF_MASK;
-    return or512(SHIFT_OR_M2, r_msk);
-}
-
-static really_inline
-m512 prep_conf_teddy_m3(const m512 *lo_mask, const m512 *dup_mask,
-                        const u8 *ptr, const u64a *r_msk_base,
-                        u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
-    PREP_SHUF_MASK;
-    return or512(SHIFT_OR_M3, r_msk);
-}
-
-static really_inline
-m512 prep_conf_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
-                        const u8 *ptr, const u64a *r_msk_base,
-                        u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
-    PREP_SHUF_MASK;
-    return or512(SHIFT_OR_M4, r_msk);
-}
-
-#define PREP_CONF_FN_NO_REINFORCEMENT(val, n)                                 \
-    prep_conf_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val)
-
-#define PREP_CONF_FN(ptr, n)                                                  \
-    prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base,                 \
-                         &c_0, &c_16, &c_32, &c_48)
-
-#define PREPARE_MASKS_1                                                       \
-    dup_mask[0] = set1_4x128(maskBase[0]);                                      \
-    dup_mask[1] = set1_4x128(maskBase[1]);
-
-#define PREPARE_MASKS_2                                                       \
-    PREPARE_MASKS_1                                                           \
-    dup_mask[2] = set1_4x128(maskBase[2]);                                      \
-    dup_mask[3] = set1_4x128(maskBase[3]);
-
-#define PREPARE_MASKS_3                                                       \
-    PREPARE_MASKS_2                                                           \
-    dup_mask[4] = set1_4x128(maskBase[4]);                                      \
-    dup_mask[5] = set1_4x128(maskBase[5]);
-
-#define PREPARE_MASKS_4                                                       \
-    PREPARE_MASKS_3                                                           \
-    dup_mask[6] = set1_4x128(maskBase[6]);                                      \
-    dup_mask[7] = set1_4x128(maskBase[7]);
-
-#define PREPARE_MASKS(n)                                                      \
-    m512 lo_mask = set1_64x8(0xf);                                              \
-    m512 dup_mask[n * 2];                                                     \
-    PREPARE_MASKS_##n
-
-#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn)                       \
-do {                                                                          \
-    const u8 *buf_end = a->buf + a->len;                                      \
-    const u8 *ptr = a->buf + a->start_offset;                                 \
-    u32 floodBackoff = FLOOD_BACKOFF_START;                                   \
-    const u8 *tryFloodDetect = a->firstFloodDetect;                           \
-    u32 last_match = ones_u32;                                                \
-    const struct Teddy *teddy = (const struct Teddy *)fdr;                    \
-    const size_t iterBytes = 128;                                             \
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",                 \
-                 a->buf, a->len, a->start_offset);                            \
-                                                                              \
-    const m128 *maskBase = getMaskBase(teddy);                                \
-    PREPARE_MASKS(n_msk);                                                     \
-    const u32 *confBase = getConfBase(teddy);                                 \
-                                                                              \
-    const u64a *r_msk_base = getReinforcedMaskBase(teddy, n_msk);             \
-    u32 c_0 = 0x100;                                                          \
-    u32 c_16 = 0x100;                                                         \
-    u32 c_32 = 0x100;                                                         \
-    u32 c_48 = 0x100;                                                         \
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 64);                               \
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);           \
-    if (ptr < mainStart) {                                                    \
-        ptr = mainStart - 64;                                                 \
-        m512 p_mask;                                                          \
-        m512 val_0 = vectoredLoad512(&p_mask, ptr, a->start_offset,           \
-                                     a->buf, buf_end,                         \
-                                     a->buf_history, a->len_history, n_msk);  \
-        m512 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk);               \
-        r_0 = or512(r_0, p_mask);                                             \
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
-        ptr += 64;                                                            \
-    }                                                                         \
-                                                                              \
-    if (ptr + 64 <= buf_end) {                                                \
-        m512 r_0 = PREP_CONF_FN(ptr, n_msk);                                  \
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
-        ptr += 64;                                                            \
-    }                                                                         \
-                                                                              \
-    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {                    \
-        __builtin_prefetch(ptr + (iterBytes * 4));                            \
-        CHECK_FLOOD;                                                          \
-        m512 r_0 = PREP_CONF_FN(ptr, n_msk);                                  \
-        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn);                      \
-        m512 r_1 = PREP_CONF_FN(ptr + 64, n_msk);                             \
-        CONFIRM_TEDDY(r_1, 8, 64, NOT_CAUTIOUS, conf_fn);                     \
-    }                                                                         \
-                                                                              \
-    if (ptr + 64 <= buf_end) {                                                \
-        m512 r_0 = PREP_CONF_FN(ptr, n_msk);                                  \
-        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn);                      \
-        ptr += 64;                                                            \
-    }                                                                         \
-                                                                              \
-    assert(ptr + 64 > buf_end);                                               \
-    if (ptr < buf_end) {                                                      \
-        m512 p_mask;                                                          \
-        m512 val_0 = vectoredLoad512(&p_mask, ptr, 0, ptr, buf_end,           \
-                                     a->buf_history, a->len_history, n_msk);  \
-        m512 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk);               \
-        r_0 = or512(r_0, p_mask);                                             \
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
-    }                                                                         \
-                                                                              \
-    return HWLM_SUCCESS;                                                      \
-} while(0)
-
-#elif defined(HAVE_AVX2) // not HAVE_AVX512 but HAVE_AVX2 reinforced teddy
-
-#ifdef ARCH_64_BIT
-#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
-do {                                                                        \
-    if (unlikely(diff256(var, ones256()))) {                                \
-        m128 lo = movdq_lo(var);                                            \
-        m128 hi = movdq_hi(var);                                            \
-        u64a part1 = movq(lo);                                              \
-        u64a part2 = movq(rshiftbyte_m128(lo, 8));                          \
-        u64a part3 = movq(hi);                                              \
-        u64a part4 = movq(rshiftbyte_m128(hi, 8));                          \
-        CONF_CHUNK_64(part1, bucket, offset, reason, conf_fn);              \
-        CONF_CHUNK_64(part2, bucket, offset + 8, reason, conf_fn);          \
-        CONF_CHUNK_64(part3, bucket, offset + 16, reason, conf_fn);         \
-        CONF_CHUNK_64(part4, bucket, offset + 24, reason, conf_fn);         \
-    }                                                                       \
-} while(0)
-#else
-#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
-do {                                                                        \
-    if (unlikely(diff256(var, ones256()))) {                                \
-        m128 lo = movdq_lo(var);                                            \
-        m128 hi = movdq_hi(var);                                            \
-        u32 part1 = movd(lo);                                               \
-        u32 part2 = movd(rshiftbyte_m128(lo, 4));                           \
-        u32 part3 = movd(rshiftbyte_m128(lo, 8));                           \
-        u32 part4 = movd(rshiftbyte_m128(lo, 12));                          \
-        u32 part5 = movd(hi);                                               \
-        u32 part6 = movd(rshiftbyte_m128(hi, 4));                           \
-        u32 part7 = movd(rshiftbyte_m128(hi, 8));                           \
-        u32 part8 = movd(rshiftbyte_m128(hi, 12));                          \
-        CONF_CHUNK_32(part1, bucket, offset, reason, conf_fn);              \
-        CONF_CHUNK_32(part2, bucket, offset + 4, reason, conf_fn);          \
-        CONF_CHUNK_32(part3, bucket, offset + 8, reason, conf_fn);          \
-        CONF_CHUNK_32(part4, bucket, offset + 12, reason, conf_fn);         \
-        CONF_CHUNK_32(part5, bucket, offset + 16, reason, conf_fn);         \
-        CONF_CHUNK_32(part6, bucket, offset + 20, reason, conf_fn);         \
-        CONF_CHUNK_32(part7, bucket, offset + 24, reason, conf_fn);         \
-        CONF_CHUNK_32(part8, bucket, offset + 28, reason, conf_fn);         \
-    }                                                                       \
-} while(0)
-#endif
-
-#define PREP_SHUF_MASK_NO_REINFORCEMENT(val)                                \
-    m256 lo = and256(val, *lo_mask);                                        \
-    m256 hi = and256(rshift64_m256(val, 4), *lo_mask)
-
-#define PREP_SHUF_MASK                                                      \
-    PREP_SHUF_MASK_NO_REINFORCEMENT(load256(ptr));                          \
-    *c_128 = *(ptr + 15);                                                   \
-    m256 r_msk = set4x64(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]); \
-    *c_0 = *(ptr + 31)
-
-#define SHIFT_OR_M1                                                         \
-    or256(pshufb_m256(dup_mask[0], lo), pshufb_m256(dup_mask[1], hi))
-
-#define SHIFT_OR_M2                                                         \
-    or256(lshift128_m256(or256(pshufb_m256(dup_mask[2], lo),                \
-                               pshufb_m256(dup_mask[3], hi)),               \
-                         1), SHIFT_OR_M1)
-
-#define SHIFT_OR_M3                                                         \
-    or256(lshift128_m256(or256(pshufb_m256(dup_mask[4], lo),                \
-                               pshufb_m256(dup_mask[5], hi)),               \
-                         2), SHIFT_OR_M2)
-
-#define SHIFT_OR_M4                                                         \
-    or256(lshift128_m256(or256(pshufb_m256(dup_mask[6], lo),                \
-                               pshufb_m256(dup_mask[7], hi)),               \
-                         3), SHIFT_OR_M3)
-
-static really_inline
-m256 prep_conf_teddy_no_reinforcement_m1(const m256 *lo_mask,
-                                         const m256 *dup_mask,
-                                         const m256 val) {
-    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
-    return SHIFT_OR_M1;
-}
-
-static really_inline
-m256 prep_conf_teddy_no_reinforcement_m2(const m256 *lo_mask,
-                                         const m256 *dup_mask,
-                                         const m256 val) {
-    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
-    return SHIFT_OR_M2;
-}
-
-static really_inline
-m256 prep_conf_teddy_no_reinforcement_m3(const m256 *lo_mask,
-                                         const m256 *dup_mask,
-                                         const m256 val) {
-    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
-    return SHIFT_OR_M3;
-}
-
-static really_inline
-m256 prep_conf_teddy_no_reinforcement_m4(const m256 *lo_mask,
-                                         const m256 *dup_mask,
-                                         const m256 val) {
-    PREP_SHUF_MASK_NO_REINFORCEMENT(val);
-    return SHIFT_OR_M4;
-}
-
-static really_inline
-m256 prep_conf_teddy_m1(const m256 *lo_mask, const m256 *dup_mask,
-                        const u8 *ptr, const u64a *r_msk_base,
-                        u32 *c_0, u32 *c_128) {
-    PREP_SHUF_MASK;
-    return or256(SHIFT_OR_M1, r_msk);
-}
-
-static really_inline
-m256 prep_conf_teddy_m2(const m256 *lo_mask, const m256 *dup_mask,
-                        const u8 *ptr, const u64a *r_msk_base,
-                        u32 *c_0, u32 *c_128) {
-    PREP_SHUF_MASK;
-    return or256(SHIFT_OR_M2, r_msk);
-}
-
-static really_inline
-m256 prep_conf_teddy_m3(const m256 *lo_mask, const m256 *dup_mask,
-                        const u8 *ptr, const u64a *r_msk_base,
-                        u32 *c_0, u32 *c_128) {
-    PREP_SHUF_MASK;
-    return or256(SHIFT_OR_M3, r_msk);
-}
-
-static really_inline
-m256 prep_conf_teddy_m4(const m256 *lo_mask, const m256 *dup_mask,
-                        const u8 *ptr, const u64a *r_msk_base,
-                        u32 *c_0, u32 *c_128) {
-    PREP_SHUF_MASK;
-    return or256(SHIFT_OR_M4, r_msk);
-}
-
-#define PREP_CONF_FN_NO_REINFORCEMENT(val, n)                                 \
-    prep_conf_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val)
-
-#define PREP_CONF_FN(ptr, n)                                                  \
-    prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base, &c_0, &c_128)
-
-#define PREPARE_MASKS_1                                                       \
-    dup_mask[0] = set1_2x128(maskBase[0]);                                      \
-    dup_mask[1] = set1_2x128(maskBase[1]);
-
-#define PREPARE_MASKS_2                                                       \
-    PREPARE_MASKS_1                                                           \
-    dup_mask[2] = set1_2x128(maskBase[2]);                                      \
-    dup_mask[3] = set1_2x128(maskBase[3]);
-
-#define PREPARE_MASKS_3                                                       \
-    PREPARE_MASKS_2                                                           \
-    dup_mask[4] = set1_2x128(maskBase[4]);                                      \
-    dup_mask[5] = set1_2x128(maskBase[5]);
-
-#define PREPARE_MASKS_4                                                       \
-    PREPARE_MASKS_3                                                           \
-    dup_mask[6] = set1_2x128(maskBase[6]);                                      \
-    dup_mask[7] = set1_2x128(maskBase[7]);
-
-#define PREPARE_MASKS(n)                                                      \
-    m256 lo_mask = set1_32x8(0xf);                                              \
-    m256 dup_mask[n * 2];                                                     \
-    PREPARE_MASKS_##n
-
-#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn)                       \
-do {                                                                          \
-    const u8 *buf_end = a->buf + a->len;                                      \
-    const u8 *ptr = a->buf + a->start_offset;                                 \
-    u32 floodBackoff = FLOOD_BACKOFF_START;                                   \
-    const u8 *tryFloodDetect = a->firstFloodDetect;                           \
-    u32 last_match = ones_u32;                                                \
-    const struct Teddy *teddy = (const struct Teddy *)fdr;                    \
-    const size_t iterBytes = 64;                                              \
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",                 \
-                 a->buf, a->len, a->start_offset);                            \
-                                                                              \
-    const m128 *maskBase = getMaskBase(teddy);                                \
-    PREPARE_MASKS(n_msk);                                                     \
-    const u32 *confBase = getConfBase(teddy);                                 \
-                                                                              \
-    const u64a *r_msk_base = getReinforcedMaskBase(teddy, n_msk);             \
-    u32 c_0 = 0x100;                                                          \
-    u32 c_128 = 0x100;                                                        \
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 32);                               \
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);           \
-    if (ptr < mainStart) {                                                    \
-        ptr = mainStart - 32;                                                 \
-        m256 p_mask;                                                          \
-        m256 val_0 = vectoredLoad256(&p_mask, ptr, a->start_offset,           \
-                                     a->buf, buf_end,                         \
-                                     a->buf_history, a->len_history, n_msk);  \
-        m256 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk);               \
-        r_0 = or256(r_0, p_mask);                                             \
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
-        ptr += 32;                                                            \
-    }                                                                         \
-                                                                              \
-    if (ptr + 32 <= buf_end) {                                                \
-        m256 r_0 = PREP_CONF_FN(ptr, n_msk);                                  \
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
-        ptr += 32;                                                            \
-    }                                                                         \
-                                                                              \
-    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {                    \
-        __builtin_prefetch(ptr + (iterBytes * 4));                            \
-        CHECK_FLOOD;                                                          \
-        m256 r_0 = PREP_CONF_FN(ptr, n_msk);                                  \
-        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn);                      \
-        m256 r_1 = PREP_CONF_FN(ptr + 32, n_msk);                             \
-        CONFIRM_TEDDY(r_1, 8, 32, NOT_CAUTIOUS, conf_fn);                     \
-    }                                                                         \
-                                                                              \
-    if (ptr + 32 <= buf_end) {                                                \
-        m256 r_0 = PREP_CONF_FN(ptr, n_msk);                                  \
-        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn);                      \
-        ptr += 32;                                                            \
-    }                                                                         \
-                                                                              \
-    assert(ptr + 32 > buf_end);                                               \
-    if (ptr < buf_end) {                                                      \
-        m256 p_mask;                                                          \
-        m256 val_0 = vectoredLoad256(&p_mask, ptr, 0, ptr, buf_end,           \
-                                     a->buf_history, a->len_history, n_msk);  \
-        m256 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk);               \
-        r_0 = or256(r_0, p_mask);                                             \
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
-    }                                                                         \
-                                                                              \
-    return HWLM_SUCCESS;                                                      \
-} while(0)
-
-#else // not defined HAVE_AVX2
-
-#ifdef ARCH_64_BIT
-#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
-do {                                                                        \
-    if (unlikely(diff128(var, ones128()))) {                                \
-        u64a __attribute__((aligned(16))) vec[2];                           \
-        store128(vec, var);                                                 \
-        u64a lo = vec[0];                                                   \
-        u64a hi = vec[1];                                                   \
-        CONF_CHUNK_64(lo, bucket, offset, reason, conf_fn);                 \
-        CONF_CHUNK_64(hi, bucket, offset + 8, reason, conf_fn);             \
-    }                                                                       \
-} while(0)
-#else
-#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
-do {                                                                        \
-    if (unlikely(diff128(var, ones128()))) {                                \
-        u32 part1 = movd(var);                                              \
-        u32 part2 = movd(rshiftbyte_m128(var, 4));                          \
-        u32 part3 = movd(rshiftbyte_m128(var, 8));                          \
-        u32 part4 = movd(rshiftbyte_m128(var, 12));                         \
-        CONF_CHUNK_32(part1, bucket, offset, reason, conf_fn);              \
-        CONF_CHUNK_32(part2, bucket, offset + 4, reason, conf_fn);          \
-        CONF_CHUNK_32(part3, bucket, offset + 8, reason, conf_fn);          \
-        CONF_CHUNK_32(part4, bucket, offset + 12, reason, conf_fn);         \
-    }                                                                       \
-} while(0)
-#endif
-
-static really_inline
-m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) {
-    m128 mask = set1_16x8(0xf);
-    m128 lo = and128(val, mask);
-    m128 hi = and128(rshift64_m128(val, 4), mask);
-    return or128(pshufb_m128(maskBase[0 * 2], lo),
-                 pshufb_m128(maskBase[0 * 2 + 1], hi));
-}
-
-static really_inline
-m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) {
-    m128 mask = set1_16x8(0xf);
-    m128 lo = and128(val, mask);
-    m128 hi = and128(rshift64_m128(val, 4), mask);
-    m128 r = prep_conf_teddy_m1(maskBase, val);
-
-    m128 res_1 = or128(pshufb_m128(maskBase[1 * 2], lo),
-                       pshufb_m128(maskBase[1 * 2 + 1], hi));
-    m128 res_shifted_1 = palignr(res_1, *old_1, 16 - 1);
-    *old_1 = res_1;
-    return or128(r, res_shifted_1);
-}
-
-static really_inline
-m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
-                        m128 val) {
-    m128 mask = set1_16x8(0xf);
-    m128 lo = and128(val, mask);
-    m128 hi = and128(rshift64_m128(val, 4), mask);
-    m128 r = prep_conf_teddy_m2(maskBase, old_1, val);
-
-    m128 res_2 = or128(pshufb_m128(maskBase[2 * 2], lo),
-                       pshufb_m128(maskBase[2 * 2 + 1], hi));
-    m128 res_shifted_2 = palignr(res_2, *old_2, 16 - 2);
-    *old_2 = res_2;
-    return or128(r, res_shifted_2);
-}
-
-static really_inline
-m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
-                        m128 *old_3, m128 val) {
-    m128 mask = set1_16x8(0xf);
-    m128 lo = and128(val, mask);
-    m128 hi = and128(rshift64_m128(val, 4), mask);
-    m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, val);
-
-    m128 res_3 = or128(pshufb_m128(maskBase[3 * 2], lo),
-                       pshufb_m128(maskBase[3 * 2 + 1], hi));
-    m128 res_shifted_3 = palignr(res_3, *old_3, 16 - 3);
-    *old_3 = res_3;
-    return or128(r, res_shifted_3);
-}
-
-#define FDR_EXEC_TEDDY_RES_OLD_1
-
-#define FDR_EXEC_TEDDY_RES_OLD_2                                              \
-    m128 res_old_1 = zeroes128();
-
-#define FDR_EXEC_TEDDY_RES_OLD_3                                              \
-    m128 res_old_1 = zeroes128();                                             \
-    m128 res_old_2 = zeroes128();
-
-#define FDR_EXEC_TEDDY_RES_OLD_4                                              \
-    m128 res_old_1 = zeroes128();                                             \
-    m128 res_old_2 = zeroes128();                                             \
-    m128 res_old_3 = zeroes128();
-
-#define FDR_EXEC_TEDDY_RES_OLD(n) FDR_EXEC_TEDDY_RES_OLD_##n
-
-#define PREP_CONF_FN_1(mask_base, val)                                        \
-    prep_conf_teddy_m1(mask_base, val)
-
-#define PREP_CONF_FN_2(mask_base, val)                                        \
-    prep_conf_teddy_m2(mask_base, &res_old_1, val)
-
-#define PREP_CONF_FN_3(mask_base, val)                                        \
-    prep_conf_teddy_m3(mask_base, &res_old_1, &res_old_2, val)
-
-#define PREP_CONF_FN_4(mask_base, val)                                        \
-    prep_conf_teddy_m4(mask_base, &res_old_1, &res_old_2, &res_old_3, val)
-
-#define PREP_CONF_FN(mask_base, val, n)                                       \
-    PREP_CONF_FN_##n(mask_base, val)
-
-#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn)                       \
-do {                                                                          \
-    const u8 *buf_end = a->buf + a->len;                                      \
-    const u8 *ptr = a->buf + a->start_offset;                                 \
-    u32 floodBackoff = FLOOD_BACKOFF_START;                                   \
-    const u8 *tryFloodDetect = a->firstFloodDetect;                           \
-    u32 last_match = ones_u32;                                                \
-    const struct Teddy *teddy = (const struct Teddy *)fdr;                    \
-    const size_t iterBytes = 32;                                              \
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",                 \
-                 a->buf, a->len, a->start_offset);                            \
-                                                                              \
-    const m128 *maskBase = getMaskBase(teddy);                                \
-    const u32 *confBase = getConfBase(teddy);                                 \
-                                                                              \
-    FDR_EXEC_TEDDY_RES_OLD(n_msk);                                            \
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);                               \
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);           \
-    if (ptr < mainStart) {                                                    \
-        ptr = mainStart - 16;                                                 \
-        m128 p_mask;                                                          \
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->start_offset,           \
-                                     a->buf, buf_end,                         \
-                                     a->buf_history, a->len_history, n_msk);  \
-        m128 r_0 = PREP_CONF_FN(maskBase, val_0, n_msk);                      \
-        r_0 = or128(r_0, p_mask);                                             \
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
-        ptr += 16;                                                            \
-    }                                                                         \
-                                                                              \
-    if (ptr + 16 <= buf_end) {                                                \
-        m128 r_0 = PREP_CONF_FN(maskBase, load128(ptr), n_msk);               \
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
-        ptr += 16;                                                            \
-    }                                                                         \
-                                                                              \
-    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {                    \
-        __builtin_prefetch(ptr + (iterBytes * 4));                            \
-        CHECK_FLOOD;                                                          \
-        m128 r_0 = PREP_CONF_FN(maskBase, load128(ptr), n_msk);               \
-        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn);                      \
-        m128 r_1 = PREP_CONF_FN(maskBase, load128(ptr + 16), n_msk);          \
-        CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, conf_fn);                     \
-    }                                                                         \
-                                                                              \
-    if (ptr + 16 <= buf_end) {                                                \
-        m128 r_0 = PREP_CONF_FN(maskBase, load128(ptr), n_msk);               \
-        CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn);                      \
-        ptr += 16;                                                            \
-    }                                                                         \
-                                                                              \
-    assert(ptr + 16 > buf_end);                                               \
-    if (ptr < buf_end) {                                                      \
-        m128 p_mask;                                                          \
-        m128 val_0 = vectoredLoad128(&p_mask, ptr, 0, ptr, buf_end,           \
-                                     a->buf_history, a->len_history, n_msk);  \
-        m128 r_0 = PREP_CONF_FN(maskBase, val_0, n_msk);                      \
-        r_0 = or128(r_0, p_mask);                                             \
-        CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn);                         \
-    }                                                                         \
-                                                                              \
-    return HWLM_SUCCESS;                                                      \
-} while(0)
-
-#endif // HAVE_AVX2 HAVE_AVX512
-
-hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
-                                  const struct FDR_Runtime_Args *a,
-                                  hwlm_group_t control) {
-    FDR_EXEC_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
-}
-
-hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a,
-                                      hwlm_group_t control) {
-    FDR_EXEC_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
-}
-
-hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
-                                  const struct FDR_Runtime_Args *a,
-                                  hwlm_group_t control) {
-    FDR_EXEC_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
-}
-
-hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a,
-                                      hwlm_group_t control) {
-    FDR_EXEC_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
-}
-
-hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
-                                  const struct FDR_Runtime_Args *a,
-                                  hwlm_group_t control) {
-    FDR_EXEC_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
-}
-
-hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a,
-                                      hwlm_group_t control) {
-    FDR_EXEC_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
-}
-
-hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
-                                  const struct FDR_Runtime_Args *a,
-                                  hwlm_group_t control) {
-    FDR_EXEC_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
-}
-
-hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a,
-                                      hwlm_group_t control) {
-    FDR_EXEC_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
-}
diff --git a/src/fdr/teddy.cpp b/src/fdr/teddy.cpp
new file mode 100644 (file)
index 0000000..d2993f5
--- /dev/null
@@ -0,0 +1,862 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2024, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Teddy literal matcher: SSSE3 engine runtime.
+ */
+
+#include "fdr_internal.h"
+#include "flood_runtime.h"
+#include "teddy.h"
+#include "teddy_internal.h"
+#include "teddy_runtime_common.h"
+#include "util/arch.h"
+#include "util/simd_utils.h"
+
+
+#ifdef ARCH_64_BIT
+static really_inline
+hwlm_error_t conf_chunk_64(u64a chunk, u8 bucket, u8 offset,
+                           CautionReason reason, const u8 *pt,
+                           const u32* confBase,
+                           const struct FDR_Runtime_Args *a,
+                           hwlm_group_t *control,
+                           u32 *last_match) {
+    if (unlikely(chunk != ones_u64a)) {
+        chunk = ~chunk;
+        do_confWithBit_teddy(&chunk, bucket, offset, confBase, reason, a, pt,
+                control, last_match);
+        // adapted from CHECK_HWLM_TERMINATE_MATCHING
+        if (unlikely(*control == HWLM_TERMINATE_MATCHING)) {
+            return HWLM_TERMINATED;
+        }
+
+    }
+    return HWLM_SUCCESS;
+}
+
+#define CONF_CHUNK_64(chunk, bucket, off, reason, pt, confBase, a, control, last_match) \
+ if(conf_chunk_64(chunk, bucket, off, reason, pt, confBase, a, control, last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
+
+#else // 32/64
+
+static really_inline
+hwlm_error_t conf_chunk_32(u32 chunk, u8 bucket, u8 offset,
+                           CautionReason reason, const u8 *pt,
+                           const u32* confBase,
+                           const struct FDR_Runtime_Args *a,
+                           hwlm_group_t *control,
+                           u32 *last_match) {
+    if (unlikely(chunk != ones_u32)) {
+        chunk = ~chunk;
+        do_confWithBit_teddy(&chunk, bucket, offset, confBase, reason, a, pt,
+                control, last_match);
+        // adapted from CHECK_HWLM_TERMINATE_MATCHING
+        if (unlikely(*control == HWLM_TERMINATE_MATCHING)) {
+            return HWLM_TERMINATED;
+        }
+    }
+    return HWLM_SUCCESS;
+}
+
+#define CONF_CHUNK_32(chunk, bucket, off, reason, pt, confBase, a, control, last_match) \
+ if(conf_chunk_32(chunk, bucket, off, reason, pt, confBase, a, control, last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
+
+#endif
+
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_AVX512) // common to both 512b's
+
+static really_inline
+const m512 *getDupMaskBase(const struct Teddy *teddy, u8 numMask) {
+    return (const m512 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy))
+                          + ROUNDUP_CL(2 * numMask * sizeof(m256)));
+}
+
+
+#ifdef ARCH_64_BIT
+
+static really_inline
+hwlm_error_t confirm_teddy_64_512(m512 var, u8 bucket, u8 offset,
+                                  CautionReason reason, const u8 *ptr,
+                                  const struct FDR_Runtime_Args *a,
+                                  const u32* confBase, hwlm_group_t *control,
+                                  u32 *last_match) {
+    if (unlikely(diff512(var, ones512()))) {
+        m128 p128_0 = extract128from512(var, 0);
+        m128 p128_1 = extract128from512(var, 1);
+        m128 p128_2 = extract128from512(var, 2);
+        m128 p128_3 = extract128from512(var, 3);
+        u64a part1 = movq(p128_0);
+        u64a part2 = movq(rshiftbyte_m128(p128_0, 8));
+        u64a part3 = movq(p128_1);
+        u64a part4 = movq(rshiftbyte_m128(p128_1, 8));
+        u64a part5 = movq(p128_2);
+        u64a part6 = movq(rshiftbyte_m128(p128_2, 8));
+        u64a part7 = movq(p128_3);
+        u64a part8 = movq(rshiftbyte_m128(p128_3, 8));
+        CONF_CHUNK_64(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_64(part2, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_64(part3, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_64(part4, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_64(part5, bucket, offset + 32, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_64(part6, bucket, offset + 40, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_64(part7, bucket, offset + 48, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_64(part8, bucket, offset + 56, reason, ptr, confBase, a, control, last_match);
+    }
+    return HWLM_SUCCESS;
+}
+
+#define confirm_teddy_512_f confirm_teddy_64_512
+
+#else // 32/64
+
+static really_inline
+hwlm_error_t confirm_teddy_32_512(m512 var, u8 bucket, u8 offset,
+                                  CautionReason reason, const u8 *ptr,
+                                  const struct FDR_Runtime_Args *a,
+                                  const u32* confBase, hwlm_group_t *control,
+                                  u32 *last_match) {
+    if (unlikely(diff512(var, ones512()))) {
+        m128 p128_0 = extract128from512(var, 0);
+        m128 p128_1 = extract128from512(var, 1);
+        m128 p128_2 = extract128from512(var, 2);
+        m128 p128_3 = extract128from512(var, 3);
+        u32 part1 = movd(p128_0);
+        u32 part2 = movd(rshiftbyte_m128(p128_0, 4));
+        u32 part3 = movd(rshiftbyte_m128(p128_0, 8));
+        u32 part4 = movd(rshiftbyte_m128(p128_0, 12));
+        u32 part5 = movd(p128_1);
+        u32 part6 = movd(rshiftbyte_m128(p128_1, 4));
+        u32 part7 = movd(rshiftbyte_m128(p128_1, 8));
+        u32 part8 = movd(rshiftbyte_m128(p128_1, 12));
+        u32 part9 = movd(p128_2);
+        u32 part10 = movd(rshiftbyte_m128(p128_2, 4));
+        u32 part11 = movd(rshiftbyte_m128(p128_2, 8));
+        u32 part12 = movd(rshiftbyte_m128(p128_2, 12));
+        u32 part13 = movd(p128_3);
+        u32 part14 = movd(rshiftbyte_m128(p128_3, 4));
+        u32 part15 = movd(rshiftbyte_m128(p128_3, 8));
+        u32 part16 = movd(rshiftbyte_m128(p128_3, 12));
+        CONF_CHUNK_32(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part2, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part3, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part4, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part5, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part6, bucket, offset + 20, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part7, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part8, bucket, offset + 28, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part9, bucket, offset + 32, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part10, bucket, offset + 36, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part11, bucket, offset + 40, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part12, bucket, offset + 44, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part13, bucket, offset + 48, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part14, bucket, offset + 52, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part15, bucket, offset + 56, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part16, bucket, offset + 60, reason, ptr, confBase, a, control, last_match);
+    }
+    return HWLM_SUCCESS;
+}
+
+#define confirm_teddy_512_f confirm_teddy_32_512
+
+
+#endif // 32/64
+
+#define CONFIRM_TEDDY_512(...) if(confirm_teddy_512_f(__VA_ARGS__, a, confBase, &control, &last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
+
+#endif // AVX512VBMI or AVX512
+
+
+#if defined(HAVE_AVX512VBMI) // VBMI strong teddy
+
+#define TEDDY_VBMI_SL1_MASK   0xfffffffffffffffeULL
+#define TEDDY_VBMI_SL2_MASK   0xfffffffffffffffcULL
+#define TEDDY_VBMI_SL3_MASK   0xfffffffffffffff8ULL
+
+template<int NMSK>
+static really_inline
+m512 prep_conf_teddy_512vbmi_templ(const m512 *lo_mask, const m512 *dup_mask,
+                                   const m512 *sl_msk, const m512 val) {
+    m512 lo = and512(val, *lo_mask);
+    m512 hi = and512(rshift64_m512(val, 4), *lo_mask);
+    m512 shuf_or_b0 = or512(pshufb_m512(dup_mask[0], lo),
+                            pshufb_m512(dup_mask[1], hi));
+
+    if constexpr (NMSK == 1) return shuf_or_b0;
+    m512 shuf_or_b1 = or512(pshufb_m512(dup_mask[2], lo),
+                            pshufb_m512(dup_mask[3], hi));
+    m512 sl1 = maskz_vpermb512(TEDDY_VBMI_SL1_MASK, sl_msk[0], shuf_or_b1);
+    if constexpr (NMSK == 2) return (or512(sl1, shuf_or_b0));
+    m512 shuf_or_b2 = or512(pshufb_m512(dup_mask[4], lo),
+                            pshufb_m512(dup_mask[5], hi));
+    m512 sl2 = maskz_vpermb512(TEDDY_VBMI_SL2_MASK, sl_msk[1], shuf_or_b2);
+    if constexpr (NMSK == 3) return (or512(sl2, or512(sl1, shuf_or_b0)));
+    m512 shuf_or_b3 = or512(pshufb_m512(dup_mask[6], lo),
+                            pshufb_m512(dup_mask[7], hi));
+    m512 sl3 = maskz_vpermb512(TEDDY_VBMI_SL3_MASK, sl_msk[2], shuf_or_b3);
+    return (or512(sl3, or512(sl2, or512(sl1, shuf_or_b0))));
+}
+
+
+#define TEDDY_VBMI_SL1_POS    15
+#define TEDDY_VBMI_SL2_POS    14
+#define TEDDY_VBMI_SL3_POS    13
+
+#define TEDDY_VBMI_CONF_MASK_HEAD   (0xffffffffffffffffULL >> n_sh)
+#define TEDDY_VBMI_CONF_MASK_FULL   (0xffffffffffffffffULL << n_sh)
+#define TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffffffffffULL >> (64 - n) << overlap)
+#define TEDDY_VBMI_LOAD_MASK_PATCH  (0xffffffffffffffffULL >> (64 - n_sh))
+
+template<int NMSK>
+hwlm_error_t fdr_exec_teddy_512vbmi_templ(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = ones_u32;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 64;
+    u32 n_sh = NMSK - 1;
+    const size_t loopBytes = 64 - n_sh;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m128 *maskBase = getMaskBase(teddy);
+
+    m512 lo_mask = set1_64x8(0xf);
+    m512 dup_mask[NMSK * 2];
+    m512 sl_msk[NMSK - 1];
+    dup_mask[0] = set1_4x128(maskBase[0]);
+    dup_mask[1] = set1_4x128(maskBase[1]);
+    if constexpr (NMSK > 1){
+    dup_mask[2] = set1_4x128(maskBase[2]);
+    dup_mask[3] = set1_4x128(maskBase[3]);
+    sl_msk[0] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL1_POS);
+    }
+    if constexpr (NMSK > 2){
+    dup_mask[4] = set1_4x128(maskBase[4]);
+    dup_mask[5] = set1_4x128(maskBase[5]);
+    sl_msk[1] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL2_POS);
+    }
+    if constexpr (NMSK > 3){
+    dup_mask[6] = set1_4x128(maskBase[6]);
+    dup_mask[7] = set1_4x128(maskBase[7]);
+    sl_msk[2] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL3_POS);
+    }
+    const u32 *confBase = getConfBase(teddy);
+
+    u64a k = TEDDY_VBMI_CONF_MASK_FULL;
+    m512 p_mask = set_mask_m512(~k);
+    u32 overlap = 0;
+    u64a patch = 0;
+    if (likely(ptr + loopBytes <= buf_end)) {
+        m512 p_mask0 = set_mask_m512(~TEDDY_VBMI_CONF_MASK_HEAD);
+        m512 r_0 = prep_conf_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, loadu512(ptr));
+        r_0 = or512(r_0, p_mask0);
+        CONFIRM_TEDDY_512(r_0, 8, 0, VECTORING, ptr);
+        ptr += loopBytes;
+        overlap = n_sh;
+        patch = TEDDY_VBMI_LOAD_MASK_PATCH;
+    }
+
+    for (; ptr + loopBytes <= buf_end; ptr += loopBytes) {
+        __builtin_prefetch(ptr - n_sh + (64 * 2));
+        CHECK_FLOOD;
+        m512 r_0 = prep_conf_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, loadu512(ptr - n_sh));
+        r_0 = or512(r_0, p_mask);
+        CONFIRM_TEDDY_512(r_0, 8, 0, NOT_CAUTIOUS, ptr - n_sh);
+    }
+
+    assert(ptr + loopBytes > buf_end);
+    if (ptr < buf_end) {
+        u32 left = (u32)(buf_end - ptr);
+        u64a k1 = TEDDY_VBMI_CONF_MASK_VAR(left);
+        m512 p_mask1 = set_mask_m512(~k1);
+        m512 val_0 = loadu_maskz_m512(k1 | patch, ptr - overlap);
+        m512 r_0 = prep_conf_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, val_0);
+        r_0 = or512(r_0, p_mask1);
+        CONFIRM_TEDDY_512(r_0, 8, 0, VECTORING, ptr - overlap);
+    }
+
+    return HWLM_SUCCESS;
+}
+
+#define FDR_EXEC_TEDDY_FN fdr_exec_teddy_512vbmi_templ
+
+#elif defined(HAVE_AVX512) // AVX512 reinforced teddy
+
+/* both 512b versions use the same confirm teddy */
+
+template <int NMSK>
+static inline
+m512 shift_or_512_templ(const m512 *dup_mask, m512 lo, m512 hi) {
+    return or512(lshift128_m512(or512(pshufb_m512(dup_mask[(NMSK - 1) * 2], lo),
+                                pshufb_m512(dup_mask[(NMSK * 2) - 1], hi)),
+                                NMSK - 1), shift_or_512_templ<NMSK - 1>(dup_mask, lo, hi));
+}
+
+template <>
+m512 shift_or_512_templ<1>(const m512 *dup_mask, m512 lo, m512 hi){
+    return or512(pshufb_m512(dup_mask[0], lo), pshufb_m512(dup_mask[1], hi));
+}
+
+template <int NMSK>
+static really_inline
+m512 prep_conf_teddy_no_reinforcement_512_templ(const m512 *lo_mask,
+                                                const m512 *dup_mask,
+                                                const m512 val) {
+    m512 lo = and512(val, *lo_mask);
+    m512 hi = and512(rshift64_m512(val, 4), *lo_mask);
+    return shift_or_512_templ<NMSK>(dup_mask, lo, hi);
+}
+
+template <int NMSK>
+static really_inline
+m512 prep_conf_teddy_512_templ(const m512 *lo_mask, const m512 *dup_mask,
+                               const u8 *ptr, const u64a *r_msk_base,
+                               u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
+    m512 lo = and512(load512(ptr), *lo_mask);
+    m512 hi = and512(rshift64_m512(load512(ptr), 4), *lo_mask);
+    *c_16 = *(ptr + 15);
+    *c_32 = *(ptr + 31);
+    *c_48 = *(ptr + 47);
+    m512 r_msk = set8x64(0ULL, r_msk_base[*c_48], 0ULL, r_msk_base[*c_32],
+                           0ULL, r_msk_base[*c_16], 0ULL, r_msk_base[*c_0]);
+    *c_0 = *(ptr + 63);
+    return or512(shift_or_512_templ<NMSK>(dup_mask, lo, hi), r_msk);
+}
+
+
+#define PREP_CONF_FN_512(ptr, n)                                                  \
+    prep_conf_teddy_512_templ<n>(&lo_mask, dup_mask, ptr, r_msk_base,             \
+                         &c_0, &c_16, &c_32, &c_48)
+
+template <int NMSK>
+hwlm_error_t fdr_exec_teddy_512_templ(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = ones_u32;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 128;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m128 *maskBase = getMaskBase(teddy);
+
+    m512 lo_mask = set1_64x8(0xf);
+    m512 dup_mask[NMSK * 2];
+
+    dup_mask[0] = set1_4x128(maskBase[0]);
+    dup_mask[1] = set1_4x128(maskBase[1]);
+    if constexpr (NMSK > 1){
+    dup_mask[2] = set1_4x128(maskBase[2]);
+    dup_mask[3] = set1_4x128(maskBase[3]);
+    }
+    if constexpr (NMSK > 2){
+    dup_mask[4] = set1_4x128(maskBase[4]);
+    dup_mask[5] = set1_4x128(maskBase[5]);
+    }
+    if constexpr (NMSK > 3){
+    dup_mask[6] = set1_4x128(maskBase[6]);
+    dup_mask[7] = set1_4x128(maskBase[7]);
+    }
+    const u32 *confBase = getConfBase(teddy);
+
+    const u64a *r_msk_base = getReinforcedMaskBase(teddy, NMSK);
+    u32 c_0 = 0x100;
+    u32 c_16 = 0x100;
+    u32 c_32 = 0x100;
+    u32 c_48 = 0x100;
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 64);
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+    if (ptr < mainStart) {
+        ptr = mainStart - 64;
+        m512 p_mask;
+        m512 val_0 = vectoredLoad512(&p_mask, ptr, a->start_offset,
+                                     a->buf, buf_end,
+                                     a->buf_history, a->len_history, NMSK);
+        m512 r_0 = prep_conf_teddy_no_reinforcement_512_templ<NMSK>(&lo_mask, dup_mask, val_0);
+        r_0 = or512(r_0, p_mask);
+        CONFIRM_TEDDY_512(r_0, 8, 0, VECTORING, ptr);
+        ptr += 64;
+    }
+
+    if (ptr + 64 <= buf_end) {
+        m512 r_0 = PREP_CONF_FN_512(ptr, NMSK);
+        CONFIRM_TEDDY_512(r_0, 8, 0, VECTORING, ptr);
+        ptr += 64;
+    }
+
+    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+        __builtin_prefetch(ptr + (iterBytes * 4));
+        CHECK_FLOOD;
+        m512 r_0 = PREP_CONF_FN_512(ptr, NMSK);
+        CONFIRM_TEDDY_512(r_0, 8, 0, NOT_CAUTIOUS, ptr);
+        m512 r_1 = PREP_CONF_FN_512(ptr + 64, NMSK);
+        CONFIRM_TEDDY_512(r_1, 8, 64, NOT_CAUTIOUS, ptr);
+    }
+
+    if (ptr + 64 <= buf_end) {
+        m512 r_0 = PREP_CONF_FN_512(ptr, NMSK);
+        CONFIRM_TEDDY_512(r_0, 8, 0, NOT_CAUTIOUS, ptr);
+        ptr += 64;
+    }
+
+    assert(ptr + 64 > buf_end);
+    if (ptr < buf_end) {
+        m512 p_mask;
+        m512 val_0 = vectoredLoad512(&p_mask, ptr, 0, ptr, buf_end,
+                                     a->buf_history, a->len_history, NMSK);
+        m512 r_0 = prep_conf_teddy_no_reinforcement_512_templ<NMSK>(&lo_mask, dup_mask,val_0);
+        r_0 = or512(r_0, p_mask);
+        CONFIRM_TEDDY_512(r_0, 8, 0, VECTORING, ptr);
+    }
+
+    return HWLM_SUCCESS;
+}
+
+
+#define FDR_EXEC_TEDDY_FN fdr_exec_teddy_512_templ
+
+/* #endif // AVX512 vs AVX512VBMI * back to the original fully exclusive logic */
+
+#elif defined(HAVE_AVX2) // not HAVE_AVX512 but HAVE_AVX2 reinforced teddy
+
+#ifdef ARCH_64_BIT
+
+hwlm_error_t confirm_teddy_64_256(m256 var, u8 bucket, u8 offset,
+                                  CautionReason reason, const u8 *ptr,
+                                  const struct FDR_Runtime_Args *a,
+                                  const u32* confBase, hwlm_group_t *control,
+                                  u32 *last_match) {
+    if (unlikely(diff256(var, ones256()))) {
+        m128 lo = movdq_lo(var);
+        m128 hi = movdq_hi(var);
+        u64a part1 = movq(lo);
+        u64a part2 = movq(rshiftbyte_m128(lo, 8));
+        u64a part3 = movq(hi);
+        u64a part4 = movq(rshiftbyte_m128(hi, 8));
+        CONF_CHUNK_64(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_64(part2, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_64(part3, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_64(part4, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
+    }
+    return HWLM_SUCCESS;
+}
+
+#define confirm_teddy_256_f confirm_teddy_64_256
+
+#else
+
+hwlm_error_t confirm_teddy_32_256(m256 var, u8 bucket, u8 offset,
+                                  CautionReason reason, const u8 *ptr,
+                                  const struct FDR_Runtime_Args *a,
+                                  const u32* confBase, hwlm_group_t *control,
+                                  u32 *last_match) {
+    if (unlikely(diff256(var, ones256()))) {
+        m128 lo = movdq_lo(var);
+        m128 hi = movdq_hi(var);
+        u32 part1 = movd(lo);
+        u32 part2 = movd(rshiftbyte_m128(lo, 4));
+        u32 part3 = movd(rshiftbyte_m128(lo, 8));
+        u32 part4 = movd(rshiftbyte_m128(lo, 12));
+        u32 part5 = movd(hi);
+        u32 part6 = movd(rshiftbyte_m128(hi, 4));
+        u32 part7 = movd(rshiftbyte_m128(hi, 8));
+        u32 part8 = movd(rshiftbyte_m128(hi, 12));
+        CONF_CHUNK_32(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part2, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part3, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part4, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part5, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part6, bucket, offset + 20, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part7, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part8, bucket, offset + 28, reason, ptr, confBase, a, control, last_match);
+    }
+    return HWLM_SUCCESS;
+}
+
+#define confirm_teddy_256_f confirm_teddy_32_256
+
+#endif
+
+#define CONFIRM_TEDDY_256(...) if(confirm_teddy_256_f(__VA_ARGS__, a, confBase, &control, &last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
+
+/*
+static really_inline
+m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset,
+                       const u8 *lo, const u8 *hi,
+                       const u8 *buf_history, size_t len_history,
+                       const u32 nMasks) {
+    m128 p_mask128;
+    m256 ret = set1_2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
+                                          buf_history, len_history, nMasks));
+    *p_mask = set1_2x128(p_mask128);
+    return ret;
+}
+*/
+
+template <int NMSK>
+static inline
+m256 shift_or_256_templ(const m256 *dup_mask, m256 lo, m256 hi){
+    return or256(lshift128_m256(or256(pshufb_m256(dup_mask[(NMSK-1)*2], lo),
+                                pshufb_m256(dup_mask[(NMSK*2)-1], hi)),
+                                (NMSK-1)), shift_or_256_templ<NMSK-1>(dup_mask, lo, hi));
+}
+
+template<>
+m256 shift_or_256_templ<1>(const m256 *dup_mask, m256 lo, m256 hi){
+    return or256(pshufb_m256(dup_mask[0], lo), pshufb_m256(dup_mask[1], hi));
+}
+
+template <int NMSK>
+static really_inline
+m256 prep_conf_teddy_no_reinforcement_256_templ(const m256 *lo_mask,
+                                         const m256 *dup_mask,
+                                         const m256 val) {
+    m256 lo = and256(val, *lo_mask);
+    m256 hi = and256(rshift64_m256(val, 4), *lo_mask);
+    return shift_or_256_templ<NMSK>(dup_mask, lo, hi);
+}
+
+template <int NMSK>
+static really_inline
+m256 prep_conf_teddy_256_templ(const m256 *lo_mask, const m256 *dup_mask,
+                        const u8 *ptr, const u64a *r_msk_base,
+                        u32 *c_0, u32 *c_128) {
+    m256 lo = and256(load256(ptr), *lo_mask);
+    m256 hi = and256(rshift64_m256(load256(ptr), 4), *lo_mask);
+    *c_128 = *(ptr + 15);
+    m256 r_msk = set4x64(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]);
+    *c_0 = *(ptr + 31);
+    return or256(shift_or_256_templ<NMSK>(dup_mask, lo, hi), r_msk);
+}
+
+#define PREP_CONF_FN_256_NO_REINFORCEMENT(val, n)                                 \
+    prep_conf_teddy_no_reinforcement_256_templ<n>(&lo_mask, dup_mask, val)
+
+#define PREP_CONF_FN_256(ptr, n)                                                  \
+    prep_conf_teddy_256_templ<n>(&lo_mask, dup_mask, ptr, r_msk_base, &c_0, &c_128)
+
+template <int NMSK>
+hwlm_error_t fdr_exec_teddy_256_templ(const struct FDR *fdr,
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = ones_u32;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 64;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m128 *maskBase = getMaskBase(teddy);
+    //PREPARE_MASKS_256;
+
+    m256 lo_mask = set1_32x8(0xf);
+    m256 dup_mask[NMSK * 2];
+    dup_mask[0] = set1_2x128(maskBase[0]);
+    dup_mask[1] = set1_2x128(maskBase[1]);
+    if constexpr (NMSK > 1){
+    dup_mask[2] = set1_2x128(maskBase[2]);
+    dup_mask[3] = set1_2x128(maskBase[3]);
+    }
+    if constexpr (NMSK > 2){
+    dup_mask[4] = set1_2x128(maskBase[4]);
+    dup_mask[5] = set1_2x128(maskBase[5]);
+    }
+    if constexpr (NMSK > 3){
+    dup_mask[6] = set1_2x128(maskBase[6]);
+    dup_mask[7] = set1_2x128(maskBase[7]);
+    }
+    const u32 *confBase = getConfBase(teddy);
+
+    const u64a *r_msk_base = getReinforcedMaskBase(teddy, NMSK);
+    u32 c_0 = 0x100;
+    u32 c_128 = 0x100;
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 32);
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+    if (ptr < mainStart) {
+        ptr = mainStart - 32;
+        m256 p_mask;
+        m256 val_0 = vectoredLoad256(&p_mask, ptr, a->start_offset,
+                                     a->buf, buf_end,
+                                     a->buf_history, a->len_history, NMSK);
+        m256 r_0 = PREP_CONF_FN_256_NO_REINFORCEMENT(val_0, NMSK);
+        r_0 = or256(r_0, p_mask);
+        CONFIRM_TEDDY_256(r_0, 8, 0, VECTORING, ptr);
+        ptr += 32;
+    }
+
+    if (ptr + 32 <= buf_end) {
+        m256 r_0 = PREP_CONF_FN_256(ptr, NMSK);
+        CONFIRM_TEDDY_256(r_0, 8, 0, VECTORING, ptr);
+        ptr += 32;
+    }
+
+    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+        __builtin_prefetch(ptr + (iterBytes * 4));
+        CHECK_FLOOD;
+        m256 r_0 = PREP_CONF_FN_256(ptr, NMSK);
+        CONFIRM_TEDDY_256(r_0, 8, 0, NOT_CAUTIOUS, ptr);
+        m256 r_1 = PREP_CONF_FN_256(ptr + 32, NMSK);
+        CONFIRM_TEDDY_256(r_1, 8, 32, NOT_CAUTIOUS, ptr);
+    }
+
+    if (ptr + 32 <= buf_end) {
+        m256 r_0 = PREP_CONF_FN_256(ptr, NMSK);
+        CONFIRM_TEDDY_256(r_0, 8, 0, NOT_CAUTIOUS, ptr);
+        ptr += 32;
+    }
+
+    assert(ptr + 32 > buf_end);
+    if (ptr < buf_end) {
+        m256 p_mask;
+        m256 val_0 = vectoredLoad256(&p_mask, ptr, 0, ptr, buf_end,
+                                     a->buf_history, a->len_history, NMSK);
+        m256 r_0 = PREP_CONF_FN_256_NO_REINFORCEMENT(val_0, NMSK);
+        r_0 = or256(r_0, p_mask);
+        CONFIRM_TEDDY_256(r_0, 8, 0, VECTORING, ptr);
+    }
+
+    return HWLM_SUCCESS;
+}
+
+#define FDR_EXEC_TEDDY_FN fdr_exec_teddy_256_templ
+
+#else // not defined HAVE_AVX2
+
+#ifdef ARCH_64_BIT
+static really_inline
+hwlm_error_t confirm_teddy_64_128(m128 var, u8 bucket, u8 offset,
+                                  CautionReason reason, const u8 *ptr,
+                                  const struct FDR_Runtime_Args *a,
+                                  const u32* confBase, hwlm_group_t *control,
+                                  u32 *last_match) {
+    if (unlikely(diff128(var, ones128()))) {
+        u64a lo = 0;
+        u64a hi = 0;
+        u64a __attribute__((aligned(16))) vec[2];
+        store128(vec, var);
+        lo = vec[0];
+        hi = vec[1];
+        CONF_CHUNK_64(lo, bucket, offset, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_64(hi, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
+    }
+    return HWLM_SUCCESS;
+}
+
+#define confirm_teddy_128_f confirm_teddy_64_128
+
+#else // 32/64
+
+static really_inline
+hwlm_error_t confirm_teddy_32_128(m128 var, u8 bucket, u8 offset,
+                                  CautionReason reason, const u8 *ptr,
+                                  const struct FDR_Runtime_Args *a,
+                                  const u32* confBase, hwlm_group_t *control,
+                                  u32 *last_match) {
+    if (unlikely(diff128(var, ones128()))) {
+        u32 part1 = movd(var);
+        u32 part2 = movd(rshiftbyte_m128(var, 4));
+        u32 part3 = movd(rshiftbyte_m128(var, 8));
+        u32 part4 = movd(rshiftbyte_m128(var, 12));
+        CONF_CHUNK_32(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part2, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part3, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part4, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
+    }
+    return HWLM_SUCCESS;
+}
+#define confirm_teddy_128_f confirm_teddy_32_128
+
+#endif  // 32/64
+
+
+#define CONFIRM_TEDDY_128(...) if(confirm_teddy_128_f(__VA_ARGS__, a, confBase, &control, &last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
+
+template <int NMSK>
+static really_inline
+m128 prep_conf_teddy_128_templ(const m128 *maskBase, m128 val) {
+    m128 mask = set1_16x8(0xf);
+    m128 lo = and128(val, mask);
+    m128 hi = and128(rshift64_m128(val, 4), mask);
+    m128 r1 = or128(pshufb_m128(maskBase[0 * 2], lo),
+                             pshufb_m128(maskBase[0 * 2 + 1], hi));
+    if constexpr (NMSK == 1) return r1;
+    m128 res_1 = or128(pshufb_m128(maskBase[1 * 2], lo),
+                       pshufb_m128(maskBase[1 * 2 + 1], hi));
+
+    m128 old_1 = zeroes128();
+    m128 res_shifted_1 = palignr(res_1, old_1, 16 - 1);
+    m128 r2 = or128(r1, res_shifted_1);
+    if constexpr (NMSK == 2) return r2;
+    m128 res_2 = or128(pshufb_m128(maskBase[2 * 2], lo),
+                       pshufb_m128(maskBase[2 * 2 + 1], hi));
+    m128 res_shifted_2 = palignr(res_2, old_1, 16 - 2);
+    m128 r3 = or128(r2, res_shifted_2);
+    if constexpr (NMSK == 3) return r3;
+    m128 res_3 = or128(pshufb_m128(maskBase[3 * 2], lo),
+                       pshufb_m128(maskBase[3 * 2 + 1], hi));
+    m128 res_shifted_3 = palignr(res_3, old_1, 16 - 3);
+    return or128(r3, res_shifted_3);
+}
+
+template <int NMSK>
+hwlm_error_t fdr_exec_teddy_128_templ(const struct FDR *fdr,
+                             const struct FDR_Runtime_Args *a,
+                             hwlm_group_t control) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = ones_u32;
+    const struct Teddy *teddy = reinterpret_cast<const struct Teddy *>(fdr);
+    const size_t iterBytes = 32;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m128 *maskBase = getMaskBase(teddy);
+    const u32 *confBase = getConfBase(teddy);
+
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+    if (ptr < mainStart) {
+        ptr = mainStart - 16;
+        m128 p_mask;
+        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->start_offset,
+                                     a->buf, buf_end,
+                                     a->buf_history, a->len_history, NMSK);
+        m128 r_0 = prep_conf_teddy_128_templ<NMSK>(maskBase, val_0);
+        r_0 = or128(r_0, p_mask);
+        CONFIRM_TEDDY_128(r_0, 8, 0, VECTORING, ptr);
+        ptr += 16;
+    }
+
+    if (ptr + 16 <= buf_end) {
+        m128 r_0 = prep_conf_teddy_128_templ<NMSK>(maskBase, load128(ptr));
+        CONFIRM_TEDDY_128(r_0, 8, 0, VECTORING, ptr);
+        ptr += 16;
+    }
+
+    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+        __builtin_prefetch(ptr + (iterBytes * 4));
+        CHECK_FLOOD;
+        m128 r_0 = prep_conf_teddy_128_templ<NMSK>(maskBase, load128(ptr));
+        CONFIRM_TEDDY_128(r_0, 8, 0, NOT_CAUTIOUS, ptr);
+        m128 r_1 = prep_conf_teddy_128_templ<NMSK>(maskBase, load128(ptr + 16));
+        CONFIRM_TEDDY_128(r_1, 8, 16, NOT_CAUTIOUS, ptr);
+    }
+
+    if (ptr + 16 <= buf_end) {
+        m128 r_0 = prep_conf_teddy_128_templ<NMSK>(maskBase, load128(ptr));
+        CONFIRM_TEDDY_128(r_0, 8, 0, NOT_CAUTIOUS, ptr);
+        ptr += 16;
+    }
+
+    assert(ptr + 16 > buf_end);
+    if (ptr < buf_end) {
+        m128 p_mask;
+        m128 val_0 = vectoredLoad128(&p_mask, ptr, 0, ptr, buf_end,
+                                     a->buf_history, a->len_history, NMSK);
+        m128 r_0 = prep_conf_teddy_128_templ<NMSK>(maskBase, val_0);
+        r_0 = or128(r_0, p_mask);
+        CONFIRM_TEDDY_128(r_0, 8, 0, VECTORING, ptr);
+    }
+
+    return HWLM_SUCCESS;
+}
+
+#define FDR_EXEC_TEDDY_FN fdr_exec_teddy_128_templ
+
+
+#endif // HAVE_AVX2 HAVE_AVX512
+
+
+
+extern "C" {
+
+hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control) {
+    return FDR_EXEC_TEDDY_FN<1>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    return FDR_EXEC_TEDDY_FN<1>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control) {
+    return FDR_EXEC_TEDDY_FN<2>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    return FDR_EXEC_TEDDY_FN<2>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control) {
+    return FDR_EXEC_TEDDY_FN<3>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    return FDR_EXEC_TEDDY_FN<3>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control) {
+    return FDR_EXEC_TEDDY_FN<4>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    return FDR_EXEC_TEDDY_FN<4>(fdr, a, control);
+}
+
+} // extern
+
index 40ae075626e0e34b8dea91e3c8ab86cb1270d6e5..f5e258d15b7c3c3909bd27a266a9a3e2d4573090 100644 (file)
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2024, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
 struct FDR; // forward declaration from fdr_internal.h
 struct FDR_Runtime_Args;
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
                                   const struct FDR_Runtime_Args *a,
                                   hwlm_group_t control);
@@ -106,5 +111,8 @@ hwlm_error_t fdr_exec_fat_teddy_msks4_pck(const struct FDR *fdr,
                                           hwlm_group_t control);
 
 #endif /* HAVE_AVX2 */
+#ifdef __cplusplus
+}
+#endif
 
 #endif /* TEDDY_H_ */
diff --git a/src/fdr/teddy_avx2.c b/src/fdr/teddy_avx2.c
deleted file mode 100644 (file)
index e17e787..0000000
+++ /dev/null
@@ -1,709 +0,0 @@
-/*
- * Copyright (c) 2016-2020, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/** \file
- * \brief Teddy literal matcher: AVX2 engine runtime.
- */
-
-#include "fdr_internal.h"
-#include "flood_runtime.h"
-#include "teddy.h"
-#include "teddy_internal.h"
-#include "teddy_runtime_common.h"
-#include "util/arch.h"
-#include "util/simd_utils.h"
-
-#if defined(HAVE_AVX2)
-
-const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = {
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
-};
-
-#if defined(HAVE_AVX512VBMI) // VBMI strong fat teddy
-
-#define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, pt, conf_fn)          \
-do {                                                                        \
-    if (unlikely(chunk != ones_u64a)) {                                     \
-        chunk = ~chunk;                                                     \
-        conf_fn(&chunk, bucket, off, confBase, reason, a, pt,               \
-                &control, &last_match);                                     \
-        CHECK_HWLM_TERMINATE_MATCHING;                                      \
-    }                                                                       \
-} while(0)
-
-#define CONF_FAT_CHUNK_32(chunk, bucket, off, reason, pt, conf_fn)          \
-do {                                                                        \
-    if (unlikely(chunk != ones_u32)) {                                      \
-        chunk = ~chunk;                                                     \
-        conf_fn(&chunk, bucket, off, confBase, reason, a, pt,               \
-                &control, &last_match);                                     \
-        CHECK_HWLM_TERMINATE_MATCHING;                                      \
-    }                                                                       \
-} while(0)
-
-static really_inline
-const m512 *getDupMaskBase(const struct Teddy *teddy, u8 numMask) {
-    return (const m512 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy))
-                          + ROUNDUP_CL(2 * numMask * sizeof(m256)));
-}
-
-#else
-
-#define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, conf_fn)              \
-do {                                                                        \
-    if (unlikely(chunk != ones_u64a)) {                                     \
-        chunk = ~chunk;                                                     \
-        conf_fn(&chunk, bucket, off, confBase, reason, a, ptr,              \
-                &control, &last_match);                                     \
-        CHECK_HWLM_TERMINATE_MATCHING;                                      \
-    }                                                                       \
-} while(0)
-
-#define CONF_FAT_CHUNK_32(chunk, bucket, off, reason, conf_fn)              \
-do {                                                                        \
-    if (unlikely(chunk != ones_u32)) {                                      \
-        chunk = ~chunk;                                                     \
-        conf_fn(&chunk, bucket, off, confBase, reason, a, ptr,              \
-                &control, &last_match);                                     \
-        CHECK_HWLM_TERMINATE_MATCHING;                                      \
-    }                                                                       \
-} while(0)
-
-static really_inline
-const m256 *getMaskBase_fat(const struct Teddy *teddy) {
-    return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
-}
-
-#endif
-
-#if defined(HAVE_AVX512VBMI) // VBMI strong fat teddy
-
-const u8 ALIGN_AVX_DIRECTIVE p_mask_interleave[64] = {
-    0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
-    8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
-    16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
-    24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
-};
-
-#ifdef ARCH_64_BIT
-#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, pt, conf_fn)         \
-do {                                                                        \
-    if (unlikely(diff512(var, ones512()))) {                                \
-        m512 msk_interleave = load512(p_mask_interleave);                   \
-        m512 r = vpermb512(msk_interleave, var);                            \
-        m128 r0 = extract128from512(r, 0);                                  \
-        m128 r1 = extract128from512(r, 1);                                  \
-        m128 r2 = extract128from512(r, 2);                                  \
-        m128 r3 = extract128from512(r, 3);                                  \
-        u64a part1 = movq(r0);                                              \
-        u64a part2 = extract64from128(r0, 1);                               \
-        u64a part3 = movq(r1);                                              \
-        u64a part4 = extract64from128(r1, 1);                               \
-        u64a part5 = movq(r2);                                              \
-        u64a part6 = extract64from128(r2, 1);                               \
-        u64a part7 = movq(r3);                                              \
-        u64a part8 = extract64from128(r3, 1);                               \
-        CONF_FAT_CHUNK_64(part1, bucket, offset, reason, pt, conf_fn);      \
-        CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, pt, conf_fn);  \
-        CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, pt, conf_fn);  \
-        CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, pt, conf_fn); \
-        CONF_FAT_CHUNK_64(part5, bucket, offset + 16, reason, pt, conf_fn); \
-        CONF_FAT_CHUNK_64(part6, bucket, offset + 20, reason, pt, conf_fn); \
-        CONF_FAT_CHUNK_64(part7, bucket, offset + 24, reason, pt, conf_fn); \
-        CONF_FAT_CHUNK_64(part8, bucket, offset + 28, reason, pt, conf_fn); \
-    }                                                                       \
-} while(0)
-#else
-#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, pt, conf_fn)         \
-do {                                                                        \
-    if (unlikely(diff512(var, ones512()))) {                                \
-        m512 msk_interleave = load512(p_mask_interleave);                   \
-        m512 r = vpermb512(msk_interleave, var);                            \
-        m128 r0 = extract128from512(r, 0);                                  \
-        m128 r1 = extract128from512(r, 1);                                  \
-        m128 r2 = extract128from512(r, 2);                                  \
-        m128 r3 = extract128from512(r, 3);                                  \
-        u32 part1 = movd(r0);                                               \
-        u32 part2 = extract32from128(r0, 1);                                \
-        u32 part3 = extract32from128(r0, 2);                                \
-        u32 part4 = extract32from128(r0, 3);                                \
-        u32 part5 = movd(r1);                                               \
-        u32 part6 = extract32from128(r1, 1);                                \
-        u32 part7 = extract32from128(r1, 2);                                \
-        u32 part8 = extract32from128(r1, 3);                                \
-        u32 part9 = movd(r2);                                               \
-        u32 part10 = extract32from128(r2, 1);                               \
-        u32 part11 = extract32from128(r2, 2);                               \
-        u32 part12 = extract32from128(r2, 3);                               \
-        u32 part13 = movd(r3);                                              \
-        u32 part14 = extract32from128(r3, 1);                               \
-        u32 part15 = extract32from128(r3, 2);                               \
-        u32 part16 = extract32from128(r3, 3);                               \
-        CONF_FAT_CHUNK_32(part1, bucket, offset, reason, pt, conf_fn);      \
-        CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, pt, conf_fn);  \
-        CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, pt, conf_fn);  \
-        CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, pt, conf_fn);  \
-        CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, pt, conf_fn);  \
-        CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, pt, conf_fn); \
-        CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, pt, conf_fn); \
-        CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, pt, conf_fn); \
-        CONF_FAT_CHUNK_32(part9, bucket, offset + 16, reason, pt, conf_fn); \
-        CONF_FAT_CHUNK_32(part10, bucket, offset + 18, reason, pt, conf_fn);\
-        CONF_FAT_CHUNK_32(part11, bucket, offset + 20, reason, pt, conf_fn);\
-        CONF_FAT_CHUNK_32(part12, bucket, offset + 22, reason, pt, conf_fn);\
-        CONF_FAT_CHUNK_32(part13, bucket, offset + 24, reason, pt, conf_fn);\
-        CONF_FAT_CHUNK_32(part14, bucket, offset + 26, reason, pt, conf_fn);\
-        CONF_FAT_CHUNK_32(part15, bucket, offset + 28, reason, pt, conf_fn);\
-        CONF_FAT_CHUNK_32(part16, bucket, offset + 30, reason, pt, conf_fn);\
-    }                                                                       \
-} while(0)
-#endif
-
-#define PREP_FAT_SHUF_MASK                                                  \
-    m512 lo = and512(val, *lo_mask);                                        \
-    m512 hi = and512(rshift64_m512(val, 4), *lo_mask)
-
-#define FAT_TEDDY_VBMI_PSHUFB_OR_M1                          \
-    m512 shuf_or_b0 = or512(pshufb_m512(dup_mask[0], lo),    \
-                            pshufb_m512(dup_mask[1], hi));
-
-#define FAT_TEDDY_VBMI_PSHUFB_OR_M2                          \
-    FAT_TEDDY_VBMI_PSHUFB_OR_M1                              \
-    m512 shuf_or_b1 = or512(pshufb_m512(dup_mask[2], lo),    \
-                            pshufb_m512(dup_mask[3], hi));
-
-#define FAT_TEDDY_VBMI_PSHUFB_OR_M3                          \
-    FAT_TEDDY_VBMI_PSHUFB_OR_M2                              \
-    m512 shuf_or_b2 = or512(pshufb_m512(dup_mask[4], lo),    \
-                            pshufb_m512(dup_mask[5], hi));
-
-#define FAT_TEDDY_VBMI_PSHUFB_OR_M4                          \
-    FAT_TEDDY_VBMI_PSHUFB_OR_M3                              \
-    m512 shuf_or_b3 = or512(pshufb_m512(dup_mask[6], lo),    \
-                            pshufb_m512(dup_mask[7], hi));
-
-#define FAT_TEDDY_VBMI_SL1_MASK   0xfffffffefffffffeULL
-#define FAT_TEDDY_VBMI_SL2_MASK   0xfffffffcfffffffcULL
-#define FAT_TEDDY_VBMI_SL3_MASK   0xfffffff8fffffff8ULL
-
-#define FAT_TEDDY_VBMI_SHIFT_M1
-
-#define FAT_TEDDY_VBMI_SHIFT_M2                      \
-    FAT_TEDDY_VBMI_SHIFT_M1                          \
-    m512 sl1 = maskz_vpermb512(FAT_TEDDY_VBMI_SL1_MASK, sl_msk[0], shuf_or_b1);
-
-#define FAT_TEDDY_VBMI_SHIFT_M3                      \
-    FAT_TEDDY_VBMI_SHIFT_M2                          \
-    m512 sl2 = maskz_vpermb512(FAT_TEDDY_VBMI_SL2_MASK, sl_msk[1], shuf_or_b2);
-
-#define FAT_TEDDY_VBMI_SHIFT_M4                      \
-    FAT_TEDDY_VBMI_SHIFT_M3                          \
-    m512 sl3 = maskz_vpermb512(FAT_TEDDY_VBMI_SL3_MASK, sl_msk[2], shuf_or_b3);
-
-#define FAT_SHIFT_OR_M1            \
-    shuf_or_b0
-
-#define FAT_SHIFT_OR_M2            \
-    or512(sl1, FAT_SHIFT_OR_M1)
-
-#define FAT_SHIFT_OR_M3            \
-    or512(sl2, FAT_SHIFT_OR_M2)
-
-#define FAT_SHIFT_OR_M4            \
-    or512(sl3, FAT_SHIFT_OR_M3)
-
-static really_inline
-m512 prep_conf_fat_teddy_m1(const m512 *lo_mask, const m512 *dup_mask,
-                            UNUSED const m512 *sl_msk, const m512 val) {
-    PREP_FAT_SHUF_MASK;
-    FAT_TEDDY_VBMI_PSHUFB_OR_M1;
-    FAT_TEDDY_VBMI_SHIFT_M1;
-    return FAT_SHIFT_OR_M1;
-}
-
-static really_inline
-m512 prep_conf_fat_teddy_m2(const m512 *lo_mask, const m512 *dup_mask,
-                            const m512 *sl_msk, const m512 val) {
-    PREP_FAT_SHUF_MASK;
-    FAT_TEDDY_VBMI_PSHUFB_OR_M2;
-    FAT_TEDDY_VBMI_SHIFT_M2;
-    return FAT_SHIFT_OR_M2;
-}
-
-static really_inline
-m512 prep_conf_fat_teddy_m3(const m512 *lo_mask, const m512 *dup_mask,
-                            const m512 *sl_msk, const m512 val) {
-    PREP_FAT_SHUF_MASK;
-    FAT_TEDDY_VBMI_PSHUFB_OR_M3;
-    FAT_TEDDY_VBMI_SHIFT_M3;
-    return FAT_SHIFT_OR_M3;
-}
-
-static really_inline
-m512 prep_conf_fat_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
-                            const m512 *sl_msk, const m512 val) {
-    PREP_FAT_SHUF_MASK;
-    FAT_TEDDY_VBMI_PSHUFB_OR_M4;
-    FAT_TEDDY_VBMI_SHIFT_M4;
-    return FAT_SHIFT_OR_M4;
-}
-
-#define PREP_CONF_FAT_FN(val, n)    \
-    prep_conf_fat_teddy_m##n(&lo_mask, dup_mask, sl_msk, val)
-
-#define FAT_TEDDY_VBMI_SL1_POS    15
-#define FAT_TEDDY_VBMI_SL2_POS    14
-#define FAT_TEDDY_VBMI_SL3_POS    13
-
-#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M1
-
-#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M2    \
-    FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M1        \
-    sl_msk[0] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL1_POS);
-
-#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M3    \
-    FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M2        \
-    sl_msk[1] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL2_POS);
-
-#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M4    \
-    FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M3        \
-    sl_msk[2] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL3_POS);
-
-/*
- * In FAT teddy, it needs 2 bytes to represent result of each position,
- * so each nibble's(for example, lo nibble of last byte) FAT teddy mask
- * has 16x2 bytes:
- *   |----------------------------------|----------------------------------|
- *   16bytes (bucket 0..7 in each byte) 16bytes (bucket 8..15 in each byte)
- *                     A                                  B
- * at runtime FAT teddy reads 16 bytes once and duplicate them to 32 bytes:
- *   |----------------------------------|----------------------------------|
- *   16bytes input data (lo nibbles)    16bytes duplicated data (lo nibbles)
- *                     X                                  X
- * then do pshufb_m256(AB, XX).
- *
- * In AVX512 reinforced FAT teddy, it reads 32 bytes once and duplicate them
- * to 64 bytes:
- *   |----------------|----------------|----------------|----------------|
- *            X                Y                X                Y
- * in this case we need DUP_FAT_MASK to construct AABB:
- *   |----------------|----------------|----------------|----------------|
- *            A                A                B                B
- * then do pshufb_m512(AABB, XYXY).
- */
-
-#define PREPARE_FAT_MASKS(n)                                                  \
-    m512 lo_mask = set1_64x8(0xf);                                              \
-    m512 sl_msk[n - 1];                                                       \
-    FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M##n
-
-#define FAT_TEDDY_VBMI_CONF_MASK_HEAD   (0xffffffffULL >> n_sh)
-#define FAT_TEDDY_VBMI_CONF_MASK_FULL   ((0xffffffffULL << n_sh) & 0xffffffffULL)
-#define FAT_TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffULL >> (32 - n) << overlap)
-#define FAT_TEDDY_VBMI_LOAD_MASK_PATCH  (0xffffffffULL >> (32 - n_sh))
-
-#define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn)                   \
-do {                                                                          \
-    const u8 *buf_end = a->buf + a->len;                                      \
-    const u8 *ptr = a->buf + a->start_offset;                                 \
-    u32 floodBackoff = FLOOD_BACKOFF_START;                                   \
-    const u8 *tryFloodDetect = a->firstFloodDetect;                           \
-    u32 last_match = ones_u32;                                                \
-    const struct Teddy *teddy = (const struct Teddy *)fdr;                    \
-    const size_t iterBytes = 32;                                              \
-    u32 n_sh = n_msk - 1;                                                     \
-    const size_t loopBytes = 32 - n_sh;                                       \
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",                 \
-                 a->buf, a->len, a->start_offset);                            \
-                                                                              \
-    const m512 *dup_mask = getDupMaskBase(teddy, n_msk);                      \
-    PREPARE_FAT_MASKS(n_msk);                                                 \
-    const u32 *confBase = getConfBase(teddy);                                 \
-                                                                              \
-    u64a k = FAT_TEDDY_VBMI_CONF_MASK_FULL;                                   \
-    m512 p_mask = set_mask_m512(~((k << 32) | k));                            \
-    u32 overlap = 0;                                                          \
-    u64a patch = 0;                                                           \
-    if (likely(ptr + loopBytes <= buf_end)) {                                 \
-        u64a k0 = FAT_TEDDY_VBMI_CONF_MASK_HEAD;                              \
-        m512 p_mask0 = set_mask_m512(~((k0 << 32) | k0));                     \
-        m512 r_0 = PREP_CONF_FAT_FN(set2x256(loadu256(ptr)), n_msk);          \
-        r_0 = or512(r_0, p_mask0);                                            \
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, ptr, conf_fn);               \
-        ptr += loopBytes;                                                     \
-        overlap = n_sh;                                                       \
-        patch = FAT_TEDDY_VBMI_LOAD_MASK_PATCH;                               \
-    }                                                                         \
-                                                                              \
-    for (; ptr + loopBytes <= buf_end; ptr += loopBytes) {                    \
-        CHECK_FLOOD;                                                          \
-        m512 r_0 = PREP_CONF_FAT_FN(set2x256(loadu256(ptr - n_sh)), n_msk);   \
-        r_0 = or512(r_0, p_mask);                                             \
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, ptr - n_sh, conf_fn);     \
-    }                                                                         \
-                                                                              \
-    assert(ptr + loopBytes > buf_end);                                        \
-    if (ptr < buf_end) {                                                      \
-        u32 left = (u32)(buf_end - ptr);                                      \
-        u64a k1 = FAT_TEDDY_VBMI_CONF_MASK_VAR(left);                         \
-        m512 p_mask1 = set_mask_m512(~((k1 << 32) | k1));                     \
-        m512 val_0 = set2x256(loadu_maskz_m256(k1 | patch, ptr - overlap));   \
-        m512 r_0 = PREP_CONF_FAT_FN(val_0, n_msk);                            \
-        r_0 = or512(r_0, p_mask1);                                            \
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, ptr - overlap, conf_fn);     \
-    }                                                                         \
-                                                                              \
-    return HWLM_SUCCESS;                                                      \
-} while(0)
-
-#else // !HAVE_AVX512VBMI, AVX2 normal fat teddy
-
-#ifdef ARCH_64_BIT
-#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn)             \
-do {                                                                        \
-    if (unlikely(diff256(var, ones256()))) {                                \
-        m256 swap = swap128in256(var);                                      \
-        m256 r = interleave256lo(var, swap);                                \
-        u64a part1 = extractlow64from256(r);                                \
-        u64a part2 = extract64from256(r, 1);                                \
-        r = interleave256hi(var, swap);                                     \
-        u64a part3 = extractlow64from256(r);                                \
-        u64a part4 = extract64from256(r, 1);                                \
-        CONF_FAT_CHUNK_64(part1, bucket, offset, reason, conf_fn);          \
-        CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, conf_fn);      \
-        CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, conf_fn);      \
-        CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, conf_fn);     \
-    }                                                                       \
-} while(0)
-#else
-#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn)             \
-do {                                                                        \
-    if (unlikely(diff256(var, ones256()))) {                                \
-        m256 swap = swap128in256(var);                                      \
-        m256 r = interleave256lo(var, swap);                                \
-        u32 part1 = extractlow32from256(r);                                 \
-        u32 part2 = extract32from256(r, 1);                                 \
-        u32 part3 = extract32from256(r, 2);                                 \
-        u32 part4 = extract32from256(r, 3);                                 \
-        r = interleave256hi(var, swap);                                     \
-        u32 part5 = extractlow32from256(r);                                 \
-        u32 part6 = extract32from256(r, 1);                                 \
-        u32 part7 = extract32from256(r, 2);                                 \
-        u32 part8 = extract32from256(r, 3);                                 \
-        CONF_FAT_CHUNK_32(part1, bucket, offset, reason, conf_fn);          \
-        CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, conf_fn);      \
-        CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, conf_fn);      \
-        CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, conf_fn);      \
-        CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, conf_fn);      \
-        CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, conf_fn);     \
-        CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, conf_fn);     \
-        CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, conf_fn);     \
-    }                                                                       \
-} while(0)
-#endif
-
-static really_inline
-m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset,
-                       const u8 *lo, const u8 *hi,
-                       const u8 *buf_history, size_t len_history,
-                       const u32 nMasks) {
-    m128 p_mask128;
-    m256 ret = set1_2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
-                                        buf_history, len_history, nMasks));
-    *p_mask = set1_2x128(p_mask128);
-    return ret;
-}
-
-static really_inline
-m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
-    m256 mask = set1_32x8(0xf);
-    m256 lo = and256(val, mask);
-    m256 hi = and256(rshift64_m256(val, 4), mask);
-    return or256(pshufb_m256(maskBase[0 * 2], lo),
-                 pshufb_m256(maskBase[0 * 2 + 1], hi));
-}
-
-static really_inline
-m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
-    m256 mask = set1_32x8(0xf);
-    m256 lo = and256(val, mask);
-    m256 hi = and256(rshift64_m256(val, 4), mask);
-    m256 r = prep_conf_fat_teddy_m1(maskBase, val);
-
-    m256 res_1 = or256(pshufb_m256(maskBase[1 * 2], lo),
-                       pshufb_m256(maskBase[1 * 2 + 1], hi));
-    m256 res_shifted_1 = vpalignr(res_1, *old_1, 16 - 1);
-    *old_1 = res_1;
-    return or256(r, res_shifted_1);
-}
-
-static really_inline
-m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
-                            m256 val) {
-    m256 mask = set1_32x8(0xf);
-    m256 lo = and256(val, mask);
-    m256 hi = and256(rshift64_m256(val, 4), mask);
-    m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val);
-
-    m256 res_2 = or256(pshufb_m256(maskBase[2 * 2], lo),
-                       pshufb_m256(maskBase[2 * 2 + 1], hi));
-    m256 res_shifted_2 = vpalignr(res_2, *old_2, 16 - 2);
-    *old_2 = res_2;
-    return or256(r, res_shifted_2);
-}
-
-static really_inline
-m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
-                            m256 *old_3, m256 val) {
-    m256 mask = set1_32x8(0xf);
-    m256 lo = and256(val, mask);
-    m256 hi = and256(rshift64_m256(val, 4), mask);
-    m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val);
-
-    m256 res_3 = or256(pshufb_m256(maskBase[3 * 2], lo),
-                       pshufb_m256(maskBase[3 * 2 + 1], hi));
-    m256 res_shifted_3 = vpalignr(res_3, *old_3, 16 - 3);
-    *old_3 = res_3;
-    return or256(r, res_shifted_3);
-}
-
-#define FDR_EXEC_FAT_TEDDY_RES_OLD_1                                        \
-do {                                                                        \
-} while(0)
-
-#define FDR_EXEC_FAT_TEDDY_RES_OLD_2                                        \
-    m256 res_old_1 = zeroes256();
-
-#define FDR_EXEC_FAT_TEDDY_RES_OLD_3                                        \
-    m256 res_old_1 = zeroes256();                                           \
-    m256 res_old_2 = zeroes256();
-
-#define FDR_EXEC_FAT_TEDDY_RES_OLD_4                                        \
-    m256 res_old_1 = zeroes256();                                           \
-    m256 res_old_2 = zeroes256();                                           \
-    m256 res_old_3 = zeroes256();
-
-#define FDR_EXEC_FAT_TEDDY_RES_OLD(n) FDR_EXEC_FAT_TEDDY_RES_OLD_##n
-
-#define PREP_CONF_FAT_FN_1(mask_base, val)                                  \
-    prep_conf_fat_teddy_m1(mask_base, val)
-
-#define PREP_CONF_FAT_FN_2(mask_base, val)                                  \
-    prep_conf_fat_teddy_m2(mask_base, &res_old_1, val)
-
-#define PREP_CONF_FAT_FN_3(mask_base, val)                                  \
-    prep_conf_fat_teddy_m3(mask_base, &res_old_1, &res_old_2, val)
-
-#define PREP_CONF_FAT_FN_4(mask_base, val)                                  \
-    prep_conf_fat_teddy_m4(mask_base, &res_old_1, &res_old_2, &res_old_3, val)
-
-#define PREP_CONF_FAT_FN(mask_base, val, n)                                 \
-    PREP_CONF_FAT_FN_##n(mask_base, val)
-
-#define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn)                 \
-do {                                                                        \
-    const u8 *buf_end = a->buf + a->len;                                    \
-    const u8 *ptr = a->buf + a->start_offset;                               \
-    u32 floodBackoff = FLOOD_BACKOFF_START;                                 \
-    const u8 *tryFloodDetect = a->firstFloodDetect;                         \
-    u32 last_match = ones_u32;                                              \
-    const struct Teddy *teddy = (const struct Teddy *)fdr;                  \
-    const size_t iterBytes = 32;                                            \
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",               \
-                 a->buf, a->len, a->start_offset);                          \
-                                                                            \
-    const m256 *maskBase = getMaskBase_fat(teddy);                          \
-    const u32 *confBase = getConfBase(teddy);                               \
-                                                                            \
-    FDR_EXEC_FAT_TEDDY_RES_OLD(n_msk);                                      \
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);                             \
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);         \
-    if (ptr < mainStart) {                                                  \
-        ptr = mainStart - 16;                                               \
-        m256 p_mask;                                                        \
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->start_offset,       \
-                                       a->buf, buf_end,                     \
-                                       a->buf_history, a->len_history,      \
-                                       n_msk);                              \
-        m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk);                \
-        r_0 = or256(r_0, p_mask);                                           \
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn);                  \
-        ptr += 16;                                                          \
-    }                                                                       \
-                                                                            \
-    if (ptr + 16 <= buf_end) {                                              \
-        m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk);       \
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn);                  \
-        ptr += 16;                                                          \
-    }                                                                       \
-                                                                            \
-    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {                 \
-        __builtin_prefetch(ptr + (iterBytes * 4));                          \
-        CHECK_FLOOD;                                                        \
-        m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk);       \
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn);               \
-        m256 r_1 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr + 16), n_msk);  \
-        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, conf_fn);              \
-    }                                                                       \
-                                                                            \
-    if (ptr + 16 <= buf_end) {                                              \
-        m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk);       \
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn);               \
-        ptr += 16;                                                          \
-    }                                                                       \
-                                                                            \
-    assert(ptr + 16 > buf_end);                                             \
-    if (ptr < buf_end) {                                                    \
-        m256 p_mask;                                                        \
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, 0, ptr, buf_end,       \
-                                       a->buf_history, a->len_history,      \
-                                       n_msk);                              \
-        m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk);                \
-        r_0 = or256(r_0, p_mask);                                           \
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn);                  \
-    }                                                                       \
-                                                                            \
-    return HWLM_SUCCESS;                                                    \
-} while(0)
-
-#endif // HAVE_AVX512VBMI
-
-hwlm_error_t fdr_exec_fat_teddy_msks1(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a,
-                                      hwlm_group_t control) {
-    FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
-}
-
-hwlm_error_t fdr_exec_fat_teddy_msks1_pck(const struct FDR *fdr,
-                                          const struct FDR_Runtime_Args *a,
-                                          hwlm_group_t control) {
-    FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
-}
-
-hwlm_error_t fdr_exec_fat_teddy_msks2(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a,
-                                      hwlm_group_t control) {
-    FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
-}
-
-hwlm_error_t fdr_exec_fat_teddy_msks2_pck(const struct FDR *fdr,
-                                          const struct FDR_Runtime_Args *a,
-                                          hwlm_group_t control) {
-    FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
-}
-
-hwlm_error_t fdr_exec_fat_teddy_msks3(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a,
-                                      hwlm_group_t control) {
-    FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
-}
-
-hwlm_error_t fdr_exec_fat_teddy_msks3_pck(const struct FDR *fdr,
-                                          const struct FDR_Runtime_Args *a,
-                                          hwlm_group_t control) {
-    FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
-}
-
-hwlm_error_t fdr_exec_fat_teddy_msks4(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a,
-                                      hwlm_group_t control) {
-    FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
-}
-
-hwlm_error_t fdr_exec_fat_teddy_msks4_pck(const struct FDR *fdr,
-                                          const struct FDR_Runtime_Args *a,
-                                          hwlm_group_t control) {
-    FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
-}
-
-#endif // HAVE_AVX2
diff --git a/src/fdr/teddy_fat.cpp b/src/fdr/teddy_fat.cpp
new file mode 100644 (file)
index 0000000..29e4352
--- /dev/null
@@ -0,0 +1,570 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2024, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* fat teddy for AVX2 and AVX512VBMI */
+
+#include "fdr_internal.h"
+#include "flood_runtime.h"
+#include "teddy.h"
+#include "teddy_internal.h"
+#include "teddy_runtime_common.h"
+#include "util/arch.h"
+#include "util/simd_utils.h"
+
+#if defined(HAVE_AVX2)
+
+#ifdef ARCH_64_BIT
+static really_inline
+hwlm_error_t conf_chunk_64(u64a chunk, u8 bucket, u8 offset,
+                           CautionReason reason, const u8 *pt,
+                           const u32* confBase,             
+                           const struct FDR_Runtime_Args *a,           
+                           hwlm_group_t *control,            
+                           u32 *last_match) {    
+    if (unlikely(chunk != ones_u64a)) {
+        chunk = ~chunk;
+        do_confWithBit_teddy(&chunk, bucket, offset, confBase, reason, a, pt,
+                control, last_match);
+        // adapted from CHECK_HWLM_TERMINATE_MATCHING    
+        if (unlikely(*control == HWLM_TERMINATE_MATCHING)) {
+            return HWLM_TERMINATED;
+        }
+
+    }
+    return HWLM_SUCCESS;
+}
+
+#define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, pt, confBase, a, control, last_match) \
+ if(conf_chunk_64(chunk, bucket, off, reason, pt, confBase, a, control, last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
+#else
+static really_inline
+hwlm_error_t conf_chunk_32(u32 chunk, u8 bucket, u8 offset,
+                           CautionReason reason, const u8 *pt,
+                           const u32* confBase,
+                           const struct FDR_Runtime_Args *a,
+                           hwlm_group_t *control,
+                           u32 *last_match) {
+    if (unlikely(chunk != ones_u32)) {
+        chunk = ~chunk;
+        do_confWithBit_teddy(&chunk, bucket, offset, confBase, reason, a, pt,
+                control, last_match);
+        // adapted from CHECK_HWLM_TERMINATE_MATCHING
+        if (unlikely(*control == HWLM_TERMINATE_MATCHING)) {
+            return HWLM_TERMINATED;
+        }
+    }
+    return HWLM_SUCCESS;
+}
+
+
+#define CONF_FAT_CHUNK_32(chunk, bucket, off, reason, pt, confBase, a, control, last_match) \
+ if(conf_chunk_32(chunk, bucket, off, reason, pt, confBase, a, control, last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
+
+#endif
+
+
+#if defined(HAVE_AVX512VBMI) // VBMI strong teddy
+
+ // fat 512 teddy is only with vbmi
+
+static really_inline
+const m512 *getDupMaskBase(const struct Teddy *teddy, u8 numMask) {
+    return (const m512 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy))
+                          + ROUNDUP_CL(2 * numMask * sizeof(m256)));
+}
+
+
+const u8 ALIGN_AVX_DIRECTIVE p_mask_interleave[64] = {
+    0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
+    8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
+    16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
+    24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
+};
+
+#ifdef ARCH_64_BIT
+hwlm_error_t confirm_fat_teddy_64_512(m512 var, u8 bucket, u8 offset,
+                                  CautionReason reason, const u8 *ptr,
+                                  const struct FDR_Runtime_Args *a,
+                                  const u32* confBase, hwlm_group_t *control,
+                                  u32 *last_match) {
+    if (unlikely(diff512(var, ones512()))) {
+        m512 msk_interleave = load512(p_mask_interleave);
+        m512 r = vpermb512(msk_interleave, var);
+        m128 r0 = extract128from512(r, 0);
+        m128 r1 = extract128from512(r, 1);
+        m128 r2 = extract128from512(r, 2);
+        m128 r3 = extract128from512(r, 3);
+        u64a part1 = movq(r0);
+        u64a part2 = extract64from128(r0, 1);
+        u64a part3 = movq(r1);
+        u64a part4 = extract64from128(r1, 1);
+        u64a part5 = movq(r2);
+        u64a part6 = extract64from128(r2, 1);
+        u64a part7 = movq(r3);
+        u64a part8 = extract64from128(r3, 1);
+        CONF_FAT_CHUNK_64(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_64(part5, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_64(part6, bucket, offset + 20, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_64(part7, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_64(part8, bucket, offset + 28, reason, ptr, confBase, a, control, last_match);
+    }
+    return HWLM_SUCCESS;
+}
+#define confirm_fat_teddy_512_f confirm_fat_teddy_64_512
+#else // 32-64
+
+hwlm_error_t confirm_fat_teddy_32_512(m512 var, u8 bucket, u8 offset,
+                                  CautionReason reason, const u8 *ptr,
+                                  const struct FDR_Runtime_Args *a,
+                                  const u32* confBase, hwlm_group_t *control,
+                                  u32 *last_match) {
+    if (unlikely(diff512(var, ones512()))) {
+        m512 msk_interleave = load512(p_mask_interleave);
+        m512 r = vpermb512(msk_interleave, var);
+        m128 r0 = extract128from512(r, 0);
+        m128 r1 = extract128from512(r, 1);
+        m128 r2 = extract128from512(r, 2);
+        m128 r3 = extract128from512(r, 3);
+        u32 part1 = movd(r0);
+        u32 part2 = extract32from128(r0, 1);
+        u32 part3 = extract32from128(r0, 2);
+        u32 part4 = extract32from128(r0, 3);
+        u32 part5 = movd(r1);
+        u32 part6 = extract32from128(r1, 1);
+        u32 part7 = extract32from128(r1, 2);
+        u32 part8 = extract32from128(r1, 3);
+        u32 part9 = movd(r2);
+        u32 part10 = extract32from128(r2, 1);
+        u32 part11 = extract32from128(r2, 2);
+        u32 part12 = extract32from128(r2, 3);
+        u32 part13 = movd(r3);
+        u32 part14 = extract32from128(r3, 1);
+        u32 part15 = extract32from128(r3, 2);
+        u32 part16 = extract32from128(r3, 3);
+        CONF_FAT_CHUNK_32(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part9, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part10, bucket, offset + 18, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part11, bucket, offset + 20, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part12, bucket, offset + 22, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part13, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part14, bucket, offset + 26, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part15, bucket, offset + 28, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part16, bucket, offset + 30, reason, ptr, confBase, a, control, last_match);
+    }
+    return HWLM_SUCCESS;
+}
+#define confirm_fat_teddy_512_f confirm_fat_teddy_32_512
+#endif // 32/64
+
+#define CONFIRM_FAT_TEDDY_512(...) if(confirm_fat_teddy_512_f(__VA_ARGS__, a, confBase, &control, &last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
+
+#define TEDDY_VBMI_SL1_MASK   0xfffffffffffffffeULL
+#define TEDDY_VBMI_SL2_MASK   0xfffffffffffffffcULL
+#define TEDDY_VBMI_SL3_MASK   0xfffffffffffffff8ULL
+
+#define FAT_TEDDY_VBMI_SL1_MASK   0xfffffffefffffffeULL
+#define FAT_TEDDY_VBMI_SL2_MASK   0xfffffffcfffffffcULL
+#define FAT_TEDDY_VBMI_SL3_MASK   0xfffffff8fffffff8ULL
+
+#define FAT_TEDDY_VBMI_SL1_POS    15
+#define FAT_TEDDY_VBMI_SL2_POS    14
+#define FAT_TEDDY_VBMI_SL3_POS    13
+
+#define FAT_TEDDY_VBMI_CONF_MASK_HEAD   (0xffffffffULL >> n_sh)
+#define FAT_TEDDY_VBMI_CONF_MASK_FULL   ((0xffffffffULL << n_sh) & 0xffffffffULL)
+#define FAT_TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffULL >> (32 - n) << overlap)
+#define FAT_TEDDY_VBMI_LOAD_MASK_PATCH  (0xffffffffULL >> (32 - n_sh))
+
+template<int NMSK>
+static really_inline
+m512 prep_conf_fat_teddy_512vbmi_templ(const m512 *lo_mask, const m512 *dup_mask,
+                                       const m512 *sl_msk, const m512 val) {
+    m512 lo = and512(val, *lo_mask);
+    m512 hi = and512(rshift64_m512(val, 4), *lo_mask);
+    m512 shuf_or_b0 = or512(pshufb_m512(dup_mask[0], lo),
+                            pshufb_m512(dup_mask[1], hi));
+
+    if constexpr (NMSK == 1) return shuf_or_b0;
+    m512 shuf_or_b1 = or512(pshufb_m512(dup_mask[2], lo),
+                            pshufb_m512(dup_mask[3], hi));
+    m512 sl1 = maskz_vpermb512(FAT_TEDDY_VBMI_SL1_MASK, sl_msk[0], shuf_or_b1);
+    if constexpr (NMSK == 2) return (or512(sl1, shuf_or_b0));
+    m512 shuf_or_b2 = or512(pshufb_m512(dup_mask[4], lo),
+                            pshufb_m512(dup_mask[5], hi));
+    m512 sl2 = maskz_vpermb512(FAT_TEDDY_VBMI_SL2_MASK, sl_msk[1], shuf_or_b2);
+    if constexpr (NMSK == 3) return (or512(sl2, or512(sl1, shuf_or_b0)));
+    m512 shuf_or_b3 = or512(pshufb_m512(dup_mask[6], lo),
+                            pshufb_m512(dup_mask[7], hi));
+    m512 sl3 = maskz_vpermb512(FAT_TEDDY_VBMI_SL3_MASK, sl_msk[2], shuf_or_b3);
+    return (or512(sl3, or512(sl2, or512(sl1, shuf_or_b0))));
+}
+
+
+#define TEDDY_VBMI_SL1_POS    15
+#define TEDDY_VBMI_SL2_POS    14
+#define TEDDY_VBMI_SL3_POS    13
+
+#define TEDDY_VBMI_CONF_MASK_HEAD   (0xffffffffffffffffULL >> n_sh)
+#define TEDDY_VBMI_CONF_MASK_FULL   (0xffffffffffffffffULL << n_sh)
+#define TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffffffffffULL >> (64 - n) << overlap)
+#define TEDDY_VBMI_LOAD_MASK_PATCH  (0xffffffffffffffffULL >> (64 - n_sh))
+
+template<int NMSK>
+hwlm_error_t fdr_exec_fat_teddy_512vbmi_templ(const struct FDR *fdr,
+                                              const struct FDR_Runtime_Args *a,
+                                              hwlm_group_t control) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = ones_u32;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 32;
+    u32 n_sh = NMSK - 1;
+    const size_t loopBytes = 32 - n_sh;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m512 *dup_mask = getDupMaskBase(teddy, NMSK);
+    m512 lo_mask = set1_64x8(0xf);
+    m512 sl_msk[NMSK - 1];
+    if constexpr (NMSK > 1){
+    sl_msk[0] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL1_POS);
+    }
+    if constexpr (NMSK > 2){
+    sl_msk[1] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL2_POS);
+    }
+    if constexpr (NMSK > 3){
+    sl_msk[2] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL3_POS);
+    }
+
+    const u32 *confBase = getConfBase(teddy);
+
+    u64a k = FAT_TEDDY_VBMI_CONF_MASK_FULL;
+    m512 p_mask = set_mask_m512(~((k << 32) | k));
+    u32 overlap = 0;
+    u64a patch = 0;
+    if (likely(ptr + loopBytes <= buf_end)) {
+        u64a k0 = FAT_TEDDY_VBMI_CONF_MASK_HEAD;
+        m512 p_mask0 = set_mask_m512(~((k0 << 32) | k0));
+        m512 r_0 = prep_conf_fat_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, set2x256(loadu256(ptr)));
+
+        r_0 = or512(r_0, p_mask0);
+        CONFIRM_FAT_TEDDY_512(r_0, 16, 0, VECTORING, ptr);
+        ptr += loopBytes;
+        overlap = n_sh;
+        patch = FAT_TEDDY_VBMI_LOAD_MASK_PATCH;
+    }
+
+    for (; ptr + loopBytes <= buf_end; ptr += loopBytes) {
+        CHECK_FLOOD;
+        m512 r_0 = prep_conf_fat_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, set2x256(loadu256(ptr - n_sh)));
+        r_0 = or512(r_0, p_mask);
+        CONFIRM_FAT_TEDDY_512(r_0, 16, 0, NOT_CAUTIOUS, ptr - n_sh);
+    }
+
+    assert(ptr + loopBytes > buf_end);
+    if (ptr < buf_end) {
+        u32 left = (u32)(buf_end - ptr);
+        u64a k1 = FAT_TEDDY_VBMI_CONF_MASK_VAR(left);
+        m512 p_mask1 = set_mask_m512(~((k1 << 32) | k1));
+        m512 val_0 = set2x256(loadu_maskz_m256(k1 | patch, ptr - overlap));
+        m512 r_0 = prep_conf_fat_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, val_0);
+
+        r_0 = or512(r_0, p_mask1);
+        CONFIRM_FAT_TEDDY_512(r_0, 16, 0, VECTORING, ptr - overlap);
+    }
+
+    return HWLM_SUCCESS;
+}
+
+#define FDR_EXEC_FAT_TEDDY_FN fdr_exec_fat_teddy_512vbmi_templ
+
+
+#elif defined(HAVE_AVX2) // not HAVE_AVX512 but HAVE_AVX2 reinforced teddy
+
+
+#ifdef ARCH_64_BIT
+extern "C" {
+hwlm_error_t confirm_fat_teddy_64_256(m256 var, u8 bucket, u8 offset,
+                                      CautionReason reason, const u8 *ptr,
+                                      const struct FDR_Runtime_Args *a,
+                                      const u32* confBase, hwlm_group_t *control,
+                                      u32 *last_match) {
+    if (unlikely(diff256(var, ones256()))) {
+        m256 swap = swap128in256(var);
+        m256 r = interleave256lo(var, swap);
+        u64a part1 = extractlow64from256(r);
+        u64a part2 = extract64from256(r, 1);
+        r = interleave256hi(var, swap);
+        u64a part3 = extractlow64from256(r);
+        u64a part4 = extract64from256(r, 1);
+        CONF_FAT_CHUNK_64(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
+    }
+    return HWLM_SUCCESS;
+}
+} // extern C
+
+#define confirm_fat_teddy_256_f confirm_fat_teddy_64_256
+
+#else
+extern "C" {
+hwlm_error_t confirm_fat_teddy_32_256(m256 var, u8 bucket, u8 offset,
+                                      CautionReason reason, const u8 *ptr,
+                                      const struct FDR_Runtime_Args *a,
+                                      const u32* confBase, hwlm_group_t *control,
+                                      u32 *last_match) {
+    if (unlikely(diff256(var, ones256()))) {
+        m256 swap = swap128in256(var);
+        m256 r = interleave256lo(var, swap);
+        u32 part1 = extractlow32from256(r);
+        u32 part2 = extract32from256(r, 1);
+        u32 part3 = extract32from256(r, 2);
+        u32 part4 = extract32from256(r, 3);
+        r = interleave256hi(var, swap);
+        u32 part5 = extractlow32from256(r);
+        u32 part6 = extract32from256(r, 1);
+        u32 part7 = extract32from256(r, 2);
+        u32 part8 = extract32from256(r, 3);
+        CONF_FAT_CHUNK_32(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, ptr, confBase, a, control, last_match);
+    }
+    return HWLM_SUCCESS;
+}
+
+} // extern C
+
+#define confirm_fat_teddy_256_f confirm_fat_teddy_32_256
+
+#endif
+
+#define CONFIRM_FAT_TEDDY_256(...) if(confirm_fat_teddy_256_f(__VA_ARGS__, a, confBase, &control, &last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
+
+static really_inline
+const m256 *getMaskBase_fat(const struct Teddy *teddy) {
+    return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
+}
+
+
+static really_inline
+m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset,
+                       const u8 *lo, const u8 *hi,
+                       const u8 *buf_history, size_t len_history,
+                       const u32 nMasks) {
+    m128 p_mask128;
+    m256 ret = set1_2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
+                                          buf_history, len_history, nMasks));
+    *p_mask = set1_2x128(p_mask128);
+    return ret;
+}
+
+template<int NMSK>
+static really_inline
+m256 prep_conf_fat_teddy_256_templ(const m256 *maskBase, m256 val,
+                                   m256* old_1, m256* old_2, m256* old_3){
+    m256 mask = set1_32x8(0xf);
+    m256 lo = and256(val, mask);
+    m256 hi = and256(rshift64_m256(val, 4), mask);
+    m256 r = or256(pshufb_m256(maskBase[0 * 2], lo),
+                     pshufb_m256(maskBase[0 * 2 + 1], hi));
+    if constexpr (NMSK == 1) return r;
+    m256 res_1 = or256(pshufb_m256(maskBase[(NMSK-1) * 2], lo),
+                       pshufb_m256(maskBase[(NMSK-1) * 2 + 1], hi));
+    m256 res_shifted_1 = vpalignr(res_1, *old_1, 16 - (NMSK-1));
+    *old_1 = res_1;
+    r = or256(r, res_shifted_1);
+    if constexpr (NMSK == 2) return r;
+    m256 res_2 = or256(pshufb_m256(maskBase[(NMSK-1) * 2], lo),
+                       pshufb_m256(maskBase[(NMSK-1) * 2 + 1], hi));
+    m256 res_shifted_2 = vpalignr(res_2, *old_2, 16 - (NMSK-1));
+    *old_2 = res_2;
+    r = or256(r, res_shifted_2);
+    if constexpr (NMSK == 3) return r;
+    m256 res_3 = or256(pshufb_m256(maskBase[(NMSK-1) * 2], lo),
+                       pshufb_m256(maskBase[(NMSK-1) * 2 + 1], hi));
+    m256 res_shifted_3 = vpalignr(res_3, *old_3, 16 - (NMSK-1));
+    *old_3 = res_3;
+    return or256(r, res_shifted_3);
+}
+
+template<int NMSK>
+hwlm_error_t fdr_exec_fat_teddy_256_templ(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = ones_u32;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 32;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m256 *maskBase = getMaskBase_fat(teddy);
+    const u32 *confBase = getConfBase(teddy);
+
+    m256 res_old_1 = zeroes256();
+    m256 res_old_2 = zeroes256();
+    m256 res_old_3 = zeroes256();
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+    if (ptr < mainStart) {
+        ptr = mainStart - 16;
+        m256 p_mask;
+        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->start_offset,
+                                       a->buf, buf_end,
+                                       a->buf_history, a->len_history,
+                                       NMSK);
+        m256 r_0 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, val_0, &res_old_1, &res_old_2, &res_old_3);
+        r_0 = or256(r_0, p_mask);
+        CONFIRM_FAT_TEDDY_256(r_0, 16, 0, VECTORING, ptr);
+        ptr += 16;
+    }
+
+    if (ptr + 16 <= buf_end) {
+        m256 r_0 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, load2x128(ptr), &res_old_1, &res_old_2, &res_old_3);
+        CONFIRM_FAT_TEDDY_256(r_0, 16, 0, VECTORING, ptr);
+        ptr += 16;
+    }
+
+    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+        __builtin_prefetch(ptr + (iterBytes * 4));
+        CHECK_FLOOD;
+        m256 r_0 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, load2x128(ptr), &res_old_1, &res_old_2, &res_old_3);
+        CONFIRM_FAT_TEDDY_256(r_0, 16, 0, NOT_CAUTIOUS, ptr);
+        m256 r_1 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, load2x128(ptr + 16), &res_old_1, &res_old_2, &res_old_3);
+        CONFIRM_FAT_TEDDY_256(r_1, 16, 16, NOT_CAUTIOUS, ptr);
+    }
+
+    if (ptr + 16 <= buf_end) {
+        m256 r_0 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, load2x128(ptr), &res_old_1, &res_old_2, &res_old_3);
+        CONFIRM_FAT_TEDDY_256(r_0, 16, 0, NOT_CAUTIOUS, ptr);
+        ptr += 16;
+    }
+
+    assert(ptr + 16 > buf_end);
+    if (ptr < buf_end) {
+        m256 p_mask;
+        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, 0, ptr, buf_end,
+                                       a->buf_history, a->len_history,
+                                       NMSK);
+        m256 r_0 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, val_0, &res_old_1, &res_old_2, &res_old_3);
+        r_0 = or256(r_0, p_mask);
+        CONFIRM_FAT_TEDDY_256(r_0, 16, 0, VECTORING, ptr);
+    }
+    return HWLM_SUCCESS;
+}
+
+// this check is because it is possible to build with both AVX512VBMI and AVX2 defined,
+// to replicate the behaviour of the original flow of control we give preference
+// to the former. If we're building for both then this will be compiled multiple times
+// with the desired variant defined by itself.
+#ifndef FDR_EXEC_FAT_TEDDY_FN
+#define FDR_EXEC_FAT_TEDDY_FN fdr_exec_fat_teddy_256_templ
+#endif
+
+#endif // HAVE_AVX2 for fat teddy
+
+/* we only have fat teddy in these two modes */
+// #if (defined(HAVE_AVX2) || defined(HAVE_AVX512VBMI)) && defined(FDR_EXEC_FAT_TEDDY_FN)
+// #if defined(FDR_EXEC_FAT_TEDDY_FN)
+
+extern "C" {
+hwlm_error_t fdr_exec_fat_teddy_msks1(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    return FDR_EXEC_FAT_TEDDY_FN<1>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_fat_teddy_msks1_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control) {
+    return FDR_EXEC_FAT_TEDDY_FN<1>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_fat_teddy_msks2(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    return FDR_EXEC_FAT_TEDDY_FN<2>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_fat_teddy_msks2_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control) {
+    return FDR_EXEC_FAT_TEDDY_FN<2>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_fat_teddy_msks3(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    return FDR_EXEC_FAT_TEDDY_FN<3>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_fat_teddy_msks3_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control) {
+    return FDR_EXEC_FAT_TEDDY_FN<3>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_fat_teddy_msks4(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    return FDR_EXEC_FAT_TEDDY_FN<4>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_fat_teddy_msks4_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control) {
+    return FDR_EXEC_FAT_TEDDY_FN<4>(fdr, a, control);
+}
+
+} // extern c
+
+#endif // HAVE_AVX2 from the beginning
+
index d27be994ee8c64e7e1d0f51d9215ad568e3e393c..bd0dba8fd60afa28dc90e63ecdfbad1d403d7d35 100644 (file)
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2016-2020, Intel Corporation
+ * Copyright (c) 2024, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
 #include "util/simd_utils.h"
 #include "util/uniform_ops.h"
 
-extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
-#if defined(HAVE_AVX2)
-extern const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64];
-#endif
 
 #if defined(HAVE_AVX512VBMI)
 static const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = {
@@ -142,6 +139,37 @@ void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) {
 //          |----------|-------|----------------|............|
 //          0          start   start+offset     end(<=16)
 // p_mask   ffff.....ffffff..ff0000...........00ffff..........
+
+// replace the p_mask_arr table.
+// m is the length of the zone of bytes==0 , n is
+// the offset where that zone begins. more specifically, there are
+// 16-n bytes of 1's before the zone begins.
+// m,n 4,7  - 4 bytes of 0s, and 16-7 bytes of 1's before that.
+// 00 00 00 00 ff..ff
+// ff ff ff ff ff ff ff ff 00 00 00 00 ff..ff
+// m,n 15,15 - 15 bytes of 0s , f's high, but also with 16-15=1 byte of 1s
+// in the beginning - which push the ff at the end off the high end , leaving
+// ff 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+// m,n 15,16 - 15 bytes of 0s, ff high , with 16-16 = 0 ones on the low end
+// before that, so,
+// 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ff
+// so to get the one part, with the f's high, we start out with 1's and
+// shift them up (right) by m+n.
+// now to fill in any ones that belong on the low end we have to take
+// some 1's and shift them down. the ones zone there needs to be 16-n long,
+// meaning shifted down by 16-(16-n) , or of course just n.
+// then we should be able to or these together.
+static really_inline
+m128 p_mask_gen(u8 m, u8 n){
+    m128 a = ones128();
+    m128 b = ones128();
+    m%=17; n%=17;
+    m+=(16-n); m%=17;
+    a = rshiftbyte_m128(a, n);
+    b = lshiftbyte_m128(b, m);
+    return or128(a, b);
+}
+
 static really_inline
 m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
                      const u8 *lo, const u8 *hi,
@@ -161,13 +189,11 @@ m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
         uintptr_t avail = (uintptr_t)(hi - ptr);
         if (avail >= 16) {
             assert(start_offset - start <= 16);
-            *p_mask = loadu128(p_mask_arr[16 - start_offset + start]
-                               + 16 - start_offset + start);
+            *p_mask = p_mask_gen(16 - start_offset + start, 16 - start_offset + start);
             return loadu128(ptr);
         }
         assert(start_offset - start <= avail);
-        *p_mask = loadu128(p_mask_arr[avail - start_offset + start]
-                           + 16 - start_offset + start);
+        *p_mask = p_mask_gen(avail - start_offset + start, 16 - start_offset + start);
         copy_start = 0;
         copy_len = avail;
     } else { // start zone
@@ -180,8 +206,7 @@ m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
         }
         uintptr_t end = MIN(16, (uintptr_t)(hi - ptr));
         assert(start + start_offset <= end);
-        *p_mask = loadu128(p_mask_arr[end - start - start_offset]
-                           + 16 - start - start_offset);
+        *p_mask = p_mask_gen(end - start - start_offset, 16 - start - start_offset);
         copy_start = start;
         copy_len = end - start;
     }
@@ -270,6 +295,20 @@ void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) {
 //          |----------|-------|----------------|............|
 //          0          start   start+offset     end(<=32)
 // p_mask   ffff.....ffffff..ff0000...........00ffff..........
+
+// like the pmask gen above this replaces the large array.
+static really_inline
+m256 fat_pmask_gen(u8 m, u8 n){
+    m256 a=ones256();
+    m256 b=ones256();
+    m%=33; n%=33;
+    m+=(32-n); m%=33;
+
+    a = rshift_byte_m256(a, m);
+    b = lshift_byte_m256(b, n);
+    return or256(a, b);
+}
+
 static really_inline
 m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
                      const u8 *lo, const u8 *hi,
@@ -289,13 +328,11 @@ m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
         uintptr_t avail = (uintptr_t)(hi - ptr);
         if (avail >= 32) {
             assert(start_offset - start <= 32);
-            *p_mask = loadu256(p_mask_arr256[32 - start_offset + start]
-                               + 32 - start_offset + start);
+            *p_mask = fat_pmask_gen(32 - start_offset + start, 32 - start_offset + start);
             return loadu256(ptr);
         }
         assert(start_offset - start <= avail);
-        *p_mask = loadu256(p_mask_arr256[avail - start_offset + start]
-                           + 32 - start_offset + start);
+        *p_mask = fat_pmask_gen(avail - start_offset + start, 32 - start_offset + start);
         copy_start = 0;
         copy_len = avail;
     } else { //start zone
@@ -308,8 +345,7 @@ m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
         }
         uintptr_t end = MIN(32, (uintptr_t)(hi - ptr));
         assert(start + start_offset <= end);
-        *p_mask = loadu256(p_mask_arr256[end - start - start_offset]
-                           + 32 - start - start_offset);
+        *p_mask = fat_pmask_gen(end - start - start_offset, 32 - start - start_offset);
         copy_start = start;
         copy_len = end - start;
     }
@@ -428,8 +464,13 @@ void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
         if (!cf) {
             continue;
         }
+#ifdef __cplusplus
+        const struct FDRConfirm *fdrc = reinterpret_cast<const struct FDRConfirm *>
+                                        (reinterpret_cast<const u8 *>(confBase) + cf);
+#else
         const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
                                         ((const u8 *)confBase + cf);
+#endif
         if (!(fdrc->groups & *control)) {
             continue;
         }
@@ -442,18 +483,31 @@ void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
 
 static really_inline
 const m128 *getMaskBase(const struct Teddy *teddy) {
+#ifdef __cplusplus
+    return reinterpret_cast<const m128 *>(reinterpret_cast<const u8 *>(teddy) + ROUNDUP_CL(sizeof(struct Teddy)));
+#else
     return (const m128 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
+#endif
 }
 
 static really_inline
 const u64a *getReinforcedMaskBase(const struct Teddy *teddy, u8 numMask) {
+#ifdef __cplusplus
+    return reinterpret_cast<const u64a *>(reinterpret_cast<const u8 *>(getMaskBase(teddy))
+                          + ROUNDUP_CL(2 * numMask * sizeof(m128)));
+#else
     return (const u64a *)((const u8 *)getMaskBase(teddy)
                           + ROUNDUP_CL(2 * numMask * sizeof(m128)));
+#endif
 }
 
 static really_inline
 const u32 *getConfBase(const struct Teddy *teddy) {
+#ifdef __cplusplus
+    return reinterpret_cast<const u32 *>(reinterpret_cast<const u8 *>(teddy) + teddy->confOffset);
+#else
     return (const u32 *)((const u8 *)teddy + teddy->confOffset);
+#endif
 }
 
 #endif /* TEDDY_RUNTIME_COMMON_H_ */
index e1b605586cd76813fd63a668014310a4f47976c6..c301f09f2300123ad8f5a842bea27270d058a4cf 100644 (file)
@@ -352,16 +352,20 @@ static really_really_inline
 m128 rshiftbyte_m128(m128 a, unsigned b) {
     if (b == 0) {
         return a;
+    } else if (b > 15) {
+        return zeroes128();
     }
-    return palignr(zeroes128(), a, b);
+    else return palignr(zeroes128(), a, b);
 }
 
 static really_really_inline
 m128 lshiftbyte_m128(m128 a, unsigned b) {
     if (b == 0) {
         return a;
+    } else if (b > 15) {
+        return zeroes128();
     }
-    return palignr(a, zeroes128(), 16 - b);
+    else return palignr(a, zeroes128(), 16 - b);
 }
 
 static really_inline
index 1fc158009339588dad53292a913a8416d4da6db2..e39c791be819ef7172f385b020ffa368f22ba3cb 100644 (file)
@@ -42,6 +42,7 @@
 
 #include <string.h> // for memcpy
 
+
 #define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
 #define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
 #define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
@@ -178,13 +179,11 @@ m128 load_m128_from_u64a(const u64a *p) {
 
 #define CASE_RSHIFT_VECTOR(a, count)  case count: return _mm_srli_si128((m128)(a), (count)); break;
 
+// we encounter cases where an argument slips past __builtin_constant_p but 
+// still fails to meet the (stricter) criteria demanded by the underlying 
+// intrinsic. in those cases we want to explicitly avoid the optimization.
 static really_inline
-m128 rshiftbyte_m128(const m128 a, int count_immed) {
-#if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
-    if (__builtin_constant_p(count_immed)) {
-        return _mm_srli_si128(a, count_immed);
-    }
-#endif
+m128 rshiftbyte_m128_nim(const m128 a, int count_immed) {
     switch (count_immed) {
     case 0: return a; break;
     CASE_RSHIFT_VECTOR(a, 1);
@@ -205,17 +204,26 @@ m128 rshiftbyte_m128(const m128 a, int count_immed) {
     default: return zeroes128(); break;
     }
 }
-#undef CASE_RSHIFT_VECTOR
-
-#define CASE_LSHIFT_VECTOR(a, count)  case count: return _mm_slli_si128((m128)(a), (count)); break;
 
 static really_inline
-m128 lshiftbyte_m128(const m128 a, int count_immed) {
+m128 rshiftbyte_m128(const m128 a, int count_immed) {
 #if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
     if (__builtin_constant_p(count_immed)) {
-        return _mm_slli_si128(a, count_immed);
+        return _mm_srli_si128(a, count_immed);
     }
 #endif
+    return rshiftbyte_m128_nim(a, count_immed);
+}
+
+#undef CASE_RSHIFT_VECTOR
+
+#define CASE_LSHIFT_VECTOR(a, count)  case count: return _mm_slli_si128((m128)(a), (count)); break;
+
+// we encounter cases where an argument slips past __builtin_constant_p but 
+// still fails to meet the (stricter) criteria demanded by the underlying 
+// intrinsic. in those cases we want to explicitly avoid the optimization.
+static really_inline
+m128 lshiftbyte_m128_nim(const m128 a, int count_immed) {
     switch (count_immed) {
     case 0: return a; break;
     CASE_LSHIFT_VECTOR(a, 1);
@@ -236,6 +244,16 @@ m128 lshiftbyte_m128(const m128 a, int count_immed) {
     default: return zeroes128(); break;
     }
 }
+
+static really_inline
+m128 lshiftbyte_m128(const m128 a, int count_immed) {
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
+    if (__builtin_constant_p(count_immed)) {
+        return _mm_slli_si128(a, count_immed);
+    }
+#endif
+    return lshiftbyte_m128_nim(a, count_immed);
+}
 #undef CASE_LSHIFT_VECTOR
 
 #if defined(HAVE_SSE41)
@@ -500,6 +518,56 @@ static really_inline m256 ones256(void) {
     return rv;
 }
 
+// byte-granularity shifts of the whole 256 bits as a single chunk
+static really_inline m256 lshift_byte_m256(m256 v, u8 n){
+    if(n==0)return v;
+    else {
+        union {
+            u8 c[32];
+            m128 val128[2];
+            m256 val256;
+        } u;
+        u.val256=v;
+        if(n < 16){
+            m128 c = lshiftbyte_m128_nim(u.val128[1], 16-n);
+            u.val128[1] = rshiftbyte_m128_nim(u.val128[1], n);
+            u.val128[0] = or128(c, rshiftbyte_m128_nim(u.val128[0], n));
+            return u.val256;
+        } else if(n==16){
+            u.val128[0] = u.val128[1]; u.val128[1]=zeroes128();
+            return u.val256;
+        } else if(n<32){
+            u.val128[0] = rshiftbyte_m128_nim(u.val128[0], n-16);
+            u.val128[1]=zeroes128();
+            return u.val256;
+        } else return zeroes256();
+    }
+}
+
+static really_inline m256 rshift_byte_m256(m256 v, u8 n){
+    if(n==0)return v;
+    else {
+        union {
+            m128 val128[2];
+            m256 val256;
+        } u;
+        u.val256=v;
+        if(n < 16){
+            m128 c = rshiftbyte_m128_nim(u.val128[0], 16-n);
+            u.val128[0] = lshiftbyte_m128_nim(u.val128[0], n);
+            u.val128[1] = or128(c, lshiftbyte_m128_nim(u.val128[1], n));
+            return u.val256;
+        } else if(n==16){
+            u.val128[1] = u.val128[0]; u.val128[0]=zeroes128();
+            return u.val256;
+        } else if(n<32){
+            u.val128[1] = lshiftbyte_m128_nim(u.val128[1], n-16);
+            u.val128[0]=zeroes128();
+            return u.val256;
+        } else return zeroes256();
+    }
+}
+
 static really_inline m256 add256(m256 a, m256 b) {
     return _mm256_add_epi64(a, b);
 }