remove the use of macros for critical loops, easier to debug

author Konstantinos Margaritis <konstantinos@vectorcamp.gr>

Tue, 12 Dec 2023 16:16:13 +0000 (18:16 +0200)

committer Konstantinos Margaritis <konstantinos@vectorcamp.gr>

Wed, 29 Oct 2025 22:05:40 +0000 (00:05 +0200)
author Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Tue, 12 Dec 2023 16:16:13 +0000 (18:16 +0200)
committer Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Wed, 29 Oct 2025 22:05:40 +0000 (00:05 +0200)
diff --git a/src/fdr/arm/fdr_impl.h b/src/fdr/arm/fdr_impl.h

new file mode 100644 (file)

index 0000000..44f95d6
--- /dev/null
+++ b/src/fdr/arm/fdr_impl.h
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2025, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FDR_IMPL_ARM_H
+#define FDR_IMPL_ARM_H
+
+static really_inline
+void get_conf_stride(const u8 *itPtr, UNUSED const u8 *start_ptr,
+                       UNUSED const u8 *end_ptr, u32 domain_mask, u8 stride,
+                       const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
+    assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
+
+    // get_conf_stride_4
+    u64a it_hi = *(const u64a *)itPtr;
+    u64a it_lo = *(const u64a *)(itPtr + 8);
+    u64a reach0  = domain_mask & it_hi;
+    u64a reach4  = domain_mask & (it_hi >> 32);
+    u64a reach8  = domain_mask & it_lo;
+    u64a reach12 = domain_mask & (it_lo >> 32);
+
+    m128 st0 = load_m128_from_u64a(ft + reach0);
+    m128 st4 = load_m128_from_u64a(ft + reach4);
+    m128 st8 = load_m128_from_u64a(ft + reach8);
+    m128 st12 = load_m128_from_u64a(ft + reach12);
+
+    st4 = lshiftbyte_m128(st4, 4);
+    st12 = lshiftbyte_m128(st12, 4);
+
+    *s = or128(*s, st0);
+    *s = or128(*s, st4);
+
+    if (stride == 4) {
+        *conf0 = movq(*s);
+        *s = rshiftbyte_m128(*s, 8);
+        *conf0 ^= ~0ULL;
+
+        *s = or128(*s, st8);
+        *s = or128(*s, st12);
+        *conf8 = movq(*s);
+        *s = rshiftbyte_m128(*s, 8);
+        *conf8 ^= ~0ULL;
+        return;
+    }
+
+    // get_conf_stride_2
+    u64a reach2  = domain_mask & (it_hi >> 16);
+    u64a reach6  = domain_mask & (it_hi >> 48);
+    u64a reach10 = domain_mask & (it_lo >> 16);
+    u64a reach14 = domain_mask & (it_lo >> 48);
+    
+    m128 st2 = load_m128_from_u64a(ft + reach2);
+    m128 st6 = load_m128_from_u64a(ft + reach6);
+    m128 st10 = load_m128_from_u64a(ft + reach10);
+    m128 st14 = load_m128_from_u64a(ft + reach14);
+
+    st2  = lshiftbyte_m128(st2, 2);
+    st6  = lshiftbyte_m128(st6, 6);
+    st10 = lshiftbyte_m128(st10, 2);
+    st14 = lshiftbyte_m128(st14, 6);
+
+    *s = or128(*s, st2);
+    *s = or128(*s, st6);
+
+    if (stride == 2) {
+        *conf0 = movq(*s);
+        *s = rshiftbyte_m128(*s, 8);
+        *conf0 ^= ~0ULL;
+
+        *s = or128(*s, st8);
+        *s = or128(*s, st10);
+        *s = or128(*s, st12);
+        *s = or128(*s, st14);
+
+        *conf8 = movq(*s);
+        *s = rshiftbyte_m128(*s, 8);
+        *conf8 ^= ~0ULL;
+        return;
+    }
+
+    // get_conf_stride_1
+    u64a reach1  = domain_mask & (it_hi >> 8);
+    u64a reach3  = domain_mask & (it_hi >> 24);
+    u64a reach5  = domain_mask & (it_hi >> 40);
+    u64a reach7  = domain_mask & ((it_hi >> 56) | (it_lo << 8));
+    u64a reach9  = domain_mask & (it_lo >> 8);
+    u64a reach11 = domain_mask & (it_lo >> 24);
+    u64a reach13 = domain_mask & (it_lo >> 40);
+    u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15);
+
+    m128 st1 = load_m128_from_u64a(ft + reach1);
+    m128 st3 = load_m128_from_u64a(ft + reach3);
+    m128 st5 = load_m128_from_u64a(ft + reach5);
+    m128 st7 = load_m128_from_u64a(ft + reach7);
+    m128 st9 = load_m128_from_u64a(ft + reach9);
+    m128 st11 = load_m128_from_u64a(ft + reach11);
+    m128 st13 = load_m128_from_u64a(ft + reach13);
+    m128 st15 = load_m128_from_u64a(ft + reach15);
+
+    st1 = lshiftbyte_m128(st1, 1);
+    st3 = lshiftbyte_m128(st3, 3);
+    st5 = lshiftbyte_m128(st5, 5);
+    st7 = lshiftbyte_m128(st7, 7);
+    st9 = lshiftbyte_m128(st9, 1);
+    st11 = lshiftbyte_m128(st11, 3);
+    st13 = lshiftbyte_m128(st13, 5);
+    st15 = lshiftbyte_m128(st15, 7);
+
+    st0 = or128(st0, st1);
+    st2 = or128(st2, st3);
+    st4 = or128(st4, st5);
+    st6 = or128(st6, st7);
+    st0 = or128(st0, st2);
+    st4 = or128(st4, st6);
+    st0 = or128(st0, st4);
+
+    st8 = or128(st8, st9);
+    st10 = or128(st10, st11);
+    st12 = or128(st12, st13);
+    st14 = or128(st14, st15);
+    st8 = or128(st8, st10);
+    st12 = or128(st12, st14);
+    st8 = or128(st8, st12);
+
+    m128 st = or128(*s, st0);
+    *conf0 = movq(st) ^ ~0ULL;
+    st = rshiftbyte_m128(st, 8);
+    st = or128(st, st8);
+
+    *conf8 = movq(st) ^ ~0ULL;
+    *s = rshiftbyte_m128(st, 8);
+}
+
+static really_inline
+void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
+                    const u32 *confBase, const struct FDR_Runtime_Args *a,
+                    const u8 *ptr, u32 *last_match_id, const struct zone *z) {
+    const u8 bucket = 8;
+
+    if (likely(!*conf)) {
+        return;
+    }
+
+    /* ptr is currently referring to a location in the zone's buffer, we also
+     * need a pointer in the original, main buffer for the final string compare.
+     */
+    const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust); //NOLINT (performance-no-int-to-ptr)
+
+    const u8 *confLoc = ptr;
+
+    do  {
+        u32 bit = findAndClearLSB_64(conf);
+        u32 byte = bit / bucket + offset;
+        u32 bitRem = bit % bucket;
+        u32 idx = bitRem;
+        u32 cf = confBase[idx];
+        if (!cf) {
+            continue;
+        }
+        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
+                                        ((const u8 *)confBase + cf);
+        if (!(fdrc->groups & *control)) {
+            continue;
+        }
+        u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1);
+        confWithBit(fdrc, a, ptr_main - a->buf + byte, control,
+                    last_match_id, confVal, conf, bit);
+    } while (unlikely(!!*conf));
+}
+
+#endif // FDR_IMPL_ARM_H
+\ No newline at end of file
diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c

index 76c4a74317f972cfbab0e4421ac17792a81e92d9..fad561fe65348fc80d34d1f39f1b455994cf61b6 100644 (file)
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -1,6 +1,6 @@
  /*
   * Copyright (c) 2015-2017, Intel Corporation
- * Copyright (c) 2020-2024, VectorCamp PC
+ * Copyright (c) 2020-2025, VectorCamp PC
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions are met:
@@ -32,6 +32,7 @@
  #include "fdr_confirm_runtime.h"
  #include "fdr_internal.h"
  #include "fdr_loadval.h"
+#include "fdr_impl.h"
  #include "flood_runtime.h"
  #include "scratch.h"
  #include "teddy.h"
@@ -41,86 +42,6 @@
  #include "util/simd_utils.h"
  #include "util/uniform_ops.h"
  
-/** \brief number of bytes processed in each iteration */
-#define ITER_BYTES          16
-
-/** \brief total zone buffer size */
-#define ZONE_TOTAL_SIZE     64
-
-/** \brief maximum number of allowed zones */
-#define ZONE_MAX            3
-
-/** \brief zone information.
- *
- * Zone represents a region of data to scan in FDR.
- *
- * The incoming buffer is to split in multiple zones to ensure two properties:
- * 1: that we can read 8? bytes behind to generate a hash safely
- * 2: that we can read the 3 byte after the current byte (domain > 8)
- */
-struct zone {
-    /** \brief copied buffer, used only when it is a boundary zone. */
-    u8 ALIGN_CL_DIRECTIVE buf[ZONE_TOTAL_SIZE];
-
-    /** \brief shift amount for fdr state to avoid unwanted match. */
-    u8 shift;
-
-    /** \brief if boundary zone, start points into the zone buffer after the
-     * pre-padding. Otherwise, points to the main buffer, appropriately. */
-    const u8 *start;
-
-    /** \brief if boundary zone, end points to the end of zone. Otherwise,
-     * pointer to the main buffer, appropriately. */
-    const u8 *end;
-
-    /** \brief the amount to adjust to go from a pointer in the zones region
-     * (between start and end) to a pointer in the original data buffer. */
-    ptrdiff_t zone_pointer_adjust;
-
-    /** \brief firstFloodDetect from FDR_Runtime_Args for non-boundary zones,
-     * otherwise end of the zone buf. floodPtr always points inside the same
-     * buffer as the start pointe. */
-    const u8 *floodPtr;
-};
-
-static
-const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = {
-    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-    { 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-    { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-    { 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
-      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-    { 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
-      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-    { 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00,
-      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00,
-      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00,
-      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-      0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-      0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-      0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00 },
-    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 },
-    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-      0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00 },
-    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00 },
-    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 },
-    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }
-};
-
  /* generates an initial state mask based on the last byte-ish of history rather
   * than being all accepting. If there is no history to consider, the state is
   * generated based on the minimum length of each bucket in order to prevent
@@ -142,169 +63,6 @@ m128 getInitState(const struct FDR *fdr, u8 len_history, const u64a *ft,
      return s;
  }
  
-static really_inline
-void get_conf_stride(const u8 *itPtr, UNUSED const u8 *start_ptr,
-                       UNUSED const u8 *end_ptr, u32 domain_mask, u8 stride,
-                       const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
-    assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
-
-    // get_conf_stride_4
-    u64a it_hi = *(const u64a *)itPtr;
-    u64a it_lo = *(const u64a *)(itPtr + 8);
-    u64a reach0  = domain_mask & it_hi;
-    u64a reach4  = domain_mask & (it_hi >> 32);
-    u64a reach8  = domain_mask & it_lo;
-    u64a reach12 = domain_mask & (it_lo >> 32);
-
-    m128 st0 = load_m128_from_u64a(ft + reach0);
-    m128 st4 = load_m128_from_u64a(ft + reach4);
-    m128 st8 = load_m128_from_u64a(ft + reach8);
-    m128 st12 = load_m128_from_u64a(ft + reach12);
-
-    st4 = lshiftbyte_m128(st4, 4);
-    st12 = lshiftbyte_m128(st12, 4);
-
-    *s = or128(*s, st0);
-    *s = or128(*s, st4);
-
-    if (stride == 4) {
-        *conf0 = movq(*s);
-        *s = rshiftbyte_m128(*s, 8);
-        *conf0 ^= ~0ULL;
-
-        *s = or128(*s, st8);
-        *s = or128(*s, st12);
-        *conf8 = movq(*s);
-        *s = rshiftbyte_m128(*s, 8);
-        *conf8 ^= ~0ULL;
-        return;
-    }
-
-    // get_conf_stride_2
-    u64a reach2  = domain_mask & (it_hi >> 16);
-    u64a reach6  = domain_mask & (it_hi >> 48);
-    u64a reach10 = domain_mask & (it_lo >> 16);
-    u64a reach14 = domain_mask & (it_lo >> 48);
-    
-    m128 st2 = load_m128_from_u64a(ft + reach2);
-    m128 st6 = load_m128_from_u64a(ft + reach6);
-    m128 st10 = load_m128_from_u64a(ft + reach10);
-    m128 st14 = load_m128_from_u64a(ft + reach14);
-
-    st2  = lshiftbyte_m128(st2, 2);
-    st6  = lshiftbyte_m128(st6, 6);
-    st10 = lshiftbyte_m128(st10, 2);
-    st14 = lshiftbyte_m128(st14, 6);
-
-    *s = or128(*s, st2);
-    *s = or128(*s, st6);
-
-    if (stride == 2) {
-        *conf0 = movq(*s);
-        *s = rshiftbyte_m128(*s, 8);
-        *conf0 ^= ~0ULL;
-
-        *s = or128(*s, st8);
-        *s = or128(*s, st10);
-        *s = or128(*s, st12);
-        *s = or128(*s, st14);
-
-        *conf8 = movq(*s);
-        *s = rshiftbyte_m128(*s, 8);
-        *conf8 ^= ~0ULL;
-        return;
-    }
-
-    // get_conf_stride_1
-    u64a reach1  = domain_mask & (it_hi >> 8);
-    u64a reach3  = domain_mask & (it_hi >> 24);
-    u64a reach5  = domain_mask & (it_hi >> 40);
-    u64a reach7  = domain_mask & ((it_hi >> 56) | (it_lo << 8));
-    u64a reach9  = domain_mask & (it_lo >> 8);
-    u64a reach11 = domain_mask & (it_lo >> 24);
-    u64a reach13 = domain_mask & (it_lo >> 40);
-    u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15);
-
-    m128 st1 = load_m128_from_u64a(ft + reach1);
-    m128 st3 = load_m128_from_u64a(ft + reach3);
-    m128 st5 = load_m128_from_u64a(ft + reach5);
-    m128 st7 = load_m128_from_u64a(ft + reach7);
-    m128 st9 = load_m128_from_u64a(ft + reach9);
-    m128 st11 = load_m128_from_u64a(ft + reach11);
-    m128 st13 = load_m128_from_u64a(ft + reach13);
-    m128 st15 = load_m128_from_u64a(ft + reach15);
-
-    st1 = lshiftbyte_m128(st1, 1);
-    st3 = lshiftbyte_m128(st3, 3);
-    st5 = lshiftbyte_m128(st5, 5);
-    st7 = lshiftbyte_m128(st7, 7);
-    st9 = lshiftbyte_m128(st9, 1);
-    st11 = lshiftbyte_m128(st11, 3);
-    st13 = lshiftbyte_m128(st13, 5);
-    st15 = lshiftbyte_m128(st15, 7);
-
-    st0 = or128(st0, st1);
-    st2 = or128(st2, st3);
-    st4 = or128(st4, st5);
-    st6 = or128(st6, st7);
-    st0 = or128(st0, st2);
-    st4 = or128(st4, st6);
-    st0 = or128(st0, st4);
-
-    st8 = or128(st8, st9);
-    st10 = or128(st10, st11);
-    st12 = or128(st12, st13);
-    st14 = or128(st14, st15);
-    st8 = or128(st8, st10);
-    st12 = or128(st12, st14);
-    st8 = or128(st8, st12);
-
-    m128 st = or128(*s, st0);
-    *conf0 = movq(st) ^ ~0ULL;
-    st = rshiftbyte_m128(st, 8);
-    st = or128(st, st8);
-
-    *conf8 = movq(st) ^ ~0ULL;
-    *s = rshiftbyte_m128(st, 8);
-}
-
-static really_inline
-void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
-                    const u32 *confBase, const struct FDR_Runtime_Args *a,
-                    const u8 *ptr, u32 *last_match_id, const struct zone *z) {
-    const u8 bucket = 8;
-
-    if (likely(!*conf)) {
-        return;
-    }
-
-    /* ptr is currently referring to a location in the zone's buffer, we also
-     * need a pointer in the original, main buffer for the final string compare.
-     */
-    const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust); //NOLINT (performance-no-int-to-ptr)
-
-    const u8 *confLoc = ptr;
-
-    do  {
-        u32 bit = findAndClearLSB_64(conf);
-        u32 byte = bit / bucket + offset;
-        u32 bitRem = bit % bucket;
-        u32 idx = bitRem;
-        u32 cf = confBase[idx];
-        if (!cf) {
-            continue;
-        }
-        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
-                                        ((const u8 *)confBase + cf);
-        if (!(fdrc->groups & *control)) {
-            continue;
-        }
-        u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1);
-        confWithBit(fdrc, a, ptr_main - a->buf + byte, control,
-                    last_match_id, confVal, conf, bit);
-    } while (unlikely(!!*conf));
-}
-
  static really_inline
  void dumpZoneInfo(UNUSED const struct zone *z, UNUSED size_t zone_id) {
  #ifdef DEBUG
diff --git a/src/fdr/fdr.h b/src/fdr/fdr.h

index 4dcef851d62dec8c9188325d3ea31f92fc0fd50e..f6cf8f90e39370ced47e7abd9022788086d8303c 100644 (file)
--- a/src/fdr/fdr.h
+++ b/src/fdr/fdr.h
@@ -1,5 +1,6 @@
  /*
   * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2025, VectorCamp PC
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions are met:
diff --git a/src/fdr/fdr_impl.h b/src/fdr/fdr_impl.h

new file mode 100644 (file)

index 0000000..f7b755b
--- /dev/null
+++ b/src/fdr/fdr_impl.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2020-2025, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \brief number of bytes processed in each iteration */
+#define ITER_BYTES          16
+
+/** \brief total zone buffer size */
+#define ZONE_TOTAL_SIZE     64
+
+/** \brief maximum number of allowed zones */
+#define ZONE_MAX            3
+
+/** \brief zone information.
+ *
+ * Zone represents a region of data to scan in FDR.
+ *
+ * The incoming buffer is to split in multiple zones to ensure two properties:
+ * 1: that we can read 8? bytes behind to generate a hash safely
+ * 2: that we can read the 3 byte after the current byte (domain > 8)
+ */
+struct zone {
+    /** \brief copied buffer, used only when it is a boundary zone. */
+    u8 ALIGN_CL_DIRECTIVE buf[ZONE_TOTAL_SIZE];
+
+    /** \brief shift amount for fdr state to avoid unwanted match. */
+    u8 shift;
+
+    /** \brief if boundary zone, start points into the zone buffer after the
+     * pre-padding. Otherwise, points to the main buffer, appropriately. */
+    const u8 *start;
+
+    /** \brief if boundary zone, end points to the end of zone. Otherwise,
+     * pointer to the main buffer, appropriately. */
+    const u8 *end;
+
+    /** \brief the amount to adjust to go from a pointer in the zones region
+     * (between start and end) to a pointer in the original data buffer. */
+    ptrdiff_t zone_pointer_adjust;
+
+    /** \brief firstFloodDetect from FDR_Runtime_Args for non-boundary zones,
+     * otherwise end of the zone buf. floodPtr always points inside the same
+     * buffer as the start pointe. */
+    const u8 *floodPtr;
+};
+
+static
+const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = {
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+    { 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+    { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+    { 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+    { 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 },
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }
+};
+
+#if defined(VS_SIMDE_BACKEND)
+#include "x86/fdr_impl.h"
+#else
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#include "x86/fdr_impl.h"
+#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#include "arm/fdr_impl.h"
+#elif defined(ARCH_PPC64EL)
+#include "ppc64le/fdr_impl.h"
+#endif
+#endif
+\ No newline at end of file
diff --git a/src/fdr/ppc64le/fdr_impl.h b/src/fdr/ppc64le/fdr_impl.h

new file mode 100644 (file)

index 0000000..a6aedb5
--- /dev/null
+++ b/src/fdr/ppc64le/fdr_impl.h
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2025, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FDR_IMPL_PPC64LE_H
+#define FDR_IMPL_PPC64LE_H
+
+static really_inline
+void get_conf_stride(const u8 *itPtr, UNUSED const u8 *start_ptr,
+                       UNUSED const u8 *end_ptr, u32 domain_mask, u8 stride,
+                       const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
+    assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
+
+    // get_conf_stride_4
+    u64a it_hi = *(const u64a *)itPtr;
+    u64a it_lo = *(const u64a *)(itPtr + 8);
+    u64a reach0  = domain_mask & it_hi;
+    u64a reach4  = domain_mask & (it_hi >> 32);
+    u64a reach8  = domain_mask & it_lo;
+    u64a reach12 = domain_mask & (it_lo >> 32);
+
+    m128 st0 = load_m128_from_u64a(ft + reach0);
+    m128 st4 = load_m128_from_u64a(ft + reach4);
+    m128 st8 = load_m128_from_u64a(ft + reach8);
+    m128 st12 = load_m128_from_u64a(ft + reach12);
+
+    st4 = lshiftbyte_m128(st4, 4);
+    st12 = lshiftbyte_m128(st12, 4);
+
+    *s = or128(*s, st0);
+    *s = or128(*s, st4);
+
+    if (stride == 4) {
+        *conf0 = movq(*s);
+        *s = rshiftbyte_m128(*s, 8);
+        *conf0 ^= ~0ULL;
+
+        *s = or128(*s, st8);
+        *s = or128(*s, st12);
+        *conf8 = movq(*s);
+        *s = rshiftbyte_m128(*s, 8);
+        *conf8 ^= ~0ULL;
+        return;
+    }
+
+    // get_conf_stride_2
+    u64a reach2  = domain_mask & (it_hi >> 16);
+    u64a reach6  = domain_mask & (it_hi >> 48);
+    u64a reach10 = domain_mask & (it_lo >> 16);
+    u64a reach14 = domain_mask & (it_lo >> 48);
+    
+    m128 st2 = load_m128_from_u64a(ft + reach2);
+    m128 st6 = load_m128_from_u64a(ft + reach6);
+    m128 st10 = load_m128_from_u64a(ft + reach10);
+    m128 st14 = load_m128_from_u64a(ft + reach14);
+
+    st2  = lshiftbyte_m128(st2, 2);
+    st6  = lshiftbyte_m128(st6, 6);
+    st10 = lshiftbyte_m128(st10, 2);
+    st14 = lshiftbyte_m128(st14, 6);
+
+    *s = or128(*s, st2);
+    *s = or128(*s, st6);
+
+    if (stride == 2) {
+        *conf0 = movq(*s);
+        *s = rshiftbyte_m128(*s, 8);
+        *conf0 ^= ~0ULL;
+
+        *s = or128(*s, st8);
+        *s = or128(*s, st10);
+        *s = or128(*s, st12);
+        *s = or128(*s, st14);
+
+        *conf8 = movq(*s);
+        *s = rshiftbyte_m128(*s, 8);
+        *conf8 ^= ~0ULL;
+        return;
+    }
+
+    // get_conf_stride_1
+    u64a reach1  = domain_mask & (it_hi >> 8);
+    u64a reach3  = domain_mask & (it_hi >> 24);
+    u64a reach5  = domain_mask & (it_hi >> 40);
+    u64a reach7  = domain_mask & ((it_hi >> 56) | (it_lo << 8));
+    u64a reach9  = domain_mask & (it_lo >> 8);
+    u64a reach11 = domain_mask & (it_lo >> 24);
+    u64a reach13 = domain_mask & (it_lo >> 40);
+    u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15);
+
+    m128 st1 = load_m128_from_u64a(ft + reach1);
+    m128 st3 = load_m128_from_u64a(ft + reach3);
+    m128 st5 = load_m128_from_u64a(ft + reach5);
+    m128 st7 = load_m128_from_u64a(ft + reach7);
+    m128 st9 = load_m128_from_u64a(ft + reach9);
+    m128 st11 = load_m128_from_u64a(ft + reach11);
+    m128 st13 = load_m128_from_u64a(ft + reach13);
+    m128 st15 = load_m128_from_u64a(ft + reach15);
+
+    st1 = lshiftbyte_m128(st1, 1);
+    st3 = lshiftbyte_m128(st3, 3);
+    st5 = lshiftbyte_m128(st5, 5);
+    st7 = lshiftbyte_m128(st7, 7);
+    st9 = lshiftbyte_m128(st9, 1);
+    st11 = lshiftbyte_m128(st11, 3);
+    st13 = lshiftbyte_m128(st13, 5);
+    st15 = lshiftbyte_m128(st15, 7);
+
+    st0 = or128(st0, st1);
+    st2 = or128(st2, st3);
+    st4 = or128(st4, st5);
+    st6 = or128(st6, st7);
+    st0 = or128(st0, st2);
+    st4 = or128(st4, st6);
+    st0 = or128(st0, st4);
+
+    st8 = or128(st8, st9);
+    st10 = or128(st10, st11);
+    st12 = or128(st12, st13);
+    st14 = or128(st14, st15);
+    st8 = or128(st8, st10);
+    st12 = or128(st12, st14);
+    st8 = or128(st8, st12);
+
+    m128 st = or128(*s, st0);
+    *conf0 = movq(st) ^ ~0ULL;
+    st = rshiftbyte_m128(st, 8);
+    st = or128(st, st8);
+
+    *conf8 = movq(st) ^ ~0ULL;
+    *s = rshiftbyte_m128(st, 8);
+}
+
+static really_inline
+void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
+                    const u32 *confBase, const struct FDR_Runtime_Args *a,
+                    const u8 *ptr, u32 *last_match_id, const struct zone *z) {
+    const u8 bucket = 8;
+
+    if (likely(!*conf)) {
+        return;
+    }
+
+    /* ptr is currently referring to a location in the zone's buffer, we also
+     * need a pointer in the original, main buffer for the final string compare.
+     */
+    const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust); //NOLINT (performance-no-int-to-ptr)
+
+    const u8 *confLoc = ptr;
+
+    do  {
+        u32 bit = findAndClearLSB_64(conf);
+        u32 byte = bit / bucket + offset;
+        u32 bitRem = bit % bucket;
+        u32 idx = bitRem;
+        u32 cf = confBase[idx];
+        if (!cf) {
+            continue;
+        }
+        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
+                                        ((const u8 *)confBase + cf);
+        if (!(fdrc->groups & *control)) {
+            continue;
+        }
+        u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1);
+        confWithBit(fdrc, a, ptr_main - a->buf + byte, control,
+                    last_match_id, confVal, conf, bit);
+    } while (unlikely(!!*conf));
+}
+
+#endif // FDR_IMPL_PPC64LE_H
+\ No newline at end of file
diff --git a/src/fdr/x86/fdr_impl.h b/src/fdr/x86/fdr_impl.h

new file mode 100644 (file)

index 0000000..bfca631
--- /dev/null
+++ b/src/fdr/x86/fdr_impl.h
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2025, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FDR_IMPL_X86_H
+#define FDR_IMPL_X86_H
+
+static really_inline
+void get_conf_stride(const u8 *itPtr, UNUSED const u8 *start_ptr,
+                       UNUSED const u8 *end_ptr, u32 domain_mask, u8 stride,
+                       const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
+    assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
+
+    // get_conf_stride_4
+    u64a it_hi = *(const u64a *)itPtr;
+    u64a it_lo = *(const u64a *)(itPtr + 8);
+    u64a reach0  = domain_mask & it_hi;
+    u64a reach4  = domain_mask & (it_hi >> 32);
+    u64a reach8  = domain_mask & it_lo;
+    u64a reach12 = domain_mask & (it_lo >> 32);
+
+    m128 st0 = load_m128_from_u64a(ft + reach0);
+    m128 st4 = load_m128_from_u64a(ft + reach4);
+    m128 st8 = load_m128_from_u64a(ft + reach8);
+    m128 st12 = load_m128_from_u64a(ft + reach12);
+
+    st4 = lshiftbyte_m128(st4, 4);
+    st12 = lshiftbyte_m128(st12, 4);
+
+    *s = or128(*s, st0);
+    *s = or128(*s, st4);
+
+    if (stride == 4) {
+        *conf0 = movq(*s);
+        *s = rshiftbyte_m128(*s, 8);
+        *conf0 ^= ~0ULL;
+
+        *s = or128(*s, st8);
+        *s = or128(*s, st12);
+        *conf8 = movq(*s);
+        *s = rshiftbyte_m128(*s, 8);
+        *conf8 ^= ~0ULL;
+        return;
+    }
+
+    // get_conf_stride_2
+    u64a reach2  = domain_mask & (it_hi >> 16);
+    u64a reach6  = domain_mask & (it_hi >> 48);
+    u64a reach10 = domain_mask & (it_lo >> 16);
+    u64a reach14 = domain_mask & (it_lo >> 48);
+    
+    m128 st2 = load_m128_from_u64a(ft + reach2);
+    m128 st6 = load_m128_from_u64a(ft + reach6);
+    m128 st10 = load_m128_from_u64a(ft + reach10);
+    m128 st14 = load_m128_from_u64a(ft + reach14);
+
+    st2  = lshiftbyte_m128(st2, 2);
+    st6  = lshiftbyte_m128(st6, 6);
+    st10 = lshiftbyte_m128(st10, 2);
+    st14 = lshiftbyte_m128(st14, 6);
+
+    *s = or128(*s, st2);
+    *s = or128(*s, st6);
+
+    if (stride == 2) {
+        *conf0 = movq(*s);
+        *s = rshiftbyte_m128(*s, 8);
+        *conf0 ^= ~0ULL;
+
+        *s = or128(*s, st8);
+        *s = or128(*s, st10);
+        *s = or128(*s, st12);
+        *s = or128(*s, st14);
+
+        *conf8 = movq(*s);
+        *s = rshiftbyte_m128(*s, 8);
+        *conf8 ^= ~0ULL;
+        return;
+    }
+
+    // get_conf_stride_1
+    u64a reach1  = domain_mask & (it_hi >> 8);
+    u64a reach3  = domain_mask & (it_hi >> 24);
+    u64a reach5  = domain_mask & (it_hi >> 40);
+    u64a reach7  = domain_mask & ((it_hi >> 56) | (it_lo << 8));
+    u64a reach9  = domain_mask & (it_lo >> 8);
+    u64a reach11 = domain_mask & (it_lo >> 24);
+    u64a reach13 = domain_mask & (it_lo >> 40);
+    u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15);
+
+    m128 st1 = load_m128_from_u64a(ft + reach1);
+    m128 st3 = load_m128_from_u64a(ft + reach3);
+    m128 st5 = load_m128_from_u64a(ft + reach5);
+    m128 st7 = load_m128_from_u64a(ft + reach7);
+    m128 st9 = load_m128_from_u64a(ft + reach9);
+    m128 st11 = load_m128_from_u64a(ft + reach11);
+    m128 st13 = load_m128_from_u64a(ft + reach13);
+    m128 st15 = load_m128_from_u64a(ft + reach15);
+
+    st1 = lshiftbyte_m128(st1, 1);
+    st3 = lshiftbyte_m128(st3, 3);
+    st5 = lshiftbyte_m128(st5, 5);
+    st7 = lshiftbyte_m128(st7, 7);
+    st9 = lshiftbyte_m128(st9, 1);
+    st11 = lshiftbyte_m128(st11, 3);
+    st13 = lshiftbyte_m128(st13, 5);
+    st15 = lshiftbyte_m128(st15, 7);
+
+    st0 = or128(st0, st1);
+    st2 = or128(st2, st3);
+    st4 = or128(st4, st5);
+    st6 = or128(st6, st7);
+    st0 = or128(st0, st2);
+    st4 = or128(st4, st6);
+    st0 = or128(st0, st4);
+
+    st8 = or128(st8, st9);
+    st10 = or128(st10, st11);
+    st12 = or128(st12, st13);
+    st14 = or128(st14, st15);
+    st8 = or128(st8, st10);
+    st12 = or128(st12, st14);
+    st8 = or128(st8, st12);
+
+    m128 st = or128(*s, st0);
+    *conf0 = movq(st) ^ ~0ULL;
+    st = rshiftbyte_m128(st, 8);
+    st = or128(st, st8);
+
+    *conf8 = movq(st) ^ ~0ULL;
+    *s = rshiftbyte_m128(st, 8);
+}
+
+static really_inline
+void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
+                    const u32 *confBase, const struct FDR_Runtime_Args *a,
+                    const u8 *ptr, u32 *last_match_id, const struct zone *z) {
+    const u8 bucket = 8;
+
+    if (likely(!*conf)) {
+        return;
+    }
+
+    /* ptr is currently referring to a location in the zone's buffer, we also
+     * need a pointer in the original, main buffer for the final string compare.
+     */
+    const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust); //NOLINT (performance-no-int-to-ptr)
+
+    const u8 *confLoc = ptr;
+
+    do  {
+        u32 bit = findAndClearLSB_64(conf);
+        u32 byte = bit / bucket + offset;
+        u32 bitRem = bit % bucket;
+        u32 idx = bitRem;
+        u32 cf = confBase[idx];
+        if (!cf) {
+            continue;
+        }
+        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
+                                        ((const u8 *)confBase + cf);
+        if (!(fdrc->groups & *control)) {
+            continue;
+        }
+        u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1);
+        confWithBit(fdrc, a, ptr_main - a->buf + byte, control,
+                    last_match_id, confVal, conf, bit);
+    } while (unlikely(!!*conf));
+}
+
+#endif // FDR_IMPL_X86_H
+\ No newline at end of file
diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h

index eaf8b98f95dd67a2e2b7f2c07ece1b082022fe9d..45c00a2c51dfd8a27c81f929809c9e4f430ef29f 100644 (file)
--- a/src/util/arch/arm/simd_utils.h
+++ b/src/util/arch/arm/simd_utils.h
@@ -181,9 +181,7 @@ static really_inline m128 set1_2x64(u64a c) {
      return (m128) vdupq_n_u64(c);
  }
  
-static really_inline m128 insert32_m128(m128 in, u32 val, const int imm) {
-    return (m128) vsetq_lane_u32(val, (uint32x4_t)in, imm);
-}
+#define insert32_m128(in, val, imm) ((m128) vsetq_lane_u32(val, (uint32x4_t)in, imm))
  
  static really_inline u32 movd(const m128 in) {
      return vgetq_lane_u32((uint32x4_t) in, 0);
diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h

index 9b6f7539bdd037d18ed0bf6f39354e05587a545b..6d2c5d9ebd69f4126ba78fb52f60abc1edea5e32 100644 (file)
--- a/src/util/arch/ppc64el/simd_utils.h
+++ b/src/util/arch/ppc64el/simd_utils.h
@@ -429,6 +429,16 @@ m128 set2x64(u64a hi, u64a lo) {
      return (m128) v;
  }
  
+static really_inline
+m128 widenlo128(m128 x) {
+    return (m128) vec_mergel((m128)x, zeroes128());
+}
+
+static really_inline
+m128 widenhi128(m128 x) {
+    return (m128) vec_mergeh((m128)x, zeroes128());
+}
+
  #if defined(__clang__) && (__clang_major__ == 15)
  #pragma clang diagnostic pop
  #endif // defined(__clang__) && (__clang_major__ == 15)
diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h

index 1e67668f1a2d7ce8a949f31f3f2a13505b84f4a5..4df740086fcf454050be068da2449dd73a6f00df 100644 (file)
--- a/src/util/arch/x86/simd_utils.h
+++ b/src/util/arch/x86/simd_utils.h
@@ -169,9 +169,7 @@ static really_inline m128 set1_2x64(u64a c) {
      return _mm_set1_epi64x(c);
  }
  
-static really_inline m128 insert32_m128(m128 in, u32 val, const int imm) {
-    return _mm_insert_epi32(in, val, imm);
-}
+#define insert32_m128(in, val, imm) (m128) (_mm_insert_epi32((m128) in, (m128) val, (m128) imm))
  
  static really_inline u32 movd(const m128 in) {
      return _mm_cvtsi128_si32(in);
@@ -489,8 +487,6 @@ m128 set2x64(u64a hi, u64a lo) {
      return _mm_set_epi64x(hi, lo);
  }
  
-#include "../print_simd.h"
-
  static really_inline
  m128 widenlo128(m128 x) {
      return _mm_unpacklo_epi32(x, zeroes128());
author	Konstantinos Margaritis <konstantinos@vectorcamp.gr>
	Tue, 12 Dec 2023 16:16:13 +0000 (18:16 +0200)
committer	Konstantinos Margaritis <konstantinos@vectorcamp.gr>
	Wed, 29 Oct 2025 22:05:40 +0000 (00:05 +0200)
src/fdr/arm/fdr_impl.h	[new file with mode: 0644]	patch \| blob
src/fdr/fdr.c		patch \| blob \| blame \| history
src/fdr/fdr.h		patch \| blob \| blame \| history
src/fdr/fdr_impl.h	[new file with mode: 0644]	patch \| blob
src/fdr/ppc64le/fdr_impl.h	[new file with mode: 0644]	patch \| blob
src/fdr/x86/fdr_impl.h	[new file with mode: 0644]	patch \| blob
src/util/arch/arm/simd_utils.h		patch \| blob \| blame \| history
src/util/arch/ppc64el/simd_utils.h		patch \| blob \| blame \| history
src/util/arch/x86/simd_utils.h		patch \| blob \| blame \| history