--- /dev/null
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2025, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FDR_IMPL_ARM_H
+#define FDR_IMPL_ARM_H
+
+static really_inline
+void get_conf_stride(const u8 *itPtr, UNUSED const u8 *start_ptr,
+ UNUSED const u8 *end_ptr, u32 domain_mask, u8 stride,
+ const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
+ assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
+
+ // get_conf_stride_4
+ u64a it_hi = *(const u64a *)itPtr;
+ u64a it_lo = *(const u64a *)(itPtr + 8);
+ u64a reach0 = domain_mask & it_hi;
+ u64a reach4 = domain_mask & (it_hi >> 32);
+ u64a reach8 = domain_mask & it_lo;
+ u64a reach12 = domain_mask & (it_lo >> 32);
+
+ m128 st0 = load_m128_from_u64a(ft + reach0);
+ m128 st4 = load_m128_from_u64a(ft + reach4);
+ m128 st8 = load_m128_from_u64a(ft + reach8);
+ m128 st12 = load_m128_from_u64a(ft + reach12);
+
+ st4 = lshiftbyte_m128(st4, 4);
+ st12 = lshiftbyte_m128(st12, 4);
+
+ *s = or128(*s, st0);
+ *s = or128(*s, st4);
+
+ if (stride == 4) {
+ *conf0 = movq(*s);
+ *s = rshiftbyte_m128(*s, 8);
+ *conf0 ^= ~0ULL;
+
+ *s = or128(*s, st8);
+ *s = or128(*s, st12);
+ *conf8 = movq(*s);
+ *s = rshiftbyte_m128(*s, 8);
+ *conf8 ^= ~0ULL;
+ return;
+ }
+
+ // get_conf_stride_2
+ u64a reach2 = domain_mask & (it_hi >> 16);
+ u64a reach6 = domain_mask & (it_hi >> 48);
+ u64a reach10 = domain_mask & (it_lo >> 16);
+ u64a reach14 = domain_mask & (it_lo >> 48);
+
+ m128 st2 = load_m128_from_u64a(ft + reach2);
+ m128 st6 = load_m128_from_u64a(ft + reach6);
+ m128 st10 = load_m128_from_u64a(ft + reach10);
+ m128 st14 = load_m128_from_u64a(ft + reach14);
+
+ st2 = lshiftbyte_m128(st2, 2);
+ st6 = lshiftbyte_m128(st6, 6);
+ st10 = lshiftbyte_m128(st10, 2);
+ st14 = lshiftbyte_m128(st14, 6);
+
+ *s = or128(*s, st2);
+ *s = or128(*s, st6);
+
+ if (stride == 2) {
+ *conf0 = movq(*s);
+ *s = rshiftbyte_m128(*s, 8);
+ *conf0 ^= ~0ULL;
+
+ *s = or128(*s, st8);
+ *s = or128(*s, st10);
+ *s = or128(*s, st12);
+ *s = or128(*s, st14);
+
+ *conf8 = movq(*s);
+ *s = rshiftbyte_m128(*s, 8);
+ *conf8 ^= ~0ULL;
+ return;
+ }
+
+ // get_conf_stride_1
+ u64a reach1 = domain_mask & (it_hi >> 8);
+ u64a reach3 = domain_mask & (it_hi >> 24);
+ u64a reach5 = domain_mask & (it_hi >> 40);
+ u64a reach7 = domain_mask & ((it_hi >> 56) | (it_lo << 8));
+ u64a reach9 = domain_mask & (it_lo >> 8);
+ u64a reach11 = domain_mask & (it_lo >> 24);
+ u64a reach13 = domain_mask & (it_lo >> 40);
+ u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15);
+
+ m128 st1 = load_m128_from_u64a(ft + reach1);
+ m128 st3 = load_m128_from_u64a(ft + reach3);
+ m128 st5 = load_m128_from_u64a(ft + reach5);
+ m128 st7 = load_m128_from_u64a(ft + reach7);
+ m128 st9 = load_m128_from_u64a(ft + reach9);
+ m128 st11 = load_m128_from_u64a(ft + reach11);
+ m128 st13 = load_m128_from_u64a(ft + reach13);
+ m128 st15 = load_m128_from_u64a(ft + reach15);
+
+ st1 = lshiftbyte_m128(st1, 1);
+ st3 = lshiftbyte_m128(st3, 3);
+ st5 = lshiftbyte_m128(st5, 5);
+ st7 = lshiftbyte_m128(st7, 7);
+ st9 = lshiftbyte_m128(st9, 1);
+ st11 = lshiftbyte_m128(st11, 3);
+ st13 = lshiftbyte_m128(st13, 5);
+ st15 = lshiftbyte_m128(st15, 7);
+
+ st0 = or128(st0, st1);
+ st2 = or128(st2, st3);
+ st4 = or128(st4, st5);
+ st6 = or128(st6, st7);
+ st0 = or128(st0, st2);
+ st4 = or128(st4, st6);
+ st0 = or128(st0, st4);
+
+ st8 = or128(st8, st9);
+ st10 = or128(st10, st11);
+ st12 = or128(st12, st13);
+ st14 = or128(st14, st15);
+ st8 = or128(st8, st10);
+ st12 = or128(st12, st14);
+ st8 = or128(st8, st12);
+
+ m128 st = or128(*s, st0);
+ *conf0 = movq(st) ^ ~0ULL;
+ st = rshiftbyte_m128(st, 8);
+ st = or128(st, st8);
+
+ *conf8 = movq(st) ^ ~0ULL;
+ *s = rshiftbyte_m128(st, 8);
+}
+
+static really_inline
+void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
+ const u32 *confBase, const struct FDR_Runtime_Args *a,
+ const u8 *ptr, u32 *last_match_id, const struct zone *z) {
+ const u8 bucket = 8;
+
+ if (likely(!*conf)) {
+ return;
+ }
+
+ /* ptr is currently referring to a location in the zone's buffer, we also
+ * need a pointer in the original, main buffer for the final string compare.
+ */
+ const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust); //NOLINT (performance-no-int-to-ptr)
+
+ const u8 *confLoc = ptr;
+
+ do {
+ u32 bit = findAndClearLSB_64(conf);
+ u32 byte = bit / bucket + offset;
+ u32 bitRem = bit % bucket;
+ u32 idx = bitRem;
+ u32 cf = confBase[idx];
+ if (!cf) {
+ continue;
+ }
+ const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
+ ((const u8 *)confBase + cf);
+ if (!(fdrc->groups & *control)) {
+ continue;
+ }
+ u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1);
+ confWithBit(fdrc, a, ptr_main - a->buf + byte, control,
+ last_match_id, confVal, conf, bit);
+ } while (unlikely(!!*conf));
+}
+
+#endif // FDR_IMPL_ARM_H
\ No newline at end of file
/*
* Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2025, VectorCamp PC
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
#include "fdr_confirm_runtime.h"
#include "fdr_internal.h"
#include "fdr_loadval.h"
+#include "fdr_impl.h"
#include "flood_runtime.h"
#include "scratch.h"
#include "teddy.h"
#include "util/simd_utils.h"
#include "util/uniform_ops.h"
-/** \brief number of bytes processed in each iteration */
-#define ITER_BYTES 16
-
-/** \brief total zone buffer size */
-#define ZONE_TOTAL_SIZE 64
-
-/** \brief maximum number of allowed zones */
-#define ZONE_MAX 3
-
-/** \brief zone information.
- *
- * Zone represents a region of data to scan in FDR.
- *
- * The incoming buffer is to split in multiple zones to ensure two properties:
- * 1: that we can read 8? bytes behind to generate a hash safely
- * 2: that we can read the 3 byte after the current byte (domain > 8)
- */
-struct zone {
- /** \brief copied buffer, used only when it is a boundary zone. */
- u8 ALIGN_CL_DIRECTIVE buf[ZONE_TOTAL_SIZE];
-
- /** \brief shift amount for fdr state to avoid unwanted match. */
- u8 shift;
-
- /** \brief if boundary zone, start points into the zone buffer after the
- * pre-padding. Otherwise, points to the main buffer, appropriately. */
- const u8 *start;
-
- /** \brief if boundary zone, end points to the end of zone. Otherwise,
- * pointer to the main buffer, appropriately. */
- const u8 *end;
-
- /** \brief the amount to adjust to go from a pointer in the zones region
- * (between start and end) to a pointer in the original data buffer. */
- ptrdiff_t zone_pointer_adjust;
-
- /** \brief firstFloodDetect from FDR_Runtime_Args for non-boundary zones,
- * otherwise end of the zone buf. floodPtr always points inside the same
- * buffer as the start pointe. */
- const u8 *floodPtr;
-};
-
-static
-const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = {
- { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
- { 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
- { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
- { 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
- { 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
- { 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
- { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
- { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
- { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
- { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
- { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
- { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00 },
- { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 },
- { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00 },
- { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00 },
- { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 },
- { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }
-};
-
/* generates an initial state mask based on the last byte-ish of history rather
* than being all accepting. If there is no history to consider, the state is
* generated based on the minimum length of each bucket in order to prevent
return s;
}
-static really_inline
-void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
- UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
- const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
- /* +1: the zones ensure that we can read the byte at z->end */
- assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
- u64a domain_mask = ~domain_mask_flipped;
-
- u64a it_hi = *(const u64a *)itPtr;
- u64a it_lo = *(const u64a *)(itPtr + 8);
- u64a reach0 = domain_mask & it_hi;
- u64a reach1 = domain_mask & (it_hi >> 8);
- u64a reach2 = domain_mask & (it_hi >> 16);
- u64a reach3 = domain_mask & (it_hi >> 24);
- u64a reach4 = domain_mask & (it_hi >> 32);
- u64a reach5 = domain_mask & (it_hi >> 40);
- u64a reach6 = domain_mask & (it_hi >> 48);
- u64a reach7 = domain_mask & ((it_hi >> 56) | (it_lo << 8));
- u64a reach8 = domain_mask & it_lo;
- u64a reach9 = domain_mask & (it_lo >> 8);
- u64a reach10 = domain_mask & (it_lo >> 16);
- u64a reach11 = domain_mask & (it_lo >> 24);
- u64a reach12 = domain_mask & (it_lo >> 32);
- u64a reach13 = domain_mask & (it_lo >> 40);
- u64a reach14 = domain_mask & (it_lo >> 48);
- u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15);
-
- m128 st0 = load_m128_from_u64a(ft + reach0);
- m128 st1 = lshiftbyte_m128(load_m128_from_u64a(ft + reach1), 1);
- m128 st2 = lshiftbyte_m128(load_m128_from_u64a(ft + reach2), 2);
- m128 st3 = lshiftbyte_m128(load_m128_from_u64a(ft + reach3), 3);
- m128 st4 = lshiftbyte_m128(load_m128_from_u64a(ft + reach4), 4);
- m128 st5 = lshiftbyte_m128(load_m128_from_u64a(ft + reach5), 5);
- m128 st6 = lshiftbyte_m128(load_m128_from_u64a(ft + reach6), 6);
- m128 st7 = lshiftbyte_m128(load_m128_from_u64a(ft + reach7), 7);
- m128 st8 = load_m128_from_u64a(ft + reach8);
- m128 st9 = lshiftbyte_m128(load_m128_from_u64a(ft + reach9), 1);
- m128 st10 = lshiftbyte_m128(load_m128_from_u64a(ft + reach10), 2);
- m128 st11 = lshiftbyte_m128(load_m128_from_u64a(ft + reach11), 3);
- m128 st12 = lshiftbyte_m128(load_m128_from_u64a(ft + reach12), 4);
- m128 st13 = lshiftbyte_m128(load_m128_from_u64a(ft + reach13), 5);
- m128 st14 = lshiftbyte_m128(load_m128_from_u64a(ft + reach14), 6);
- m128 st15 = lshiftbyte_m128(load_m128_from_u64a(ft + reach15), 7);
-
- st0 = or128(st0, st1);
- st2 = or128(st2, st3);
- st4 = or128(st4, st5);
- st6 = or128(st6, st7);
- st0 = or128(st0, st2);
- st4 = or128(st4, st6);
- st0 = or128(st0, st4);
-
- st8 = or128(st8, st9);
- st10 = or128(st10, st11);
- st12 = or128(st12, st13);
- st14 = or128(st14, st15);
- st8 = or128(st8, st10);
- st12 = or128(st12, st14);
- st8 = or128(st8, st12);
-
- m128 st = or128(*s, st0);
- *conf0 = movq(st) ^ ~0ULL;
- st = rshiftbyte_m128(st, 8);
- st = or128(st, st8);
-
- *conf8 = movq(st) ^ ~0ULL;
- *s = rshiftbyte_m128(st, 8);
-}
-
-static really_inline
-void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr,
- UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
- const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
- assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
-
- u64a reach0 = andn(domain_mask_flipped, itPtr);
- u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
- u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
- u64a reach6 = andn(domain_mask_flipped, itPtr + 6);
-
- m128 st0 = load_m128_from_u64a(ft + reach0);
- m128 st2 = load_m128_from_u64a(ft + reach2);
- m128 st4 = load_m128_from_u64a(ft + reach4);
- m128 st6 = load_m128_from_u64a(ft + reach6);
-
- u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
- u64a reach10 = andn(domain_mask_flipped, itPtr + 10);
- u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
- u64a reach14 = andn(domain_mask_flipped, itPtr + 14);
-
- m128 st8 = load_m128_from_u64a(ft + reach8);
- m128 st10 = load_m128_from_u64a(ft + reach10);
- m128 st12 = load_m128_from_u64a(ft + reach12);
- m128 st14 = load_m128_from_u64a(ft + reach14);
-
- st2 = lshiftbyte_m128(st2, 2);
- st4 = lshiftbyte_m128(st4, 4);
- st6 = lshiftbyte_m128(st6, 6);
-
- *s = or128(*s, st0);
- *s = or128(*s, st2);
- *s = or128(*s, st4);
- *s = or128(*s, st6);
-
- *conf0 = movq(*s);
- *s = rshiftbyte_m128(*s, 8);
- *conf0 ^= ~0ULL;
-
- st10 = lshiftbyte_m128(st10, 2);
- st12 = lshiftbyte_m128(st12, 4);
- st14 = lshiftbyte_m128(st14, 6);
-
- *s = or128(*s, st8);
- *s = or128(*s, st10);
- *s = or128(*s, st12);
- *s = or128(*s, st14);
-
- *conf8 = movq(*s);
- *s = rshiftbyte_m128(*s, 8);
- *conf8 ^= ~0ULL;
-}
-
-static really_inline
-void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr,
- UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
- const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
- assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
-
- u64a reach0 = andn(domain_mask_flipped, itPtr);
- u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
- u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
- u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
-
- m128 st0 = load_m128_from_u64a(ft + reach0);
- m128 st4 = load_m128_from_u64a(ft + reach4);
- m128 st8 = load_m128_from_u64a(ft + reach8);
- m128 st12 = load_m128_from_u64a(ft + reach12);
-
- st4 = lshiftbyte_m128(st4, 4);
- st12 = lshiftbyte_m128(st12, 4);
-
- *s = or128(*s, st0);
- *s = or128(*s, st4);
- *conf0 = movq(*s);
- *s = rshiftbyte_m128(*s, 8);
- *conf0 ^= ~0ULL;
-
- *s = or128(*s, st8);
- *s = or128(*s, st12);
- *conf8 = movq(*s);
- *s = rshiftbyte_m128(*s, 8);
- *conf8 ^= ~0ULL;
-}
-
-static really_inline
-void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
- const u32 *confBase, const struct FDR_Runtime_Args *a,
- const u8 *ptr, u32 *last_match_id, const struct zone *z) {
- const u8 bucket = 8;
-
- if (likely(!*conf)) {
- return;
- }
-
- /* ptr is currently referring to a location in the zone's buffer, we also
- * need a pointer in the original, main buffer for the final string compare.
- */
- const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust); //NOLINT (performance-no-int-to-ptr)
-
- const u8 *confLoc = ptr;
-
- do {
- u32 bit = findAndClearLSB_64(conf);
- u32 byte = bit / bucket + offset;
- u32 bitRem = bit % bucket;
- u32 idx = bitRem;
- u32 cf = confBase[idx];
- if (!cf) {
- continue;
- }
- const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
- ((const u8 *)confBase + cf);
- if (!(fdrc->groups & *control)) {
- continue;
- }
- u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1);
- confWithBit(fdrc, a, ptr_main - a->buf + byte, control,
- last_match_id, confVal, conf, bit);
- } while (unlikely(!!*conf));
-}
-
static really_inline
void dumpZoneInfo(UNUSED const struct zone *z, UNUSED size_t zone_id) {
#ifdef DEBUG
#define INVALID_MATCH_ID (~0U)
-#define FDR_MAIN_LOOP(zz, s, get_conf_fn) \
- do { \
- const u8 *tryFloodDetect = zz->floodPtr; \
- const u8 *start_ptr = zz->start; \
- const u8 *end_ptr = zz->end; \
- for (const u8 *itPtr = ROUNDDOWN_PTR(start_ptr, 64); itPtr + 4*ITER_BYTES <= end_ptr; \
- itPtr += 4*ITER_BYTES) { \
- __builtin_prefetch(itPtr); \
- } \
- \
- for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr; \
- itPtr += ITER_BYTES) { \
- if (unlikely(itPtr > tryFloodDetect)) { \
- tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect,\
- &floodBackoff, &control, \
- ITER_BYTES); \
- if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \
- return HWLM_TERMINATED; \
- } \
- } \
- __builtin_prefetch(itPtr + ITER_BYTES); \
- u64a conf0; \
- u64a conf8; \
- get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_flipped, \
- ft, &conf0, &conf8, &s); \
- do_confirm_fdr(&conf0, 0, &control, confBase, a, itPtr, \
- &last_match_id, zz); \
- do_confirm_fdr(&conf8, 8, &control, confBase, a, itPtr, \
- &last_match_id, zz); \
- if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \
- return HWLM_TERMINATED; \
- } \
- } /* end for loop */ \
- } while (0) \
-
static never_inline
hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
u32 floodBackoff = FLOOD_BACKOFF_START;
u32 last_match_id = INVALID_MATCH_ID;
- u32 domain_mask_flipped = ~fdr->domainMask;
- u8 stride = fdr->stride;
+
const u64a *ft =
(const u64a *)((const u8 *)fdr + ROUNDUP_CL(sizeof(struct FDR)));
assert(ISALIGNED_CL(ft));
for (size_t curZone = 0; curZone < numZone; curZone++) {
struct zone *z = &zones[curZone];
- dumpZoneInfo(z, curZone);
-
- /* When a zone contains less data than is processed in an iteration
- * of FDR_MAIN_LOOP(), we need to scan over some extra data.
- *
- * We have chosen to scan this extra data at the start of the
- * iteration. The extra data is either data we have already scanned or
- * garbage (if it is earlier than offset 0),
- *
- * As a result we need to shift the incoming state back so that it will
- * properly line up with the data being scanned.
- *
- * We also need to forbid reporting any matches in the data being
- * rescanned as they have already been reported (or are over garbage but
- * later stages should also provide that safety guarantee).
- */
-
- u8 shift = z->shift;
-
- state = variable_byte_shift_m128(state, shift);
-
- state = or128(state, load128(zone_or_mask[shift]));
-
- switch (stride) {
- case 1:
- FDR_MAIN_LOOP(z, state, get_conf_stride_1);
- break;
- case 2:
- FDR_MAIN_LOOP(z, state, get_conf_stride_2);
- break;
- case 4:
- FDR_MAIN_LOOP(z, state, get_conf_stride_4);
- break;
- default:
- break;
- }
+ m128 zone_mask = load128(zone_or_mask[z->shift]);
+
+ const u8 *cacheline = ROUNDDOWN_PTR(z->start, 64);
+ __builtin_prefetch(cacheline);
+
+ const u8 *tryFloodDetect = z->floodPtr;
+
+ state = variable_byte_shift_m128(state, z->shift);
+ state = or128(state, zone_mask);
+
+ for (const u8 *itPtr = z->start; itPtr + ITER_BYTES <= z->end; itPtr += ITER_BYTES) {
+ if (unlikely(itPtr > tryFloodDetect)) {
+ tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect,
+ &floodBackoff, &control,
+ ITER_BYTES);
+ if (unlikely(control == HWLM_TERMINATE_MATCHING)) {
+ return HWLM_TERMINATED;
+ }
+ }
+ u64a conf0;
+ u64a conf8;
+
+ cacheline += 64;
+ __builtin_prefetch(cacheline);
+
+ get_conf_stride(itPtr, z->start, z->end, fdr->domainMask, fdr->stride, ft, &conf0, &conf8, &state);
+
+ do_confirm_fdr(&conf0, 0, &control, confBase, a, itPtr, &last_match_id, z);
+ do_confirm_fdr(&conf8, 8, &control, confBase, a, itPtr, &last_match_id, z);
+ if (unlikely(control == HWLM_TERMINATE_MATCHING)) {
+ return HWLM_TERMINATED;
+ }
+ } /* end for loop */
}
return HWLM_SUCCESS;
/*
* Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2025, VectorCamp PC
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
--- /dev/null
+/*
+ * Copyright (c) 2020-2025, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \brief number of bytes processed in each iteration */
+#define ITER_BYTES 16
+
+/** \brief total zone buffer size */
+#define ZONE_TOTAL_SIZE 64
+
+/** \brief maximum number of allowed zones */
+#define ZONE_MAX 3
+
+/** \brief zone information.
+ *
+ * Zone represents a region of data to scan in FDR.
+ *
+ * The incoming buffer is to split in multiple zones to ensure two properties:
+ * 1: that we can read 8? bytes behind to generate a hash safely
+ * 2: that we can read the 3 byte after the current byte (domain > 8)
+ */
+struct zone {
+ /** \brief copied buffer, used only when it is a boundary zone. */
+ u8 ALIGN_CL_DIRECTIVE buf[ZONE_TOTAL_SIZE];
+
+ /** \brief shift amount for fdr state to avoid unwanted match. */
+ u8 shift;
+
+ /** \brief if boundary zone, start points into the zone buffer after the
+ * pre-padding. Otherwise, points to the main buffer, appropriately. */
+ const u8 *start;
+
+ /** \brief if boundary zone, end points to the end of zone. Otherwise,
+ * pointer to the main buffer, appropriately. */
+ const u8 *end;
+
+ /** \brief the amount to adjust to go from a pointer in the zones region
+ * (between start and end) to a pointer in the original data buffer. */
+ ptrdiff_t zone_pointer_adjust;
+
+ /** \brief firstFloodDetect from FDR_Runtime_Args for non-boundary zones,
+ * otherwise end of the zone buf. floodPtr always points inside the same
+ * buffer as the start pointe. */
+ const u8 *floodPtr;
+};
+
+static
+const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = {
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }
+};
+
+#if defined(VS_SIMDE_BACKEND)
+#include "x86/fdr_impl.h"
+#else
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#include "x86/fdr_impl.h"
+#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#include "arm/fdr_impl.h"
+#elif defined(ARCH_PPC64EL)
+#include "ppc64le/fdr_impl.h"
+#endif
+#endif
\ No newline at end of file
--- /dev/null
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2025, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FDR_IMPL_PPC64LE_H
+#define FDR_IMPL_PPC64LE_H
+
+static really_inline
+void get_conf_stride(const u8 *itPtr, UNUSED const u8 *start_ptr,
+ UNUSED const u8 *end_ptr, u32 domain_mask, u8 stride,
+ const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
+ assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
+
+ // get_conf_stride_4
+ u64a it_hi = *(const u64a *)itPtr;
+ u64a it_lo = *(const u64a *)(itPtr + 8);
+ u64a reach0 = domain_mask & it_hi;
+ u64a reach4 = domain_mask & (it_hi >> 32);
+ u64a reach8 = domain_mask & it_lo;
+ u64a reach12 = domain_mask & (it_lo >> 32);
+
+ m128 st0 = load_m128_from_u64a(ft + reach0);
+ m128 st4 = load_m128_from_u64a(ft + reach4);
+ m128 st8 = load_m128_from_u64a(ft + reach8);
+ m128 st12 = load_m128_from_u64a(ft + reach12);
+
+ st4 = lshiftbyte_m128(st4, 4);
+ st12 = lshiftbyte_m128(st12, 4);
+
+ *s = or128(*s, st0);
+ *s = or128(*s, st4);
+
+ if (stride == 4) {
+ *conf0 = movq(*s);
+ *s = rshiftbyte_m128(*s, 8);
+ *conf0 ^= ~0ULL;
+
+ *s = or128(*s, st8);
+ *s = or128(*s, st12);
+ *conf8 = movq(*s);
+ *s = rshiftbyte_m128(*s, 8);
+ *conf8 ^= ~0ULL;
+ return;
+ }
+
+ // get_conf_stride_2
+ u64a reach2 = domain_mask & (it_hi >> 16);
+ u64a reach6 = domain_mask & (it_hi >> 48);
+ u64a reach10 = domain_mask & (it_lo >> 16);
+ u64a reach14 = domain_mask & (it_lo >> 48);
+
+ m128 st2 = load_m128_from_u64a(ft + reach2);
+ m128 st6 = load_m128_from_u64a(ft + reach6);
+ m128 st10 = load_m128_from_u64a(ft + reach10);
+ m128 st14 = load_m128_from_u64a(ft + reach14);
+
+ st2 = lshiftbyte_m128(st2, 2);
+ st6 = lshiftbyte_m128(st6, 6);
+ st10 = lshiftbyte_m128(st10, 2);
+ st14 = lshiftbyte_m128(st14, 6);
+
+ *s = or128(*s, st2);
+ *s = or128(*s, st6);
+
+ if (stride == 2) {
+ *conf0 = movq(*s);
+ *s = rshiftbyte_m128(*s, 8);
+ *conf0 ^= ~0ULL;
+
+ *s = or128(*s, st8);
+ *s = or128(*s, st10);
+ *s = or128(*s, st12);
+ *s = or128(*s, st14);
+
+ *conf8 = movq(*s);
+ *s = rshiftbyte_m128(*s, 8);
+ *conf8 ^= ~0ULL;
+ return;
+ }
+
+ // get_conf_stride_1
+ u64a reach1 = domain_mask & (it_hi >> 8);
+ u64a reach3 = domain_mask & (it_hi >> 24);
+ u64a reach5 = domain_mask & (it_hi >> 40);
+ u64a reach7 = domain_mask & ((it_hi >> 56) | (it_lo << 8));
+ u64a reach9 = domain_mask & (it_lo >> 8);
+ u64a reach11 = domain_mask & (it_lo >> 24);
+ u64a reach13 = domain_mask & (it_lo >> 40);
+ u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15);
+
+ m128 st1 = load_m128_from_u64a(ft + reach1);
+ m128 st3 = load_m128_from_u64a(ft + reach3);
+ m128 st5 = load_m128_from_u64a(ft + reach5);
+ m128 st7 = load_m128_from_u64a(ft + reach7);
+ m128 st9 = load_m128_from_u64a(ft + reach9);
+ m128 st11 = load_m128_from_u64a(ft + reach11);
+ m128 st13 = load_m128_from_u64a(ft + reach13);
+ m128 st15 = load_m128_from_u64a(ft + reach15);
+
+ st1 = lshiftbyte_m128(st1, 1);
+ st3 = lshiftbyte_m128(st3, 3);
+ st5 = lshiftbyte_m128(st5, 5);
+ st7 = lshiftbyte_m128(st7, 7);
+ st9 = lshiftbyte_m128(st9, 1);
+ st11 = lshiftbyte_m128(st11, 3);
+ st13 = lshiftbyte_m128(st13, 5);
+ st15 = lshiftbyte_m128(st15, 7);
+
+ st0 = or128(st0, st1);
+ st2 = or128(st2, st3);
+ st4 = or128(st4, st5);
+ st6 = or128(st6, st7);
+ st0 = or128(st0, st2);
+ st4 = or128(st4, st6);
+ st0 = or128(st0, st4);
+
+ st8 = or128(st8, st9);
+ st10 = or128(st10, st11);
+ st12 = or128(st12, st13);
+ st14 = or128(st14, st15);
+ st8 = or128(st8, st10);
+ st12 = or128(st12, st14);
+ st8 = or128(st8, st12);
+
+ m128 st = or128(*s, st0);
+ *conf0 = movq(st) ^ ~0ULL;
+ st = rshiftbyte_m128(st, 8);
+ st = or128(st, st8);
+
+ *conf8 = movq(st) ^ ~0ULL;
+ *s = rshiftbyte_m128(st, 8);
+}
+
+static really_inline
+void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
+ const u32 *confBase, const struct FDR_Runtime_Args *a,
+ const u8 *ptr, u32 *last_match_id, const struct zone *z) {
+ const u8 bucket = 8;
+
+ if (likely(!*conf)) {
+ return;
+ }
+
+ /* ptr is currently referring to a location in the zone's buffer, we also
+ * need a pointer in the original, main buffer for the final string compare.
+ */
+ const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust); //NOLINT (performance-no-int-to-ptr)
+
+ const u8 *confLoc = ptr;
+
+ do {
+ u32 bit = findAndClearLSB_64(conf);
+ u32 byte = bit / bucket + offset;
+ u32 bitRem = bit % bucket;
+ u32 idx = bitRem;
+ u32 cf = confBase[idx];
+ if (!cf) {
+ continue;
+ }
+ const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
+ ((const u8 *)confBase + cf);
+ if (!(fdrc->groups & *control)) {
+ continue;
+ }
+ u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1);
+ confWithBit(fdrc, a, ptr_main - a->buf + byte, control,
+ last_match_id, confVal, conf, bit);
+ } while (unlikely(!!*conf));
+}
+
+#endif // FDR_IMPL_PPC64LE_H
\ No newline at end of file
--- /dev/null
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2025, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FDR_IMPL_X86_H
+#define FDR_IMPL_X86_H
+
+static really_inline
+void get_conf_stride(const u8 *itPtr, UNUSED const u8 *start_ptr,
+ UNUSED const u8 *end_ptr, u32 domain_mask, u8 stride,
+ const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
+ assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
+
+ // get_conf_stride_4
+ u64a it_hi = *(const u64a *)itPtr;
+ u64a it_lo = *(const u64a *)(itPtr + 8);
+ u64a reach0 = domain_mask & it_hi;
+ u64a reach4 = domain_mask & (it_hi >> 32);
+ u64a reach8 = domain_mask & it_lo;
+ u64a reach12 = domain_mask & (it_lo >> 32);
+
+ m128 st0 = load_m128_from_u64a(ft + reach0);
+ m128 st4 = load_m128_from_u64a(ft + reach4);
+ m128 st8 = load_m128_from_u64a(ft + reach8);
+ m128 st12 = load_m128_from_u64a(ft + reach12);
+
+ st4 = lshiftbyte_m128(st4, 4);
+ st12 = lshiftbyte_m128(st12, 4);
+
+ *s = or128(*s, st0);
+ *s = or128(*s, st4);
+
+ if (stride == 4) {
+ *conf0 = movq(*s);
+ *s = rshiftbyte_m128(*s, 8);
+ *conf0 ^= ~0ULL;
+
+ *s = or128(*s, st8);
+ *s = or128(*s, st12);
+ *conf8 = movq(*s);
+ *s = rshiftbyte_m128(*s, 8);
+ *conf8 ^= ~0ULL;
+ return;
+ }
+
+ // get_conf_stride_2
+ u64a reach2 = domain_mask & (it_hi >> 16);
+ u64a reach6 = domain_mask & (it_hi >> 48);
+ u64a reach10 = domain_mask & (it_lo >> 16);
+ u64a reach14 = domain_mask & (it_lo >> 48);
+
+ m128 st2 = load_m128_from_u64a(ft + reach2);
+ m128 st6 = load_m128_from_u64a(ft + reach6);
+ m128 st10 = load_m128_from_u64a(ft + reach10);
+ m128 st14 = load_m128_from_u64a(ft + reach14);
+
+ st2 = lshiftbyte_m128(st2, 2);
+ st6 = lshiftbyte_m128(st6, 6);
+ st10 = lshiftbyte_m128(st10, 2);
+ st14 = lshiftbyte_m128(st14, 6);
+
+ *s = or128(*s, st2);
+ *s = or128(*s, st6);
+
+ if (stride == 2) {
+ *conf0 = movq(*s);
+ *s = rshiftbyte_m128(*s, 8);
+ *conf0 ^= ~0ULL;
+
+ *s = or128(*s, st8);
+ *s = or128(*s, st10);
+ *s = or128(*s, st12);
+ *s = or128(*s, st14);
+
+ *conf8 = movq(*s);
+ *s = rshiftbyte_m128(*s, 8);
+ *conf8 ^= ~0ULL;
+ return;
+ }
+
+ // get_conf_stride_1
+ u64a reach1 = domain_mask & (it_hi >> 8);
+ u64a reach3 = domain_mask & (it_hi >> 24);
+ u64a reach5 = domain_mask & (it_hi >> 40);
+ u64a reach7 = domain_mask & ((it_hi >> 56) | (it_lo << 8));
+ u64a reach9 = domain_mask & (it_lo >> 8);
+ u64a reach11 = domain_mask & (it_lo >> 24);
+ u64a reach13 = domain_mask & (it_lo >> 40);
+ u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15);
+
+ m128 st1 = load_m128_from_u64a(ft + reach1);
+ m128 st3 = load_m128_from_u64a(ft + reach3);
+ m128 st5 = load_m128_from_u64a(ft + reach5);
+ m128 st7 = load_m128_from_u64a(ft + reach7);
+ m128 st9 = load_m128_from_u64a(ft + reach9);
+ m128 st11 = load_m128_from_u64a(ft + reach11);
+ m128 st13 = load_m128_from_u64a(ft + reach13);
+ m128 st15 = load_m128_from_u64a(ft + reach15);
+
+ st1 = lshiftbyte_m128(st1, 1);
+ st3 = lshiftbyte_m128(st3, 3);
+ st5 = lshiftbyte_m128(st5, 5);
+ st7 = lshiftbyte_m128(st7, 7);
+ st9 = lshiftbyte_m128(st9, 1);
+ st11 = lshiftbyte_m128(st11, 3);
+ st13 = lshiftbyte_m128(st13, 5);
+ st15 = lshiftbyte_m128(st15, 7);
+
+ st0 = or128(st0, st1);
+ st2 = or128(st2, st3);
+ st4 = or128(st4, st5);
+ st6 = or128(st6, st7);
+ st0 = or128(st0, st2);
+ st4 = or128(st4, st6);
+ st0 = or128(st0, st4);
+
+ st8 = or128(st8, st9);
+ st10 = or128(st10, st11);
+ st12 = or128(st12, st13);
+ st14 = or128(st14, st15);
+ st8 = or128(st8, st10);
+ st12 = or128(st12, st14);
+ st8 = or128(st8, st12);
+
+ m128 st = or128(*s, st0);
+ *conf0 = movq(st) ^ ~0ULL;
+ st = rshiftbyte_m128(st, 8);
+ st = or128(st, st8);
+
+ *conf8 = movq(st) ^ ~0ULL;
+ *s = rshiftbyte_m128(st, 8);
+}
+
+static really_inline
+void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
+ const u32 *confBase, const struct FDR_Runtime_Args *a,
+ const u8 *ptr, u32 *last_match_id, const struct zone *z) {
+ const u8 bucket = 8;
+
+ if (likely(!*conf)) {
+ return;
+ }
+
+ /* ptr is currently referring to a location in the zone's buffer, we also
+ * need a pointer in the original, main buffer for the final string compare.
+ */
+ const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust); //NOLINT (performance-no-int-to-ptr)
+
+ const u8 *confLoc = ptr;
+
+ do {
+ u32 bit = findAndClearLSB_64(conf);
+ u32 byte = bit / bucket + offset;
+ u32 bitRem = bit % bucket;
+ u32 idx = bitRem;
+ u32 cf = confBase[idx];
+ if (!cf) {
+ continue;
+ }
+ const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
+ ((const u8 *)confBase + cf);
+ if (!(fdrc->groups & *control)) {
+ continue;
+ }
+ u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1);
+ confWithBit(fdrc, a, ptr_main - a->buf + byte, control,
+ last_match_id, confVal, conf, bit);
+ } while (unlikely(!!*conf));
+}
+
+#endif // FDR_IMPL_X86_H
\ No newline at end of file
return (m128) vdupq_n_u64(c);
}
+#define insert32_m128(in, val, imm) ((m128) vsetq_lane_u32(val, (uint32x4_t)in, imm))
+
static really_inline u32 movd(const m128 in) {
return vgetq_lane_u32((uint32x4_t) in, 0);
}
return (m128) vld1q_u64((uint64_t *) data);
}
+static really_inline
+m128 widenlo128(m128 x) {
+ return (m128) vmovl_u32(vget_low_u32((uint32x4_t)x));
+}
+
+static really_inline
+m128 widenhi128(m128 x) {
+ return (m128) vmovl_u32(vget_high_u32((uint32x4_t)x));
+}
+
#endif // ARCH_ARM_SIMD_UTILS_H
return rv;
}
+static really_inline
+m256 widen128(m128 x) {
+ m256 rv;
+ rv.lo = widenlo128(x);
+ rv.hi = widenhi128(x);
+ return rv;
+}
+
#endif // HAVE_SIMD_256_BITS
/****
return (m128) v;
}
+static really_inline
+m128 widenlo128(m128 x) {
+ return (m128) vec_mergel((m128)x, zeroes128());
+}
+
+static really_inline
+m128 widenhi128(m128 x) {
+ return (m128) vec_mergeh((m128)x, zeroes128());
+}
+
#if defined(__clang__) && (__clang_major__ == 15)
#pragma clang diagnostic pop
#endif // defined(__clang__) && (__clang_major__ == 15)
return (m128) _mm_sub_epi64(a, b);
}
+static really_really_inline
+m128 lshift32_m128(m128 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+ if (__builtin_constant_p(b)) {
+ return _mm_slli_epi32(a, b);
+ }
+#endif
+ m128 x = _mm_cvtsi32_si128(b);
+ return _mm_sll_epi32(a, x);
+}
+
static really_really_inline
m128 lshift64_m128(m128 a, unsigned b) {
#if defined(HAVE__BUILTIN_CONSTANT_P)
return _mm_set1_epi64x(c);
}
+#define insert32_m128(in, val, imm) (m128) (_mm_insert_epi32((m128) in, (m128) val, (m128) imm))
+
static really_inline u32 movd(const m128 in) {
return _mm_cvtsi128_si32(in);
}
return _mm_set_epi64x(hi, lo);
}
+static really_inline
+m128 widenlo128(m128 x) {
+ return _mm_unpacklo_epi32(x, zeroes128());
+}
+
+static really_inline
+m128 widenhi128(m128 x) {
+ return _mm_unpackhi_epi32(x, zeroes128());
+}
+
/****
**** 256-bit Primitives
****/
return insert128to256(cast128to256(lo), hi, 1);
#endif
}
+
+static really_inline
+m256 widen128(m128 x) {
+ return (m256) _mm256_cvtepu32_epi64(x);
+}
+
#endif //AVX2
/****