src/config.h
src/config.h.in
src/hs_version.h
-src/fdr/fdr_autogen.c
-src/fdr/fdr_autogen_compiler.cpp
-src/fdr/teddy_autogen.c
-src/fdr/teddy_autogen_compiler.cpp
src/parser/Parser.cpp
# Generated PCRE files
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
-# include the autogen targets
-add_subdirectory(src/fdr)
-
-include_directories(${PROJECT_BINARY_DIR}/src/fdr)
-
if(NOT WIN32)
set(RAGEL_C_FLAGS "-Wno-unused")
endif()
)
install(FILES ${hs_HEADERS} DESTINATION include/hs)
-set(fdr_autogen_targets autogen_runtime autogen_teddy_runtime)
-
set (hs_exec_SRCS
${hs_HEADERS}
src/hs_version.h
src/fdr/flood_runtime.h
src/fdr/fdr_loadval.h
src/fdr/teddy.c
+ src/fdr/teddy_avx2.c
+ src/fdr/teddy.h
src/fdr/teddy_internal.h
+ src/fdr/teddy_runtime_common.h
src/hwlm/hwlm.c
src/hwlm/hwlm.h
src/hwlm/hwlm_internal.h
set (LIB_SOVERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION})
add_library(hs_exec OBJECT ${hs_exec_SRCS})
-add_dependencies(hs_exec ${fdr_autogen_targets})
if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
add_library(hs_exec_shared OBJECT ${hs_exec_SRCS})
-add_dependencies(hs_exec_shared ${fdr_autogen_targets})
set_target_properties(hs_exec_shared PROPERTIES
POSITION_INDEPENDENT_CODE TRUE)
endif()
add_library(hs STATIC ${hs_SRCS} $<TARGET_OBJECTS:hs_exec>)
add_dependencies(hs ragel_Parser)
-add_dependencies(hs autogen_teddy_compiler)
if (NOT BUILD_SHARED_LIBS)
install(TARGETS hs DESTINATION lib)
if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
add_library(hs_shared SHARED ${hs_SRCS} $<TARGET_OBJECTS:hs_exec_shared>)
add_dependencies(hs_shared ragel_Parser)
- add_dependencies(hs_shared autogen_compiler autogen_teddy_compiler)
set_target_properties(hs_shared PROPERTIES
OUTPUT_NAME hs
VERSION ${LIB_VERSION}
+++ /dev/null
-# The set of rules and other nastiness for generating FDR/Teddy source
-
-# we need to add these as explicit dependencies
-set(AUTOGEN_PY_FILES
- arch.py
- autogen.py
- autogen_utils.py
- teddy_autogen.py
-)
-
-function(fdr_autogen type out)
- add_custom_command (
- COMMENT "AUTOGEN ${out}"
- OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${out}"
- COMMAND ${PYTHON} "${CMAKE_CURRENT_SOURCE_DIR}/autogen.py" ${type} > "${CMAKE_CURRENT_BINARY_DIR}/${out}"
- DEPENDS ${AUTOGEN_PY_FILES}
- )
- add_custom_target(autogen_${type} DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/${out}")
-endfunction(fdr_autogen)
-
-#now build the functions
-fdr_autogen(runtime fdr_autogen.c)
-fdr_autogen(teddy_runtime teddy_autogen.c)
-fdr_autogen(teddy_compiler teddy_autogen_compiler.cpp)
-
-set(fdr_GENERATED_SRC
- ${PROJECT_BINARY_DIR}/src/fdr/fdr_autogen.c
- ${PROJECT_BINARY_DIR}/src/fdr/teddy_autogen.c
- ${PROJECT_BINARY_DIR}/src/fdr/teddy_autogen_compiler.cpp
- PARENT_SCOPE)
-
-set_source_files_properties(${fdr_GENERATED_SRC} PROPERTIES GENERATED TRUE)
-include_directories(${CMAKE_CURRENT_BINARY_DIR})
+++ /dev/null
-#!/usr/bin/python
-
-# Copyright (c) 2015, Intel Corporation
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-# this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-# * Neither the name of Intel Corporation nor the names of its contributors
-# may be used to endorse or promote products derived from this software
-# without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import autogen_utils
-
-# wrapper for architectures
-
-class Arch:
- def __init__(self, name, extensions = []):
- self.name = name
- self.extensions = extensions
- self.target = None
-
- def get_guard(self):
- # these defines definitely fall into the "belt-and-suspenders"
- # category of paranoia
- if (self.guard_list == []):
- return "#if 1"
-
- return "#if " + " && ".join(self.guard_list)
-
-class X86Arch(Arch):
- def __init__(self, name, extensions = []):
- Arch.__init__(self, name, extensions)
- self.guard_list = [ ]
- self.target = "0"
-
- if "AVX2" in extensions:
- self.target += " | HS_CPU_FEATURES_AVX2"
- self.guard_list += [ "defined(__AVX2__)" ]
-
-
-arch_x86_64 = X86Arch("x86_64", extensions = [ ])
-arch_x86_64_avx2 = X86Arch("x86_64_avx2", extensions = [ "AVX2" ])
+++ /dev/null
-#!/usr/bin/python
-
-# Copyright (c) 2015-2016, Intel Corporation
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-# this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-# * Neither the name of Intel Corporation nor the names of its contributors
-# may be used to endorse or promote products derived from this software
-# without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import sys
-from autogen_utils import *
-from teddy_autogen import *
-from arch import *
-
-# teddy setup
-
-def build_teddy_matchers():
- all_matchers = [ ]
-
- # AVX2
- all_matchers += [ MTFast(arch = arch_x86_64_avx2, packed = False) ]
- all_matchers += [ MTFast(arch = arch_x86_64_avx2, packed = True) ]
- for n_msk in range(1, 5):
- all_matchers += [ MTFat(arch = arch_x86_64_avx2, packed = False, num_masks = n_msk, num_buckets = 16) ]
- all_matchers += [ MTFat(arch = arch_x86_64_avx2, packed = True, num_masks = n_msk, num_buckets = 16) ]
-
- # SSE/SSE2/SSSE3
- for n_msk in range(1, 5):
- all_matchers += [ MT(arch = arch_x86_64, packed = False, num_masks = n_msk, num_buckets = 8) ]
- all_matchers += [ MT(arch = arch_x86_64, packed = True, num_masks = n_msk, num_buckets = 8) ]
-
- return all_matchers
-
-def produce_teddy_compiles(l):
- print "void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {"
- print " static const TeddyEngineDef defns[] = {"
- for m in l:
- m.produce_compile_call()
- print " };"
- print " out->clear();"
- print " for (size_t i = 0; i < ARRAY_LENGTH(defns); i++) {"
- print " out->push_back(TeddyEngineDescription(defns[i]));"
- print " }"
- print "}"
-
-# see below - we don't produce our 'zeros' at the point of the teddy runtimes as they
-# are linked. So we either generate the function or we don't - then at the point of the
-# header in fdr_autogen.c we either generate the header or we #define the zero.
-
-def produce_teddy_runtimes(l):
- # Since we're using -Wmissing-prototypes, we need headers first.
- for m in l:
- m.produce_guard()
- print m.produce_header(visible = True, header_only = True)
- m.close_guard()
-
- for m in l:
- m.produce_guard()
- m.produce_code()
- m.close_guard()
-
-# see produce_teddy_runtimes() comment for the rationale
-
-def produce_teddy_headers(l):
- for m in l:
- m.produce_guard()
- print m.produce_header(visible = True, header_only = True)
- m.produce_zero_alternative()
-
-# general utilities
-
-def make_fdr_function_pointers(matcher_list):
- print """
-typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr, const struct FDR_Runtime_Args *a);
-static FDRFUNCTYPE funcs[] = {
-"""
- all_funcs = " fdr_engine_exec,\n"
- all_funcs += ",\n".join([ " %s" % m.get_name() for m in matcher_list ])
- print all_funcs
- print """
-};
-"""
-
-def assign_ids(matcher_list, next_id):
- for m in matcher_list:
- m.id = next_id
- next_id += 1
- return next_id
-
-# Main entry point
-
-tm = build_teddy_matchers()
-next_id = assign_ids(tm, 1)
-if sys.argv[1] == "runtime":
- produce_teddy_headers(tm)
- make_fdr_function_pointers(tm)
-elif sys.argv[1] == "teddy_runtime":
- produce_teddy_runtimes(tm)
-elif sys.argv[1] == "teddy_compiler":
- produce_teddy_compiles(tm)
+++ /dev/null
-#!/usr/bin/python
-
-# Copyright (c) 2015-2016, Intel Corporation
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-# this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-# * Neither the name of Intel Corporation nor the names of its contributors
-# may be used to endorse or promote products derived from this software
-# without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import sys
-
-def fail_out(msg = ""):
- print >>sys.stderr, "Internal failure in autogen.py: " + msg
- sys.exit(1)
-
-class IntegerType:
- def __init__(self, size):
- self.size = size
-
- def get_name(self):
- return { 256: "m256", 128 : "m128", 64 : "u64a", 32 : "u32" , 16 : "u16", 8 : "u8"}[self.size]
-
- def size_in_bytes(self):
- return self.size / 8
-
- def zero_expression(self):
- return "0"
-
- def constant_to_string(self, n):
- if self.size == 64:
- suffix = "ULL"
- else:
- suffix = ""
- return "0x%x%s" % (n & ((1 << self.size) - 1), suffix)
-
- def lowbits(self, n):
- return (1 << n) - 1
-
- def highbits(self, n):
- return ~(self.lowbits(self.size - n))
-
- def lowbit_mask(self, n):
- return self.constant_to_string(self.lowbits(n))
-
- def lowbit_extract_expr(self, expr_string, n):
- return "(%s & %s)" % ( expr_string, self.lowbit_mask(n))
-
- def flip_lowbits_expr(self, expr_string, n):
- return "(%s ^ %s)" % ( expr_string, self.lowbit_mask(n))
-
- def bit_extract_expr(self, expr_string, low, high):
- lbm = self.lowbit_mask(high - low)
- return "((%s >> %d) & %s)" % (expr_string, low, lbm)
-
- # shifts are +ve if left and -ve if right
- def shift_expr(self, expr_string, n):
- if n <= -self.size or n >= self.size:
- return self.zero_expression()
- elif (n > 0):
- return "(%s << %d)" % (expr_string, n)
- elif (n < 0):
- return "(%s >> %d)" % (expr_string, -n)
- else:
- return "(%s)" % (expr_string)
-
-class SIMDIntegerType(IntegerType):
- def __init__(self, size):
- IntegerType.__init__(self, size)
-
- def zero_expression(self):
- return "zeroes128()"
-
- def lowbit_extract_expr(self, expr_string, n):
- if (n <= 32):
- tmpType = IntegerType(32)
- tmpExpr = "movd(%s)" % expr_string
- elif (32 < n <= 64):
- tmpType = IntegerType(64)
- tmpExpr = "movq(%s)" % expr_string
- return tmpType.lowbit_extract_expr(tmpExpr, n)
-
- def bit_extract_expr(self, expr_string, low, high, flip):
- fail_out("Unimplemented bit extract on m128")
-
- def shift_expr(self, expr_string, n):
- if n % 8 != 0:
- fail_out("Trying to shift a m128 by a bit granular value")
-
- # should check that n is divisible by 8
- if n <= -self.size or n >= self.size:
- return self.zero_expression()
- elif (n > 0):
- return "byteShiftLeft128(%s, %s)" % (expr_string, n / 8)
- elif (n < 0):
- return "byteShiftRight128(%s, %s)" % (expr_string, -n / 8)
- else:
- return "(%s)" % (expr_string)
-
- def lowbit_mask(self, n):
- if n % 8 != 0:
- fail_out("Trying to make a lowbit mask in a m128 by a bit granular value")
- return self.shift_expr("ones128()", -(128 - n))
/*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
u32 getNumBuckets() const { return numBuckets; }
u32 getConfirmPullBackDistance() const { return confirmPullBackDistance; }
u32 getConfirmTopLevelSplit() const { return confirmTopLevelSplit; }
+ void setConfirmTopLevelSplit(u32 split) { confirmTopLevelSplit = split; }
bool isValidOnTarget(const target_t &target_in) const;
virtual u32 getDefaultFloodSuffixLength() const = 0;
#include "fdr_loadval.h"
#include "fdr_streaming_runtime.h"
#include "flood_runtime.h"
+#include "teddy.h"
#include "teddy_internal.h"
#include "util/simd_utils.h"
#include "util/simd_utils_ssse3.h"
return HWLM_SUCCESS;
}
-#include "fdr_autogen.c"
+#if defined(__AVX2__)
+#define ONLY_AVX2(func) func
+#else
+#define ONLY_AVX2(func) NULL
+#endif
+
+typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr, const struct FDR_Runtime_Args *a);
+static const FDRFUNCTYPE funcs[] = {
+ fdr_engine_exec,
+ ONLY_AVX2(fdr_exec_teddy_avx2_msks1_fast),
+ ONLY_AVX2(fdr_exec_teddy_avx2_msks1_pck_fast),
+ ONLY_AVX2(fdr_exec_teddy_avx2_msks1_fat),
+ ONLY_AVX2(fdr_exec_teddy_avx2_msks1_pck_fat),
+ ONLY_AVX2(fdr_exec_teddy_avx2_msks2_fat),
+ ONLY_AVX2(fdr_exec_teddy_avx2_msks2_pck_fat),
+ ONLY_AVX2(fdr_exec_teddy_avx2_msks3_fat),
+ ONLY_AVX2(fdr_exec_teddy_avx2_msks3_pck_fat),
+ ONLY_AVX2(fdr_exec_teddy_avx2_msks4_fat),
+ ONLY_AVX2(fdr_exec_teddy_avx2_msks4_pck_fat),
+ fdr_exec_teddy_msks1,
+ fdr_exec_teddy_msks1_pck,
+ fdr_exec_teddy_msks2,
+ fdr_exec_teddy_msks2_pck,
+ fdr_exec_teddy_msks3,
+ fdr_exec_teddy_msks3_pck,
+ fdr_exec_teddy_msks4,
+ fdr_exec_teddy_msks4_pck,
+};
#define FAKE_HISTORY_SIZE 16
static const u8 fake_history[FAKE_HISTORY_SIZE];
void getFdrDescriptions(vector<FDREngineDescription> *out) {
static const FDREngineDef def = {0, 128, 8, 0, 1, 256};
out->clear();
- out->push_back(FDREngineDescription(def));
+ out->emplace_back(def);
}
static
/*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* POSSIBILITY OF SUCH DAMAGE.
*/
-#include "config.h"
+/** \file
+ * \brief Teddy literal matcher: SSSE3 engine runtime.
+ */
+
+#include "fdr_internal.h"
+#include "flood_runtime.h"
+#include "teddy.h"
+#include "teddy_internal.h"
+#include "teddy_runtime_common.h"
#include "util/simd_utils.h"
#include "util/simd_utils_ssse3.h"
-static const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
+const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}
};
-// Note: p_mask is an output param that initialises a poison mask.
-UNUSED static really_inline
-m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
- const u8 *buf_history, size_t len_history,
- const u32 nMasks) {
- union {
- u8 val8[16];
- m128 val128;
- } u;
- u.val128 = zeroes128();
-
- if (ptr >= lo) {
- u32 avail = (u32)(hi - ptr);
- if (avail >= 16) {
- *p_mask = load128((const void*)(p_mask_arr[16] + 16));
- return loadu128(ptr);
- }
- *p_mask = load128((const void*)(p_mask_arr[avail] + 16));
- for (u32 i = 0; i < avail; i++) {
- u.val8[i] = ptr[i];
- }
- } else {
- u32 need = MIN((u32)(lo - ptr), MIN(len_history, nMasks - 1));
- u32 start = (u32)(lo - ptr);
- u32 i;
- for (i = start - need; ptr + i < lo; i++) {
- u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
- }
- u32 end = MIN(16, (u32)(hi - ptr));
- *p_mask = loadu128((const void*)(p_mask_arr[end - start] + 16 - start));
- for (; i < end; i++) {
- u.val8[i] = ptr[i];
- }
- }
-
- return u.val128;
+#ifdef ARCH_64_BIT
+#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
+do { \
+ if (unlikely(isnonzero128(var))) { \
+ u64a lo = movq(var); \
+ u64a hi = movq(byteShiftRight128(var, 8)); \
+ if (unlikely(lo)) { \
+ conf_fn(&lo, bucket, offset, confBase, reason, a, ptr, \
+ control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ if (unlikely(hi)) { \
+ conf_fn(&hi, bucket, offset + 8, confBase, reason, a, ptr, \
+ control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ } \
+} while (0);
+#else
+#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
+do { \
+ if (unlikely(isnonzero128(var))) { \
+ u32 part1 = movd(var); \
+ u32 part2 = movd(byteShiftRight128(var, 4)); \
+ u32 part3 = movd(byteShiftRight128(var, 8)); \
+ u32 part4 = movd(byteShiftRight128(var, 12)); \
+ if (unlikely(part1)) { \
+ conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \
+ control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ if (unlikely(part2)) { \
+ conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr, \
+ control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ if (unlikely(part3)) { \
+ conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr, \
+ control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ if (unlikely(part4)) { \
+ conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr, \
+ control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ } \
+} while (0);
+#endif
+
+static really_inline
+m128 prep_conf_teddy_m1(const m128 *maskBase, m128 p_mask, m128 val) {
+ m128 mask = set16x8(0xf);
+ m128 lo = and128(val, mask);
+ m128 hi = and128(rshift2x64(val, 4), mask);
+ return and128(and128(pshufb(maskBase[0*2], lo),
+ pshufb(maskBase[0*2+1], hi)), p_mask);
}
+static really_inline
+m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 p_mask,
+ m128 val) {
+ m128 mask = set16x8(0xf);
+ m128 lo = and128(val, mask);
+ m128 hi = and128(rshift2x64(val, 4), mask);
+ m128 r = prep_conf_teddy_m1(maskBase, p_mask, val);
-#if defined(__AVX2__)
+ m128 res_1 = and128(pshufb(maskBase[1*2], lo),
+ pshufb(maskBase[1*2+1], hi));
+ m128 res_shifted_1 = palignr(res_1, *old_1, 16-1);
+ *old_1 = res_1;
+ return and128(and128(r, p_mask), res_shifted_1);
+}
+
+static really_inline
+m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
+ m128 p_mask, m128 val) {
+ m128 mask = set16x8(0xf);
+ m128 lo = and128(val, mask);
+ m128 hi = and128(rshift2x64(val, 4), mask);
+ m128 r = prep_conf_teddy_m2(maskBase, old_1, p_mask, val);
-UNUSED static really_inline
-m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
- const u8 *buf_history, size_t len_history,
- const u32 nMasks) {
- m128 p_mask128;
- m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, lo, hi, buf_history, len_history, nMasks));
- *p_mask = set2x128(p_mask128);
- return ret;
+ m128 res_2 = and128(pshufb(maskBase[2*2], lo),
+ pshufb(maskBase[2*2+1], hi));
+ m128 res_shifted_2 = palignr(res_2, *old_2, 16-2);
+ *old_2 = res_2;
+ return and128(r, res_shifted_2);
}
-static const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = {
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}
-};
+static really_inline
+m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
+ m128 *old_3, m128 p_mask, m128 val) {
+ m128 mask = set16x8(0xf);
+ m128 lo = and128(val, mask);
+ m128 hi = and128(rshift2x64(val, 4), mask);
+ m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, p_mask, val);
+
+ m128 res_3 = and128(pshufb(maskBase[3*2], lo),
+ pshufb(maskBase[3*2+1], hi));
+ m128 res_shifted_3 = palignr(res_3, *old_3, 16-3);
+ *old_3 = res_3;
+ return and128(r, res_shifted_3);
+}
+
+hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a) {
+ const u8 *buf_end = a->buf + a->len;
+ const u8 *ptr = a->buf + a->start_offset;
+ hwlmcb_rv_t controlVal = *a->groups;
+ hwlmcb_rv_t *control = &controlVal;
+ u32 floodBackoff = FLOOD_BACKOFF_START;
+ const u8 *tryFloodDetect = a->firstFloodDetect;
+ u32 last_match = (u32)-1;
+ const struct Teddy *teddy = (const struct Teddy *)fdr;
+ const size_t iterBytes = 32;
+ DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+ a->buf, a->len, a->start_offset);
+ const m128 *maskBase = getMaskBase(teddy);
+ const u32 *confBase = getConfBase(teddy, 1);
-UNUSED static really_inline
-m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
- const u8 *buf_history, size_t len_history) {
- union {
- u8 val8[32];
- m256 val256;
- } u;
-
- if (ptr >= lo) {
- u32 avail = (u32)(hi - ptr);
- if (avail >= 32) {
- *p_mask = load256((const void*)(p_mask_arr256[32] + 32));
- return loadu256(ptr);
- }
- *p_mask = load256((const void*)(p_mask_arr256[avail] + 32));
- for (u32 i = 0; i < avail; i++) {
- u.val8[i] = ptr[i];
- }
- } else {
- // need contains "how many chars to pull from history"
- // calculate based on what we need, what we have in the buffer
- // and only what we need to make primary confirm work
- u32 start = (u32)(lo - ptr);
- u32 i;
- for (i = start; ptr + i < lo; i++) {
- u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
- }
- u32 end = MIN(32, (u32)(hi - ptr));
- *p_mask = loadu256((const void*)(p_mask_arr256[end - start] + 32 - start));
- for (; i < end; i++) {
- u.val8[i] = ptr[i];
- }
- }
-
- return u.val256;
+ const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+ DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+ if (ptr < mainStart) {
+ ptr = mainStart - 16;
+ m128 p_mask;
+ m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 1);
+ m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
+ ptr += 16;
+ }
+
+ if (ptr + 16 < buf_end) {
+ m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
+ ptr += 16;
+ }
+
+ for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+ __builtin_prefetch(ptr + (iterBytes*4));
+ CHECK_FLOOD;
+ m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
+ CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit1_teddy);
+ m128 r_1 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr + 16));
+ CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit1_teddy);
+ }
+
+ for (; ptr < buf_end; ptr += 16) {
+ m128 p_mask;
+ m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 1);
+ m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
+ }
+ *a->groups = controlVal;
+ return HWLM_SUCCESS;
}
+hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a) {
+ const u8 *buf_end = a->buf + a->len;
+ const u8 *ptr = a->buf + a->start_offset;
+ hwlmcb_rv_t controlVal = *a->groups;
+ hwlmcb_rv_t *control = &controlVal;
+ u32 floodBackoff = FLOOD_BACKOFF_START;
+ const u8 *tryFloodDetect = a->firstFloodDetect;
+ u32 last_match = (u32)-1;
+ const struct Teddy *teddy = (const struct Teddy *)fdr;
+ const size_t iterBytes = 32;
+ DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+ a->buf, a->len, a->start_offset);
-#endif // __AVX2__
+ const m128 *maskBase = getMaskBase(teddy);
+ const u32 *confBase = getConfBase(teddy, 1);
-#define P0(cnd) unlikely(cnd)
+ const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+ DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+ if (ptr < mainStart) {
+ ptr = mainStart - 16;
+ m128 p_mask;
+ m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 1);
+ m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
+ ptr += 16;
+ }
-#include "fdr.h"
-#include "fdr_internal.h"
-#include "flood_runtime.h"
+ if (ptr + 16 < buf_end) {
+ m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
+ ptr += 16;
+ }
-#include "fdr_confirm.h"
-#include "fdr_confirm_runtime.h"
+ for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+ __builtin_prefetch(ptr + (iterBytes*4));
+ CHECK_FLOOD;
+ m128 r_0 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr));
+ CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
+ m128 r_1 = prep_conf_teddy_m1(maskBase, ones128(), load128(ptr + 16));
+ CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
+ }
-#include "fdr_loadval.h"
-#include "util/bitutils.h"
-#include "teddy_internal.h"
+ for (; ptr < buf_end; ptr += 16) {
+ m128 p_mask;
+ m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 1);
+ m128 r_0 = prep_conf_teddy_m1(maskBase, p_mask, val_0);
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
+ }
+ *a->groups = controlVal;
+ return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a) {
+ const u8 *buf_end = a->buf + a->len;
+ const u8 *ptr = a->buf + a->start_offset;
+ hwlmcb_rv_t controlVal = *a->groups;
+ hwlmcb_rv_t *control = &controlVal;
+ u32 floodBackoff = FLOOD_BACKOFF_START;
+ const u8 *tryFloodDetect = a->firstFloodDetect;
+ u32 last_match = (u32)-1;
+ const struct Teddy *teddy = (const struct Teddy *)fdr;
+ const size_t iterBytes = 32;
+ DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+ a->buf, a->len, a->start_offset);
+
+ const m128 *maskBase = getMaskBase(teddy);
+ const u32 *confBase = getConfBase(teddy, 2);
+
+ m128 res_old_1 = ones128();
+ const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+ DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+ if (ptr < mainStart) {
+ ptr = mainStart - 16;
+ m128 p_mask;
+ m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 2);
+ m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
+ ptr += 16;
+ }
+
+ if (ptr + 16 < buf_end) {
+ m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
+ load128(ptr));
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
+ ptr += 16;
+ }
+
+ for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+ __builtin_prefetch(ptr + (iterBytes*4));
+ CHECK_FLOOD;
+ m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
+ load128(ptr));
+ CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
+ m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
+ load128(ptr + 16));
+ CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
+ }
+
+ for (; ptr < buf_end; ptr += 16) {
+ m128 p_mask;
+ m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 2);
+ m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
+ }
+ *a->groups = controlVal;
+ return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a) {
+ const u8 *buf_end = a->buf + a->len;
+ const u8 *ptr = a->buf + a->start_offset;
+ hwlmcb_rv_t controlVal = *a->groups;
+ hwlmcb_rv_t *control = &controlVal;
+ u32 floodBackoff = FLOOD_BACKOFF_START;
+ const u8 *tryFloodDetect = a->firstFloodDetect;
+ u32 last_match = (u32)-1;
+ const struct Teddy *teddy = (const struct Teddy *)fdr;
+ const size_t iterBytes = 32;
+ DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+ a->buf, a->len, a->start_offset);
+
+ const m128 *maskBase = getMaskBase(teddy);
+ const u32 *confBase = getConfBase(teddy, 2);
+
+ m128 res_old_1 = ones128();
+ const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+ DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+ if (ptr < mainStart) {
+ ptr = mainStart - 16;
+ m128 p_mask;
+ m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 2);
+ m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
+ ptr += 16;
+ }
+
+ if (ptr + 16 < buf_end) {
+ m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
+ load128(ptr));
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
+ ptr += 16;
+ }
+
+ for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+ __builtin_prefetch(ptr + (iterBytes*4));
+ CHECK_FLOOD;
+ m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
+ load128(ptr));
+ CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
+ m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, ones128(),
+ load128(ptr + 16));
+ CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
+ }
-#include "teddy_autogen.c"
+ for (; ptr < buf_end; ptr += 16) {
+ m128 p_mask;
+ m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 2);
+ m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
+ }
+ *a->groups = controlVal;
+ return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a) {
+ const u8 *buf_end = a->buf + a->len;
+ const u8 *ptr = a->buf + a->start_offset;
+ hwlmcb_rv_t controlVal = *a->groups;
+ hwlmcb_rv_t *control = &controlVal;
+ u32 floodBackoff = FLOOD_BACKOFF_START;
+ const u8 *tryFloodDetect = a->firstFloodDetect;
+ u32 last_match = (u32)-1;
+ const struct Teddy *teddy = (const struct Teddy *)fdr;
+ const size_t iterBytes = 32;
+ DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+ a->buf, a->len, a->start_offset);
+
+ const m128 *maskBase = getMaskBase(teddy);
+ const u32 *confBase = getConfBase(teddy, 3);
+
+ m128 res_old_1 = ones128();
+ m128 res_old_2 = ones128();
+ const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+ DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+ if (ptr < mainStart) {
+ ptr = mainStart - 16;
+ m128 p_mask;
+ m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 3);
+ m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
+ p_mask, val_0);
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
+ ptr += 16;
+ }
+
+ if (ptr + 16 < buf_end) {
+ m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
+ ones128(), load128(ptr));
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
+ ptr += 16;
+ }
+
+ for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+ __builtin_prefetch(ptr + (iterBytes*4));
+ CHECK_FLOOD;
+ m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
+ ones128(), load128(ptr));
+ CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
+ m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
+ ones128(), load128(ptr + 16));
+ CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
+ }
+
+ for (; ptr < buf_end; ptr += 16) {
+ m128 p_mask;
+ m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 3);
+ m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
+ p_mask, val_0);
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
+ }
+ *a->groups = controlVal;
+ return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a) {
+ const u8 *buf_end = a->buf + a->len;
+ const u8 *ptr = a->buf + a->start_offset;
+ hwlmcb_rv_t controlVal = *a->groups;
+ hwlmcb_rv_t *control = &controlVal;
+ u32 floodBackoff = FLOOD_BACKOFF_START;
+ const u8 *tryFloodDetect = a->firstFloodDetect;
+ u32 last_match = (u32)-1;
+ const struct Teddy *teddy = (const struct Teddy *)fdr;
+ const size_t iterBytes = 32;
+ DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+ a->buf, a->len, a->start_offset);
+
+ const m128 *maskBase = getMaskBase(teddy);
+ const u32 *confBase = getConfBase(teddy, 3);
+
+ m128 res_old_1 = ones128();
+ m128 res_old_2 = ones128();
+ const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+ DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+ if (ptr < mainStart) {
+ ptr = mainStart - 16;
+ m128 p_mask;
+ m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 3);
+ m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
+ p_mask, val_0);
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
+ ptr += 16;
+ }
+
+ if (ptr + 16 < buf_end) {
+ m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
+ ones128(), load128(ptr));
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
+ ptr += 16;
+ }
+
+ for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+ __builtin_prefetch(ptr + (iterBytes*4));
+ CHECK_FLOOD;
+ m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
+ ones128(), load128(ptr));
+ CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
+ m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
+ ones128(), load128(ptr + 16));
+ CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
+ }
+
+ for (; ptr < buf_end; ptr += 16) {
+ m128 p_mask;
+ m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 3);
+ m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
+ p_mask, val_0);
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
+ }
+ *a->groups = controlVal;
+ return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a) {
+ const u8 *buf_end = a->buf + a->len;
+ const u8 *ptr = a->buf + a->start_offset;
+ hwlmcb_rv_t controlVal = *a->groups;
+ hwlmcb_rv_t *control = &controlVal;
+ u32 floodBackoff = FLOOD_BACKOFF_START;
+ const u8 *tryFloodDetect = a->firstFloodDetect;
+ u32 last_match = (u32)-1;
+ const struct Teddy *teddy = (const struct Teddy *)fdr;
+ const size_t iterBytes = 32;
+ DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+ a->buf, a->len, a->start_offset);
+
+ const m128 *maskBase = getMaskBase(teddy);
+ const u32 *confBase = getConfBase(teddy, 4);
+
+ m128 res_old_1 = ones128();
+ m128 res_old_2 = ones128();
+ m128 res_old_3 = ones128();
+ const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+ DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+ if (ptr < mainStart) {
+ ptr = mainStart - 16;
+ m128 p_mask;
+ m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 4);
+ m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
+ &res_old_3, p_mask, val_0);
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
+ ptr += 16;
+ }
+
+ if (ptr + 16 < buf_end) {
+ m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
+ &res_old_3, ones128(), load128(ptr));
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
+ ptr += 16;
+ }
+
+ for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+ __builtin_prefetch(ptr + (iterBytes*4));
+ CHECK_FLOOD;
+ m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
+ &res_old_3, ones128(), load128(ptr));
+ CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
+ m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
+ &res_old_3, ones128(), load128(ptr + 16));
+ CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
+ }
+
+ for (; ptr < buf_end; ptr += 16) {
+ m128 p_mask;
+ m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 4);
+ m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
+ &res_old_3, p_mask, val_0);
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
+ }
+ *a->groups = controlVal;
+ return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a) {
+ const u8 *buf_end = a->buf + a->len;
+ const u8 *ptr = a->buf + a->start_offset;
+ hwlmcb_rv_t controlVal = *a->groups;
+ hwlmcb_rv_t *control = &controlVal;
+ u32 floodBackoff = FLOOD_BACKOFF_START;
+ const u8 *tryFloodDetect = a->firstFloodDetect;
+ u32 last_match = (u32)-1;
+ const struct Teddy *teddy = (const struct Teddy *)fdr;
+ const size_t iterBytes = 32;
+ DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+ a->buf, a->len, a->start_offset);
+
+ const m128 *maskBase = getMaskBase(teddy);
+ const u32 *confBase = getConfBase(teddy, 4);
+
+ m128 res_old_1 = ones128();
+ m128 res_old_2 = ones128();
+ m128 res_old_3 = ones128();
+ const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+ DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+ if (ptr < mainStart) {
+ ptr = mainStart - 16;
+ m128 p_mask;
+ m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 4);
+ m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
+ &res_old_3, p_mask, val_0);
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
+ ptr += 16;
+ }
+
+ if (ptr + 16 < buf_end) {
+ m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
+ &res_old_3, ones128(), load128(ptr));
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
+ ptr += 16;
+ }
+
+ for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+ __builtin_prefetch(ptr + (iterBytes*4));
+ CHECK_FLOOD;
+ m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
+ &res_old_3, ones128(), load128(ptr));
+ CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
+ m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
+ &res_old_3, ones128(), load128(ptr + 16));
+ CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
+ }
+
+ for (; ptr < buf_end; ptr += 16) {
+ m128 p_mask;
+ m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 4);
+ m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
+ &res_old_3, p_mask, val_0);
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
+ }
+ *a->groups = controlVal;
+ return HWLM_SUCCESS;
+}
--- /dev/null
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Teddy literal matcher: function declarations.
+ */
+
+#ifndef TEDDY_H_
+#define TEDDY_H_
+
+#include "hwlm/hwlm.h"
+
+struct FDR; // forward declaration from fdr_internal.h
+struct FDR_Runtime_Args;
+
+hwlm_error_t fdr_exec_s1_w128(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_s2_w128(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_s4_w128(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a);
+
+#if defined(__AVX2__)
+
+hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a);
+
+hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a);
+
+#endif /* __AVX2__ */
+
+#endif /* TEDDY_H_ */
+++ /dev/null
-#!/usr/bin/python
-
-# Copyright (c) 2015-2016, Intel Corporation
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-# this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-# * Neither the name of Intel Corporation nor the names of its contributors
-# may be used to endorse or promote products derived from this software
-# without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import sys
-from autogen_utils import *
-from string import Template
-
-class MT:
- def produce_header(self, visible, header_only = False):
- s = ""
- if not visible:
- s += "static never_inline"
- s += """
-hwlm_error_t %s(UNUSED const struct FDR *fdr,
- UNUSED const struct FDR_Runtime_Args * a)""" % self.get_name()
- if header_only:
- s += ";"
- else:
- s += "{"
- s += "\n"
- return s
-
- def produce_guard(self):
- print self.arch.get_guard()
-
- def produce_zero_alternative(self):
- print """
-#else
-#define %s 0
-#endif
-""" % self.get_name()
-
- def close_guard(self):
- print "#endif"
-
- def produce_confirm_base(self, conf_var_name, conf_var_size, offset, cautious, enable_confirmless, do_bailout = False):
- if cautious:
- caution_string = "VECTORING"
- else:
- caution_string = "NOT_CAUTIOUS"
- conf_split_mask = IntegerType(32).constant_to_string(
- self.conf_top_level_split - 1)
- if enable_confirmless:
- quick_check_string = """
- if (!fdrc->mult) {
- u32 id = fdrc->nBitsOrSoleID;
- if ((last_match == id) && (fdrc->flags & NoRepeat))
- continue;
- last_match = id;
- controlVal = a->cb(ptr+byte-buf, ptr+byte-buf, id, a->ctxt);
- continue;
- } """
- else:
- quick_check_string = ""
- if do_bailout:
- bailout_string = """
- if ((ptr + byte < buf + a->start_offset) || (ptr + byte >= buf + len)) continue;"""
- else:
- bailout_string = ""
-
- return Template("""
-if (P0(!!$CONFVAR)) {
- do {
- u32 bit = findAndClearLSB_$CONFVAR_SIZE(&$CONFVAR);
- u32 byte = bit / $NUM_BUCKETS + $OFFSET;
- u32 bitRem = bit % $NUM_BUCKETS;
- $BAILOUT_STRING
- u32 confSplit = *(ptr+byte) & $SPLIT_MASK;
- u32 idx = confSplit * $NUM_BUCKETS + bitRem;
- u32 cf = confBase[idx];
- if (!cf)
- continue;
- fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);
- if (!(fdrc->groups & *control))
- continue;
- $QUICK_CHECK_STRING
- CautionReason reason = $CAUTION_STRING;
- CONF_TYPE v;
- const u8 * confirm_loc = ptr + byte - $CONF_PULL_BACK - 7;
- if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
- v = lv_u64a(confirm_loc, buf, buf + len);
- } else { // r == VECTORING, confirm_loc < buf
- u64a histBytes = a->histBytes;
- v = lv_u64a_ce(confirm_loc, buf, buf + len);
- // stitch together v (which doesn't move) and history (which does)
- u32 overhang = buf - confirm_loc;
- histBytes >>= 64 - (overhang * 8);
- v |= histBytes;
- }
- confWithBit(fdrc, a, ptr - buf + byte, $CONF_PULL_BACK, control, &last_match, v);
- } while(P0(!!$CONFVAR));
- if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
- *a->groups = controlVal;
- return HWLM_TERMINATED;
- }
-}""").substitute(CONFVAR = conf_var_name,
- CONFVAR_SIZE = conf_var_size,
- NUM_BUCKETS = self.num_buckets,
- OFFSET = offset,
- SPLIT_MASK = conf_split_mask,
- QUICK_CHECK_STRING = quick_check_string,
- BAILOUT_STRING = bailout_string,
- CAUTION_STRING = caution_string,
- CONF_PULL_BACK = self.conf_pull_back)
-
- def produce_confirm(self, iter, var_name, offset, bits, cautious = True):
- if self.packed:
- print self.produce_confirm_base(var_name, bits, iter*16 + offset, cautious, enable_confirmless = False, do_bailout = False)
- else:
- if cautious:
- caution_string = "VECTORING"
- else:
- caution_string = "NOT_CAUTIOUS"
-
- print " if (P0(!!%s)) {" % var_name
- print " do {"
- if bits == 64:
- print " u32 bit = findAndClearLSB_64(&%s);" % (var_name)
- else:
- print " u32 bit = findAndClearLSB_32(&%s);" % (var_name)
- print " u32 byte = bit / %d + %d;" % (self.num_buckets, iter*16 + offset)
- print " u32 idx = bit %% %d;" % self.num_buckets
- print " u32 cf = confBase[idx];"
- print " fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);"
- print " if (!(fdrc->groups & *control))"
- print " continue;"
- print """
- CautionReason reason = %s;
- CONF_TYPE v;
- const u8 * confirm_loc = ptr + byte - 7;
- if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
- v = lv_u64a(confirm_loc, buf, buf + len);
- } else { // r == VECTORING, confirm_loc < buf
- u64a histBytes = a->histBytes;
- v = lv_u64a_ce(confirm_loc, buf, buf + len);
- // stitch together v (which doesn't move) and history (which does)
- u32 overhang = buf - confirm_loc;
- histBytes >>= 64 - (overhang * 8);
- v |= histBytes;
- }""" % (caution_string)
- if self.num_masks == 1:
- print " confWithBit1(fdrc, a, ptr - buf + byte, control, &last_match, v);"
- else:
- print " confWithBitMany(fdrc, a, ptr - buf + byte, %s, control, &last_match, v);" % (caution_string)
- print " } while(P0(!!%s));" % var_name
- print " if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {"
- print " *a->groups = controlVal;"
- print " return HWLM_TERMINATED;"
- print " }"
- print " }"
-
- def produce_needed_temporaries(self, max_iterations):
- print " m128 p_mask;"
- for iter in range(0, max_iterations):
- print " m128 val_%d;" % iter
- print " m128 val_%d_lo;" % iter
- print " m128 val_%d_hi;" % iter
- for x in range(self.num_masks):
- print " m128 res_%d_%d;" % (iter, x)
- if x != 0:
- print " m128 res_shifted_%d_%d;" % (iter, x)
- print " m128 r_%d;" % iter
- print "#ifdef ARCH_64_BIT"
- print " u64a r_%d_lopart;" % iter
- print " u64a r_%d_hipart;" % iter
- print "#else"
- print " u32 r_%d_part1;" % iter
- print " u32 r_%d_part2;" % iter
- print " u32 r_%d_part3;" % iter
- print " u32 r_%d_part4;" % iter
- print "#endif"
-
- def produce_one_iteration_state_calc(self, iter, effective_num_iterations,
- cautious, save_old):
- if cautious:
- print " val_%d = vectoredLoad128(&p_mask, ptr + %d, buf, buf+len, a->buf_history, a->len_history, %d);" % (iter, iter*16, self.num_masks)
- else:
- print " val_%d = load128(ptr + %d);" % (iter, iter*16)
- print " val_%d_lo = and128(val_%d, lomask);" % (iter, iter)
- print " val_%d_hi = rshift2x64(val_%d, 4);" % (iter, iter)
- print " val_%d_hi = and128(val_%d_hi, lomask);" % (iter, iter)
- print
- for x in range(self.num_masks):
- print Template("""
- res_${ITER}_${X} = and128(pshufb(maskBase[${X}*2] , val_${ITER}_lo),
- pshufb(maskBase[${X}*2+1], val_${ITER}_hi));""").substitute(ITER = iter, X = x)
- if x != 0:
- if iter == 0:
- print " res_shifted_%d_%d = palignr(res_%d_%d, res_old_%d, 16-%d);" % (iter, x, iter, x, x, x)
- else:
- print " res_shifted_%d_%d = palignr(res_%d_%d, res_%d_%d, 16-%d);" % (iter, x, iter, x, iter-1, x, x)
- if x != 0 and iter == effective_num_iterations - 1 and save_old:
- print " res_old_%d = res_%d_%d;" % (x, iter, x)
- print
- if cautious:
- print " r_%d = and128(res_%d_0, p_mask);" % (iter, iter)
- else:
- print " r_%d = res_%d_0;" % (iter, iter)
- for x in range(1, self.num_masks):
- print " r_%d = and128(r_%d, res_shifted_%d_%d);" % (iter, iter, iter, x)
- print
-
- def produce_one_iteration_confirm(self, iter, confirmCautious):
- setup64 = [ (0, "r_%d_lopart" % iter, "movq(r_%d)" % iter),
- (8, "r_%d_hipart" % iter, "movq(byteShiftRight128(r_%d, 8))" % iter) ]
-
- setup32 = [ (0, "r_%d_part1" % iter, "movd(r_%d)" % iter),
- (4, "r_%d_part2" % iter, "movd(byteShiftRight128(r_%d, 4))" % iter),
- (8, "r_%d_part3" % iter, "movd(byteShiftRight128(r_%d, 8))" % iter),
- (12, "r_%d_part4" % iter, "movd(byteShiftRight128(r_%d, 12))" % iter) ]
-
- print " if (P0(isnonzero128(r_%d))) {" % (iter)
- print "#ifdef ARCH_64_BIT"
- for (off, val, init) in setup64:
- print " %s = %s;" % (val, init)
- for (off, val, init) in setup64:
- self.produce_confirm(iter, val, off, 64, cautious = confirmCautious)
- print "#else"
- for (off, val, init) in setup32:
- print " %s = %s;" % (val, init)
- for (off, val, init) in setup32:
- self.produce_confirm(iter, val, off, 32, cautious = confirmCautious)
- print "#endif"
- print " }"
-
- def produce_one_iteration(self, iter, effective_num_iterations, cautious = False,
- confirmCautious = True, save_old = True):
- self.produce_one_iteration_state_calc(iter, effective_num_iterations, cautious, save_old)
- self.produce_one_iteration_confirm(iter, confirmCautious)
-
- def produce_code(self):
- print self.produce_header(visible = True, header_only = False)
- print """
- const u8 * buf = a->buf;
- const size_t len = a->len;
- const u8 * ptr = buf + a->start_offset;
- hwlmcb_rv_t controlVal = *a->groups;
- hwlmcb_rv_t * control = &controlVal;
- u32 floodBackoff = FLOOD_BACKOFF_START;
- const u8 * tryFloodDetect = a->firstFloodDetect;
- const struct FDRConfirm *fdrc;
- u32 last_match = (u32)-1;
-"""
- print
-
- self.produce_needed_temporaries(self.num_iterations)
- print
-
- print " const struct Teddy * teddy = (const struct Teddy *)fdr;"
- print " const m128 * maskBase = (const m128 *)((const u8 *)fdr + sizeof(struct Teddy));"
- print " const u32 * confBase = (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) + (%d*32));" % self.num_masks
- print " const u8 * mainStart = ROUNDUP_PTR(ptr, 16);"
- print " const size_t iterBytes = %d;" % (self.num_iterations * 16)
-
- print ' DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\\n",' \
- ' buf, len, a->start_offset);'
- print ' DEBUG_PRINTF("derive: ptr: %p mainstart %p\\n", ptr,' \
- ' mainStart);'
-
- for x in range(self.num_masks):
- if (x != 0):
- print " m128 res_old_%d = ones128();" % x
- print " m128 lomask = set16x8(0xf);"
-
- print " if (ptr < mainStart) {"
- print " ptr = mainStart - 16;"
- self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
- print " ptr += 16;"
- print " }"
-
- print " if (ptr + 16 < buf + len) {"
- self.produce_one_iteration(0, 1, cautious = False, confirmCautious = True, save_old = True)
- print " ptr += 16;"
- print " }"
-
- print """
- for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {
- __builtin_prefetch(ptr + (iterBytes*4));
- if (P0(ptr > tryFloodDetect)) {
- tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, &floodBackoff, &controlVal, iterBytes);
- if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
- *a->groups = controlVal;
- return HWLM_TERMINATED;
- }
- }
-"""
- for iter in range(self.num_iterations):
- self.produce_one_iteration(iter, self.num_iterations, cautious = False, confirmCautious = False)
-
- print " }"
-
- print " for (; ptr < buf + len; ptr += 16) {"
- self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
- print " }"
-
- print """
- *a->groups = controlVal;
- return HWLM_SUCCESS;
-}
-"""
-
- def produce_compile_call(self):
- packed_str = { False : "false", True : "true"}[self.packed]
- print " { %d, %s, %d, %d, %s, %d, %d }," % (
- self.id, self.arch.target, self.num_masks, self.num_buckets, packed_str,
- self.conf_pull_back, self.conf_top_level_split)
-
- def get_name(self):
- if self.packed:
- pck_string = "_pck"
- else:
- pck_string = ""
-
- if self.num_buckets == 16:
- type_string = "_fat"
- else:
- type_string = ""
-
- return "fdr_exec_teddy_%s_msks%d%s%s" % (self.arch.name, self.num_masks, pck_string, type_string)
-
- def __init__(self, arch, packed = False, num_masks = 1, num_buckets = 8):
- self.arch = arch
- self.packed = packed
- self.num_masks = num_masks
- self.num_buckets = num_buckets
- self.num_iterations = 2
-
- if packed:
- self.conf_top_level_split = 32
- else:
- self.conf_top_level_split = 1
- self.conf_pull_back = 0
-
-class MTFat(MT):
- def produce_needed_temporaries(self, max_iterations):
- print " m256 p_mask;"
- for iter in range(0, max_iterations):
- print " m256 val_%d;" % iter
- print " m256 val_%d_lo;" % iter
- print " m256 val_%d_hi;" % iter
- for x in range(self.num_masks):
- print " m256 res_%d_%d;" % (iter, x)
- if x != 0:
- print " m256 res_shifted_%d_%d;" % (iter, x)
- print " m256 r_%d;" % iter
- print "#ifdef ARCH_64_BIT"
- print " u64a r_%d_part1;" % iter
- print " u64a r_%d_part2;" % iter
- print " u64a r_%d_part3;" % iter
- print " u64a r_%d_part4;" % iter
- print "#else"
- print " u32 r_%d_part1;" % iter
- print " u32 r_%d_part2;" % iter
- print " u32 r_%d_part3;" % iter
- print " u32 r_%d_part4;" % iter
- print " u32 r_%d_part5;" % iter
- print " u32 r_%d_part6;" % iter
- print " u32 r_%d_part7;" % iter
- print " u32 r_%d_part8;" % iter
- print "#endif"
-
- def produce_code(self):
- print self.produce_header(visible = True, header_only = False)
- print """
- const u8 * buf = a->buf;
- const size_t len = a->len;
- const u8 * ptr = buf + a->start_offset;
- hwlmcb_rv_t controlVal = *a->groups;
- hwlmcb_rv_t * control = &controlVal;
- u32 floodBackoff = FLOOD_BACKOFF_START;
- const u8 * tryFloodDetect = a->firstFloodDetect;
- const struct FDRConfirm *fdrc;
- u32 last_match = (u32)-1;
-"""
- print
-
- self.produce_needed_temporaries(self.num_iterations)
- print
-
- print " const struct Teddy * teddy = (const struct Teddy *)fdr;"
- print " const m256 * maskBase = (const m256 *)((const u8 *)fdr + sizeof(struct Teddy));"
- print " const u32 * confBase = (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) + (%d*32*2));" % self.num_masks
- print " const u8 * mainStart = ROUNDUP_PTR(ptr, 16);"
- print " const size_t iterBytes = %d;" % (self.num_iterations * 16)
-
- print ' DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\\n",' \
- ' buf, len, a->start_offset);'
- print ' DEBUG_PRINTF("derive: ptr: %p mainstart %p\\n", ptr,' \
- ' mainStart);'
-
- for x in range(self.num_masks):
- if (x != 0):
- print " m256 res_old_%d = ones256();" % x
- print " m256 lomask = set32x8(0xf);"
-
- print " if (ptr < mainStart) {"
- print " ptr = mainStart - 16;"
- self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
- print " ptr += 16;"
- print " }"
-
- print " if (ptr + 16 < buf + len) {"
- self.produce_one_iteration(0, 1, cautious = False, confirmCautious = True, save_old = True)
- print " ptr += 16;"
- print " }"
-
- print """
- for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {
- __builtin_prefetch(ptr + (iterBytes*4));
- if (P0(ptr > tryFloodDetect)) {
- tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, &floodBackoff, &controlVal, iterBytes);
- if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
- *a->groups = controlVal;
- return HWLM_TERMINATED;
- }
- }
-"""
-
- for iter in range(self.num_iterations):
- self.produce_one_iteration(iter, self.num_iterations, False, confirmCautious = False)
-
- print " }"
-
- print " for (; ptr < buf + len; ptr += 16) {"
- self.produce_one_iteration(0, 1, cautious = True, confirmCautious = True, save_old = True)
- print " }"
-
- print """
- *a->groups = controlVal;
- return HWLM_SUCCESS;
-}
-"""
-
- def produce_one_iteration_state_calc(self, iter, effective_num_iterations,
- cautious, save_old):
- if cautious:
- print " val_%d = vectoredLoad2x128(&p_mask, ptr + %d, buf, buf+len, a->buf_history, a->len_history, %d);" % (iter, iter*16, self.num_masks)
- else:
- print " val_%d = load2x128(ptr + %d);" % (iter, iter*16)
- print " val_%d_lo = and256(val_%d, lomask);" % (iter, iter)
- print " val_%d_hi = rshift4x64(val_%d, 4);" % (iter, iter)
- print " val_%d_hi = and256(val_%d_hi, lomask);" % (iter, iter)
- print
- for x in range(self.num_masks):
- print Template("""
- res_${ITER}_${X} = and256(vpshufb(maskBase[${X}*2] , val_${ITER}_lo),
- vpshufb(maskBase[${X}*2+1], val_${ITER}_hi));""").substitute(ITER = iter, X = x)
- if x != 0:
- if iter == 0:
- print " res_shifted_%d_%d = vpalignr(res_%d_%d, res_old_%d, 16-%d);" % (iter, x, iter, x, x, x)
- else:
- print " res_shifted_%d_%d = vpalignr(res_%d_%d, res_%d_%d, 16-%d);" % (iter, x, iter, x, iter-1, x, x)
- if x != 0 and iter == effective_num_iterations - 1 and save_old:
- print " res_old_%d = res_%d_%d;" % (x, iter, x)
- print
- if cautious:
- print " r_%d = and256(res_%d_0, p_mask);" % (iter, iter)
- else:
- print " r_%d = res_%d_0;" % (iter, iter)
- for x in range(1, self.num_masks):
- print " r_%d = and256(r_%d, res_shifted_%d_%d);" % (iter, iter, iter, x)
- print
-
- def produce_one_iteration_confirm(self, iter, confirmCautious):
- setup64 = [ (0, "r_%d_part1" % iter, "extractlow64from256(r)"),
- (4, "r_%d_part2" % iter, "extract64from256(r, 1);\n r = interleave256hi(r_%d, r_swap)" % (iter)),
- (8, "r_%d_part3" % iter, "extractlow64from256(r)"),
- (12, "r_%d_part4" % iter, "extract64from256(r, 1)") ]
-
- setup32 = [ (0, "r_%d_part1" % iter, "extractlow32from256(r)"),
- (2, "r_%d_part2" % iter, "extract32from256(r, 1)"),
- (4, "r_%d_part3" % iter, "extract32from256(r, 2)"),
- (6, "r_%d_part4" % iter, "extract32from256(r, 3);\n r = interleave256hi(r_%d, r_swap)" % (iter)),
- (8, "r_%d_part5" % iter, "extractlow32from256(r)"),
- (10, "r_%d_part6" % iter, "extract32from256(r, 1)"),
- (12, "r_%d_part7" % iter, "extract32from256(r, 2)"),
- (14, "r_%d_part8" % iter, "extract32from256(r, 3)") ]
-
- print " if (P0(isnonzero256(r_%d))) {" % (iter)
- print " m256 r_swap = swap128in256(r_%d);" % (iter)
- print " m256 r = interleave256lo(r_%d, r_swap);" % (iter)
- print "#ifdef ARCH_64_BIT"
- for (off, val, init) in setup64:
- print " %s = %s;" % (val, init)
-
- for (off, val, init) in setup64:
- self.produce_confirm(iter, val, off, 64, cautious = confirmCautious)
- print "#else"
- for (off, val, init) in setup32:
- print " %s = %s;" % (val, init)
-
- for (off, val, init) in setup32:
- self.produce_confirm(iter, val, off, 32, cautious = confirmCautious)
- print "#endif"
- print " }"
-
-class MTFast:
- def produce_header(self, visible, header_only = False):
- s = ""
- if not visible:
- s += "static never_inline"
- s += """
-hwlm_error_t %s(UNUSED const struct FDR *fdr,
- UNUSED const struct FDR_Runtime_Args * a)""" % self.get_name()
- if header_only:
- s += ";"
- else:
- s += "{"
- s += "\n"
- return s
-
- def produce_guard(self):
- print self.arch.get_guard()
-
- def produce_zero_alternative(self):
- print """
-#else
-#define %s 0
-#endif
-""" % self.get_name()
-
- def close_guard(self):
- print "#endif"
-
- def produce_confirm(self, cautious):
- if cautious:
- cautious_str = "VECTORING"
- else:
- cautious_str = "NOT_CAUTIOUS"
-
- print " for (u32 i = 0; i < arrCnt; i++) {"
- print " u32 byte = bitArr[i] / 8;"
- if self.packed:
- conf_split_mask = IntegerType(32).constant_to_string(
- self.conf_top_level_split - 1)
- print " u32 bitRem = bitArr[i] % 8;"
- print " u32 confSplit = *(ptr+byte) & 0x1f;"
- print " u32 idx = confSplit * %d + bitRem;" % self.num_buckets
- print " u32 cf = confBase[idx];"
- print " if (!cf)"
- print " continue;"
- print " fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);"
- print " if (!(fdrc->groups & *control))"
- print " continue;"
- print """
- CautionReason reason = %s;
- CONF_TYPE v;
- const u8 * confirm_loc = ptr + byte - 7;
- if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
- v = lv_u64a(confirm_loc, buf, buf + len);
- } else { // r == VECTORING, confirm_loc < buf
- u64a histBytes = a->histBytes;
- v = lv_u64a_ce(confirm_loc, buf, buf + len);
- // stitch together v (which doesn't move) and history (which does)
- u32 overhang = buf - confirm_loc;
- histBytes >>= 64 - (overhang * 8);
- v |= histBytes;
- }""" % (cautious_str)
- print " confWithBit(fdrc, a, ptr - buf + byte, 0, control, &last_match, v);"
- else:
- print " u32 cf = confBase[bitArr[i] % 8];"
- print " fdrc = (const struct FDRConfirm *)((const u8 *)confBase + cf);"
- print """
- CautionReason reason = %s;
- CONF_TYPE v;
- const u8 * confirm_loc = ptr + byte - 7;
- if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
- v = lv_u64a(confirm_loc, buf, buf + len);
- } else { // r == VECTORING, confirm_loc < buf
- u64a histBytes = a->histBytes;
- v = lv_u64a_ce(confirm_loc, buf, buf + len);
- // stitch together v (which doesn't move) and history (which does)
- u32 overhang = buf - confirm_loc;
- histBytes >>= 64 - (overhang * 8);
- v |= histBytes;
- }""" % (cautious_str)
- print " confWithBit1(fdrc, a, ptr - buf + byte, control, &last_match, v);"
- print " if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {"
- print " *a->groups = controlVal;"
- print " return HWLM_TERMINATED;"
- print " }"
- print " }"
-
- def produce_needed_temporaries(self, max_iterations):
- print " u32 arrCnt;"
- print " u16 bitArr[512];"
- print " m256 p_mask;"
- print " m256 val_0;"
- print " m256 val_0_lo;"
- print " m256 val_0_hi;"
- print " m256 res_0;"
- print " m256 res_1;"
- print " m128 lo_part;"
- print " m128 hi_part;"
- print "#ifdef ARCH_64_BIT"
- print " u64a r_0_part;"
- print "#else"
- print " u32 r_0_part;"
- print "#endif"
-
- def produce_bit_scan(self, offset, bits):
- print " while (P0(!!r_0_part)) {"
- if bits == 64:
- print " bitArr[arrCnt++] = (u16)findAndClearLSB_64(&r_0_part) + 64 * %d;" % (offset)
- else:
- print " bitArr[arrCnt++] = (u16)findAndClearLSB_32(&r_0_part) + 32 * %d;" % (offset)
- print " }"
-
- def produce_bit_check_128(self, var_name, offset):
- print " if (P0(isnonzero128(%s))) {" % (var_name)
- print "#ifdef ARCH_64_BIT"
- print " r_0_part = movq(%s);" % (var_name)
- self.produce_bit_scan(offset, 64)
- print " r_0_part = movq(byteShiftRight128(%s, 8));" % (var_name)
- self.produce_bit_scan(offset + 1, 64)
- print "#else"
- print " r_0_part = movd(%s);" % (var_name)
- self.produce_bit_scan(offset * 2, 32)
- for step in range(1, 4):
- print " r_0_part = movd(byteShiftRight128(%s, %d));" % (var_name, step * 4)
- self.produce_bit_scan(offset * 2 + step, 32)
- print "#endif"
- print " }"
-
- def produce_bit_check_256(self, iter, single_iter, cautious):
- print " if (P0(isnonzero256(res_%d))) {" % (iter)
- if single_iter:
- print " arrCnt = 0;"
- print " lo_part = cast256to128(res_%d);" % (iter)
- print " hi_part = cast256to128(swap128in256(res_%d));" % (iter)
- self.produce_bit_check_128("lo_part", iter * 4)
- self.produce_bit_check_128("hi_part", iter * 4 + 2)
- if single_iter:
- self.produce_confirm(cautious)
- print " }"
-
- def produce_one_iteration_state_calc(self, iter, cautious):
- if cautious:
- print " val_0 = vectoredLoad256(&p_mask, ptr + %d, buf+a->start_offset, buf+len, a->buf_history, a->len_history);" % (iter * 32)
- else:
- print " val_0 = load256(ptr + %d);" % (iter * 32)
- print " val_0_lo = and256(val_0, lomask);"
- print " val_0_hi = rshift4x64(val_0, 4);"
- print " val_0_hi = and256(val_0_hi, lomask);"
- print " res_%d = and256(vpshufb(maskLo , val_0_lo), vpshufb(maskHi, val_0_hi));" % (iter)
- if cautious:
- print " res_%d = and256(res_%d, p_mask);" % (iter, iter)
-
- def produce_code(self):
- print self.produce_header(visible = True, header_only = False)
- print """
- const u8 * buf = a->buf;
- const size_t len = a->len;
- const u8 * ptr = buf + a->start_offset;
- hwlmcb_rv_t controlVal = *a->groups;
- hwlmcb_rv_t * control = &controlVal;
- u32 floodBackoff = FLOOD_BACKOFF_START;
- const u8 * tryFloodDetect = a->firstFloodDetect;
- const struct FDRConfirm *fdrc;
- u32 last_match = (u32)-1;
-"""
- print
-
- self.produce_needed_temporaries(self.num_iterations)
-
- print " const struct Teddy * teddy = (const struct Teddy *)fdr;"
- print " const m128 * maskBase = (const m128 *)((const u8 *)fdr + sizeof(struct Teddy));"
- print " const m256 maskLo = set2x128(maskBase[0]);"
- print " const m256 maskHi = set2x128(maskBase[1]);"
- print " const u32 * confBase = (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) + 32);"
- print " const u8 * mainStart = ROUNDUP_PTR(ptr, 32);"
- print " const size_t iterBytes = %d;" % (self.num_iterations * 32)
-
- print ' DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\\n",' \
- ' buf, len, a->start_offset);'
- print ' DEBUG_PRINTF("derive: ptr: %p mainstart %p\\n", ptr,' \
- ' mainStart);'
- print " const m256 lomask = set32x8(0xf);"
-
- print " if (ptr < mainStart) {"
- print " ptr = mainStart - 32;"
- self.produce_one_iteration_state_calc(iter = 0, cautious = True)
- self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True)
- print " ptr += 32;"
- print " }"
-
- print " if (ptr + 32 < buf + len) {"
- self.produce_one_iteration_state_calc(iter = 0, cautious = False)
- self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True)
- print " ptr += 32;"
- print " }"
- print """
- for ( ; ptr + iterBytes <= buf + len; ptr += iterBytes) {
- __builtin_prefetch(ptr + (iterBytes*4));
- if (P0(ptr > tryFloodDetect)) {
- tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, &floodBackoff, &controlVal, iterBytes);
- if (P0(controlVal == HWLM_TERMINATE_MATCHING)) {
- *a->groups = controlVal;
- return HWLM_TERMINATED;
- }
- }
-"""
-
- for iter in range (0, self.num_iterations):
- self.produce_one_iteration_state_calc(iter = iter, cautious = False)
- print " arrCnt = 0;"
- for iter in range (0, self.num_iterations):
- self.produce_bit_check_256(iter = iter, single_iter = False, cautious = False)
- self.produce_confirm(cautious = False)
- print " }"
-
- print " for (; ptr < buf + len; ptr += 32) {"
- self.produce_one_iteration_state_calc(iter = 0, cautious = True)
- self.produce_bit_check_256(iter = 0, single_iter = True, cautious = True)
- print " }"
-
- print """
- *a->groups = controlVal;
- return HWLM_SUCCESS;
-}
-"""
-
- def get_name(self):
- if self.packed:
- pck_string = "_pck"
- else:
- pck_string = ""
- return "fdr_exec_teddy_%s_msks%d%s_fast" % (self.arch.name, self.num_masks, pck_string)
-
- def produce_compile_call(self):
- packed_str = { False : "false", True : "true"}[self.packed]
- print " { %d, %s, %d, %d, %s, %d, %d }," % (
- self.id, self.arch.target, self.num_masks, self.num_buckets, packed_str,
- self.conf_pull_back, self.conf_top_level_split)
-
- def __init__(self, arch, packed = False):
- self.arch = arch
- self.packed = packed
- self.num_masks = 1
- self.num_buckets = 8
- self.num_iterations = 2
-
- self.conf_top_level_split = 1
- self.conf_pull_back = 0
- if packed:
- self.conf_top_level_split = 32
- else:
- self.conf_top_level_split = 1
- self.conf_pull_back = 0
--- /dev/null
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Teddy literal matcher: AVX2 engine runtime.
+ */
+
+#include "fdr_internal.h"
+#include "flood_runtime.h"
+#include "teddy.h"
+#include "teddy_internal.h"
+#include "teddy_runtime_common.h"
+#include "util/simd_utils.h"
+#include "util/simd_utils_ssse3.h"
+
+#if defined(__AVX2__)
+
+static const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = {
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00},
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}
+};
+
+#ifdef ARCH_64_BIT
+#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \
+do { \
+ if (unlikely(isnonzero256(var))) { \
+ m256 swap = swap128in256(var); \
+ m256 r = interleave256lo(var, swap); \
+ u64a part1 = extractlow64from256(r); \
+ u64a part2 = extract64from256(r, 1); \
+ r = interleave256hi(var, swap); \
+ u64a part3 = extractlow64from256(r); \
+ u64a part4 = extract64from256(r, 1); \
+ if (unlikely(part1)) { \
+ conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \
+ control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ if (unlikely(part2)) { \
+ conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr, \
+ control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ if (unlikely(part3)) { \
+ conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr, \
+ control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ if (unlikely(part4)) { \
+ conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr, \
+ control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ } \
+} while (0);
+#else
+#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \
+do { \
+ if (unlikely(isnonzero256(var))) { \
+ m256 swap = swap128in256(var); \
+ m256 r = interleave256lo(var, swap); \
+ u32 part1 = extractlow32from256(r); \
+ u32 part2 = extract32from256(r, 1); \
+ u32 part3 = extract32from256(r, 2); \
+ u32 part4 = extract32from256(r, 3); \
+ r = interleave256hi(var, swap); \
+ u32 part5 = extractlow32from256(r); \
+ u32 part6 = extract32from256(r, 1); \
+ u32 part7 = extract32from256(r, 2); \
+ u32 part8 = extract32from256(r, 3); \
+ if (unlikely(part1)) { \
+ conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \
+ control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ if (unlikely(part2)) { \
+ conf_fn(&part2, bucket, offset + 2, confBase, reason, a, ptr, \
+ control, &last_match); \
+ } \
+ if (unlikely(part3)) { \
+ conf_fn(&part3, bucket, offset + 4, confBase, reason, a, ptr, \
+ control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ if (unlikely(part4)) { \
+ conf_fn(&part4, bucket, offset + 6, confBase, reason, a, ptr, \
+ control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ if (unlikely(part5)) { \
+ conf_fn(&part5, bucket, offset + 8, confBase, reason, a, ptr, \
+ control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ if (unlikely(part6)) { \
+ conf_fn(&part6, bucket, offset + 10, confBase, reason, a, ptr, \
+ control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ if (unlikely(part7)) { \
+ conf_fn(&part7, bucket, offset + 12, confBase, reason, a, ptr, \
+ control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ if (unlikely(part8)) { \
+ conf_fn(&part8, bucket, offset + 14, confBase, reason, a, ptr, \
+ control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ } \
+} while (0);
+#endif
+
+#define CONFIRM_FAST_TEDDY(var, offset, reason, conf_fn) \
+do { \
+ if (unlikely(isnonzero256(var))) { \
+ u32 arrCnt = 0; \
+ m128 lo = cast256to128(var); \
+ m128 hi = cast256to128(swap128in256(var)); \
+ bit_array_fast_teddy(lo, bitArr, &arrCnt, offset); \
+ bit_array_fast_teddy(hi, bitArr, &arrCnt, offset + 2); \
+ for (u32 i = 0; i < arrCnt; i++) { \
+ conf_fn(bitArr[i], confBase, reason, a, ptr, control, \
+ &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ } \
+} while (0);
+
+static really_inline
+m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
+ const u8 *buf_history, size_t len_history,
+ const u32 nMasks) {
+ m128 p_mask128;
+ m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, lo, hi, buf_history,
+ len_history, nMasks));
+ *p_mask = set2x128(p_mask128);
+ return ret;
+}
+
+/*
+ * \brief Copy a block of [0,31] bytes efficiently.
+ *
+ * This function is a workaround intended to stop some compilers from
+ * synthesizing a memcpy function call out of the copy of a small number of
+ * bytes that we do in vectoredLoad128.
+ */
+static really_inline
+void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) {
+ switch (len) {
+ case 0:
+ break;
+ case 1:
+ *dst = *src;
+ break;
+ case 2:
+ unaligned_store_u16(dst, unaligned_load_u16(src));
+ break;
+ case 3:
+ unaligned_store_u16(dst, unaligned_load_u16(src));
+ dst[2] = src[2];
+ break;
+ case 4:
+ unaligned_store_u32(dst, unaligned_load_u32(src));
+ break;
+ case 5:
+ case 6:
+ case 7:
+ /* Perform copy with two overlapping 4-byte chunks. */
+ unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
+ unaligned_store_u32(dst, unaligned_load_u32(src));
+ break;
+ case 8:
+ unaligned_store_u64a(dst, unaligned_load_u64a(src));
+ break;
+ case 9:
+ case 10:
+ case 11:
+ case 12:
+ case 13:
+ case 14:
+ case 15:
+ /* Perform copy with two overlapping 8-byte chunks. */
+ unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
+ unaligned_store_u64a(dst, unaligned_load_u64a(src));
+ break;
+ case 16:
+ storeu128(dst, loadu128(src));
+ break;
+ default:
+ /* Perform copy with two overlapping 16-byte chunks. */
+ assert(len < 32);
+ storeu128(dst + len - 16, loadu128(src + len - 16));
+ storeu128(dst, loadu128(src));
+ break;
+ }
+}
+
+static really_inline
+m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
+ const u8 *buf_history, size_t len_history) {
+ union {
+ u8 val8[32];
+ m256 val256;
+ } u;
+
+ uintptr_t copy_start;
+ uintptr_t copy_len;
+
+ if (ptr >= lo) {
+ uintptr_t avail = (uintptr_t)(hi - ptr);
+ if (avail >= 32) {
+ *p_mask = load256(p_mask_arr256[32] + 32);
+ return loadu256(ptr);
+ }
+ *p_mask = load256(p_mask_arr256[avail] + 32);
+ copy_start = 0;
+ copy_len = avail;
+ } else {
+ // need contains "how many chars to pull from history"
+ // calculate based on what we need, what we have in the buffer
+ // and only what we need to make primary confirm work
+ uintptr_t start = (uintptr_t)(lo - ptr);
+ uintptr_t i;
+ for (i = start; ptr + i < lo; i++) {
+ u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
+ }
+ uintptr_t end = MIN(32, (uintptr_t)(hi - ptr));
+ *p_mask = loadu256(p_mask_arr256[end - start] + 32 - start);
+ copy_start = i;
+ copy_len = end - i;
+ }
+
+ // Runt block from the buffer.
+ copyRuntBlock256(&u.val8[copy_start], &ptr[copy_start], copy_len);
+
+ return u.val256;
+}
+
+static really_inline
+void do_confWithBit1_fast_teddy(u16 bits, const u32 *confBase,
+ CautionReason reason,
+ const struct FDR_Runtime_Args *a,
+ const u8 *ptr, hwlmcb_rv_t *control,
+ u32 *last_match) {
+ u32 byte = bits / 8;
+ u32 cf = confBase[bits % 8];
+ const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
+ ((const u8 *)confBase + cf);
+ u64a confVal = getConfVal(a, ptr, byte, reason);
+ confWithBit1(fdrc, a, ptr - a->buf + byte, control, last_match, confVal);
+}
+
+static really_inline
+void do_confWithBit_fast_teddy(u16 bits, const u32 *confBase,
+ CautionReason reason,
+ const struct FDR_Runtime_Args *a, const u8 *ptr,
+ hwlmcb_rv_t *control, u32 *last_match) {
+ u32 byte = bits / 8;
+ u32 bitRem = bits % 8;
+ u32 confSplit = *(ptr+byte) & 0x1f;
+ u32 idx = confSplit * 8 + bitRem;
+ u32 cf = confBase[idx];
+ if (!cf) {
+ return;
+ }
+ const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
+ ((const u8 *)confBase + cf);
+ if (!(fdrc->groups & *control)) {
+ return;
+ }
+ u64a confVal = getConfVal(a, ptr, byte, reason);
+ confWithBit(fdrc, a, ptr - a->buf + byte, 0, control, last_match, confVal);
+}
+
+static really_inline
+void bit_array_fast_teddy(m128 var, u16 *bitArr, u32 *arrCnt, u32 offset) {
+ if (unlikely(isnonzero128(var))) {
+#ifdef ARCH_64_BIT
+ u64a part_0 = movq(var);
+ while (unlikely(part_0)) {
+ bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_0) +
+ 64 * (offset);
+ *arrCnt += 1;
+ }
+ u64a part_1 = movq(byteShiftRight128(var, 8));
+ while (unlikely(part_1)) {
+ bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_1) +
+ 64 * (offset + 1);
+ *arrCnt += 1;
+ }
+#else
+ u32 part_0 = movd(var);
+ while (unlikely(part_0)) {
+ bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_0) +
+ 32 * (offset * 2);
+ *arrCnt += 1;
+ }
+ u32 part_1 = movd(byteShiftRight128(var, 4));
+ while (unlikely(part_1)) {
+ bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_1) +
+ 32 * (offset * 2 + 1);
+ *arrCnt += 1;
+ }
+ u32 part_2 = movd(byteShiftRight128(var, 8));
+ while (unlikely(part_2)) {
+ bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_2) +
+ 32 * (offset * 2 + 2);
+ *arrCnt += 1;
+ }
+ u32 part_3 = movd(byteShiftRight128(var, 12));
+ while (unlikely(part_3)) {
+ bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_3) +
+ 32 * (offset * 2 + 3);
+ *arrCnt += 1;
+ }
+#endif
+ }
+}
+
+static really_inline
+m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 p_mask, m256 val) {
+ m256 mask = set32x8(0xf);
+ m256 lo = and256(val, mask);
+ m256 hi = and256(rshift4x64(val, 4), mask);
+ return and256(and256(vpshufb(maskBase[0*2], lo),
+ vpshufb(maskBase[0*2+1], hi)), p_mask);
+}
+
+static really_inline
+m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 p_mask,
+ m256 val) {
+ m256 mask = set32x8(0xf);
+ m256 lo = and256(val, mask);
+ m256 hi = and256(rshift4x64(val, 4), mask);
+ m256 r = prep_conf_fat_teddy_m1(maskBase, p_mask, val);
+
+ m256 res_1 = and256(vpshufb(maskBase[1*2], lo),
+ vpshufb(maskBase[1*2+1], hi));
+ m256 res_shifted_1 = vpalignr(res_1, *old_1, 16-1);
+ *old_1 = res_1;
+ return and256(and256(r, p_mask), res_shifted_1);
+}
+
+static really_inline
+m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
+ m256 p_mask, m256 val) {
+ m256 mask = set32x8(0xf);
+ m256 lo = and256(val, mask);
+ m256 hi = and256(rshift4x64(val, 4), mask);
+ m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, p_mask, val);
+
+ m256 res_2 = and256(vpshufb(maskBase[2*2], lo),
+ vpshufb(maskBase[2*2+1], hi));
+ m256 res_shifted_2 = vpalignr(res_2, *old_2, 16-2);
+ *old_2 = res_2;
+ return and256(r, res_shifted_2);
+}
+
+static really_inline
+m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
+ m256 *old_3, m256 p_mask, m256 val) {
+ m256 mask = set32x8(0xf);
+ m256 lo = and256(val, mask);
+ m256 hi = and256(rshift4x64(val, 4), mask);
+ m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, p_mask, val);
+
+ m256 res_3 = and256(vpshufb(maskBase[3*2], lo),
+ vpshufb(maskBase[3*2+1], hi));
+ m256 res_shifted_3 = vpalignr(res_3, *old_3, 16-3);
+ *old_3 = res_3;
+ return and256(r, res_shifted_3);
+}
+
+static really_inline
+m256 prep_conf_fast_teddy_m1(m256 val, m256 mask, m256 maskLo, m256 maskHi,
+ m256 p_mask) {
+ m256 lo = and256(val, mask);
+ m256 hi = and256(rshift4x64(val, 4), mask);
+ m256 res = and256(vpshufb(maskLo, lo), vpshufb(maskHi, hi));
+ return and256(res, p_mask);
+}
+
+static really_inline
+const m256 * getMaskBase_avx2(const struct Teddy *teddy) {
+ return (const m256 *)((const u8 *)teddy + sizeof(struct Teddy));
+}
+
+static really_inline
+const u32 * getConfBase_avx2(const struct Teddy *teddy, u8 numMask) {
+ return (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) +
+ (numMask*32*2));
+}
+
+hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a) {
+ const u8 *buf_end = a->buf + a->len;
+ const u8 *ptr = a->buf + a->start_offset;
+ hwlmcb_rv_t controlVal = *a->groups;
+ hwlmcb_rv_t *control = &controlVal;
+ u32 floodBackoff = FLOOD_BACKOFF_START;
+ const u8 *tryFloodDetect = a->firstFloodDetect;
+ u32 last_match = (u32)-1;
+ const struct Teddy *teddy = (const struct Teddy *)fdr;
+ const size_t iterBytes = 32;
+ DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+ a->buf, a->len, a->start_offset);
+
+ const m256 *maskBase = getMaskBase_avx2(teddy);
+ const u32 *confBase = getConfBase_avx2(teddy, 1);
+
+ const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+ DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+ if (ptr < mainStart) {
+ ptr = mainStart - 16;
+ m256 p_mask;
+ m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 1);
+ m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0);
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
+ ptr += 16;
+ }
+
+ if (ptr + 16 < buf_end) {
+ m256 r_0 = prep_conf_fat_teddy_m1(maskBase, ones256(), load2x128(ptr));
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
+ ptr += 16;
+ }
+
+ for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+ __builtin_prefetch(ptr + (iterBytes*4));
+ CHECK_FLOOD;
+ m256 r_0 = prep_conf_fat_teddy_m1(maskBase, ones256(), load2x128(ptr));
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit1_teddy);
+ m256 r_1 = prep_conf_fat_teddy_m1(maskBase, ones256(),
+ load2x128(ptr + 16));
+ CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit1_teddy);
+ }
+
+ for (; ptr < buf_end; ptr += 16) {
+ m256 p_mask;
+ m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 1);
+ m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0);
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
+ }
+ *a->groups = controlVal;
+ return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a) {
+ const u8 *buf_end = a->buf + a->len;
+ const u8 *ptr = a->buf + a->start_offset;
+ hwlmcb_rv_t controlVal = *a->groups;
+ hwlmcb_rv_t *control = &controlVal;
+ u32 floodBackoff = FLOOD_BACKOFF_START;
+ const u8 *tryFloodDetect = a->firstFloodDetect;
+ u32 last_match = (u32)-1;
+ const struct Teddy *teddy = (const struct Teddy *)fdr;
+ const size_t iterBytes = 32;
+ DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+ a->buf, a->len, a->start_offset);
+
+ const m256 *maskBase = getMaskBase_avx2(teddy);
+ const u32 *confBase = getConfBase_avx2(teddy, 1);
+
+ const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+ DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+ if (ptr < mainStart) {
+ ptr = mainStart - 16;
+ m256 p_mask;
+ m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 1);
+ m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0);
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
+ ptr += 16;
+ }
+
+ if (ptr + 16 < buf_end) {
+ m256 r_0 = prep_conf_fat_teddy_m1(maskBase, ones256(), load2x128(ptr));
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
+ ptr += 16;
+ }
+
+ for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+ __builtin_prefetch(ptr + (iterBytes*4));
+ CHECK_FLOOD;
+ m256 r_0 = prep_conf_fat_teddy_m1(maskBase, ones256(), load2x128(ptr));
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
+ m256 r_1 = prep_conf_fat_teddy_m1(maskBase, ones256(),
+ load2x128(ptr + 16));
+ CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
+ }
+
+ for (; ptr < buf_end; ptr += 16) {
+ m256 p_mask;
+ m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 1);
+ m256 r_0 = prep_conf_fat_teddy_m1(maskBase, p_mask, val_0);
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
+ }
+ *a->groups = controlVal;
+ return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a) {
+ const u8 *buf_end = a->buf + a->len;
+ const u8 *ptr = a->buf + a->start_offset;
+ hwlmcb_rv_t controlVal = *a->groups;
+ hwlmcb_rv_t *control = &controlVal;
+ u32 floodBackoff = FLOOD_BACKOFF_START;
+ const u8 *tryFloodDetect = a->firstFloodDetect;
+ u32 last_match = (u32)-1;
+ const struct Teddy *teddy = (const struct Teddy *)fdr;
+ const size_t iterBytes = 32;
+ DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+ a->buf, a->len, a->start_offset);
+
+ const m256 *maskBase = getMaskBase_avx2(teddy);
+ const u32 *confBase = getConfBase_avx2(teddy, 2);
+
+ m256 res_old_1 = ones256();
+ const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+ DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+ if (ptr < mainStart) {
+ ptr = mainStart - 16;
+ m256 p_mask;
+ m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 2);
+ m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
+ ptr += 16;
+ }
+
+ if (ptr + 16 < buf_end) {
+ m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(),
+ load2x128(ptr));
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
+ ptr += 16;
+ }
+
+ for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+ __builtin_prefetch(ptr + (iterBytes*4));
+ CHECK_FLOOD;
+ m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(),
+ load2x128(ptr));
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
+ m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(),
+ load2x128(ptr + 16));
+ CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
+ }
+
+ for (; ptr < buf_end; ptr += 16) {
+ m256 p_mask;
+ m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 2);
+ m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
+ }
+ *a->groups = controlVal;
+ return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a) {
+ const u8 *buf_end = a->buf + a->len;
+ const u8 *ptr = a->buf + a->start_offset;
+ hwlmcb_rv_t controlVal = *a->groups;
+ hwlmcb_rv_t *control = &controlVal;
+ u32 floodBackoff = FLOOD_BACKOFF_START;
+ const u8 *tryFloodDetect = a->firstFloodDetect;
+ u32 last_match = (u32)-1;
+ const struct Teddy *teddy = (const struct Teddy *)fdr;
+ const size_t iterBytes = 32;
+ DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+ a->buf, a->len, a->start_offset);
+
+ const m256 *maskBase = getMaskBase_avx2(teddy);
+ const u32 *confBase = getConfBase_avx2(teddy, 2);
+
+ m256 res_old_1 = ones256();
+ const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+ DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+ if (ptr < mainStart) {
+ ptr = mainStart - 16;
+ m256 p_mask;
+ m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 2);
+ m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
+ ptr += 16;
+ }
+
+ if (ptr + 16 < buf_end) {
+ m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(),
+ load2x128(ptr));
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
+ ptr += 16;
+ }
+
+ for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+ __builtin_prefetch(ptr + (iterBytes*4));
+ CHECK_FLOOD;
+ m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(),
+ load2x128(ptr));
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
+ m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, ones256(),
+ load2x128(ptr + 16));
+ CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
+ }
+
+ for (; ptr < buf_end; ptr += 16) {
+ m256 p_mask;
+ m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 2);
+ m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, p_mask, val_0);
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
+ }
+ *a->groups = controlVal;
+ return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a) {
+ const u8 *buf_end = a->buf + a->len;
+ const u8 *ptr = a->buf + a->start_offset;
+ hwlmcb_rv_t controlVal = *a->groups;
+ hwlmcb_rv_t *control = &controlVal;
+ u32 floodBackoff = FLOOD_BACKOFF_START;
+ const u8 *tryFloodDetect = a->firstFloodDetect;
+ u32 last_match = (u32)-1;
+ const struct Teddy *teddy = (const struct Teddy *)fdr;
+ const size_t iterBytes = 32;
+ DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+ a->buf, a->len, a->start_offset);
+
+ const m256 *maskBase = getMaskBase_avx2(teddy);
+ const u32 *confBase = getConfBase_avx2(teddy, 3);
+
+ m256 res_old_1 = ones256();
+ m256 res_old_2 = ones256();
+ const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+ DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+ if (ptr < mainStart) {
+ ptr = mainStart - 16;
+ m256 p_mask;
+ m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 3);
+ m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
+ p_mask, val_0);
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
+ ptr += 16;
+ }
+
+ if (ptr + 16 < buf_end) {
+ m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
+ ones256(), load2x128(ptr));
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
+ ptr += 16;
+ }
+
+ for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+ __builtin_prefetch(ptr + (iterBytes*4));
+ CHECK_FLOOD;
+ m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
+ ones256(), load2x128(ptr));
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
+ m256 r_1 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
+ ones256(), load2x128(ptr + 16));
+ CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
+ }
+
+ for (; ptr < buf_end; ptr += 16) {
+ m256 p_mask;
+ m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 3);
+ m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
+ p_mask, val_0);
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
+ }
+ *a->groups = controlVal;
+ return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a) {
+ const u8 *buf_end = a->buf + a->len;
+ const u8 *ptr = a->buf + a->start_offset;
+ hwlmcb_rv_t controlVal = *a->groups;
+ hwlmcb_rv_t *control = &controlVal;
+ u32 floodBackoff = FLOOD_BACKOFF_START;
+ const u8 *tryFloodDetect = a->firstFloodDetect;
+ u32 last_match = (u32)-1;
+ const struct Teddy *teddy = (const struct Teddy *)fdr;
+ const size_t iterBytes = 32;
+ DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+ a->buf, a->len, a->start_offset);
+
+ const m256 *maskBase = getMaskBase_avx2(teddy);
+ const u32 *confBase = getConfBase_avx2(teddy, 3);
+
+ m256 res_old_1 = ones256();
+ m256 res_old_2 = ones256();
+ const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+ DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+ if (ptr < mainStart) {
+ ptr = mainStart - 16;
+ m256 p_mask;
+ m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 3);
+ m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
+ p_mask, val_0);
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
+ ptr += 16;
+ }
+
+ if (ptr + 16 < buf_end) {
+ m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
+ ones256(), load2x128(ptr));
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
+ ptr += 16;
+ }
+
+ for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+ __builtin_prefetch(ptr + (iterBytes*4));
+ CHECK_FLOOD;
+ m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
+ ones256(), load2x128(ptr));
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
+ m256 r_1 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
+ ones256(), load2x128(ptr + 16));
+ CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
+ }
+
+ for (; ptr < buf_end; ptr += 16) {
+ m256 p_mask;
+ m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 3);
+ m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
+ p_mask, val_0);
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
+ }
+ *a->groups = controlVal;
+ return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a) {
+ const u8 *buf_end = a->buf + a->len;
+ const u8 *ptr = a->buf + a->start_offset;
+ hwlmcb_rv_t controlVal = *a->groups;
+ hwlmcb_rv_t *control = &controlVal;
+ u32 floodBackoff = FLOOD_BACKOFF_START;
+ const u8 *tryFloodDetect = a->firstFloodDetect;
+ u32 last_match = (u32)-1;
+ const struct Teddy *teddy = (const struct Teddy *)fdr;
+ const size_t iterBytes = 32;
+ DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+ a->buf, a->len, a->start_offset);
+
+ const m256 *maskBase = getMaskBase_avx2(teddy);
+ const u32 *confBase = getConfBase_avx2(teddy, 4);
+
+ m256 res_old_1 = ones256();
+ m256 res_old_2 = ones256();
+ m256 res_old_3 = ones256();
+ const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+ DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+ if (ptr < mainStart) {
+ ptr = mainStart - 16;
+ m256 p_mask;
+ m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 4);
+ m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
+ &res_old_3, p_mask, val_0);
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
+ ptr += 16;
+ }
+
+ if (ptr + 16 < buf_end) {
+ m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
+ &res_old_3, ones256(),
+ load2x128(ptr));
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
+ ptr += 16;
+ }
+
+ for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+ __builtin_prefetch(ptr + (iterBytes*4));
+ CHECK_FLOOD;
+ m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
+ &res_old_3, ones256(),
+ load2x128(ptr));
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
+ m256 r_1 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
+ &res_old_3, ones256(),
+ load2x128(ptr + 16));
+ CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
+ }
+
+ for (; ptr < buf_end; ptr += 16) {
+ m256 p_mask;
+ m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 4);
+ m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
+ &res_old_3, p_mask, val_0);
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
+ }
+ *a->groups = controlVal;
+ return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a) {
+ const u8 *buf_end = a->buf + a->len;
+ const u8 *ptr = a->buf + a->start_offset;
+ hwlmcb_rv_t controlVal = *a->groups;
+ hwlmcb_rv_t *control = &controlVal;
+ u32 floodBackoff = FLOOD_BACKOFF_START;
+ const u8 *tryFloodDetect = a->firstFloodDetect;
+ u32 last_match = (u32)-1;
+ const struct Teddy *teddy = (const struct Teddy *)fdr;
+ const size_t iterBytes = 32;
+ DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+ a->buf, a->len, a->start_offset);
+
+ const m256 *maskBase = getMaskBase_avx2(teddy);
+ const u32 *confBase = getConfBase_avx2(teddy, 4);
+
+ m256 res_old_1 = ones256();
+ m256 res_old_2 = ones256();
+ m256 res_old_3 = ones256();
+ const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+ DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+ if (ptr < mainStart) {
+ ptr = mainStart - 16;
+ m256 p_mask;
+ m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 4);
+ m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
+ &res_old_3, p_mask, val_0);
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
+ ptr += 16;
+ }
+
+ if (ptr + 16 < buf_end) {
+ m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
+ &res_old_3, ones256(),
+ load2x128(ptr));
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
+ ptr += 16;
+ }
+
+ for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+ __builtin_prefetch(ptr + (iterBytes*4));
+ CHECK_FLOOD;
+ m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
+ &res_old_3, ones256(),
+ load2x128(ptr));
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
+ m256 r_1 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
+ &res_old_3, ones256(),
+ load2x128(ptr + 16));
+ CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
+ }
+
+ for (; ptr < buf_end; ptr += 16) {
+ m256 p_mask;
+ m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
+ a->buf_history, a->len_history, 4);
+ m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
+ &res_old_3, p_mask, val_0);
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
+ }
+ *a->groups = controlVal;
+ return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a) {
+ const u8 *buf_end = a->buf + a->len;
+ const u8 *ptr = a->buf + a->start_offset;
+ hwlmcb_rv_t controlVal = *a->groups;
+ hwlmcb_rv_t *control = &controlVal;
+ u32 floodBackoff = FLOOD_BACKOFF_START;
+ const u8 *tryFloodDetect = a->firstFloodDetect;
+ u32 last_match = (u32)-1;
+ const struct Teddy *teddy = (const struct Teddy *)fdr;
+ const size_t iterBytes = 64;
+ DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+ a->buf, a->len, a->start_offset);
+
+ const m128 *maskBase = getMaskBase(teddy);
+ const u32 *confBase = getConfBase(teddy, 1);
+
+ const m256 maskLo = set2x128(maskBase[0]);
+ const m256 maskHi = set2x128(maskBase[1]);
+ const m256 mask = set32x8(0xf);
+ u16 bitArr[512];
+
+ const u8 *mainStart = ROUNDUP_PTR(ptr, 32);
+ DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+ if (ptr < mainStart) {
+ ptr = mainStart - 32;
+ m256 p_mask;
+ m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
+ buf_end, a->buf_history, a->len_history);
+ m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
+ p_mask);
+ CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy);
+ ptr += 32;
+ }
+
+ if (ptr + 32 < buf_end) {
+ m256 val_0 = load256(ptr + 0);
+ m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
+ ones256());
+ CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy);
+ ptr += 32;
+ }
+
+ for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+ __builtin_prefetch(ptr + (iterBytes*4));
+ CHECK_FLOOD;
+
+ m256 val_0 = load256(ptr + 0);
+ m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
+ ones256());
+ CONFIRM_FAST_TEDDY(res_0, 0, NOT_CAUTIOUS, do_confWithBit1_fast_teddy);
+
+ m256 val_1 = load256(ptr + 32);
+ m256 res_1 = prep_conf_fast_teddy_m1(val_1, mask, maskLo, maskHi,
+ ones256());
+ CONFIRM_FAST_TEDDY(res_1, 4, NOT_CAUTIOUS, do_confWithBit1_fast_teddy);
+ }
+
+ for (; ptr < buf_end; ptr += 32) {
+ m256 p_mask;
+ m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
+ buf_end, a->buf_history, a->len_history);
+ m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
+ p_mask);
+ CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy);
+ }
+ *a->groups = controlVal;
+ return HWLM_SUCCESS;
+}
+
+hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a) {
+ const u8 *buf_end = a->buf + a->len;
+ const u8 *ptr = a->buf + a->start_offset;
+ hwlmcb_rv_t controlVal = *a->groups;
+ hwlmcb_rv_t *control = &controlVal;
+ u32 floodBackoff = FLOOD_BACKOFF_START;
+ const u8 *tryFloodDetect = a->firstFloodDetect;
+ u32 last_match = (u32)-1;
+ const struct Teddy *teddy = (const struct Teddy *)fdr;
+ const size_t iterBytes = 64;
+ DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+ a->buf, a->len, a->start_offset);
+
+ const m128 *maskBase = getMaskBase(teddy);
+ const u32 *confBase = getConfBase(teddy, 1);
+
+ const m256 maskLo = set2x128(maskBase[0]);
+ const m256 maskHi = set2x128(maskBase[1]);
+ const m256 mask = set32x8(0xf);
+ u16 bitArr[512];
+
+ const u8 *mainStart = ROUNDUP_PTR(ptr, 32);
+ DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+ if (ptr < mainStart) {
+ ptr = mainStart - 32;
+ m256 p_mask;
+ m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
+ buf_end, a->buf_history, a->len_history);
+ m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
+ p_mask);
+ CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy);
+ ptr += 32;
+ }
+
+ if (ptr + 32 < buf_end) {
+ m256 val_0 = load256(ptr + 0);
+ m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
+ ones256());
+ CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy);
+ ptr += 32;
+ }
+
+ for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+ __builtin_prefetch(ptr + (iterBytes*4));
+ CHECK_FLOOD;
+
+ m256 val_0 = load256(ptr + 0);
+ m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
+ ones256());
+ CONFIRM_FAST_TEDDY(res_0, 0, NOT_CAUTIOUS, do_confWithBit_fast_teddy);
+
+ m256 val_1 = load256(ptr + 32);
+ m256 res_1 = prep_conf_fast_teddy_m1(val_1, mask, maskLo, maskHi,
+ ones256());
+ CONFIRM_FAST_TEDDY(res_1, 4, NOT_CAUTIOUS, do_confWithBit_fast_teddy);
+ }
+
+ for (; ptr < buf_end; ptr += 32) {
+ m256 p_mask;
+ m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
+ buf_end, a->buf_history, a->len_history);
+ m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi,
+ p_mask);
+ CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy);
+ }
+ *a->groups = controlVal;
+ return HWLM_SUCCESS;
+}
+
+#endif // __AVX2__
return false;
}
-#include "teddy_autogen_compiler.cpp"
+void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {
+ static const TeddyEngineDef defns[] = {
+ { 1, 0 | HS_CPU_FEATURES_AVX2, 1, 8, false, 0, 1 },
+ { 2, 0 | HS_CPU_FEATURES_AVX2, 1, 8, true, 0, 32 },
+ { 3, 0 | HS_CPU_FEATURES_AVX2, 1, 16, false, 0, 1 },
+ { 4, 0 | HS_CPU_FEATURES_AVX2, 1, 16, true, 0, 32 },
+ { 5, 0 | HS_CPU_FEATURES_AVX2, 2, 16, false, 0, 1 },
+ { 6, 0 | HS_CPU_FEATURES_AVX2, 2, 16, true, 0, 32 },
+ { 7, 0 | HS_CPU_FEATURES_AVX2, 3, 16, false, 0, 1 },
+ { 8, 0 | HS_CPU_FEATURES_AVX2, 3, 16, true, 0, 32 },
+ { 9, 0 | HS_CPU_FEATURES_AVX2, 4, 16, false, 0, 1 },
+ { 10, 0 | HS_CPU_FEATURES_AVX2, 4, 16, true, 0, 32 },
+ { 11, 0, 1, 8, false, 0, 1 },
+ { 12, 0, 1, 8, true, 0, 32 },
+ { 13, 0, 2, 8, false, 0, 1 },
+ { 14, 0, 2, 8, true, 0, 32 },
+ { 15, 0, 3, 8, false, 0, 1 },
+ { 16, 0, 3, 8, true, 0, 32 },
+ { 17, 0, 4, 8, false, 0, 1 },
+ { 18, 0, 4, 8, true, 0, 32 },
+ };
+ out->clear();
+ for (const auto &def : defns) {
+ out->emplace_back(def);
+ }
+}
static
size_t maxFloodTailLen(const vector<hwlmLiteral> &vl) {
--- /dev/null
+/*
+ * Copyright (c) 2016, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Teddy literal matcher: common runtime procedures.
+ */
+
+#ifndef TEDDY_RUNTIME_COMMON_H_
+#define TEDDY_RUNTIME_COMMON_H_
+
+#include "fdr_confirm.h"
+#include "fdr_confirm_runtime.h"
+#include "ue2common.h"
+#include "util/bitutils.h"
+#include "util/simd_utils.h"
+
+extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
+
+#ifdef ARCH_64_BIT
+#define TEDDY_CONF_TYPE u64a
+#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_64(conf)
+#else
+#define TEDDY_CONF_TYPE u32
+#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_32(conf)
+#endif
+
+#define CHECK_HWLM_TERMINATE_MATCHING \
+do { \
+ if (unlikely(controlVal == HWLM_TERMINATE_MATCHING)) { \
+ *a->groups = controlVal; \
+ return HWLM_TERMINATED; \
+ } \
+} while (0);
+
+#define CHECK_FLOOD \
+do { \
+ if (unlikely(ptr > tryFloodDetect)) { \
+ tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, \
+ &floodBackoff, &controlVal, \
+ iterBytes); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+} while (0);
+
+/*
+ * \brief Copy a block of [0,15] bytes efficiently.
+ *
+ * This function is a workaround intended to stop some compilers from
+ * synthesizing a memcpy function call out of the copy of a small number of
+ * bytes that we do in vectoredLoad128.
+ */
+static really_inline
+void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) {
+ switch (len) {
+ case 0:
+ break;
+ case 1:
+ *dst = *src;
+ break;
+ case 2:
+ unaligned_store_u16(dst, unaligned_load_u16(src));
+ break;
+ case 3:
+ unaligned_store_u16(dst, unaligned_load_u16(src));
+ dst[2] = src[2];
+ break;
+ case 4:
+ unaligned_store_u32(dst, unaligned_load_u32(src));
+ break;
+ case 5:
+ case 6:
+ case 7:
+ /* Perform copy with two overlapping 4-byte chunks. */
+ unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
+ unaligned_store_u32(dst, unaligned_load_u32(src));
+ break;
+ case 8:
+ unaligned_store_u64a(dst, unaligned_load_u64a(src));
+ break;
+ default:
+ /* Perform copy with two overlapping 8-byte chunks. */
+ assert(len < 16);
+ unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
+ unaligned_store_u64a(dst, unaligned_load_u64a(src));
+ break;
+ }
+}
+
+// Note: p_mask is an output param that initialises a poison mask.
+static really_inline
+m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
+ const u8 *buf_history, size_t len_history,
+ const u32 nMasks) {
+ union {
+ u8 val8[16];
+ m128 val128;
+ } u;
+ u.val128 = zeroes128();
+
+ uintptr_t copy_start;
+ uintptr_t copy_len;
+
+ if (ptr >= lo) {
+ uintptr_t avail = (uintptr_t)(hi - ptr);
+ if (avail >= 16) {
+ *p_mask = load128(p_mask_arr[16] + 16);
+ return loadu128(ptr);
+ }
+ *p_mask = load128(p_mask_arr[avail] + 16);
+ copy_start = 0;
+ copy_len = avail;
+ } else {
+ uintptr_t need = MIN((uintptr_t)(lo - ptr),
+ MIN(len_history, nMasks - 1));
+ uintptr_t start = (uintptr_t)(lo - ptr);
+ uintptr_t i;
+ for (i = start - need; ptr + i < lo; i++) {
+ u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
+ }
+ uintptr_t end = MIN(16, (uintptr_t)(hi - ptr));
+ *p_mask = loadu128(p_mask_arr[end - start] + 16 - start);
+ copy_start = i;
+ copy_len = end - i;
+ }
+
+ // Runt block from the buffer.
+ copyRuntBlock128(&u.val8[copy_start], &ptr[copy_start], copy_len);
+
+ return u.val128;
+}
+
+static really_inline
+u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte,
+ CautionReason reason) {
+ u64a confVal = 0;
+ const u8 *buf = a->buf;
+ size_t len = a->len;
+ const u8 *confirm_loc = ptr + byte - 7;
+ if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
+ confVal = lv_u64a(confirm_loc, buf, buf + len);
+ } else { // r == VECTORING, confirm_loc < buf
+ u64a histBytes = a->histBytes;
+ confVal = lv_u64a_ce(confirm_loc, buf, buf + len);
+ // stitch together confVal and history
+ u32 overhang = buf - confirm_loc;
+ histBytes >>= 64 - (overhang * 8);
+ confVal |= histBytes;
+ }
+ return confVal;
+}
+
+static really_inline
+void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
+ const u32 *confBase, CautionReason reason,
+ const struct FDR_Runtime_Args *a, const u8 *ptr,
+ hwlmcb_rv_t *control, u32 *last_match) {
+ do {
+ u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
+ u32 byte = bit / bucket + offset;
+ u32 bitRem = bit % bucket;
+ u32 confSplit = *(ptr+byte) & 0x1f;
+ u32 idx = confSplit * bucket + bitRem;
+ u32 cf = confBase[idx];
+ if (!cf) {
+ continue;
+ }
+ const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
+ ((const u8 *)confBase + cf);
+ if (!(fdrc->groups & *control)) {
+ continue;
+ }
+ u64a confVal = getConfVal(a, ptr, byte, reason);
+ confWithBit(fdrc, a, ptr - a->buf + byte, 0, control,
+ last_match, confVal);
+ } while (unlikely(*conf));
+}
+
+static really_inline
+void do_confWithBit1_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
+ const u32 *confBase, CautionReason reason,
+ const struct FDR_Runtime_Args *a, const u8 *ptr,
+ hwlmcb_rv_t *control, u32 *last_match) {
+ do {
+ u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
+ u32 byte = bit / bucket + offset;
+ u32 idx = bit % bucket;
+ u32 cf = confBase[idx];
+ const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
+ ((const u8 *)confBase + cf);
+ if (!(fdrc->groups & *control)) {
+ continue;
+ }
+ u64a confVal = getConfVal(a, ptr, byte, reason);
+ confWithBit1(fdrc, a, ptr - a->buf + byte, control, last_match,
+ confVal);
+ } while (unlikely(*conf));
+}
+
+static really_inline
+void do_confWithBitMany_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
+ const u32 *confBase, CautionReason reason,
+ const struct FDR_Runtime_Args *a, const u8 *ptr,
+ hwlmcb_rv_t *control, u32 *last_match) {
+ do {
+ u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
+ u32 byte = bit / bucket + offset;
+ u32 idx = bit % bucket;
+ u32 cf = confBase[idx];
+ const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
+ ((const u8 *)confBase + cf);
+ if (!(fdrc->groups & *control)) {
+ continue;
+ }
+ u64a confVal = getConfVal(a, ptr, byte, reason);
+ confWithBitMany(fdrc, a, ptr - a->buf + byte, reason, control,
+ last_match, confVal);
+ } while (unlikely(*conf));
+}
+
+static really_inline
+const m128 * getMaskBase(const struct Teddy *teddy) {
+ return (const m128 *)((const u8 *)teddy + sizeof(struct Teddy));
+}
+
+static really_inline
+const u32 * getConfBase(const struct Teddy *teddy, u8 numMask) {
+ return (const u32 *)((const u8 *)teddy + sizeof(struct Teddy) +
+ (numMask*32));
+}
+
+#endif /* TEDDY_RUNTIME_COMMON_H_ */