struct FDRConfirm {
CONF_TYPE andmsk;
CONF_TYPE mult;
- u32 nBitsOrSoleID; // if flags is NO_CONFIRM then this is soleID
- u32 flags; // sole meaning is 'non-zero means no-confirm' (that is all)
+ u32 nBits;
hwlm_group_t groups;
- u32 soleLitSize;
- u32 soleLitCmp;
- u32 soleLitMsk;
};
static really_inline
static
bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
- bool make_small, bool make_confirm) {
+ bool make_small) {
// Every literal must fit within CONF_TYPE.
assert(all_of_in(lits, [](const hwlmLiteral &lit) {
return lit.s.size() <= sizeof(CONF_TYPE);
}
CONF_TYPE mult = (CONF_TYPE)0x0b4e0ef37bc32127ULL;
- u32 flags = 0;
- // we use next three variables for 'confirmless' case to speed-up
- // confirmation process
- u32 soleLitSize = 0;
- u32 soleLitCmp = 0;
- u32 soleLitMsk = 0;
-
- if (!make_confirm) {
- flags = FDRC_FLAG_NO_CONFIRM;
- if (lits[0].noruns) {
- // messy - need to clean this up later as flags is sorta kinda
- // obsoleted
- flags |= FDRC_FLAG_NOREPEAT;
- }
- mult = 0;
- soleLitSize = lits[0].s.size() - 1;
- // we can get to this point only in confirmless case;
- // it means that we have only one literal per FDRConfirm (no packing),
- // with no literal mask and size of literal is less or equal
- // to the number of masks of Teddy engine;
- // maximum number of masks for Teddy is 4, so the size of
- // literal is definitely less or equal to size of u32
- assert(lits[0].s.size() <= sizeof(u32));
- for (u32 i = 0; i < lits[0].s.size(); i++) {
- u32 shiftLoc = (sizeof(u32) - i - 1) * 8;
- u8 c = lits[0].s[lits[0].s.size() - i - 1];
- if (lits[0].nocase && ourisalpha(c)) {
- soleLitCmp |= (u32)(c & CASE_CLEAR) << shiftLoc;
- soleLitMsk |= (u32)CASE_CLEAR << shiftLoc;
- }
- else {
- soleLitCmp |= (u32)c << shiftLoc;
- soleLitMsk |= (u32)0xff << shiftLoc;
- }
- }
- }
// we can walk the vector and assign elements from the vectors to a
// map by hash value
fdrc->andmsk = andmsk;
fdrc->mult = mult;
- fdrc->nBitsOrSoleID = (flags & FDRC_FLAG_NO_CONFIRM) ? lits[0].id : nBits;
- fdrc->flags = flags;
- fdrc->soleLitSize = soleLitSize;
- fdrc->soleLitCmp = soleLitCmp;
- fdrc->soleLitMsk = soleLitMsk;
+ fdrc->nBits = nBits;
fdrc->groups = gm;
const EngineDescription &eng,
map<BucketIndex, vector<LiteralIndex>> &bucketToLits,
bool make_small) {
- bool makeConfirm = true;
unique_ptr<TeddyEngineDescription> teddyDescr =
getTeddyDescription(eng.getID());
- if (teddyDescr) {
- makeConfirm = teddyDescr->needConfirm(lits);
- }
BC2CONF bc2Conf;
u32 totalConfirmSize = 0;
}
DEBUG_PRINTF("b %d sz %zu\n", b, vl.size());
- auto fc = getFDRConfirm(vl, make_small, makeConfirm);
+ auto fc = getFDRConfirm(vl, make_small);
totalConfirmSize += fc.size();
bc2Conf.emplace(b, move(fc));
}
size_t i, hwlmcb_rv_t *control, u32 *last_match,
u64a conf_key) {
assert(i < a->len);
+ assert(i >= a->start_offset);
assert(ISALIGNED(fdrc));
const u8 * buf = a->buf;
u32 c = CONF_HASH_CALL(conf_key, fdrc->andmsk, fdrc->mult,
- fdrc->nBitsOrSoleID);
+ fdrc->nBits);
u32 start = getConfirmLitIndex(fdrc)[c];
if (likely(!start)) {
return;
} while (oldNext);
}
-// 'light-weight' confirmation function which is used by 1-mask Teddy;
-// in the 'confirmless' case it simply calls callback function,
-// otherwise it calls 'confWithBit' function for the full confirmation procedure
-static really_inline
-void confWithBit1(const struct FDRConfirm *fdrc,
- const struct FDR_Runtime_Args *a, size_t i,
- hwlmcb_rv_t *control, u32 *last_match, u64a conf_key) {
- assert(i < a->len);
- assert(ISALIGNED(fdrc));
-
- if (unlikely(fdrc->mult)) {
- confWithBit(fdrc, a, i, control, last_match, conf_key);
- return;
- } else {
- u32 id = fdrc->nBitsOrSoleID;
-
- if ((*last_match == id) && (fdrc->flags & FDRC_FLAG_NOREPEAT)) {
- return;
- }
- *last_match = id;
- *control = a->cb(i, i, id, a->ctxt);
- }
-}
-
-// This is 'light-weight' confirmation function which is used by 2-3-4-mask Teddy
-// In the 'confirmless' case it makes fast 32-bit comparison,
-// otherwise it calls 'confWithBit' function for the full confirmation procedure
-static really_inline
-void confWithBitMany(const struct FDRConfirm *fdrc,
- const struct FDR_Runtime_Args *a, size_t i, CautionReason r,
- hwlmcb_rv_t *control, u32 *last_match, u64a conf_key) {
- assert(i < a->len);
- assert(ISALIGNED(fdrc));
-
- if (i < a->start_offset) {
- return;
- }
-
- if (unlikely(fdrc->mult)) {
- confWithBit(fdrc, a, i, control, last_match, conf_key);
- return;
- } else {
- const u32 id = fdrc->nBitsOrSoleID;
- const u32 len = fdrc->soleLitSize;
-
- if ((*last_match == id) && (fdrc->flags & FDRC_FLAG_NOREPEAT)) {
- return;
- }
-
- if (r == VECTORING && len > i - a->start_offset) {
- if (len > i + a->len_history) {
- return;
- }
-
- u32 cmp = (u32)a->buf[i] << 24;
-
- if (len <= i) {
- for (u32 j = 1; j <= len; j++) {
- cmp |= (u32)a->buf[i - j] << (24 - (j * 8));
- }
- } else {
- for (u32 j = 1; j <= i; j++) {
- cmp |= (u32)a->buf[i - j] << (24 - (j * 8));
- }
- cmp |= (u32)(a->histBytes >> (40 + i * 8));
- }
-
- if ((fdrc->soleLitMsk & cmp) != fdrc->soleLitCmp) {
- return;
- }
- }
- *last_match = id;
- *control = a->cb(i - len, i, id, a->ctxt);
- }
-}
-
#endif
#include "util/simd_utils.h"
const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00},
- {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
};
+#if defined(__AVX2__) // reinforced teddy
+
+#ifdef ARCH_64_BIT
+#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
+do { \
+ if (unlikely(diff256(var, ones256()))) { \
+ m128 lo = movdq_lo(var); \
+ m128 hi = movdq_hi(var); \
+ u64a part1 = movq(lo); \
+ u64a part2 = movq(rshiftbyte_m128(lo, 8)); \
+ u64a part3 = movq(hi); \
+ u64a part4 = movq(rshiftbyte_m128(hi, 8)); \
+ if (unlikely(part1 != ones_u64a)) { \
+ part1 = ~part1; \
+ conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \
+ &control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ if (unlikely(part2 != ones_u64a)) { \
+ part2 = ~part2; \
+ conf_fn(&part2, bucket, offset + 8, confBase, reason, a, ptr, \
+ &control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ if (unlikely(part3 != ones_u64a)) { \
+ part3 = ~part3; \
+ conf_fn(&part3, bucket, offset + 16, confBase, reason, a, ptr, \
+ &control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ if (unlikely(part4 != ones_u64a)) { \
+ part4 = ~part4; \
+ conf_fn(&part4, bucket, offset + 24, confBase, reason, a, ptr, \
+ &control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ } \
+} while(0)
+#else
+#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
+do { \
+ if (unlikely(diff256(var, ones256()))) { \
+ m128 lo = movdq_lo(var); \
+ m128 hi = movdq_hi(var); \
+ u32 part1 = movd(lo); \
+ u32 part2 = movd(rshiftbyte_m128(lo, 4)); \
+ u32 part3 = movd(rshiftbyte_m128(lo, 8)); \
+ u32 part4 = movd(rshiftbyte_m128(lo, 12)); \
+ u32 part5 = movd(hi); \
+ u32 part6 = movd(rshiftbyte_m128(hi, 4)); \
+ u32 part7 = movd(rshiftbyte_m128(hi, 8)); \
+ u32 part8 = movd(rshiftbyte_m128(hi, 12)); \
+ if (unlikely(part1 != ones_u32)) { \
+ part1 = ~part1; \
+ conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \
+ &control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ if (unlikely(part2 != ones_u32)) { \
+ part2 = ~part2; \
+ conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr, \
+ &control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ if (unlikely(part3 != ones_u32)) { \
+ part3 = ~part3; \
+ conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr, \
+ &control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ if (unlikely(part4 != ones_u32)) { \
+ part4 = ~part4; \
+ conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr, \
+ &control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ if (unlikely(part5 != ones_u32)) { \
+ part5 = ~part5; \
+ conf_fn(&part5, bucket, offset + 16, confBase, reason, a, ptr, \
+ &control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ if (unlikely(part6 != ones_u32)) { \
+ part6 = ~part6; \
+ conf_fn(&part6, bucket, offset + 20, confBase, reason, a, ptr, \
+ &control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ if (unlikely(part7 != ones_u32)) { \
+ part7 = ~part7; \
+ conf_fn(&part7, bucket, offset + 24, confBase, reason, a, ptr, \
+ &control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ if (unlikely(part8 != ones_u32)) { \
+ part8 = ~part8; \
+ conf_fn(&part8, bucket, offset + 28, confBase, reason, a, ptr, \
+ &control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+ } \
+} while(0)
+#endif
+
+#define PREP_SHUF_MASK_NO_REINFORCEMENT(val) \
+ m256 lo = and256(val, *lo_mask); \
+ m256 hi = and256(rshift64_m256(val, 4), *lo_mask)
+
+#define PREP_SHUF_MASK \
+ PREP_SHUF_MASK_NO_REINFORCEMENT(load256(ptr)); \
+ *c_128 = *(ptr + 15); \
+ m256 r_msk = set64x4(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]); \
+ *c_0 = *(ptr + 31)
+
+#define SHIFT_OR_M1 \
+ or256(pshufb_m256(dup_mask[0], lo), pshufb_m256(dup_mask[1], hi))
+
+#define SHIFT_OR_M2 \
+ or256(lshift128_m256(or256(pshufb_m256(dup_mask[2], lo), \
+ pshufb_m256(dup_mask[3], hi)), \
+ 1), SHIFT_OR_M1)
+
+#define SHIFT_OR_M3 \
+ or256(lshift128_m256(or256(pshufb_m256(dup_mask[4], lo), \
+ pshufb_m256(dup_mask[5], hi)), \
+ 2), SHIFT_OR_M2)
+
+#define SHIFT_OR_M4 \
+ or256(lshift128_m256(or256(pshufb_m256(dup_mask[6], lo), \
+ pshufb_m256(dup_mask[7], hi)), \
+ 3), SHIFT_OR_M3)
+
+static really_inline
+m256 prep_conf_teddy_no_reinforcement_m1(const m256 *lo_mask,
+ const m256 *dup_mask,
+ const m256 val) {
+ PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+ return SHIFT_OR_M1;
+}
+
+static really_inline
+m256 prep_conf_teddy_no_reinforcement_m2(const m256 *lo_mask,
+ const m256 *dup_mask,
+ const m256 val) {
+ PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+ return SHIFT_OR_M2;
+}
+
+static really_inline
+m256 prep_conf_teddy_no_reinforcement_m3(const m256 *lo_mask,
+ const m256 *dup_mask,
+ const m256 val) {
+ PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+ return SHIFT_OR_M3;
+}
+
+static really_inline
+m256 prep_conf_teddy_no_reinforcement_m4(const m256 *lo_mask,
+ const m256 *dup_mask,
+ const m256 val) {
+ PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+ return SHIFT_OR_M4;
+}
+
+static really_inline
+m256 prep_conf_teddy_m1(const m256 *lo_mask, const m256 *dup_mask,
+ const u8 *ptr, const u64a *r_msk_base,
+ u32 *c_0, u32 *c_128) {
+ PREP_SHUF_MASK;
+ return or256(SHIFT_OR_M1, r_msk);
+}
+
+static really_inline
+m256 prep_conf_teddy_m2(const m256 *lo_mask, const m256 *dup_mask,
+ const u8 *ptr, const u64a *r_msk_base,
+ u32 *c_0, u32 *c_128) {
+ PREP_SHUF_MASK;
+ return or256(SHIFT_OR_M2, r_msk);
+}
+
+static really_inline
+m256 prep_conf_teddy_m3(const m256 *lo_mask, const m256 *dup_mask,
+ const u8 *ptr, const u64a *r_msk_base,
+ u32 *c_0, u32 *c_128) {
+ PREP_SHUF_MASK;
+ return or256(SHIFT_OR_M3, r_msk);
+}
+
+static really_inline
+m256 prep_conf_teddy_m4(const m256 *lo_mask, const m256 *dup_mask,
+ const u8 *ptr, const u64a *r_msk_base,
+ u32 *c_0, u32 *c_128) {
+ PREP_SHUF_MASK;
+ return or256(SHIFT_OR_M4, r_msk);
+}
+
+#else // not defined __AVX2__
+
#ifdef ARCH_64_BIT
#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
do { \
- if (unlikely(isnonzero128(var))) { \
+ if (unlikely(diff128(var, ones128()))) { \
u64a lo = movq(var); \
u64a hi = movq(rshiftbyte_m128(var, 8)); \
- if (unlikely(lo)) { \
+ if (unlikely(lo != ones_u64a)) { \
+ lo = ~lo; \
conf_fn(&lo, bucket, offset, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
- if (unlikely(hi)) { \
+ if (unlikely(hi != ones_u64a)) { \
+ hi = ~hi; \
conf_fn(&hi, bucket, offset + 8, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
} \
-} while (0);
+} while(0)
#else
#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
do { \
- if (unlikely(isnonzero128(var))) { \
+ if (unlikely(diff128(var, ones128()))) { \
u32 part1 = movd(var); \
u32 part2 = movd(rshiftbyte_m128(var, 4)); \
u32 part3 = movd(rshiftbyte_m128(var, 8)); \
u32 part4 = movd(rshiftbyte_m128(var, 12)); \
- if (unlikely(part1)) { \
+ if (unlikely(part1 != ones_u32)) { \
+ part1 = ~part1; \
conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
- if (unlikely(part2)) { \
+ if (unlikely(part2 != ones_u32)) { \
+ part2 = ~part2; \
conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
- if (unlikely(part3)) { \
+ if (unlikely(part3 != ones_u32)) { \
+ part3 = ~part3; \
conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
- if (unlikely(part4)) { \
+ if (unlikely(part4 != ones_u32)) { \
+ part4 = ~part4; \
conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
} \
-} while (0);
+} while(0)
#endif
static really_inline
m128 mask = set16x8(0xf);
m128 lo = and128(val, mask);
m128 hi = and128(rshift64_m128(val, 4), mask);
- return and128(pshufb_m128(maskBase[0 * 2], lo),
- pshufb_m128(maskBase[0 * 2 + 1], hi));
+ return or128(pshufb_m128(maskBase[0 * 2], lo),
+ pshufb_m128(maskBase[0 * 2 + 1], hi));
}
static really_inline
m128 hi = and128(rshift64_m128(val, 4), mask);
m128 r = prep_conf_teddy_m1(maskBase, val);
- m128 res_1 = and128(pshufb_m128(maskBase[1*2], lo),
- pshufb_m128(maskBase[1*2+1], hi));
- m128 res_shifted_1 = palignr(res_1, *old_1, 16-1);
+ m128 res_1 = or128(pshufb_m128(maskBase[1 * 2], lo),
+ pshufb_m128(maskBase[1 * 2 + 1], hi));
+ m128 res_shifted_1 = palignr(res_1, *old_1, 16 - 1);
*old_1 = res_1;
- return and128(r, res_shifted_1);
+ return or128(r, res_shifted_1);
}
static really_inline
m128 hi = and128(rshift64_m128(val, 4), mask);
m128 r = prep_conf_teddy_m2(maskBase, old_1, val);
- m128 res_2 = and128(pshufb_m128(maskBase[2*2], lo),
- pshufb_m128(maskBase[2*2+1], hi));
- m128 res_shifted_2 = palignr(res_2, *old_2, 16-2);
+ m128 res_2 = or128(pshufb_m128(maskBase[2 * 2], lo),
+ pshufb_m128(maskBase[2 * 2 + 1], hi));
+ m128 res_shifted_2 = palignr(res_2, *old_2, 16 - 2);
*old_2 = res_2;
- return and128(r, res_shifted_2);
+ return or128(r, res_shifted_2);
}
static really_inline
m128 hi = and128(rshift64_m128(val, 4), mask);
m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, val);
- m128 res_3 = and128(pshufb_m128(maskBase[3*2], lo),
- pshufb_m128(maskBase[3*2+1], hi));
- m128 res_shifted_3 = palignr(res_3, *old_3, 16-3);
+ m128 res_3 = or128(pshufb_m128(maskBase[3 * 2], lo),
+ pshufb_m128(maskBase[3 * 2 + 1], hi));
+ m128 res_shifted_3 = palignr(res_3, *old_3, 16 - 3);
*old_3 = res_3;
- return and128(r, res_shifted_3);
+ return or128(r, res_shifted_3);
}
+#endif // __AVX2__
+
+#if defined(__AVX2__) // reinforced teddy
+
+#define PREP_CONF_FN_NO_REINFORCEMENT(val, n) \
+ prep_conf_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val)
+
+#define PREP_CONF_FN(ptr, n) \
+ prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base, &c_0, &c_128)
+
+#define PREPARE_MASKS_1 \
+ dup_mask[0] = set2x128(maskBase[0]); \
+ dup_mask[1] = set2x128(maskBase[1]);
+
+#define PREPARE_MASKS_2 \
+ PREPARE_MASKS_1 \
+ dup_mask[2] = set2x128(maskBase[2]); \
+ dup_mask[3] = set2x128(maskBase[3]);
+
+#define PREPARE_MASKS_3 \
+ PREPARE_MASKS_2 \
+ dup_mask[4] = set2x128(maskBase[4]); \
+ dup_mask[5] = set2x128(maskBase[5]);
+
+#define PREPARE_MASKS_4 \
+ PREPARE_MASKS_3 \
+ dup_mask[6] = set2x128(maskBase[6]); \
+ dup_mask[7] = set2x128(maskBase[7]);
+
+#define PREPARE_MASKS(n) \
+ m256 lo_mask = set32x8(0xf); \
+ m256 dup_mask[n * 2]; \
+ PREPARE_MASKS_##n
+
+#else // not defined __AVX2__
+
+#define FDR_EXEC_TEDDY_RES_OLD_1
+
+#define FDR_EXEC_TEDDY_RES_OLD_2 \
+ m128 res_old_1 = zeroes128();
+
+#define FDR_EXEC_TEDDY_RES_OLD_3 \
+ m128 res_old_1 = zeroes128(); \
+ m128 res_old_2 = zeroes128();
+
+#define FDR_EXEC_TEDDY_RES_OLD_4 \
+ m128 res_old_1 = zeroes128(); \
+ m128 res_old_2 = zeroes128(); \
+ m128 res_old_3 = zeroes128();
+
+#define FDR_EXEC_TEDDY_RES_OLD(n) FDR_EXEC_TEDDY_RES_OLD_##n
+
+#define PREP_CONF_FN_1(mask_base, val) \
+ prep_conf_teddy_m1(mask_base, val)
+
+#define PREP_CONF_FN_2(mask_base, val) \
+ prep_conf_teddy_m2(mask_base, &res_old_1, val)
+
+#define PREP_CONF_FN_3(mask_base, val) \
+ prep_conf_teddy_m3(mask_base, &res_old_1, &res_old_2, val)
+
+#define PREP_CONF_FN_4(mask_base, val) \
+ prep_conf_teddy_m4(mask_base, &res_old_1, &res_old_2, &res_old_3, val)
+
+#define PREP_CONF_FN(mask_base, val, n) \
+ PREP_CONF_FN_##n(mask_base, val)
+#endif // __AVX2__
+
+
+#if defined(__AVX2__) // reinforced teddy
+#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn) \
+do { \
+ const u8 *buf_end = a->buf + a->len; \
+ const u8 *ptr = a->buf + a->start_offset; \
+ u32 floodBackoff = FLOOD_BACKOFF_START; \
+ const u8 *tryFloodDetect = a->firstFloodDetect; \
+ u32 last_match = (u32)-1; \
+ const struct Teddy *teddy = (const struct Teddy *)fdr; \
+ const size_t iterBytes = 64; \
+ DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \
+ a->buf, a->len, a->start_offset); \
+ \
+ const m128 *maskBase = getMaskBase(teddy); \
+ PREPARE_MASKS(n_msk); \
+ const u32 *confBase = getConfBase(teddy); \
+ \
+ const u64a *r_msk_base = getReinforcedMaskBase(teddy, n_msk); \
+ u32 c_0 = 0x100; \
+ u32 c_128 = 0x100; \
+ const u8 *mainStart = ROUNDUP_PTR(ptr, 32); \
+ DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \
+ if (ptr < mainStart) { \
+ ptr = mainStart - 32; \
+ m256 p_mask; \
+ m256 val_0 = vectoredLoad256(&p_mask, ptr, a->start_offset, \
+ a->buf, buf_end, \
+ a->buf_history, a->len_history, n_msk); \
+ m256 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk); \
+ c_0 = *(ptr + 31); \
+ r_0 = or256(r_0, p_mask); \
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \
+ ptr += 32; \
+ } \
+ \
+ if (ptr + 32 <= buf_end) { \
+ m256 r_0 = PREP_CONF_FN(ptr, n_msk); \
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \
+ ptr += 32; \
+ } \
+ \
+ for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { \
+ __builtin_prefetch(ptr + (iterBytes * 4)); \
+ CHECK_FLOOD; \
+ m256 r_0 = PREP_CONF_FN(ptr, n_msk); \
+ CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \
+ m256 r_1 = PREP_CONF_FN(ptr + 32, n_msk); \
+ CONFIRM_TEDDY(r_1, 8, 32, NOT_CAUTIOUS, conf_fn); \
+ } \
+ \
+ if (ptr + 32 <= buf_end) { \
+ m256 r_0 = PREP_CONF_FN(ptr, n_msk); \
+ CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \
+ ptr += 32; \
+ } \
+ \
+ assert(ptr + 32 > buf_end); \
+ if (ptr < buf_end) { \
+ m256 p_mask; \
+ m256 val_0 = vectoredLoad256(&p_mask, ptr, 0, ptr, buf_end, \
+ a->buf_history, a->len_history, n_msk); \
+ m256 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk); \
+ r_0 = or256(r_0, p_mask); \
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \
+ } \
+ \
+ return HWLM_SUCCESS; \
+} while(0)
+#else // not defined __AVX2__
+#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn) \
+do { \
+ const u8 *buf_end = a->buf + a->len; \
+ const u8 *ptr = a->buf + a->start_offset; \
+ u32 floodBackoff = FLOOD_BACKOFF_START; \
+ const u8 *tryFloodDetect = a->firstFloodDetect; \
+ u32 last_match = (u32)-1; \
+ const struct Teddy *teddy = (const struct Teddy *)fdr; \
+ const size_t iterBytes = 32; \
+ DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \
+ a->buf, a->len, a->start_offset); \
+ \
+ const m128 *maskBase = getMaskBase(teddy); \
+ const u32 *confBase = getConfBase(teddy); \
+ \
+ FDR_EXEC_TEDDY_RES_OLD(n_msk); \
+ const u8 *mainStart = ROUNDUP_PTR(ptr, 16); \
+ DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \
+ if (ptr < mainStart) { \
+ ptr = mainStart - 16; \
+ m128 p_mask; \
+ m128 val_0 = vectoredLoad128(&p_mask, ptr, a->start_offset, \
+ a->buf, buf_end, \
+ a->buf_history, a->len_history, n_msk); \
+ m128 r_0 = PREP_CONF_FN(maskBase, val_0, n_msk); \
+ r_0 = or128(r_0, p_mask); \
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \
+ ptr += 16; \
+ } \
+ \
+ if (ptr + 16 <= buf_end) { \
+ m128 r_0 = PREP_CONF_FN(maskBase, load128(ptr), n_msk); \
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \
+ ptr += 16; \
+ } \
+ \
+ for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { \
+ __builtin_prefetch(ptr + (iterBytes * 4)); \
+ CHECK_FLOOD; \
+ m128 r_0 = PREP_CONF_FN(maskBase, load128(ptr), n_msk); \
+ CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \
+ m128 r_1 = PREP_CONF_FN(maskBase, load128(ptr + 16), n_msk); \
+ CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, conf_fn); \
+ } \
+ \
+ if (ptr + 16 <= buf_end) { \
+ m128 r_0 = PREP_CONF_FN(maskBase, load128(ptr), n_msk); \
+ CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \
+ ptr += 16; \
+ } \
+ \
+ assert(ptr + 16 > buf_end); \
+ if (ptr < buf_end) { \
+ m128 p_mask; \
+ m128 val_0 = vectoredLoad128(&p_mask, ptr, 0, ptr, buf_end, \
+ a->buf_history, a->len_history, n_msk); \
+ m128 r_0 = PREP_CONF_FN(maskBase, val_0, n_msk); \
+ r_0 = or128(r_0, p_mask); \
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \
+ } \
+ \
+ return HWLM_SUCCESS; \
+} while(0)
+#endif // __AVX2__
+
hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
- const u8 *buf_end = a->buf + a->len;
- const u8 *ptr = a->buf + a->start_offset;
- u32 floodBackoff = FLOOD_BACKOFF_START;
- const u8 *tryFloodDetect = a->firstFloodDetect;
- u32 last_match = (u32)-1;
- const struct Teddy *teddy = (const struct Teddy *)fdr;
- const size_t iterBytes = 32;
- DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
- a->buf, a->len, a->start_offset);
-
- const m128 *maskBase = getMaskBase(teddy);
- const u32 *confBase = getConfBase(teddy);
-
- const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
- DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
- if (ptr < mainStart) {
- ptr = mainStart - 16;
- m128 p_mask;
- m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 1);
- m128 r_0 = prep_conf_teddy_m1(maskBase, val_0);
- r_0 = and128(r_0, p_mask);
- CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
- ptr += 16;
- }
-
- if (ptr + 16 < buf_end) {
- m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr));
- CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
- ptr += 16;
- }
-
- for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
- __builtin_prefetch(ptr + (iterBytes*4));
- CHECK_FLOOD;
- m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr));
- CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit1_teddy);
- m128 r_1 = prep_conf_teddy_m1(maskBase, load128(ptr + 16));
- CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit1_teddy);
- }
-
- for (; ptr < buf_end; ptr += 16) {
- m128 p_mask;
- m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 1);
- m128 r_0 = prep_conf_teddy_m1(maskBase, val_0);
- r_0 = and128(r_0, p_mask);
- CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit1_teddy);
- }
-
- return HWLM_SUCCESS;
+ FDR_EXEC_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
- const u8 *buf_end = a->buf + a->len;
- const u8 *ptr = a->buf + a->start_offset;
- u32 floodBackoff = FLOOD_BACKOFF_START;
- const u8 *tryFloodDetect = a->firstFloodDetect;
- u32 last_match = (u32)-1;
- const struct Teddy *teddy = (const struct Teddy *)fdr;
- const size_t iterBytes = 32;
- DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
- a->buf, a->len, a->start_offset);
-
- const m128 *maskBase = getMaskBase(teddy);
- const u32 *confBase = getConfBase(teddy);
-
- const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
- DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
- if (ptr < mainStart) {
- ptr = mainStart - 16;
- m128 p_mask;
- m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 1);
- m128 r_0 = prep_conf_teddy_m1(maskBase, val_0);
- r_0 = and128(r_0, p_mask);
- CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
- ptr += 16;
- }
-
- if (ptr + 16 < buf_end) {
- m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr));
- CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
- ptr += 16;
- }
-
- for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
- __builtin_prefetch(ptr + (iterBytes*4));
- CHECK_FLOOD;
- m128 r_0 = prep_conf_teddy_m1(maskBase, load128(ptr));
- CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
- m128 r_1 = prep_conf_teddy_m1(maskBase, load128(ptr + 16));
- CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
- }
-
- for (; ptr < buf_end; ptr += 16) {
- m128 p_mask;
- m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 1);
- m128 r_0 = prep_conf_teddy_m1(maskBase, val_0);
- r_0 = and128(r_0, p_mask);
- CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
- }
-
- return HWLM_SUCCESS;
+ FDR_EXEC_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
- const u8 *buf_end = a->buf + a->len;
- const u8 *ptr = a->buf + a->start_offset;
- u32 floodBackoff = FLOOD_BACKOFF_START;
- const u8 *tryFloodDetect = a->firstFloodDetect;
- u32 last_match = (u32)-1;
- const struct Teddy *teddy = (const struct Teddy *)fdr;
- const size_t iterBytes = 32;
- DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
- a->buf, a->len, a->start_offset);
-
- const m128 *maskBase = getMaskBase(teddy);
- const u32 *confBase = getConfBase(teddy);
-
- m128 res_old_1 = ones128();
- const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
- DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
- if (ptr < mainStart) {
- ptr = mainStart - 16;
- m128 p_mask;
- m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 2);
- m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0);
- r_0 = and128(r_0, p_mask);
- CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
- ptr += 16;
- }
-
- if (ptr + 16 < buf_end) {
- m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr));
- CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
- ptr += 16;
- }
-
- for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
- __builtin_prefetch(ptr + (iterBytes*4));
- CHECK_FLOOD;
- m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr));
- CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
- m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr + 16));
- CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
- }
-
- for (; ptr < buf_end; ptr += 16) {
- m128 p_mask;
- m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 2);
- m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0);
- r_0 = and128(r_0, p_mask);
- CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
- }
-
- return HWLM_SUCCESS;
+ FDR_EXEC_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
- const u8 *buf_end = a->buf + a->len;
- const u8 *ptr = a->buf + a->start_offset;
- u32 floodBackoff = FLOOD_BACKOFF_START;
- const u8 *tryFloodDetect = a->firstFloodDetect;
- u32 last_match = (u32)-1;
- const struct Teddy *teddy = (const struct Teddy *)fdr;
- const size_t iterBytes = 32;
- DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
- a->buf, a->len, a->start_offset);
-
- const m128 *maskBase = getMaskBase(teddy);
- const u32 *confBase = getConfBase(teddy);
-
- m128 res_old_1 = ones128();
- const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
- DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
- if (ptr < mainStart) {
- ptr = mainStart - 16;
- m128 p_mask;
- m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 2);
- m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0);
- r_0 = and128(r_0, p_mask);
- CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
- ptr += 16;
- }
-
- if (ptr + 16 < buf_end) {
- m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr));
- CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
- ptr += 16;
- }
-
- for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
- __builtin_prefetch(ptr + (iterBytes*4));
- CHECK_FLOOD;
- m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr));
- CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
- m128 r_1 = prep_conf_teddy_m2(maskBase, &res_old_1, load128(ptr + 16));
- CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
- }
-
- for (; ptr < buf_end; ptr += 16) {
- m128 p_mask;
- m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 2);
- m128 r_0 = prep_conf_teddy_m2(maskBase, &res_old_1, val_0);
- r_0 = and128(r_0, p_mask);
- CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
- }
-
- return HWLM_SUCCESS;
+ FDR_EXEC_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
- const u8 *buf_end = a->buf + a->len;
- const u8 *ptr = a->buf + a->start_offset;
- u32 floodBackoff = FLOOD_BACKOFF_START;
- const u8 *tryFloodDetect = a->firstFloodDetect;
- u32 last_match = (u32)-1;
- const struct Teddy *teddy = (const struct Teddy *)fdr;
- const size_t iterBytes = 32;
- DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
- a->buf, a->len, a->start_offset);
-
- const m128 *maskBase = getMaskBase(teddy);
- const u32 *confBase = getConfBase(teddy);
-
- m128 res_old_1 = ones128();
- m128 res_old_2 = ones128();
- const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
- DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
- if (ptr < mainStart) {
- ptr = mainStart - 16;
- m128 p_mask;
- m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 3);
- m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
- val_0);
- r_0 = and128(r_0, p_mask);
- CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
- ptr += 16;
- }
-
- if (ptr + 16 < buf_end) {
- m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
- load128(ptr));
- CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
- ptr += 16;
- }
-
- for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
- __builtin_prefetch(ptr + (iterBytes*4));
- CHECK_FLOOD;
- m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
- load128(ptr));
- CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
- m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
- load128(ptr + 16));
- CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
- }
-
- for (; ptr < buf_end; ptr += 16) {
- m128 p_mask;
- m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 3);
- m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, val_0);
- r_0 = and128(r_0, p_mask);
- CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
- }
-
- return HWLM_SUCCESS;
+ FDR_EXEC_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
- const u8 *buf_end = a->buf + a->len;
- const u8 *ptr = a->buf + a->start_offset;
- u32 floodBackoff = FLOOD_BACKOFF_START;
- const u8 *tryFloodDetect = a->firstFloodDetect;
- u32 last_match = (u32)-1;
- const struct Teddy *teddy = (const struct Teddy *)fdr;
- const size_t iterBytes = 32;
- DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
- a->buf, a->len, a->start_offset);
-
- const m128 *maskBase = getMaskBase(teddy);
- const u32 *confBase = getConfBase(teddy);
-
- m128 res_old_1 = ones128();
- m128 res_old_2 = ones128();
- const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
- DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
- if (ptr < mainStart) {
- ptr = mainStart - 16;
- m128 p_mask;
- m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 3);
- m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
- val_0);
- r_0 = and128(r_0, p_mask);
- CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
- ptr += 16;
- }
-
- if (ptr + 16 < buf_end) {
- m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
- load128(ptr));
- CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
- ptr += 16;
- }
-
- for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
- __builtin_prefetch(ptr + (iterBytes*4));
- CHECK_FLOOD;
- m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
- load128(ptr));
- CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
- m128 r_1 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2,
- load128(ptr + 16));
- CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
- }
-
- for (; ptr < buf_end; ptr += 16) {
- m128 p_mask;
- m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 3);
- m128 r_0 = prep_conf_teddy_m3(maskBase, &res_old_1, &res_old_2, val_0);
- r_0 = and128(r_0, p_mask);
- CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
- }
-
- return HWLM_SUCCESS;
+ FDR_EXEC_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
- const u8 *buf_end = a->buf + a->len;
- const u8 *ptr = a->buf + a->start_offset;
- u32 floodBackoff = FLOOD_BACKOFF_START;
- const u8 *tryFloodDetect = a->firstFloodDetect;
- u32 last_match = (u32)-1;
- const struct Teddy *teddy = (const struct Teddy *)fdr;
- const size_t iterBytes = 32;
- DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
- a->buf, a->len, a->start_offset);
-
- const m128 *maskBase = getMaskBase(teddy);
- const u32 *confBase = getConfBase(teddy);
-
- m128 res_old_1 = ones128();
- m128 res_old_2 = ones128();
- m128 res_old_3 = ones128();
- const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
- DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
- if (ptr < mainStart) {
- ptr = mainStart - 16;
- m128 p_mask;
- m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 4);
- m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
- &res_old_3, val_0);
- r_0 = and128(r_0, p_mask);
- CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
- ptr += 16;
- }
-
- if (ptr + 16 < buf_end) {
- m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
- &res_old_3, load128(ptr));
- CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
- ptr += 16;
- }
-
- for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
- __builtin_prefetch(ptr + (iterBytes*4));
- CHECK_FLOOD;
- m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
- &res_old_3, load128(ptr));
- CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
- m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
- &res_old_3, load128(ptr + 16));
- CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
- }
-
- for (; ptr < buf_end; ptr += 16) {
- m128 p_mask;
- m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 4);
- m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
- &res_old_3, val_0);
- r_0 = and128(r_0, p_mask);
- CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBitMany_teddy);
- }
-
- return HWLM_SUCCESS;
+ FDR_EXEC_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
- const u8 *buf_end = a->buf + a->len;
- const u8 *ptr = a->buf + a->start_offset;
- u32 floodBackoff = FLOOD_BACKOFF_START;
- const u8 *tryFloodDetect = a->firstFloodDetect;
- u32 last_match = (u32)-1;
- const struct Teddy *teddy = (const struct Teddy *)fdr;
- const size_t iterBytes = 32;
- DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
- a->buf, a->len, a->start_offset);
-
- const m128 *maskBase = getMaskBase(teddy);
- const u32 *confBase = getConfBase(teddy);
-
- m128 res_old_1 = ones128();
- m128 res_old_2 = ones128();
- m128 res_old_3 = ones128();
- const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
- DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
- if (ptr < mainStart) {
- ptr = mainStart - 16;
- m128 p_mask;
- m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 4);
- m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
- &res_old_3, val_0);
- r_0 = and128(r_0, p_mask);
- CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
- ptr += 16;
- }
-
- if (ptr + 16 < buf_end) {
- m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
- &res_old_3, load128(ptr));
- CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
- ptr += 16;
- }
-
- for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
- __builtin_prefetch(ptr + (iterBytes*4));
- CHECK_FLOOD;
- m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
- &res_old_3, load128(ptr));
- CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
- m128 r_1 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
- &res_old_3, load128(ptr + 16));
- CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
- }
-
- for (; ptr < buf_end; ptr += 16) {
- m128 p_mask;
- m128 val_0 = vectoredLoad128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 4);
- m128 r_0 = prep_conf_teddy_m4(maskBase, &res_old_1, &res_old_2,
- &res_old_3, val_0);
- r_0 = and128(r_0, p_mask);
- CONFIRM_TEDDY(r_0, 8, 0, VECTORING, do_confWithBit_teddy);
- }
-
- return HWLM_SUCCESS;
+ FDR_EXEC_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
}
#if defined(HAVE_AVX2)
+const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = {
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff},
+ {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
+};
+
#ifdef ARCH_64_BIT
#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \
do { \
- if (unlikely(isnonzero256(var))) { \
+ if (unlikely(diff256(var, ones256()))) { \
m256 swap = swap128in256(var); \
m256 r = interleave256lo(var, swap); \
u64a part1 = extractlow64from256(r); \
r = interleave256hi(var, swap); \
u64a part3 = extractlow64from256(r); \
u64a part4 = extract64from256(r, 1); \
- if (unlikely(part1)) { \
+ if (unlikely(part1 != ones_u64a)) { \
+ part1 = ~part1; \
conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
- if (unlikely(part2)) { \
+ if (unlikely(part2 != ones_u64a)) { \
+ part2 = ~part2; \
conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
- if (unlikely(part3)) { \
+ if (unlikely(part3 != ones_u64a)) { \
+ part3 = ~part3; \
conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
- if (unlikely(part4)) { \
+ if (unlikely(part4 != ones_u64a)) { \
+ part4 = ~part4; \
conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
} \
-} while (0);
+} while(0)
#else
#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \
do { \
- if (unlikely(isnonzero256(var))) { \
+ if (unlikely(diff256(var, ones256()))) { \
m256 swap = swap128in256(var); \
m256 r = interleave256lo(var, swap); \
u32 part1 = extractlow32from256(r); \
u32 part6 = extract32from256(r, 1); \
u32 part7 = extract32from256(r, 2); \
u32 part8 = extract32from256(r, 3); \
- if (unlikely(part1)) { \
+ if (unlikely(part1 != ones_u32)) { \
+ part1 = ~part1; \
conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
- if (unlikely(part2)) { \
+ if (unlikely(part2 != ones_u32)) { \
+ part2 = ~part2; \
conf_fn(&part2, bucket, offset + 2, confBase, reason, a, ptr, \
&control, &last_match); \
} \
- if (unlikely(part3)) { \
+ if (unlikely(part3 != ones_u32)) { \
+ part3 = ~part3; \
conf_fn(&part3, bucket, offset + 4, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
- if (unlikely(part4)) { \
+ if (unlikely(part4 != ones_u32)) { \
+ part4 = ~part4; \
conf_fn(&part4, bucket, offset + 6, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
- if (unlikely(part5)) { \
+ if (unlikely(part5 != ones_u32)) { \
+ part5 = ~part5; \
conf_fn(&part5, bucket, offset + 8, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
- if (unlikely(part6)) { \
+ if (unlikely(part6 != ones_u32)) { \
+ part6 = ~part6; \
conf_fn(&part6, bucket, offset + 10, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
- if (unlikely(part7)) { \
+ if (unlikely(part7 != ones_u32)) { \
+ part7 = ~part7; \
conf_fn(&part7, bucket, offset + 12, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
- if (unlikely(part8)) { \
+ if (unlikely(part8 != ones_u32)) { \
+ part8 = ~part8; \
conf_fn(&part8, bucket, offset + 14, confBase, reason, a, ptr, \
&control, &last_match); \
CHECK_HWLM_TERMINATE_MATCHING; \
} \
} \
-} while (0);
+} while(0)
#endif
static really_inline
-m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
+m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset,
+ const u8 *lo, const u8 *hi,
const u8 *buf_history, size_t len_history,
const u32 nMasks) {
m128 p_mask128;
- m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, lo, hi, buf_history,
- len_history, nMasks));
+ m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
+ buf_history, len_history, nMasks));
*p_mask = set2x128(p_mask128);
return ret;
}
m256 mask = set32x8(0xf);
m256 lo = and256(val, mask);
m256 hi = and256(rshift64_m256(val, 4), mask);
- return and256(pshufb_m256(maskBase[0*2], lo),
- pshufb_m256(maskBase[0*2+1], hi));
+ return or256(pshufb_m256(maskBase[0 * 2], lo),
+ pshufb_m256(maskBase[0 * 2 + 1], hi));
}
static really_inline
m256 hi = and256(rshift64_m256(val, 4), mask);
m256 r = prep_conf_fat_teddy_m1(maskBase, val);
- m256 res_1 = and256(pshufb_m256(maskBase[1*2], lo),
- pshufb_m256(maskBase[1*2+1], hi));
- m256 res_shifted_1 = vpalignr(res_1, *old_1, 16-1);
+ m256 res_1 = or256(pshufb_m256(maskBase[1 * 2], lo),
+ pshufb_m256(maskBase[1 * 2 + 1], hi));
+ m256 res_shifted_1 = vpalignr(res_1, *old_1, 16 - 1);
*old_1 = res_1;
- return and256(r, res_shifted_1);
+ return or256(r, res_shifted_1);
}
static really_inline
m256 hi = and256(rshift64_m256(val, 4), mask);
m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val);
- m256 res_2 = and256(pshufb_m256(maskBase[2*2], lo),
- pshufb_m256(maskBase[2*2+1], hi));
- m256 res_shifted_2 = vpalignr(res_2, *old_2, 16-2);
+ m256 res_2 = or256(pshufb_m256(maskBase[2 * 2], lo),
+ pshufb_m256(maskBase[2 * 2 + 1], hi));
+ m256 res_shifted_2 = vpalignr(res_2, *old_2, 16 - 2);
*old_2 = res_2;
- return and256(r, res_shifted_2);
+ return or256(r, res_shifted_2);
}
static really_inline
m256 hi = and256(rshift64_m256(val, 4), mask);
m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val);
- m256 res_3 = and256(pshufb_m256(maskBase[3*2], lo),
- pshufb_m256(maskBase[3*2+1], hi));
- m256 res_shifted_3 = vpalignr(res_3, *old_3, 16-3);
+ m256 res_3 = or256(pshufb_m256(maskBase[3 * 2], lo),
+ pshufb_m256(maskBase[3 * 2 + 1], hi));
+ m256 res_shifted_3 = vpalignr(res_3, *old_3, 16 - 3);
*old_3 = res_3;
- return and256(r, res_shifted_3);
+ return or256(r, res_shifted_3);
}
static really_inline
return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
}
+#define FDR_EXEC_FAT_TEDDY_RES_OLD_1 \
+do { \
+} while(0)
+
+#define FDR_EXEC_FAT_TEDDY_RES_OLD_2 \
+ m256 res_old_1 = zeroes256();
+
+#define FDR_EXEC_FAT_TEDDY_RES_OLD_3 \
+ m256 res_old_1 = zeroes256(); \
+ m256 res_old_2 = zeroes256();
+
+#define FDR_EXEC_FAT_TEDDY_RES_OLD_4 \
+ m256 res_old_1 = zeroes256(); \
+ m256 res_old_2 = zeroes256(); \
+ m256 res_old_3 = zeroes256();
+
+#define FDR_EXEC_FAT_TEDDY_RES_OLD(n) FDR_EXEC_FAT_TEDDY_RES_OLD_##n
+
+#define PREP_CONF_FAT_FN_1(mask_base, val) \
+ prep_conf_fat_teddy_m1(mask_base, val)
+
+#define PREP_CONF_FAT_FN_2(mask_base, val) \
+ prep_conf_fat_teddy_m2(mask_base, &res_old_1, val)
+
+#define PREP_CONF_FAT_FN_3(mask_base, val) \
+ prep_conf_fat_teddy_m3(mask_base, &res_old_1, &res_old_2, val)
+
+#define PREP_CONF_FAT_FN_4(mask_base, val) \
+ prep_conf_fat_teddy_m4(mask_base, &res_old_1, &res_old_2, &res_old_3, val)
+
+#define PREP_CONF_FAT_FN(mask_base, val, n) \
+ PREP_CONF_FAT_FN_##n(mask_base, val)
+
+#define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn) \
+do { \
+ const u8 *buf_end = a->buf + a->len; \
+ const u8 *ptr = a->buf + a->start_offset; \
+ u32 floodBackoff = FLOOD_BACKOFF_START; \
+ const u8 *tryFloodDetect = a->firstFloodDetect; \
+ u32 last_match = (u32)-1; \
+ const struct Teddy *teddy = (const struct Teddy *)fdr; \
+ const size_t iterBytes = 32; \
+ DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \
+ a->buf, a->len, a->start_offset); \
+ \
+ const m256 *maskBase = getMaskBase_avx2(teddy); \
+ const u32 *confBase = getConfBase(teddy); \
+ \
+ FDR_EXEC_FAT_TEDDY_RES_OLD(n_msk); \
+ const u8 *mainStart = ROUNDUP_PTR(ptr, 16); \
+ DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \
+ if (ptr < mainStart) { \
+ ptr = mainStart - 16; \
+ m256 p_mask; \
+ m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->start_offset, \
+ a->buf, buf_end, \
+ a->buf_history, a->len_history, \
+ n_msk); \
+ m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk); \
+ r_0 = or256(r_0, p_mask); \
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
+ ptr += 16; \
+ } \
+ \
+ if (ptr + 16 <= buf_end) { \
+ m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
+ ptr += 16; \
+ } \
+ \
+ for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { \
+ __builtin_prefetch(ptr + (iterBytes * 4)); \
+ CHECK_FLOOD; \
+ m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \
+ m256 r_1 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr + 16), n_msk); \
+ CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, conf_fn); \
+ } \
+ \
+ if (ptr + 16 <= buf_end) { \
+ m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \
+ ptr += 16; \
+ } \
+ \
+ assert(ptr + 16 > buf_end); \
+ if (ptr < buf_end) { \
+ m256 p_mask; \
+ m256 val_0 = vectoredLoad2x128(&p_mask, ptr, 0, ptr, buf_end, \
+ a->buf_history, a->len_history, \
+ n_msk); \
+ m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk); \
+ r_0 = or256(r_0, p_mask); \
+ CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \
+ } \
+ \
+ return HWLM_SUCCESS; \
+} while(0)
+
hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
- const u8 *buf_end = a->buf + a->len;
- const u8 *ptr = a->buf + a->start_offset;
- u32 floodBackoff = FLOOD_BACKOFF_START;
- const u8 *tryFloodDetect = a->firstFloodDetect;
- u32 last_match = (u32)-1;
- const struct Teddy *teddy = (const struct Teddy *)fdr;
- const size_t iterBytes = 32;
- DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
- a->buf, a->len, a->start_offset);
-
- const m256 *maskBase = getMaskBase_avx2(teddy);
- const u32 *confBase = getConfBase(teddy);
-
- const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
- DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
- if (ptr < mainStart) {
- ptr = mainStart - 16;
- m256 p_mask;
- m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 1);
- m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
- r_0 = and256(r_0, p_mask);
- CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
- ptr += 16;
- }
-
- if (ptr + 16 < buf_end) {
- m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
- CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
- ptr += 16;
- }
-
- for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
- __builtin_prefetch(ptr + (iterBytes*4));
- CHECK_FLOOD;
- m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
- CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit1_teddy);
- m256 r_1 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr + 16));
- CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit1_teddy);
- }
-
- for (; ptr < buf_end; ptr += 16) {
- m256 p_mask;
- m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 1);
- m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
- r_0 = and256(r_0, p_mask);
- CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit1_teddy);
- }
-
- return HWLM_SUCCESS;
+ FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fat(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
- const u8 *buf_end = a->buf + a->len;
- const u8 *ptr = a->buf + a->start_offset;
- u32 floodBackoff = FLOOD_BACKOFF_START;
- const u8 *tryFloodDetect = a->firstFloodDetect;
- u32 last_match = (u32)-1;
- const struct Teddy *teddy = (const struct Teddy *)fdr;
- const size_t iterBytes = 32;
- DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
- a->buf, a->len, a->start_offset);
-
- const m256 *maskBase = getMaskBase_avx2(teddy);
- const u32 *confBase = getConfBase(teddy);
-
- const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
- DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
- if (ptr < mainStart) {
- ptr = mainStart - 16;
- m256 p_mask;
- m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 1);
- m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
- r_0 = and256(r_0, p_mask);
- CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
- ptr += 16;
- }
-
- if (ptr + 16 < buf_end) {
- m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
- CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
- ptr += 16;
- }
-
- for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
- __builtin_prefetch(ptr + (iterBytes*4));
- CHECK_FLOOD;
- m256 r_0 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr));
- CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
- m256 r_1 = prep_conf_fat_teddy_m1(maskBase, load2x128(ptr + 16));
- CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
- }
-
- for (; ptr < buf_end; ptr += 16) {
- m256 p_mask;
- m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 1);
- m256 r_0 = prep_conf_fat_teddy_m1(maskBase, val_0);
- r_0 = and256(r_0, p_mask);
- CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
- }
-
- return HWLM_SUCCESS;
+ FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_teddy_avx2_msks2_fat(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
- const u8 *buf_end = a->buf + a->len;
- const u8 *ptr = a->buf + a->start_offset;
- u32 floodBackoff = FLOOD_BACKOFF_START;
- const u8 *tryFloodDetect = a->firstFloodDetect;
- u32 last_match = (u32)-1;
- const struct Teddy *teddy = (const struct Teddy *)fdr;
- const size_t iterBytes = 32;
- DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
- a->buf, a->len, a->start_offset);
-
- const m256 *maskBase = getMaskBase_avx2(teddy);
- const u32 *confBase = getConfBase(teddy);
-
- m256 res_old_1 = ones256();
- const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
- DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
- if (ptr < mainStart) {
- ptr = mainStart - 16;
- m256 p_mask;
- m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 2);
- m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
- r_0 = and256(r_0, p_mask);
- CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
- ptr += 16;
- }
-
- if (ptr + 16 < buf_end) {
- m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
- CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
- ptr += 16;
- }
-
- for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
- __builtin_prefetch(ptr + (iterBytes*4));
- CHECK_FLOOD;
- m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
- CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
- m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1,
- load2x128(ptr + 16));
- CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
- }
-
- for (; ptr < buf_end; ptr += 16) {
- m256 p_mask;
- m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 2);
- m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
- r_0 = and256(r_0, p_mask);
- CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
- }
-
- return HWLM_SUCCESS;
+ FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_teddy_avx2_msks2_pck_fat(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
- const u8 *buf_end = a->buf + a->len;
- const u8 *ptr = a->buf + a->start_offset;
- u32 floodBackoff = FLOOD_BACKOFF_START;
- const u8 *tryFloodDetect = a->firstFloodDetect;
- u32 last_match = (u32)-1;
- const struct Teddy *teddy = (const struct Teddy *)fdr;
- const size_t iterBytes = 32;
- DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
- a->buf, a->len, a->start_offset);
-
- const m256 *maskBase = getMaskBase_avx2(teddy);
- const u32 *confBase = getConfBase(teddy);
-
- m256 res_old_1 = ones256();
- const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
- DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
- if (ptr < mainStart) {
- ptr = mainStart - 16;
- m256 p_mask;
- m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 2);
- m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
- r_0 = and256(r_0, p_mask);
- CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
- ptr += 16;
- }
-
- if (ptr + 16 < buf_end) {
- m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
- CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
- ptr += 16;
- }
-
- for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
- __builtin_prefetch(ptr + (iterBytes*4));
- CHECK_FLOOD;
- m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, load2x128(ptr));
- CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
- m256 r_1 = prep_conf_fat_teddy_m2(maskBase, &res_old_1,
- load2x128(ptr + 16));
- CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
- }
-
- for (; ptr < buf_end; ptr += 16) {
- m256 p_mask;
- m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 2);
- m256 r_0 = prep_conf_fat_teddy_m2(maskBase, &res_old_1, val_0);
- r_0 = and256(r_0, p_mask);
- CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
- }
-
- return HWLM_SUCCESS;
+ FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_teddy_avx2_msks3_fat(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
- const u8 *buf_end = a->buf + a->len;
- const u8 *ptr = a->buf + a->start_offset;
- u32 floodBackoff = FLOOD_BACKOFF_START;
- const u8 *tryFloodDetect = a->firstFloodDetect;
- u32 last_match = (u32)-1;
- const struct Teddy *teddy = (const struct Teddy *)fdr;
- const size_t iterBytes = 32;
- DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
- a->buf, a->len, a->start_offset);
-
- const m256 *maskBase = getMaskBase_avx2(teddy);
- const u32 *confBase = getConfBase(teddy);
-
- m256 res_old_1 = ones256();
- m256 res_old_2 = ones256();
- const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
- DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
- if (ptr < mainStart) {
- ptr = mainStart - 16;
- m256 p_mask;
- m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 3);
- m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
- val_0);
- r_0 = and256(r_0, p_mask);
- CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
- ptr += 16;
- }
-
- if (ptr + 16 < buf_end) {
- m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
- load2x128(ptr));
- CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
- ptr += 16;
- }
-
- for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
- __builtin_prefetch(ptr + (iterBytes*4));
- CHECK_FLOOD;
- m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
- load2x128(ptr));
- CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
- m256 r_1 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
- load2x128(ptr + 16));
- CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
- }
-
- for (; ptr < buf_end; ptr += 16) {
- m256 p_mask;
- m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 3);
- m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
- val_0);
- r_0 = and256(r_0, p_mask);
- CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
- }
-
- return HWLM_SUCCESS;
+ FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_teddy_avx2_msks3_pck_fat(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
- const u8 *buf_end = a->buf + a->len;
- const u8 *ptr = a->buf + a->start_offset;
- u32 floodBackoff = FLOOD_BACKOFF_START;
- const u8 *tryFloodDetect = a->firstFloodDetect;
- u32 last_match = (u32)-1;
- const struct Teddy *teddy = (const struct Teddy *)fdr;
- const size_t iterBytes = 32;
- DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
- a->buf, a->len, a->start_offset);
-
- const m256 *maskBase = getMaskBase_avx2(teddy);
- const u32 *confBase = getConfBase(teddy);
-
- m256 res_old_1 = ones256();
- m256 res_old_2 = ones256();
- const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
- DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
- if (ptr < mainStart) {
- ptr = mainStart - 16;
- m256 p_mask;
- m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 3);
- m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
- val_0);
- r_0 = and256(r_0, p_mask);
- CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
- ptr += 16;
- }
-
- if (ptr + 16 < buf_end) {
- m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
- load2x128(ptr));
- CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
- ptr += 16;
- }
-
- for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
- __builtin_prefetch(ptr + (iterBytes*4));
- CHECK_FLOOD;
- m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
- load2x128(ptr));
- CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
- m256 r_1 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
- load2x128(ptr + 16));
- CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
- }
-
- for (; ptr < buf_end; ptr += 16) {
- m256 p_mask;
- m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 3);
- m256 r_0 = prep_conf_fat_teddy_m3(maskBase, &res_old_1, &res_old_2,
- val_0);
- r_0 = and256(r_0, p_mask);
- CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
- }
-
- return HWLM_SUCCESS;
+ FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_teddy_avx2_msks4_fat(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
- const u8 *buf_end = a->buf + a->len;
- const u8 *ptr = a->buf + a->start_offset;
- u32 floodBackoff = FLOOD_BACKOFF_START;
- const u8 *tryFloodDetect = a->firstFloodDetect;
- u32 last_match = (u32)-1;
- const struct Teddy *teddy = (const struct Teddy *)fdr;
- const size_t iterBytes = 32;
- DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
- a->buf, a->len, a->start_offset);
-
- const m256 *maskBase = getMaskBase_avx2(teddy);
- const u32 *confBase = getConfBase(teddy);
-
- m256 res_old_1 = ones256();
- m256 res_old_2 = ones256();
- m256 res_old_3 = ones256();
- const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
- DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
- if (ptr < mainStart) {
- ptr = mainStart - 16;
- m256 p_mask;
- m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 4);
- m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
- &res_old_3, val_0);
- r_0 = and256(r_0, p_mask);
- CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
- ptr += 16;
- }
-
- if (ptr + 16 < buf_end) {
- m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
- &res_old_3, load2x128(ptr));
- CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
- ptr += 16;
- }
-
- for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
- __builtin_prefetch(ptr + (iterBytes*4));
- CHECK_FLOOD;
- m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
- &res_old_3, load2x128(ptr));
- CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBitMany_teddy);
- m256 r_1 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
- &res_old_3, load2x128(ptr + 16));
- CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBitMany_teddy);
- }
-
- for (; ptr < buf_end; ptr += 16) {
- m256 p_mask;
- m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 4);
- m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
- &res_old_3, val_0);
- r_0 = and256(r_0, p_mask);
- CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBitMany_teddy);
- }
-
- return HWLM_SUCCESS;
+ FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
}
hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
hwlm_group_t control) {
- const u8 *buf_end = a->buf + a->len;
- const u8 *ptr = a->buf + a->start_offset;
- u32 floodBackoff = FLOOD_BACKOFF_START;
- const u8 *tryFloodDetect = a->firstFloodDetect;
- u32 last_match = (u32)-1;
- const struct Teddy *teddy = (const struct Teddy *)fdr;
- const size_t iterBytes = 32;
- DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
- a->buf, a->len, a->start_offset);
-
- const m256 *maskBase = getMaskBase_avx2(teddy);
- const u32 *confBase = getConfBase(teddy);
-
- m256 res_old_1 = ones256();
- m256 res_old_2 = ones256();
- m256 res_old_3 = ones256();
- const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
- DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
- if (ptr < mainStart) {
- ptr = mainStart - 16;
- m256 p_mask;
- m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 4);
- m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
- &res_old_3, val_0);
- r_0 = and256(r_0, p_mask);
- CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
- ptr += 16;
- }
-
- if (ptr + 16 < buf_end) {
- m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
- &res_old_3, load2x128(ptr));
- CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
- ptr += 16;
- }
-
- for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
- __builtin_prefetch(ptr + (iterBytes*4));
- CHECK_FLOOD;
- m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
- &res_old_3, load2x128(ptr));
- CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, do_confWithBit_teddy);
- m256 r_1 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
- &res_old_3, load2x128(ptr + 16));
- CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, do_confWithBit_teddy);
- }
-
- for (; ptr < buf_end; ptr += 16) {
- m256 p_mask;
- m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->buf, buf_end,
- a->buf_history, a->len_history, 4);
- m256 r_0 = prep_conf_fat_teddy_m4(maskBase, &res_old_1, &res_old_2,
- &res_old_3, val_0);
- r_0 = and256(r_0, p_mask);
- CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, do_confWithBit_teddy);
- }
-
- return HWLM_SUCCESS;
+ FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
}
#endif // HAVE_AVX2
return true;
}
-bytecode_ptr<FDR> TeddyCompiler::build() {
- assert(eng.numMasks <= MAX_NUM_MASKS);
+// this entry has all-zero mask to skip reinforcement
+#define NO_REINFORCEMENT N_CHARS
- if (lits.size() > eng.getNumBuckets() * TEDDY_BUCKET_LOAD) {
- DEBUG_PRINTF("too many literals: %zu\n", lits.size());
- return nullptr;
+// this means every entry in reinforcement table
+#define ALL_CHAR_SET N_CHARS
+
+// each item's reinforcement mask has REINFORCED_MSK_LEN bytes
+#define REINFORCED_MSK_LEN 8
+
+static
+void initReinforcedTable(u8 *reinforcedMsk) {
+ u64a *mask = (u64a *)reinforcedMsk;
+ fill_n(mask, N_CHARS, 0x00ffffffffffffffULL);
+}
+
+static
+void fillReinforcedMskZero(u8 *reinforcedMsk) {
+ u8 *mc = reinforcedMsk + NO_REINFORCEMENT * REINFORCED_MSK_LEN;
+ fill_n(mc, REINFORCED_MSK_LEN, 0x00);
+}
+
+static
+void fillReinforcedMsk(u8 *reinforcedMsk, u16 c, u32 j, u8 bmsk) {
+ assert(j > 0);
+ if (c == ALL_CHAR_SET) {
+ for (size_t i = 0; i < N_CHARS; i++) {
+ u8 *mc = reinforcedMsk + i * REINFORCED_MSK_LEN;
+ mc[j - 1] &= ~bmsk;
+ }
+ } else {
+ u8 *mc = reinforcedMsk + c * REINFORCED_MSK_LEN;
+ mc[j - 1] &= ~bmsk;
}
+}
#ifdef TEDDY_DEBUG
- for (size_t i = 0; i < lits.size(); i++) {
- printf("lit %zu (len = %zu, %s) is ", i, lits[i].s.size(),
- lits[i].nocase ? "caseless" : "caseful");
- for (size_t j = 0; j < lits[i].s.size(); j++) {
- printf("%02x", ((u32)lits[i].s[j]) & 0xff);
+static
+void dumpReinforcedMaskTable(const u8 *msks) {
+ for (u32 i = 0; i <= N_CHARS; i++) {
+ printf("0x%02x: ", i);
+ for (u32 j = 0; j < REINFORCED_MSK_LEN; j++) {
+ u8 val = msks[i * REINFORCED_MSK_LEN + j];
+ for (u32 k = 0; k < 8; k++) {
+ printf("%s", ((val >> k) & 0x1) ? "1" : "0");
+ }
+ printf(" ");
}
printf("\n");
}
+}
#endif
- map<BucketIndex, std::vector<LiteralIndex>> bucketToLits;
- if (eng.needConfirm(lits)) {
- if (!pack(bucketToLits)) {
- DEBUG_PRINTF("more lits (%zu) than buckets (%u), can't pack.\n",
- lits.size(), eng.getNumBuckets());
- return nullptr;
- }
- } else {
- for (u32 i = 0; i < lits.size(); i++) {
- bucketToLits[i].push_back(i);
- }
- }
- u32 maskWidth = eng.getNumBuckets() / 8;
-
- size_t headerSize = sizeof(Teddy);
- size_t maskLen = eng.numMasks * 16 * 2 * maskWidth;
-
- auto floodTable = setupFDRFloodControl(lits, eng, grey);
- auto confirmTable = setupFullConfs(lits, eng, bucketToLits, make_small);
-
- // Note: we place each major structure here on a cacheline boundary.
- size_t size = ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen) +
- ROUNDUP_CL(confirmTable.size()) + floodTable.size();
-
- auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
- assert(fdr); // otherwise would have thrown std::bad_alloc
- Teddy *teddy = (Teddy *)fdr.get(); // ugly
- u8 *teddy_base = (u8 *)teddy;
-
- // Write header.
- teddy->size = size;
- teddy->engineID = eng.getID();
- teddy->maxStringLen = verify_u32(maxLen(lits));
-
- // Write confirm structures.
- u8 *ptr = teddy_base + ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen);
- assert(ISALIGNED_CL(ptr));
- teddy->confOffset = verify_u32(ptr - teddy_base);
- memcpy(ptr, confirmTable.get(), confirmTable.size());
- ptr += ROUNDUP_CL(confirmTable.size());
-
- // Write flood control structures.
- assert(ISALIGNED_CL(ptr));
- teddy->floodOffset = verify_u32(ptr - teddy_base);
- memcpy(ptr, floodTable.get(), floodTable.size());
- ptr += floodTable.size();
-
- // Write teddy masks.
- u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize);
+static
+void fillNibbleMasks(const map<BucketIndex,
+ vector<LiteralIndex>> &bucketToLits,
+ const vector<hwlmLiteral> &lits,
+ u32 numMasks, u32 maskWidth, size_t maskLen,
+ u8 *baseMsk) {
+ memset(baseMsk, 0xff, maskLen);
for (const auto &b2l : bucketToLits) {
const u32 &bucket_id = b2l.first;
const u32 sz = verify_u32(l.s.size());
// fill in masks
- for (u32 j = 0; j < eng.numMasks; j++) {
+ for (u32 j = 0; j < numMasks; j++) {
const u32 msk_id_lo = j * 2 * maskWidth + (bucket_id / 8);
const u32 msk_id_hi = (j * 2 + 1) * maskWidth + (bucket_id / 8);
const u32 lo_base = msk_id_lo * 16;
// locations in these masks with '1'
if (j >= sz) {
for (u32 n = 0; n < 16; n++) {
- baseMsk[lo_base + n] |= bmsk;
- baseMsk[hi_base + n] |= bmsk;
+ baseMsk[lo_base + n] &= ~bmsk;
+ baseMsk[hi_base + n] &= ~bmsk;
}
} else {
u8 c = l.s[sz - 1 - j];
for (u8 cm = 0; cm < 0x10; cm++) {
if ((cm & m_lo) == (cmp_lo & m_lo)) {
- baseMsk[lo_base + cm] |= bmsk;
+ baseMsk[lo_base + cm] &= ~bmsk;
}
if ((cm & m_hi) == (cmp_hi & m_hi)) {
- baseMsk[hi_base + cm] |= bmsk;
+ baseMsk[hi_base + cm] &= ~bmsk;
}
}
} else {
if (l.nocase && ourisalpha(c)) {
u32 cmHalfClear = (0xdf >> hiShift) & 0xf;
u32 cmHalfSet = (0x20 >> hiShift) & 0xf;
- baseMsk[hi_base + (n_hi & cmHalfClear)] |= bmsk;
- baseMsk[hi_base + (n_hi | cmHalfSet)] |= bmsk;
+ baseMsk[hi_base + (n_hi & cmHalfClear)] &= ~bmsk;
+ baseMsk[hi_base + (n_hi | cmHalfSet)] &= ~bmsk;
} else {
- baseMsk[hi_base + n_hi] |= bmsk;
+ baseMsk[hi_base + n_hi] &= ~bmsk;
}
- baseMsk[lo_base + n_lo] |= bmsk;
+ baseMsk[lo_base + n_lo] &= ~bmsk;
}
}
}
}
}
+}
+
+static
+void fillReinforcedTable(const map<BucketIndex,
+ vector<LiteralIndex>> &bucketToLits,
+ const vector<hwlmLiteral> &lits,
+ u8 *reinforcedMsk) {
+ initReinforcedTable(reinforcedMsk);
+
+ for (const auto &b2l : bucketToLits) {
+ const u32 &bucket_id = b2l.first;
+ const vector<LiteralIndex> &ids = b2l.second;
+ const u8 bmsk = 1U << (bucket_id % 8);
+
+ for (const LiteralIndex &lit_id : ids) {
+ const hwlmLiteral &l = lits[lit_id];
+ DEBUG_PRINTF("putting lit %u into bucket %u\n", lit_id, bucket_id);
+ const u32 sz = verify_u32(l.s.size());
+
+ // fill in reinforced masks
+ for (u32 j = 1; j < REINFORCED_MSK_LEN; j++) {
+ if (sz - 1 < j) {
+ fillReinforcedMsk(reinforcedMsk, ALL_CHAR_SET, j, bmsk);
+ } else {
+ u8 c = l.s[sz - 1 - j];
+ if (l.nocase && ourisalpha(c)) {
+ u8 c_up = c & 0xdf;
+ fillReinforcedMsk(reinforcedMsk, c_up, j, bmsk);
+ u8 c_lo = c | 0x20;
+ fillReinforcedMsk(reinforcedMsk, c_lo, j, bmsk);
+ } else {
+ fillReinforcedMsk(reinforcedMsk, c, j, bmsk);
+ }
+ }
+ }
+ }
+ }
+
+ fillReinforcedMskZero(reinforcedMsk);
+}
+
+bytecode_ptr<FDR> TeddyCompiler::build() {
+ assert(eng.numMasks <= MAX_NUM_MASKS);
+
+ if (lits.size() > eng.getNumBuckets() * TEDDY_BUCKET_LOAD) {
+ DEBUG_PRINTF("too many literals: %zu\n", lits.size());
+ return nullptr;
+ }
+
+#ifdef TEDDY_DEBUG
+ for (size_t i = 0; i < lits.size(); i++) {
+ printf("lit %zu (len = %zu, %s) is ", i, lits[i].s.size(),
+ lits[i].nocase ? "caseless" : "caseful");
+ for (size_t j = 0; j < lits[i].s.size(); j++) {
+ printf("%02x", ((u32)lits[i].s[j])&0xff);
+ }
+ printf("\n");
+ }
+#endif
+
+ map<BucketIndex, std::vector<LiteralIndex>> bucketToLits;
+ if (!pack(bucketToLits)) {
+ DEBUG_PRINTF("more lits (%zu) than buckets (%u), can't pack.\n",
+ lits.size(), eng.getNumBuckets());
+ return nullptr;
+ }
+ u32 maskWidth = eng.getNumBuckets() / 8;
+
+ size_t headerSize = sizeof(Teddy);
+ size_t maskLen = eng.numMasks * 16 * 2 * maskWidth;
+ size_t reinforcedMaskLen = (N_CHARS + 1) * REINFORCED_MSK_LEN;
+
+ auto floodTable = setupFDRFloodControl(lits, eng, grey);
+ auto confirmTable = setupFullConfs(lits, eng, bucketToLits, make_small);
+
+ // Note: we place each major structure here on a cacheline boundary.
+ size_t size = ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen) +
+ ROUNDUP_CL(reinforcedMaskLen) +
+ ROUNDUP_CL(confirmTable.size()) + floodTable.size();
+
+ auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
+ assert(fdr); // otherwise would have thrown std::bad_alloc
+ Teddy *teddy = (Teddy *)fdr.get(); // ugly
+ u8 *teddy_base = (u8 *)teddy;
+
+ // Write header.
+ teddy->size = size;
+ teddy->engineID = eng.getID();
+ teddy->maxStringLen = verify_u32(maxLen(lits));
+
+ // Write confirm structures.
+ u8 *ptr = teddy_base + ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen) +
+ ROUNDUP_CL(reinforcedMaskLen);
+ assert(ISALIGNED_CL(ptr));
+ teddy->confOffset = verify_u32(ptr - teddy_base);
+ memcpy(ptr, confirmTable.get(), confirmTable.size());
+ ptr += ROUNDUP_CL(confirmTable.size());
+
+ // Write flood control structures.
+ assert(ISALIGNED_CL(ptr));
+ teddy->floodOffset = verify_u32(ptr - teddy_base);
+ memcpy(ptr, floodTable.get(), floodTable.size());
+ ptr += floodTable.size();
+
+ // Write teddy masks.
+ u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize);
+ fillNibbleMasks(bucketToLits, lits, eng.numMasks, maskWidth, maskLen,
+ baseMsk);
+
+ // Write reinforcement masks.
+ u8 *reinforcedMsk = baseMsk + ROUNDUP_CL(maskLen);
+ fillReinforcedTable(bucketToLits, lits, reinforcedMsk);
#ifdef TEDDY_DEBUG
for (u32 i = 0; i < eng.numMasks * 2; i++) {
}
printf("\n");
}
+
+ printf("\n===============================================\n"
+ "reinforced mask table for low boundary (original)\n\n");
+ dumpReinforcedMaskTable(reinforcedMsk);
#endif
return fdr;
return numMasks;
}
-bool TeddyEngineDescription::needConfirm(const vector<hwlmLiteral> &lits) const {
- if (packed || lits.size() > getNumBuckets()) {
- return true;
- }
- for (const auto &lit : lits) {
- if (lit.s.size() > numMasks || !lit.msk.empty()) {
- return true;
- }
- }
- return false;
-}
-
void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {
static const TeddyEngineDef defns[] = {
{ 3, 0 | HS_CPU_FEATURES_AVX2, 1, 16, false },
explicit TeddyEngineDescription(const TeddyEngineDef &def);
u32 getDefaultFloodSuffixLength() const override;
- bool needConfirm(const std::vector<hwlmLiteral> &lits) const;
};
std::unique_ptr<TeddyEngineDescription>
* POSSIBILITY OF SUCH DAMAGE.
*/
+/* Teddy bytecode layout:
+ * * |-----|
+ * * | | struct Teddy
+ * * |-----|
+ * * | | teddy masks
+ * * | |
+ * * |-----|
+ * * | | reinforcement mask table
+ * * | |
+ * * |-----|
+ * * | | confirm
+ * * | |
+ * * | |
+ * * |-----|
+ * * | | flood control
+ * * | |
+ * * |-----|
+ */
+
#ifndef TEDDY_INTERNAL_H
#define TEDDY_INTERNAL_H
#include "ue2common.h"
#include "util/bitutils.h"
#include "util/simd_utils.h"
+#include "util/uniform_ops.h"
extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
+#if defined(__AVX2__)
+extern const u8 ALIGN_DIRECTIVE p_mask_arr256[33][64];
+#endif
#ifdef ARCH_64_BIT
#define TEDDY_CONF_TYPE u64a
}
// Note: p_mask is an output param that initialises a poison mask.
+// *p_mask = load128(p_mask_arr[n] + 16 - m) means:
+// m byte 0xff in the beginning, followed by n byte 0x00,
+// then followed by the rest bytes 0xff.
+// ptr >= lo:
+// no history.
+// for end/short zone, ptr==lo and start_offset==0
+// for start zone, see below
+// lo ptr hi hi
+// |----------|-------|----------------|............|
+// start 0 start+offset end(<=16)
+// p_mask ffff..ff0000...........00ffff..........
+// ptr < lo:
+// only start zone.
+// history
+// ptr lo hi hi
+// |----------|-------|----------------|............|
+// 0 start start+offset end(<=16)
+// p_mask ffff.....ffffff..ff0000...........00ffff..........
static really_inline
-m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
+m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
+ const u8 *lo, const u8 *hi,
const u8 *buf_history, size_t len_history,
const u32 nMasks) {
union {
uintptr_t copy_start;
uintptr_t copy_len;
- if (ptr >= lo) {
+ if (ptr >= lo) { // short/end/start zone
+ uintptr_t start = (uintptr_t)(ptr - lo);
uintptr_t avail = (uintptr_t)(hi - ptr);
if (avail >= 16) {
- *p_mask = load128(p_mask_arr[16] + 16);
+ assert(start_offset - start <= 16);
+ *p_mask = loadu128(p_mask_arr[16 - start_offset + start]
+ + 16 - start_offset + start);
return loadu128(ptr);
}
- *p_mask = load128(p_mask_arr[avail] + 16);
+ assert(start_offset - start <= avail);
+ *p_mask = loadu128(p_mask_arr[avail - start_offset + start]
+ + 16 - start_offset + start);
copy_start = 0;
copy_len = avail;
- } else {
+ } else { // start zone
uintptr_t need = MIN((uintptr_t)(lo - ptr),
MIN(len_history, nMasks - 1));
uintptr_t start = (uintptr_t)(lo - ptr);
uintptr_t i;
- for (i = start - need; ptr + i < lo; i++) {
- u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
+ for (i = start - need; i < start; i++) {
+ u.val8[i] = buf_history[len_history - (start - i)];
}
uintptr_t end = MIN(16, (uintptr_t)(hi - ptr));
- *p_mask = loadu128(p_mask_arr[end - start] + 16 - start);
- copy_start = i;
- copy_len = end - i;
+ assert(start + start_offset <= end);
+ *p_mask = loadu128(p_mask_arr[end - start - start_offset]
+ + 16 - start - start_offset);
+ copy_start = start;
+ copy_len = end - start;
}
// Runt block from the buffer.
return u.val128;
}
+#if defined(__AVX2__)
+/*
+ * \brief Copy a block of [0,31] bytes efficiently.
+ *
+ * This function is a workaround intended to stop some compilers from
+ * synthesizing a memcpy function call out of the copy of a small number of
+ * bytes that we do in vectoredLoad256.
+ */
+static really_inline
+void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) {
+ switch (len) {
+ case 0:
+ break;
+ case 1:
+ *dst = *src;
+ break;
+ case 2:
+ unaligned_store_u16(dst, unaligned_load_u16(src));
+ break;
+ case 3:
+ unaligned_store_u16(dst, unaligned_load_u16(src));
+ dst[2] = src[2];
+ break;
+ case 4:
+ unaligned_store_u32(dst, unaligned_load_u32(src));
+ break;
+ case 5:
+ case 6:
+ case 7:
+ /* Perform copy with two overlapping 4-byte chunks. */
+ unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
+ unaligned_store_u32(dst, unaligned_load_u32(src));
+ break;
+ case 8:
+ unaligned_store_u64a(dst, unaligned_load_u64a(src));
+ break;
+ case 9:
+ case 10:
+ case 11:
+ case 12:
+ case 13:
+ case 14:
+ case 15:
+ /* Perform copy with two overlapping 8-byte chunks. */
+ unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
+ unaligned_store_u64a(dst, unaligned_load_u64a(src));
+ break;
+ case 16:
+ storeu128(dst, loadu128(src));
+ break;
+ default:
+ /* Perform copy with two overlapping 16-byte chunks. */
+ assert(len < 32);
+ storeu128(dst + len - 16, loadu128(src + len - 16));
+ storeu128(dst, loadu128(src));
+ break;
+ }
+}
+
+// Note: p_mask is an output param that initialises a poison mask.
+// *p_mask = load256(p_mask_arr256[n] + 32 - m) means:
+// m byte 0xff in the beginning, followed by n byte 0x00,
+// then followed by the rest bytes 0xff.
+// ptr >= lo:
+// no history.
+// for end/short zone, ptr==lo and start_offset==0
+// for start zone, see below
+// lo ptr hi hi
+// |----------|-------|----------------|............|
+// start 0 start+offset end(<=32)
+// p_mask ffff..ff0000...........00ffff..........
+// ptr < lo:
+// only start zone.
+// history
+// ptr lo hi hi
+// |----------|-------|----------------|............|
+// 0 start start+offset end(<=32)
+// p_mask ffff.....ffffff..ff0000...........00ffff..........
+static really_inline
+m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
+ const u8 *lo, const u8 *hi,
+ const u8 *buf_history, size_t len_history,
+ const u32 nMasks) {
+ union {
+ u8 val8[32];
+ m256 val256;
+ } u;
+ u.val256 = zeroes256();
+
+ uintptr_t copy_start;
+ uintptr_t copy_len;
+
+ if (ptr >= lo) { // short/end/start zone
+ uintptr_t start = (uintptr_t)(ptr - lo);
+ uintptr_t avail = (uintptr_t)(hi - ptr);
+ if (avail >= 32) {
+ assert(start_offset - start <= 32);
+ *p_mask = loadu256(p_mask_arr256[32 - start_offset + start]
+ + 32 - start_offset + start);
+ return loadu256(ptr);
+ }
+ assert(start_offset - start <= avail);
+ *p_mask = loadu256(p_mask_arr256[avail - start_offset + start]
+ + 32 - start_offset + start);
+ copy_start = 0;
+ copy_len = avail;
+ } else { //start zone
+ uintptr_t need = MIN((uintptr_t)(lo - ptr),
+ MIN(len_history, nMasks - 1));
+ uintptr_t start = (uintptr_t)(lo - ptr);
+ uintptr_t i;
+ for (i = start - need; i < start; i++) {
+ u.val8[i] = buf_history[len_history - (start - i)];
+ }
+ uintptr_t end = MIN(32, (uintptr_t)(hi - ptr));
+ assert(start + start_offset <= end);
+ *p_mask = loadu256(p_mask_arr256[end - start - start_offset]
+ + 32 - start - start_offset);
+ copy_start = start;
+ copy_len = end - start;
+ }
+
+ // Runt block from the buffer.
+ copyRuntBlock256(&u.val8[copy_start], &ptr[copy_start], copy_len);
+
+ return u.val256;
+}
+#endif // __AVX2__
+
static really_inline
u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte,
CautionReason reason) {
}
static really_inline
-void do_confWithBit1_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
- const u32 *confBase, CautionReason reason,
- const struct FDR_Runtime_Args *a, const u8 *ptr,
- hwlmcb_rv_t *control, u32 *last_match) {
- do {
- u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
- u32 byte = bit / bucket + offset;
- u32 idx = bit % bucket;
- u32 cf = confBase[idx];
- const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
- ((const u8 *)confBase + cf);
- if (!(fdrc->groups & *control)) {
- continue;
- }
- u64a confVal = getConfVal(a, ptr, byte, reason);
- confWithBit1(fdrc, a, ptr - a->buf + byte, control, last_match,
- confVal);
- } while (unlikely(*conf));
-}
-
-static really_inline
-void do_confWithBitMany_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
- const u32 *confBase, CautionReason reason,
- const struct FDR_Runtime_Args *a, const u8 *ptr,
- hwlmcb_rv_t *control, u32 *last_match) {
- do {
- u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
- u32 byte = bit / bucket + offset;
- u32 idx = bit % bucket;
- u32 cf = confBase[idx];
- const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
- ((const u8 *)confBase + cf);
- if (!(fdrc->groups & *control)) {
- continue;
- }
- u64a confVal = getConfVal(a, ptr, byte, reason);
- confWithBitMany(fdrc, a, ptr - a->buf + byte, reason, control,
- last_match, confVal);
- } while (unlikely(*conf));
+const m128 *getMaskBase(const struct Teddy *teddy) {
+ return (const m128 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
}
static really_inline
-const m128 *getMaskBase(const struct Teddy *teddy) {
- return (const m128 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
+const u64a *getReinforcedMaskBase(const struct Teddy *teddy, u8 numMask) {
+ return (const u64a *)((const u8 *)getMaskBase(teddy)
+ + ROUNDUP_CL(2 * numMask * sizeof(m128)));
}
static really_inline