0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
};
-#if defined(__AVX2__) // reinforced teddy
+#define CONF_CHUNK_64(chunk, bucket, off, reason, conf_fn) \
+do { \
+ if (unlikely(chunk != ones_u64a)) { \
+ chunk = ~chunk; \
+ conf_fn(&chunk, bucket, off, confBase, reason, a, ptr, \
+ &control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+} while(0)
+
+#define CONF_CHUNK_32(chunk, bucket, off, reason, conf_fn) \
+do { \
+ if (unlikely(chunk != ones_u32a)) { \
+ chunk = ~chunk; \
+ conf_fn(&chunk, bucket, off, confBase, reason, a, ptr, \
+ &control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+} while(0)
+
+#if defined(HAVE_AVX512) // AVX512 reinforced teddy
+
+#ifdef ARCH_64_BIT
+#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
+do { \
+ if (unlikely(diff512(var, ones512()))) { \
+ m128 p128_0 = extract128from512(var, 0); \
+ m128 p128_1 = extract128from512(var, 1); \
+ m128 p128_2 = extract128from512(var, 2); \
+ m128 p128_3 = extract128from512(var, 3); \
+ u64a part1 = movq(p128_0); \
+ u64a part2 = movq(rshiftbyte_m128(p128_0, 8)); \
+ u64a part3 = movq(p128_1); \
+ u64a part4 = movq(rshiftbyte_m128(p128_1, 8)); \
+ u64a part5 = movq(p128_2); \
+ u64a part6 = movq(rshiftbyte_m128(p128_2, 8)); \
+ u64a part7 = movq(p128_3); \
+ u64a part8 = movq(rshiftbyte_m128(p128_3, 8)); \
+ CONF_CHUNK_64(part1, bucket, offset, reason, conf_fn); \
+ CONF_CHUNK_64(part2, bucket, offset + 8, reason, conf_fn); \
+ CONF_CHUNK_64(part3, bucket, offset + 16, reason, conf_fn); \
+ CONF_CHUNK_64(part4, bucket, offset + 24, reason, conf_fn); \
+ CONF_CHUNK_64(part5, bucket, offset + 32, reason, conf_fn); \
+ CONF_CHUNK_64(part6, bucket, offset + 40, reason, conf_fn); \
+ CONF_CHUNK_64(part7, bucket, offset + 48, reason, conf_fn); \
+ CONF_CHUNK_64(part8, bucket, offset + 56, reason, conf_fn); \
+ } \
+} while(0)
+#else
+#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
+do { \
+ if (unlikely(diff512(var, ones512()))) { \
+ m128 p128_0 = extract128from512(var, 0); \
+ m128 p128_1 = extract128from512(var, 1); \
+ m128 p128_2 = extract128from512(var, 2); \
+ m128 p128_3 = extract128from512(var, 3); \
+ u32 part1 = movd(p128_0); \
+ u32 part2 = movd(rshiftbyte_m128(p128_0, 4)); \
+ u32 part3 = movd(rshiftbyte_m128(p128_0, 8)); \
+ u32 part4 = movd(rshiftbyte_m128(p128_0, 12)); \
+ u32 part5 = movd(p128_1); \
+ u32 part6 = movd(rshiftbyte_m128(p128_1, 4)); \
+ u32 part7 = movd(rshiftbyte_m128(p128_1, 8)); \
+ u32 part8 = movd(rshiftbyte_m128(p128_1, 12)); \
+ u32 part9 = movd(p128_2); \
+ u32 part10 = movd(rshiftbyte_m128(p128_2, 4)); \
+ u32 part11 = movd(rshiftbyte_m128(p128_2, 8)); \
+ u32 part12 = movd(rshiftbyte_m128(p128_2, 12)); \
+ u32 part13 = movd(p128_3); \
+ u32 part14 = movd(rshiftbyte_m128(p128_3, 4)); \
+ u32 part15 = movd(rshiftbyte_m128(p128_3, 8)); \
+ u32 part16 = movd(rshiftbyte_m128(p128_3, 12)); \
+ CONF_CHUNK_32(part1, bucket, offset, reason, conf_fn); \
+ CONF_CHUNK_32(part2, bucket, offset + 4, reason, conf_fn); \
+ CONF_CHUNK_32(part3, bucket, offset + 8, reason, conf_fn); \
+ CONF_CHUNK_32(part4, bucket, offset + 12, reason, conf_fn); \
+ CONF_CHUNK_32(part5, bucket, offset + 16, reason, conf_fn); \
+ CONF_CHUNK_32(part6, bucket, offset + 20, reason, conf_fn); \
+ CONF_CHUNK_32(part7, bucket, offset + 24, reason, conf_fn); \
+ CONF_CHUNK_32(part8, bucket, offset + 28, reason, conf_fn); \
+ CONF_CHUNK_32(part9, bucket, offset + 32, reason, conf_fn); \
+ CONF_CHUNK_32(part10, bucket, offset + 36, reason, conf_fn); \
+ CONF_CHUNK_32(part11, bucket, offset + 40, reason, conf_fn); \
+ CONF_CHUNK_32(part12, bucket, offset + 44, reason, conf_fn); \
+ CONF_CHUNK_32(part13, bucket, offset + 48, reason, conf_fn); \
+ CONF_CHUNK_32(part14, bucket, offset + 52, reason, conf_fn); \
+ CONF_CHUNK_32(part15, bucket, offset + 56, reason, conf_fn); \
+ CONF_CHUNK_32(part16, bucket, offset + 60, reason, conf_fn); \
+ } \
+} while(0)
+#endif
+
+#define PREP_SHUF_MASK_NO_REINFORCEMENT(val) \
+ m512 lo = and512(val, *lo_mask); \
+ m512 hi = and512(rshift64_m512(val, 4), *lo_mask)
+
+#define PREP_SHUF_MASK \
+ PREP_SHUF_MASK_NO_REINFORCEMENT(load512(ptr)); \
+ *c_16 = *(ptr + 15); \
+ *c_32 = *(ptr + 31); \
+ *c_48 = *(ptr + 47); \
+ m512 r_msk = set512_64(0ULL, r_msk_base[*c_48], 0ULL, r_msk_base[*c_32],\
+ 0ULL, r_msk_base[*c_16], 0ULL, r_msk_base[*c_0]);\
+ *c_0 = *(ptr + 63)
+
+#define SHIFT_OR_M1 \
+ or512(pshufb_m512(dup_mask[0], lo), pshufb_m512(dup_mask[1], hi))
+
+#define SHIFT_OR_M2 \
+ or512(lshift128_m512(or512(pshufb_m512(dup_mask[2], lo), \
+ pshufb_m512(dup_mask[3], hi)), \
+ 1), SHIFT_OR_M1)
+
+#define SHIFT_OR_M3 \
+ or512(lshift128_m512(or512(pshufb_m512(dup_mask[4], lo), \
+ pshufb_m512(dup_mask[5], hi)), \
+ 2), SHIFT_OR_M2)
+
+#define SHIFT_OR_M4 \
+ or512(lshift128_m512(or512(pshufb_m512(dup_mask[6], lo), \
+ pshufb_m512(dup_mask[7], hi)), \
+ 3), SHIFT_OR_M3)
+
+static really_inline
+m512 prep_conf_teddy_no_reinforcement_m1(const m512 *lo_mask,
+ const m512 *dup_mask,
+ const m512 val) {
+ PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+ return SHIFT_OR_M1;
+}
+
+static really_inline
+m512 prep_conf_teddy_no_reinforcement_m2(const m512 *lo_mask,
+ const m512 *dup_mask,
+ const m512 val) {
+ PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+ return SHIFT_OR_M2;
+}
+
+static really_inline
+m512 prep_conf_teddy_no_reinforcement_m3(const m512 *lo_mask,
+ const m512 *dup_mask,
+ const m512 val) {
+ PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+ return SHIFT_OR_M3;
+}
+
+static really_inline
+m512 prep_conf_teddy_no_reinforcement_m4(const m512 *lo_mask,
+ const m512 *dup_mask,
+ const m512 val) {
+ PREP_SHUF_MASK_NO_REINFORCEMENT(val);
+ return SHIFT_OR_M4;
+}
+
+static really_inline
+m512 prep_conf_teddy_m1(const m512 *lo_mask, const m512 *dup_mask,
+ const u8 *ptr, const u64a *r_msk_base,
+ u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
+ PREP_SHUF_MASK;
+ return or512(SHIFT_OR_M1, r_msk);
+}
+
+static really_inline
+m512 prep_conf_teddy_m2(const m512 *lo_mask, const m512 *dup_mask,
+ const u8 *ptr, const u64a *r_msk_base,
+ u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
+ PREP_SHUF_MASK;
+ return or512(SHIFT_OR_M2, r_msk);
+}
+
+static really_inline
+m512 prep_conf_teddy_m3(const m512 *lo_mask, const m512 *dup_mask,
+ const u8 *ptr, const u64a *r_msk_base,
+ u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
+ PREP_SHUF_MASK;
+ return or512(SHIFT_OR_M3, r_msk);
+}
+
+static really_inline
+m512 prep_conf_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
+ const u8 *ptr, const u64a *r_msk_base,
+ u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
+ PREP_SHUF_MASK;
+ return or512(SHIFT_OR_M4, r_msk);
+}
+
+#define PREP_CONF_FN_NO_REINFORCEMENT(val, n) \
+ prep_conf_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val)
+
+#define PREP_CONF_FN(ptr, n) \
+ prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base, \
+ &c_0, &c_16, &c_32, &c_48)
+
+#define PREPARE_MASKS_1 \
+ dup_mask[0] = set4x128(maskBase[0]); \
+ dup_mask[1] = set4x128(maskBase[1]);
+
+#define PREPARE_MASKS_2 \
+ PREPARE_MASKS_1 \
+ dup_mask[2] = set4x128(maskBase[2]); \
+ dup_mask[3] = set4x128(maskBase[3]);
+
+#define PREPARE_MASKS_3 \
+ PREPARE_MASKS_2 \
+ dup_mask[4] = set4x128(maskBase[4]); \
+ dup_mask[5] = set4x128(maskBase[5]);
+
+#define PREPARE_MASKS_4 \
+ PREPARE_MASKS_3 \
+ dup_mask[6] = set4x128(maskBase[6]); \
+ dup_mask[7] = set4x128(maskBase[7]);
+
+#define PREPARE_MASKS(n) \
+ m512 lo_mask = set64x8(0xf); \
+ m512 dup_mask[n * 2]; \
+ PREPARE_MASKS_##n
+
+#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn) \
+do { \
+ const u8 *buf_end = a->buf + a->len; \
+ const u8 *ptr = a->buf + a->start_offset; \
+ u32 floodBackoff = FLOOD_BACKOFF_START; \
+ const u8 *tryFloodDetect = a->firstFloodDetect; \
+ u32 last_match = (u32)-1; \
+ const struct Teddy *teddy = (const struct Teddy *)fdr; \
+ const size_t iterBytes = 128; \
+ DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \
+ a->buf, a->len, a->start_offset); \
+ \
+ const m128 *maskBase = getMaskBase(teddy); \
+ PREPARE_MASKS(n_msk); \
+ const u32 *confBase = getConfBase(teddy); \
+ \
+ const u64a *r_msk_base = getReinforcedMaskBase(teddy, n_msk); \
+ u32 c_0 = 0x100; \
+ u32 c_16 = 0x100; \
+ u32 c_32 = 0x100; \
+ u32 c_48 = 0x100; \
+ const u8 *mainStart = ROUNDUP_PTR(ptr, 64); \
+ DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \
+ if (ptr < mainStart) { \
+ ptr = mainStart - 64; \
+ m512 p_mask; \
+ m512 val_0 = vectoredLoad512(&p_mask, ptr, a->start_offset, \
+ a->buf, buf_end, \
+ a->buf_history, a->len_history, n_msk); \
+ m512 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk); \
+ r_0 = or512(r_0, p_mask); \
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \
+ ptr += 64; \
+ } \
+ \
+ if (ptr + 64 <= buf_end) { \
+ m512 r_0 = PREP_CONF_FN(ptr, n_msk); \
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \
+ ptr += 64; \
+ } \
+ \
+ for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { \
+ __builtin_prefetch(ptr + (iterBytes * 4)); \
+ CHECK_FLOOD; \
+ m512 r_0 = PREP_CONF_FN(ptr, n_msk); \
+ CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \
+ m512 r_1 = PREP_CONF_FN(ptr + 64, n_msk); \
+ CONFIRM_TEDDY(r_1, 8, 64, NOT_CAUTIOUS, conf_fn); \
+ } \
+ \
+ if (ptr + 64 <= buf_end) { \
+ m512 r_0 = PREP_CONF_FN(ptr, n_msk); \
+ CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \
+ ptr += 64; \
+ } \
+ \
+ assert(ptr + 64 > buf_end); \
+ if (ptr < buf_end) { \
+ m512 p_mask; \
+ m512 val_0 = vectoredLoad512(&p_mask, ptr, 0, ptr, buf_end, \
+ a->buf_history, a->len_history, n_msk); \
+ m512 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk); \
+ r_0 = or512(r_0, p_mask); \
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \
+ } \
+ \
+ return HWLM_SUCCESS; \
+} while(0)
+
+#elif defined(HAVE_AVX2) // not HAVE_AVX512 but HAVE_AVX2 reinforced teddy
#ifdef ARCH_64_BIT
#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
u64a part2 = movq(rshiftbyte_m128(lo, 8)); \
u64a part3 = movq(hi); \
u64a part4 = movq(rshiftbyte_m128(hi, 8)); \
- if (unlikely(part1 != ones_u64a)) { \
- part1 = ~part1; \
- conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
- if (unlikely(part2 != ones_u64a)) { \
- part2 = ~part2; \
- conf_fn(&part2, bucket, offset + 8, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
- if (unlikely(part3 != ones_u64a)) { \
- part3 = ~part3; \
- conf_fn(&part3, bucket, offset + 16, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
- if (unlikely(part4 != ones_u64a)) { \
- part4 = ~part4; \
- conf_fn(&part4, bucket, offset + 24, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
+ CONF_CHUNK_64(part1, bucket, offset, reason, conf_fn); \
+ CONF_CHUNK_64(part2, bucket, offset + 8, reason, conf_fn); \
+ CONF_CHUNK_64(part3, bucket, offset + 16, reason, conf_fn); \
+ CONF_CHUNK_64(part4, bucket, offset + 24, reason, conf_fn); \
} \
} while(0)
#else
u32 part6 = movd(rshiftbyte_m128(hi, 4)); \
u32 part7 = movd(rshiftbyte_m128(hi, 8)); \
u32 part8 = movd(rshiftbyte_m128(hi, 12)); \
- if (unlikely(part1 != ones_u32)) { \
- part1 = ~part1; \
- conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
- if (unlikely(part2 != ones_u32)) { \
- part2 = ~part2; \
- conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
- if (unlikely(part3 != ones_u32)) { \
- part3 = ~part3; \
- conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
- if (unlikely(part4 != ones_u32)) { \
- part4 = ~part4; \
- conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
- if (unlikely(part5 != ones_u32)) { \
- part5 = ~part5; \
- conf_fn(&part5, bucket, offset + 16, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
- if (unlikely(part6 != ones_u32)) { \
- part6 = ~part6; \
- conf_fn(&part6, bucket, offset + 20, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
- if (unlikely(part7 != ones_u32)) { \
- part7 = ~part7; \
- conf_fn(&part7, bucket, offset + 24, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
- if (unlikely(part8 != ones_u32)) { \
- part8 = ~part8; \
- conf_fn(&part8, bucket, offset + 28, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
+ CONF_CHUNK_32(part1, bucket, offset, reason, conf_fn); \
+ CONF_CHUNK_32(part2, bucket, offset + 4, reason, conf_fn); \
+ CONF_CHUNK_32(part3, bucket, offset + 8, reason, conf_fn); \
+ CONF_CHUNK_32(part4, bucket, offset + 12, reason, conf_fn); \
+ CONF_CHUNK_32(part5, bucket, offset + 16, reason, conf_fn); \
+ CONF_CHUNK_32(part6, bucket, offset + 20, reason, conf_fn); \
+ CONF_CHUNK_32(part7, bucket, offset + 24, reason, conf_fn); \
+ CONF_CHUNK_32(part8, bucket, offset + 28, reason, conf_fn); \
} \
} while(0)
#endif
return or256(SHIFT_OR_M4, r_msk);
}
-#else // not defined __AVX2__
+#define PREP_CONF_FN_NO_REINFORCEMENT(val, n) \
+ prep_conf_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val)
+
+#define PREP_CONF_FN(ptr, n) \
+ prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base, &c_0, &c_128)
+
+#define PREPARE_MASKS_1 \
+ dup_mask[0] = set2x128(maskBase[0]); \
+ dup_mask[1] = set2x128(maskBase[1]);
+
+#define PREPARE_MASKS_2 \
+ PREPARE_MASKS_1 \
+ dup_mask[2] = set2x128(maskBase[2]); \
+ dup_mask[3] = set2x128(maskBase[3]);
+
+#define PREPARE_MASKS_3 \
+ PREPARE_MASKS_2 \
+ dup_mask[4] = set2x128(maskBase[4]); \
+ dup_mask[5] = set2x128(maskBase[5]);
+
+#define PREPARE_MASKS_4 \
+ PREPARE_MASKS_3 \
+ dup_mask[6] = set2x128(maskBase[6]); \
+ dup_mask[7] = set2x128(maskBase[7]);
+
+#define PREPARE_MASKS(n) \
+ m256 lo_mask = set32x8(0xf); \
+ m256 dup_mask[n * 2]; \
+ PREPARE_MASKS_##n
+
+#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn) \
+do { \
+ const u8 *buf_end = a->buf + a->len; \
+ const u8 *ptr = a->buf + a->start_offset; \
+ u32 floodBackoff = FLOOD_BACKOFF_START; \
+ const u8 *tryFloodDetect = a->firstFloodDetect; \
+ u32 last_match = (u32)-1; \
+ const struct Teddy *teddy = (const struct Teddy *)fdr; \
+ const size_t iterBytes = 64; \
+ DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \
+ a->buf, a->len, a->start_offset); \
+ \
+ const m128 *maskBase = getMaskBase(teddy); \
+ PREPARE_MASKS(n_msk); \
+ const u32 *confBase = getConfBase(teddy); \
+ \
+ const u64a *r_msk_base = getReinforcedMaskBase(teddy, n_msk); \
+ u32 c_0 = 0x100; \
+ u32 c_128 = 0x100; \
+ const u8 *mainStart = ROUNDUP_PTR(ptr, 32); \
+ DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \
+ if (ptr < mainStart) { \
+ ptr = mainStart - 32; \
+ m256 p_mask; \
+ m256 val_0 = vectoredLoad256(&p_mask, ptr, a->start_offset, \
+ a->buf, buf_end, \
+ a->buf_history, a->len_history, n_msk); \
+ m256 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk); \
+ r_0 = or256(r_0, p_mask); \
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \
+ ptr += 32; \
+ } \
+ \
+ if (ptr + 32 <= buf_end) { \
+ m256 r_0 = PREP_CONF_FN(ptr, n_msk); \
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \
+ ptr += 32; \
+ } \
+ \
+ for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { \
+ __builtin_prefetch(ptr + (iterBytes * 4)); \
+ CHECK_FLOOD; \
+ m256 r_0 = PREP_CONF_FN(ptr, n_msk); \
+ CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \
+ m256 r_1 = PREP_CONF_FN(ptr + 32, n_msk); \
+ CONFIRM_TEDDY(r_1, 8, 32, NOT_CAUTIOUS, conf_fn); \
+ } \
+ \
+ if (ptr + 32 <= buf_end) { \
+ m256 r_0 = PREP_CONF_FN(ptr, n_msk); \
+ CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \
+ ptr += 32; \
+ } \
+ \
+ assert(ptr + 32 > buf_end); \
+ if (ptr < buf_end) { \
+ m256 p_mask; \
+ m256 val_0 = vectoredLoad256(&p_mask, ptr, 0, ptr, buf_end, \
+ a->buf_history, a->len_history, n_msk); \
+ m256 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk); \
+ r_0 = or256(r_0, p_mask); \
+ CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \
+ } \
+ \
+ return HWLM_SUCCESS; \
+} while(0)
+
+#else // not defined HAVE_AVX2
#ifdef ARCH_64_BIT
#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \
if (unlikely(diff128(var, ones128()))) { \
u64a lo = movq(var); \
u64a hi = movq(rshiftbyte_m128(var, 8)); \
- if (unlikely(lo != ones_u64a)) { \
- lo = ~lo; \
- conf_fn(&lo, bucket, offset, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
- if (unlikely(hi != ones_u64a)) { \
- hi = ~hi; \
- conf_fn(&hi, bucket, offset + 8, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
+ CONF_CHUNK_64(lo, bucket, offset, reason, conf_fn); \
+ CONF_CHUNK_64(hi, bucket, offset + 8, reason, conf_fn); \
} \
} while(0)
#else
u32 part2 = movd(rshiftbyte_m128(var, 4)); \
u32 part3 = movd(rshiftbyte_m128(var, 8)); \
u32 part4 = movd(rshiftbyte_m128(var, 12)); \
- if (unlikely(part1 != ones_u32)) { \
- part1 = ~part1; \
- conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
- if (unlikely(part2 != ones_u32)) { \
- part2 = ~part2; \
- conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
- if (unlikely(part3 != ones_u32)) { \
- part3 = ~part3; \
- conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
- if (unlikely(part4 != ones_u32)) { \
- part4 = ~part4; \
- conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
+ CONF_CHUNK_32(part1, bucket, offset, reason, conf_fn); \
+ CONF_CHUNK_32(part2, bucket, offset + 4, reason, conf_fn); \
+ CONF_CHUNK_32(part3, bucket, offset + 8, reason, conf_fn); \
+ CONF_CHUNK_32(part4, bucket, offset + 12, reason, conf_fn); \
} \
} while(0)
#endif
return or128(r, res_shifted_3);
}
-#endif // __AVX2__
-
-#if defined(__AVX2__) // reinforced teddy
-
-#define PREP_CONF_FN_NO_REINFORCEMENT(val, n) \
- prep_conf_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val)
-
-#define PREP_CONF_FN(ptr, n) \
- prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base, &c_0, &c_128)
-
-#define PREPARE_MASKS_1 \
- dup_mask[0] = set2x128(maskBase[0]); \
- dup_mask[1] = set2x128(maskBase[1]);
-
-#define PREPARE_MASKS_2 \
- PREPARE_MASKS_1 \
- dup_mask[2] = set2x128(maskBase[2]); \
- dup_mask[3] = set2x128(maskBase[3]);
-
-#define PREPARE_MASKS_3 \
- PREPARE_MASKS_2 \
- dup_mask[4] = set2x128(maskBase[4]); \
- dup_mask[5] = set2x128(maskBase[5]);
-
-#define PREPARE_MASKS_4 \
- PREPARE_MASKS_3 \
- dup_mask[6] = set2x128(maskBase[6]); \
- dup_mask[7] = set2x128(maskBase[7]);
-
-#define PREPARE_MASKS(n) \
- m256 lo_mask = set32x8(0xf); \
- m256 dup_mask[n * 2]; \
- PREPARE_MASKS_##n
-
-#else // not defined __AVX2__
-
#define FDR_EXEC_TEDDY_RES_OLD_1
#define FDR_EXEC_TEDDY_RES_OLD_2 \
#define PREP_CONF_FN(mask_base, val, n) \
PREP_CONF_FN_##n(mask_base, val)
-#endif // __AVX2__
-
-#if defined(__AVX2__) // reinforced teddy
-#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn) \
-do { \
- const u8 *buf_end = a->buf + a->len; \
- const u8 *ptr = a->buf + a->start_offset; \
- u32 floodBackoff = FLOOD_BACKOFF_START; \
- const u8 *tryFloodDetect = a->firstFloodDetect; \
- u32 last_match = (u32)-1; \
- const struct Teddy *teddy = (const struct Teddy *)fdr; \
- const size_t iterBytes = 64; \
- DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \
- a->buf, a->len, a->start_offset); \
- \
- const m128 *maskBase = getMaskBase(teddy); \
- PREPARE_MASKS(n_msk); \
- const u32 *confBase = getConfBase(teddy); \
- \
- const u64a *r_msk_base = getReinforcedMaskBase(teddy, n_msk); \
- u32 c_0 = 0x100; \
- u32 c_128 = 0x100; \
- const u8 *mainStart = ROUNDUP_PTR(ptr, 32); \
- DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \
- if (ptr < mainStart) { \
- ptr = mainStart - 32; \
- m256 p_mask; \
- m256 val_0 = vectoredLoad256(&p_mask, ptr, a->start_offset, \
- a->buf, buf_end, \
- a->buf_history, a->len_history, n_msk); \
- m256 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk); \
- r_0 = or256(r_0, p_mask); \
- CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \
- ptr += 32; \
- } \
- \
- if (ptr + 32 <= buf_end) { \
- m256 r_0 = PREP_CONF_FN(ptr, n_msk); \
- CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \
- ptr += 32; \
- } \
- \
- for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { \
- __builtin_prefetch(ptr + (iterBytes * 4)); \
- CHECK_FLOOD; \
- m256 r_0 = PREP_CONF_FN(ptr, n_msk); \
- CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \
- m256 r_1 = PREP_CONF_FN(ptr + 32, n_msk); \
- CONFIRM_TEDDY(r_1, 8, 32, NOT_CAUTIOUS, conf_fn); \
- } \
- \
- if (ptr + 32 <= buf_end) { \
- m256 r_0 = PREP_CONF_FN(ptr, n_msk); \
- CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \
- ptr += 32; \
- } \
- \
- assert(ptr + 32 > buf_end); \
- if (ptr < buf_end) { \
- m256 p_mask; \
- m256 val_0 = vectoredLoad256(&p_mask, ptr, 0, ptr, buf_end, \
- a->buf_history, a->len_history, n_msk); \
- m256 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk); \
- r_0 = or256(r_0, p_mask); \
- CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \
- } \
- \
- return HWLM_SUCCESS; \
-} while(0)
-#else // not defined __AVX2__
#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn) \
do { \
const u8 *buf_end = a->buf + a->len; \
\
return HWLM_SUCCESS; \
} while(0)
-#endif // __AVX2__
+
+#endif // HAVE_AVX2 HAVE_AVX512
hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
const struct FDR_Runtime_Args *a,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
};
+#define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, conf_fn) \
+do { \
+ if (unlikely(chunk != ones_u64a)) { \
+ chunk = ~chunk; \
+ conf_fn(&chunk, bucket, off, confBase, reason, a, ptr, \
+ &control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+} while(0)
+
+#define CONF_FAT_CHUNK_32(chunk, bucket, off, reason, conf_fn) \
+do { \
+ if (unlikely(chunk != ones_u32a)) { \
+ chunk = ~chunk; \
+ conf_fn(&chunk, bucket, off, confBase, reason, a, ptr, \
+ &control, &last_match); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+} while(0)
+
+static really_inline
+const m256 *getMaskBase_avx2(const struct Teddy *teddy) {
+ return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
+}
+
#ifdef ARCH_64_BIT
#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \
do { \
r = interleave256hi(var, swap); \
u64a part3 = extractlow64from256(r); \
u64a part4 = extract64from256(r, 1); \
- if (unlikely(part1 != ones_u64a)) { \
- part1 = ~part1; \
- conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
- if (unlikely(part2 != ones_u64a)) { \
- part2 = ~part2; \
- conf_fn(&part2, bucket, offset + 4, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
- if (unlikely(part3 != ones_u64a)) { \
- part3 = ~part3; \
- conf_fn(&part3, bucket, offset + 8, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
- if (unlikely(part4 != ones_u64a)) { \
- part4 = ~part4; \
- conf_fn(&part4, bucket, offset + 12, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
+ CONF_FAT_CHUNK_64(part1, bucket, offset, reason, conf_fn); \
+ CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, conf_fn); \
+ CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, conf_fn); \
+ CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, conf_fn); \
} \
} while(0)
#else
u32 part6 = extract32from256(r, 1); \
u32 part7 = extract32from256(r, 2); \
u32 part8 = extract32from256(r, 3); \
- if (unlikely(part1 != ones_u32)) { \
- part1 = ~part1; \
- conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
- if (unlikely(part2 != ones_u32)) { \
- part2 = ~part2; \
- conf_fn(&part2, bucket, offset + 2, confBase, reason, a, ptr, \
- &control, &last_match); \
- } \
- if (unlikely(part3 != ones_u32)) { \
- part3 = ~part3; \
- conf_fn(&part3, bucket, offset + 4, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
- if (unlikely(part4 != ones_u32)) { \
- part4 = ~part4; \
- conf_fn(&part4, bucket, offset + 6, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
- if (unlikely(part5 != ones_u32)) { \
- part5 = ~part5; \
- conf_fn(&part5, bucket, offset + 8, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
- if (unlikely(part6 != ones_u32)) { \
- part6 = ~part6; \
- conf_fn(&part6, bucket, offset + 10, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
- if (unlikely(part7 != ones_u32)) { \
- part7 = ~part7; \
- conf_fn(&part7, bucket, offset + 12, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
- if (unlikely(part8 != ones_u32)) { \
- part8 = ~part8; \
- conf_fn(&part8, bucket, offset + 14, confBase, reason, a, ptr, \
- &control, &last_match); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
+ CONF_FAT_CHUNK_32(part1, bucket, offset, reason, conf_fn); \
+ CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, conf_fn); \
+ CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, conf_fn); \
+ CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, conf_fn); \
+ CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, conf_fn); \
+ CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, conf_fn); \
+ CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, conf_fn); \
+ CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, conf_fn); \
} \
} while(0)
#endif
return or256(r, res_shifted_3);
}
-static really_inline
-const m256 *getMaskBase_avx2(const struct Teddy *teddy) {
- return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
-}
-
#define FDR_EXEC_FAT_TEDDY_RES_OLD_1 \
do { \
} while(0)