]> git.ipfire.org Git - thirdparty/vectorscan.git/commitdiff
prefetch works best when addresses are 64-byte aligned
authorKonstantinos Margaritis <konstantinos@vectorcamp.gr>
Fri, 12 Mar 2021 08:10:53 +0000 (10:10 +0200)
committerKonstantinos Margaritis <konstantinos@vectorcamp.gr>
Tue, 12 Oct 2021 08:50:32 +0000 (11:50 +0300)
src/fdr/fdr.c
src/hwlm/noodle_engine_avx2.c
src/hwlm/noodle_engine_sse.c
src/nfa/mcclellan.c
src/nfa/mcsheng.c
src/nfa/shufti.c

index 715ab68469c018ec2fd216150d18aa958faa9def..561e8f98662561619620ef5768bf114ec8803aac 100644 (file)
@@ -665,7 +665,7 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
         const u8 *tryFloodDetect = zz->floodPtr;                            \
         const u8 *start_ptr = zz->start;                                    \
         const u8 *end_ptr = zz->end;                                        \
-        for (const u8 *itPtr = start_ptr; itPtr + 4*ITER_BYTES <= end_ptr;  \
+        for (const u8 *itPtr = ROUNDDOWN_PTR(start_ptr, 64); itPtr + 4*ITER_BYTES <= end_ptr;      \
             itPtr += 4*ITER_BYTES) {                                        \
             __builtin_prefetch(itPtr);                                      \
         }                                                                   \
index 05c40cd22a346eada1ee9677eec4858ee7a71798..0aebdc673d781b6100418c1def66090b23a2f15b 100644 (file)
@@ -95,7 +95,7 @@ hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf,
         u32 z = movemask256(eq256(mask1, v));
 
         // On large packet buffers, this prefetch appears to get us about 2%.
-        __builtin_prefetch(d + 128);
+        __builtin_prefetch(ROUNDDOWN_PTR(d + 128, 64));
 
         hwlm_error_t result = single_zscan(n, d, buf, z, len, cbi);
         if (unlikely(result != HWLM_SUCCESS))
@@ -126,7 +126,7 @@ hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf,
         lastz0 = z0 >> 31;
 
         // On large packet buffers, this prefetch appears to get us about 2%.
-        __builtin_prefetch(d + 128);
+        __builtin_prefetch(ROUNDDOWN_PTR(d + 128, 64));
 
         hwlm_error_t result = double_zscan(n, d, buf, z, len, cbi);
         if (unlikely(result != HWLM_SUCCESS))
index 78033a472fdc022926f2653dcdb647cec5b65344..501aea859d5f98a83d28dc18963c8d100f31a4e1 100644 (file)
@@ -91,7 +91,7 @@ hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf,
         u32 z = movemask128(eq128(mask1, v));
 
         // On large packet buffers, this prefetch appears to get us about 2%.
-        __builtin_prefetch(d + 128);
+        __builtin_prefetch(ROUNDDOWN_PTR(d + 128, 64));
         DEBUG_PRINTF("z 0x%08x\n", z);
 
         hwlm_error_t result = single_zscan(n, d, buf, z, len, cbi);
@@ -118,7 +118,7 @@ hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf,
         lastz1 = z1;
 
         // On large packet buffers, this prefetch appears to get us about 2%.
-        __builtin_prefetch(d + 128);
+        __builtin_prefetch(ROUNDDOWN_PTR(d + 128, 64));
         DEBUG_PRINTF("z 0x%08x\n", z);
 
         hwlm_error_t result = double_zscan(n, d, buf, z, len, cbi);
index 5ac0615ad6e8bf0e2d9aba2b77fc1098f889f0d3..a7fcb06a4c1d31528a63396db0e8206dfc549aa1 100644 (file)
@@ -634,10 +634,11 @@ char nfaExecMcClellan16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
     assert(ISALIGNED_N(q->state, 2));
     u32 s = *(u16 *)q->state;
 
-    __builtin_prefetch(&m->remap[0]);
-    __builtin_prefetch(&m->remap[64]);
-    __builtin_prefetch(&m->remap[128]);
-    __builtin_prefetch(&m->remap[192]);
+    const u8 *base = ROUNDDOWN_PTR(&m->remap[0], 64);
+    __builtin_prefetch(base);
+    __builtin_prefetch(base + 64);
+    __builtin_prefetch(base + 128);
+    __builtin_prefetch(base + 192);
 
     if (q->report_current) {
         assert(s);
@@ -795,10 +796,11 @@ char nfaExecMcClellan8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
 
     u32 s = *(u8 *)q->state;
 
-    __builtin_prefetch(&m->remap[0]);
-    __builtin_prefetch(&m->remap[64]);
-    __builtin_prefetch(&m->remap[128]);
-    __builtin_prefetch(&m->remap[192]);
+    const u8 *base = ROUNDDOWN_PTR(&m->remap[0], 64);
+    __builtin_prefetch(base);
+    __builtin_prefetch(base + 64);
+    __builtin_prefetch(base + 128);
+    __builtin_prefetch(base + 192);
 
     if (q->report_current) {
         assert(s);
index c52bf31c22525785f106530ed1f0bf83b6d1c83b..5c97d73a4eead009ab3bceda455091051baa9ed4 100644 (file)
@@ -889,10 +889,11 @@ char nfaExecMcSheng16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
         return MO_ALIVE;
     }
 
-    __builtin_prefetch(&m->remap[0]);
-    __builtin_prefetch(&m->remap[64]);
-    __builtin_prefetch(&m->remap[128]);
-    __builtin_prefetch(&m->remap[192]);
+    const u8 *base = ROUNDDOWN_PTR(&m->remap[0], 64);
+    __builtin_prefetch(base);
+    __builtin_prefetch(base + 64);
+    __builtin_prefetch(base + 128);
+    __builtin_prefetch(base + 192);
 
     while (1) {
         assert(q->cur < q->end);
@@ -1022,10 +1023,11 @@ char nfaExecMcSheng8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
         return MO_ALIVE;
     }
 
-    __builtin_prefetch(&m->remap[0]);
-    __builtin_prefetch(&m->remap[64]);
-    __builtin_prefetch(&m->remap[128]);
-    __builtin_prefetch(&m->remap[192]);
+    const u8 *base = ROUNDDOWN_PTR(&m->remap[0], 64);
+    __builtin_prefetch(base);
+    __builtin_prefetch(base + 64);
+    __builtin_prefetch(base + 128);
+    __builtin_prefetch(base + 192);
 
     while (1) {
         DEBUG_PRINTF("%s @ %llu\n", q->items[q->cur].type == MQE_TOP ? "TOP" :
index 4f7cae2e1e0fe5eb5e30c46683e8705db4b4b234..2c30ce5c6ecda0947cb4f3eeb05fd0f3862c1e8a 100644 (file)
@@ -179,7 +179,7 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
 
     const u8 *last_block = buf_end - 16;
 
-    for (const u8 *itPtr = buf; itPtr + 4*16 <= last_block; itPtr += 4*16) {
+    for (const u8 *itPtr = ROUNDDOWN_PTR(buf, 64); itPtr + 4*16 <= last_block; itPtr += 4*16) {
         __builtin_prefetch(itPtr);
     }
     while (buf < last_block) {