const u8 *tryFloodDetect = zz->floodPtr; \
const u8 *start_ptr = zz->start; \
const u8 *end_ptr = zz->end; \
- for (const u8 *itPtr = start_ptr; itPtr + 4*ITER_BYTES <= end_ptr; \
+ for (const u8 *itPtr = ROUNDDOWN_PTR(start_ptr, 64); itPtr + 4*ITER_BYTES <= end_ptr; \
itPtr += 4*ITER_BYTES) { \
__builtin_prefetch(itPtr); \
} \
u32 z = movemask256(eq256(mask1, v));
// On large packet buffers, this prefetch appears to get us about 2%.
- __builtin_prefetch(d + 128);
+ __builtin_prefetch(ROUNDDOWN_PTR(d + 128, 64));
hwlm_error_t result = single_zscan(n, d, buf, z, len, cbi);
if (unlikely(result != HWLM_SUCCESS))
lastz0 = z0 >> 31;
// On large packet buffers, this prefetch appears to get us about 2%.
- __builtin_prefetch(d + 128);
+ __builtin_prefetch(ROUNDDOWN_PTR(d + 128, 64));
hwlm_error_t result = double_zscan(n, d, buf, z, len, cbi);
if (unlikely(result != HWLM_SUCCESS))
u32 z = movemask128(eq128(mask1, v));
// On large packet buffers, this prefetch appears to get us about 2%.
- __builtin_prefetch(d + 128);
+ __builtin_prefetch(ROUNDDOWN_PTR(d + 128, 64));
DEBUG_PRINTF("z 0x%08x\n", z);
hwlm_error_t result = single_zscan(n, d, buf, z, len, cbi);
lastz1 = z1;
// On large packet buffers, this prefetch appears to get us about 2%.
- __builtin_prefetch(d + 128);
+ __builtin_prefetch(ROUNDDOWN_PTR(d + 128, 64));
DEBUG_PRINTF("z 0x%08x\n", z);
hwlm_error_t result = double_zscan(n, d, buf, z, len, cbi);
assert(ISALIGNED_N(q->state, 2));
u32 s = *(u16 *)q->state;
- __builtin_prefetch(&m->remap[0]);
- __builtin_prefetch(&m->remap[64]);
- __builtin_prefetch(&m->remap[128]);
- __builtin_prefetch(&m->remap[192]);
+ const u8 *base = ROUNDDOWN_PTR(&m->remap[0], 64);
+ __builtin_prefetch(base);
+ __builtin_prefetch(base + 64);
+ __builtin_prefetch(base + 128);
+ __builtin_prefetch(base + 192);
if (q->report_current) {
assert(s);
u32 s = *(u8 *)q->state;
- __builtin_prefetch(&m->remap[0]);
- __builtin_prefetch(&m->remap[64]);
- __builtin_prefetch(&m->remap[128]);
- __builtin_prefetch(&m->remap[192]);
+ const u8 *base = ROUNDDOWN_PTR(&m->remap[0], 64);
+ __builtin_prefetch(base);
+ __builtin_prefetch(base + 64);
+ __builtin_prefetch(base + 128);
+ __builtin_prefetch(base + 192);
if (q->report_current) {
assert(s);
return MO_ALIVE;
}
- __builtin_prefetch(&m->remap[0]);
- __builtin_prefetch(&m->remap[64]);
- __builtin_prefetch(&m->remap[128]);
- __builtin_prefetch(&m->remap[192]);
+ const u8 *base = ROUNDDOWN_PTR(&m->remap[0], 64);
+ __builtin_prefetch(base);
+ __builtin_prefetch(base + 64);
+ __builtin_prefetch(base + 128);
+ __builtin_prefetch(base + 192);
while (1) {
assert(q->cur < q->end);
return MO_ALIVE;
}
- __builtin_prefetch(&m->remap[0]);
- __builtin_prefetch(&m->remap[64]);
- __builtin_prefetch(&m->remap[128]);
- __builtin_prefetch(&m->remap[192]);
+ const u8 *base = ROUNDDOWN_PTR(&m->remap[0], 64);
+ __builtin_prefetch(base);
+ __builtin_prefetch(base + 64);
+ __builtin_prefetch(base + 128);
+ __builtin_prefetch(base + 192);
while (1) {
DEBUG_PRINTF("%s @ %llu\n", q->items[q->cur].type == MQE_TOP ? "TOP" :
const u8 *last_block = buf_end - 16;
- for (const u8 *itPtr = buf; itPtr + 4*16 <= last_block; itPtr += 4*16) {
+ for (const u8 *itPtr = ROUNDDOWN_PTR(buf, 64); itPtr + 4*16 <= last_block; itPtr += 4*16) {
__builtin_prefetch(itPtr);
}
while (buf < last_block) {