return scanSingleLoop(n, buf, len, cbi, chars, d1, e);
}
-static really_inline
-svuint16_t getCharMaskDouble(const struct noodTable *n, bool noCase) {
- if (noCase) {
- const uint64_t lowerFirst = n->key0 & 0xdf;
- const uint64_t upperFirst = n->key0 | 0x20;
- const uint64_t lowerSecond = n->key1 & 0xdf;
- const uint64_t upperSecond = n->key1 | 0x20;
- const uint64_t chars = lowerFirst | (lowerSecond << 8)
- | (lowerFirst << 16) | (upperSecond) << 24
- | (upperFirst << 32) | (lowerSecond) << 40
- | (upperFirst << 48) | (upperSecond) << 56;
- return svreinterpret_u16(svdup_u64(chars));
- } else {
- uint16_t chars_u16 = n->key0 | (n->key1 << 8);
- return svdup_u16(chars_u16);
- }
-}
-
static really_inline
hwlm_error_t doubleCheckMatched(const struct noodTable *n, const u8 *buf,
size_t len, const struct cb_info *cbi,
}
++d;
- svuint16_t chars = getCharMaskDouble(n, noCase);
+ svuint16_t chars = getCharMaskDouble(n->key0, n->key1, noCase);
if (scan_len <= svcntb()) {
return scanDoubleOnce(n, buf, len, cbi, chars, d, e);
#include "vermicelli_sse.h"
#endif
-static really_inline
-const u8 *vermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf,
- const u8 *buf_end) {
- DEBUG_PRINTF("double verm scan %s\\x%02hhx%02hhx over %zu bytes\n",
- nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf));
- assert(buf < buf_end);
-
- VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */
- VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */
-
-#ifdef HAVE_AVX512
- if (buf_end - buf <= VERM_BOUNDARY) {
- const u8 *ptr = nocase
- ? dvermMiniNocase(chars1, chars2, buf, buf_end)
- : dvermMini(chars1, chars2, buf, buf_end);
- if (ptr) {
- return ptr;
- }
-
- /* check for partial match at end */
- u8 mask = nocase ? CASE_CLEAR : 0xff;
- if ((buf_end[-1] & mask) == (u8)c1) {
- DEBUG_PRINTF("partial!!!\n");
- return buf_end - 1;
- }
-
- return buf_end;
- }
-#endif
-
- assert((buf_end - buf) >= VERM_BOUNDARY);
- uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY;
- if (min) {
- // Input isn't aligned, so we need to run one iteration with an
- // unaligned load, then skip buf forward to the next aligned address.
- // There's some small overlap here, but we don't mind scanning it twice
- // if we can do it quickly, do we?
- const u8 *ptr = nocase
- ? dvermPreconditionNocase(chars1, chars2, buf)
- : dvermPrecondition(chars1, chars2, buf);
- if (ptr) {
- return ptr;
- }
-
- buf += VERM_BOUNDARY - min;
- assert(buf < buf_end);
- }
-
- // Aligned loops from here on in
- const u8 *ptr = nocase ? dvermSearchAlignedNocase(chars1, chars2, c1, c2,
- buf, buf_end)
- : dvermSearchAligned(chars1, chars2, c1, c2, buf,
- buf_end);
- if (ptr) {
- return ptr;
- }
-
- // Tidy up the mess at the end
- ptr = nocase ? dvermPreconditionNocase(chars1, chars2,
- buf_end - VERM_BOUNDARY)
- : dvermPrecondition(chars1, chars2, buf_end - VERM_BOUNDARY);
-
- if (ptr) {
- return ptr;
- }
-
- /* check for partial match at end */
- u8 mask = nocase ? CASE_CLEAR : 0xff;
- if ((buf_end[-1] & mask) == (u8)c1) {
- DEBUG_PRINTF("partial!!!\n");
- return buf_end - 1;
- }
-
- return buf_end;
-}
-
static really_inline
const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2,
const u8 *buf, const u8 *buf_end) {
return buf_end;
}
-/* returns highest offset of c2 (NOTE: not c1) */
-static really_inline
-const u8 *rvermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf,
- const u8 *buf_end) {
- DEBUG_PRINTF("rev double verm scan %s\\x%02hhx%02hhx over %zu bytes\n",
- nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf));
- assert(buf < buf_end);
-
- VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */
- VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */
-
-#ifdef HAVE_AVX512
- if (buf_end - buf <= VERM_BOUNDARY) {
- const u8 *ptr = nocase
- ? rdvermMiniNocase(chars1, chars2, buf, buf_end)
- : rdvermMini(chars1, chars2, buf, buf_end);
-
- if (ptr) {
- return ptr;
- }
-
- // check for partial match at end ???
- return buf - 1;
- }
-#endif
-
- assert((buf_end - buf) >= VERM_BOUNDARY);
- size_t min = (size_t)buf_end % VERM_BOUNDARY;
- if (min) {
- // input not aligned, so we need to run one iteration with an unaligned
- // load, then skip buf forward to the next aligned address. There's
- // some small overlap here, but we don't mind scanning it twice if we
- // can do it quickly, do we?
- const u8 *ptr = nocase ? rdvermPreconditionNocase(chars1, chars2,
- buf_end - VERM_BOUNDARY)
- : rdvermPrecondition(chars1, chars2,
- buf_end - VERM_BOUNDARY);
-
- if (ptr) {
- return ptr;
- }
-
- buf_end -= min;
- if (buf >= buf_end) {
- return buf_end;
- }
- }
-
- // Aligned loops from here on in
- if (nocase) {
- return rdvermSearchAlignedNocase(chars1, chars2, c1, c2, buf, buf_end);
- } else {
- return rdvermSearchAligned(chars1, chars2, c1, c2, buf, buf_end);
- }
-}
-
#endif /* VERMICELLI_H */
#define VERM_TYPE m128
#define VERM_SET_FN set1_16x8
+// returns NULL if not found
static really_inline
-const u8 *lastMatchOffset(const u8 *buf_end, u32 z) {
- assert(z);
- return buf_end - 16 + 31 - clz32(z);
-}
-
-static really_inline
-const u8 *dvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2,
- const u8 *buf, const u8 *buf_end) {
- for (; buf + 16 < buf_end; buf += 16) {
- m128 data = load128(buf);
- u32 z = movemask128(and128(eq128(chars1, data),
- rshiftbyte_m128(eq128(chars2, data), 1)));
- if (buf[15] == c1 && buf[16] == c2) {
- z |= (1 << 15);
- }
- if (unlikely(z)) {
- u32 pos = ctz32(z);
- return buf + pos;
- }
- }
-
- return NULL;
-}
-
-static really_inline
-const u8 *dvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
- const u8 *buf, const u8 *buf_end) {
- assert((size_t)buf % 16 == 0);
- m128 casemask = set1_16x8(CASE_CLEAR);
+const u8 *dvermPreconditionMasked(m128 chars1, m128 chars2,
+ m128 mask1, m128 mask2, const u8 *buf) {
+ m128 data = loadu128(buf); // unaligned
+ m128 v1 = eq128(chars1, and128(data, mask1));
+ m128 v2 = eq128(chars2, and128(data, mask2));
+ u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1)));
- for (; buf + 16 < buf_end; buf += 16) {
- m128 data = load128(buf);
- m128 v = and128(casemask, data);
- u32 z = movemask128(and128(eq128(chars1, v),
- rshiftbyte_m128(eq128(chars2, v), 1)));
- if ((buf[15] & CASE_CLEAR) == c1 && (buf[16] & CASE_CLEAR) == c2) {
- z |= (1 << 15);
- }
- if (unlikely(z)) {
- u32 pos = ctz32(z);
- return buf + pos;
- }
+ /* no fixup of the boundary required - the aligned run will pick it up */
+ if (unlikely(z)) {
+ u32 pos = ctz32(z);
+ return buf + pos;
}
-
return NULL;
}
}
}
- return NULL;
-}
-
-// returns NULL if not found
-static really_inline
-const u8 *dvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
- m128 data = loadu128(buf); // unaligned
- u32 z = movemask128(and128(eq128(chars1, data),
- rshiftbyte_m128(eq128(chars2, data), 1)));
-
- /* no fixup of the boundary required - the aligned run will pick it up */
- if (unlikely(z)) {
- u32 pos = ctz32(z);
- return buf + pos;
- }
- return NULL;
-}
-
-// returns NULL if not found
-static really_inline
-const u8 *dvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
- /* due to laziness, nonalphas and nocase having interesting behaviour */
- m128 casemask = set1_16x8(CASE_CLEAR);
- m128 data = loadu128(buf); // unaligned
- m128 v = and128(casemask, data);
- u32 z = movemask128(and128(eq128(chars1, v),
- rshiftbyte_m128(eq128(chars2, v), 1)));
-
- /* no fixup of the boundary required - the aligned run will pick it up */
- if (unlikely(z)) {
- u32 pos = ctz32(z);
- return buf + pos;
- }
- return NULL;
-}
-
-// returns NULL if not found
-static really_inline
-const u8 *dvermPreconditionMasked(m128 chars1, m128 chars2,
- m128 mask1, m128 mask2, const u8 *buf) {
- m128 data = loadu128(buf); // unaligned
- m128 v1 = eq128(chars1, and128(data, mask1));
- m128 v2 = eq128(chars2, and128(data, mask2));
- u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1)));
-
- /* no fixup of the boundary required - the aligned run will pick it up */
- if (unlikely(z)) {
- u32 pos = ctz32(z);
- return buf + pos;
- }
- return NULL;
-}
-
-static really_inline
-const u8 *rdvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2,
- const u8 *buf, const u8 *buf_end) {
- assert((size_t)buf_end % 16 == 0);
-
- for (; buf + 16 < buf_end; buf_end -= 16) {
- m128 data = load128(buf_end - 16);
- u32 z = movemask128(and128(eq128(chars2, data),
- lshiftbyte_m128(eq128(chars1, data), 1)));
- if (buf_end[-17] == c1 && buf_end[-16] == c2) {
- z |= 1;
- }
- if (unlikely(z)) {
- return lastMatchOffset(buf_end, z);
- }
- }
- return buf_end;
-}
-
-static really_inline
-const u8 *rdvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
- const u8 *buf, const u8 *buf_end) {
- assert((size_t)buf_end % 16 == 0);
- m128 casemask = set1_16x8(CASE_CLEAR);
-
- for (; buf + 16 < buf_end; buf_end -= 16) {
- m128 data = load128(buf_end - 16);
- m128 v = and128(casemask, data);
- u32 z = movemask128(and128(eq128(chars2, v),
- lshiftbyte_m128(eq128(chars1, v), 1)));
- if ((buf_end[-17] & CASE_CLEAR) == c1
- && (buf_end[-16] & CASE_CLEAR) == c2) {
- z |= 1;
- }
- if (unlikely(z)) {
- return lastMatchOffset(buf_end, z);
- }
- }
- return buf_end;
-}
-
-// returns NULL if not found
-static really_inline
-const u8 *rdvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
- m128 data = loadu128(buf);
- u32 z = movemask128(and128(eq128(chars2, data),
- lshiftbyte_m128(eq128(chars1, data), 1)));
-
- /* no fixup of the boundary required - the aligned run will pick it up */
- if (unlikely(z)) {
- return lastMatchOffset(buf + 16, z);
- }
-
- return NULL;
-}
-
-// returns NULL if not found
-static really_inline
-const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
- /* due to laziness, nonalphas and nocase having interesting behaviour */
- m128 casemask = set1_16x8(CASE_CLEAR);
- m128 data = loadu128(buf);
- m128 v = and128(casemask, data);
- u32 z = movemask128(and128(eq128(chars2, v),
- lshiftbyte_m128(eq128(chars1, v), 1)));
- /* no fixup of the boundary required - the aligned run will pick it up */
- if (unlikely(z)) {
- return lastMatchOffset(buf + 16, z);
- }
-
return NULL;
}
\ No newline at end of file
return NULL;
}
+static really_inline
+const u8 *lastMatchOffset(const u8 *buf_end, u32 z) {
+ assert(z);
+ return buf_end - 16 + 31 - clz32(z);
+}
+
static really_inline
const u8 *rvermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end,
char negate) {
return NULL;
}
+static really_inline
+const u8 *dvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2,
+ const u8 *buf, const u8 *buf_end) {
+ for (; buf + 16 < buf_end; buf += 16) {
+ m128 data = load128(buf);
+ u32 z = movemask128(and128(eq128(chars1, data),
+ rshiftbyte_m128(eq128(chars2, data), 1)));
+ if (buf[15] == c1 && buf[16] == c2) {
+ z |= (1 << 15);
+ }
+ if (unlikely(z)) {
+ const u8 *matchPos = buf + ctz32(z);
+ DEBUG_PRINTF("match pos %p\n", matchPos);
+ return matchPos;
+ }
+ }
+
+ return NULL;
+}
+
+static really_inline
+const u8 *dvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
+ const u8 *buf, const u8 *buf_end) {
+ assert((size_t)buf % 16 == 0);
+ m128 casemask = set1_16x8(CASE_CLEAR);
+
+ for (; buf + 16 < buf_end; buf += 16) {
+ m128 data = load128(buf);
+ m128 v = and128(casemask, data);
+ u32 z = movemask128(and128(eq128(chars1, v),
+ rshiftbyte_m128(eq128(chars2, v), 1)));
+ if ((buf[15] & CASE_CLEAR) == c1 && (buf[16] & CASE_CLEAR) == c2) {
+ z |= (1 << 15);
+ }
+ if (unlikely(z)) {
+ const u8 *matchPos = buf + ctz32(z);
+ DEBUG_PRINTF("match pos %p\n", matchPos);
+ return matchPos;
+ }
+ }
+
+ return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *dvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
+ m128 data = loadu128(buf); // unaligned
+ u32 z = movemask128(and128(eq128(chars1, data),
+ rshiftbyte_m128(eq128(chars2, data), 1)));
+
+ /* no fixup of the boundary required - the aligned run will pick it up */
+ if (unlikely(z)) {
+ const u8 *matchPos = buf + ctz32(z);
+ DEBUG_PRINTF("match pos %p\n", matchPos);
+ return matchPos;
+ }
+ return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *dvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
+ /* due to laziness, nonalphas and nocase having interesting behaviour */
+ m128 casemask = set1_16x8(CASE_CLEAR);
+ m128 data = loadu128(buf); // unaligned
+ m128 v = and128(casemask, data);
+ u32 z = movemask128(and128(eq128(chars1, v),
+ rshiftbyte_m128(eq128(chars2, v), 1)));
+
+ /* no fixup of the boundary required - the aligned run will pick it up */
+ if (unlikely(z)) {
+ const u8 *matchPos = buf + ctz32(z);
+ DEBUG_PRINTF("match pos %p\n", matchPos);
+ return matchPos;
+ }
+ return NULL;
+}
+
+
+static really_inline
+const u8 *rdvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2,
+ const u8 *buf, const u8 *buf_end) {
+ assert((size_t)buf_end % 16 == 0);
+
+ for (; buf + 16 < buf_end; buf_end -= 16) {
+ m128 data = load128(buf_end - 16);
+ u32 z = movemask128(and128(eq128(chars2, data),
+ lshiftbyte_m128(eq128(chars1, data), 1)));
+ if (buf_end[-17] == c1 && buf_end[-16] == c2) {
+ z |= 1;
+ }
+ if (unlikely(z)) {
+ const u8 *matchPos = lastMatchOffset(buf_end, z);
+ DEBUG_PRINTF("match pos %p\n", matchPos);
+ return matchPos;
+ }
+ }
+ return buf_end;
+}
+
+static really_inline
+const u8 *rdvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
+ const u8 *buf, const u8 *buf_end) {
+ assert((size_t)buf_end % 16 == 0);
+ m128 casemask = set1_16x8(CASE_CLEAR);
+
+ for (; buf + 16 < buf_end; buf_end -= 16) {
+ m128 data = load128(buf_end - 16);
+ m128 v = and128(casemask, data);
+ u32 z = movemask128(and128(eq128(chars2, v),
+ lshiftbyte_m128(eq128(chars1, v), 1)));
+ if ((buf_end[-17] & CASE_CLEAR) == c1
+ && (buf_end[-16] & CASE_CLEAR) == c2) {
+ z |= 1;
+ }
+ if (unlikely(z)) {
+ const u8 *matchPos = lastMatchOffset(buf_end, z);
+ DEBUG_PRINTF("match pos %p\n", matchPos);
+ return matchPos;
+ }
+ }
+ return buf_end;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *rdvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
+ m128 data = loadu128(buf);
+ u32 z = movemask128(and128(eq128(chars2, data),
+ lshiftbyte_m128(eq128(chars1, data), 1)));
+
+ /* no fixup of the boundary required - the aligned run will pick it up */
+ if (unlikely(z)) {
+ const u8 *matchPos = lastMatchOffset(buf + 16, z);
+ DEBUG_PRINTF("match pos %p\n", matchPos);
+ return matchPos;
+ }
+
+ return NULL;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
+ /* due to laziness, nonalphas and nocase having interesting behaviour */
+ m128 casemask = set1_16x8(CASE_CLEAR);
+ m128 data = loadu128(buf);
+ m128 v = and128(casemask, data);
+ u32 z = movemask128(and128(eq128(chars2, v),
+ lshiftbyte_m128(eq128(chars1, v), 1)));
+ /* no fixup of the boundary required - the aligned run will pick it up */
+ if (unlikely(z)) {
+ const u8 *matchPos = lastMatchOffset(buf + 16, z);
+ DEBUG_PRINTF("match pos %p\n", matchPos);
+ return matchPos;
+ }
+
+ return NULL;
+}
+
#else // HAVE_AVX512
#define VERM_BOUNDARY 64
ptr = nocase ? rvermUnalignNocase(chars, buf, 1)
: rvermUnalign(chars, buf, 1);
return ptr ? ptr : buf - 1;
+}
+
+static really_inline
+const u8 *vermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf,
+ const u8 *buf_end) {
+ DEBUG_PRINTF("double verm scan %s\\x%02hhx%02hhx over %zu bytes\n",
+ nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf));
+ assert(buf < buf_end);
+
+ VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */
+ VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */
+
+#ifdef HAVE_AVX512
+ if (buf_end - buf <= VERM_BOUNDARY) {
+ const u8 *ptr = nocase
+ ? dvermMiniNocase(chars1, chars2, buf, buf_end)
+ : dvermMini(chars1, chars2, buf, buf_end);
+ if (ptr) {
+ return ptr;
+ }
+
+ /* check for partial match at end */
+ u8 mask = nocase ? CASE_CLEAR : 0xff;
+ if ((buf_end[-1] & mask) == (u8)c1) {
+ DEBUG_PRINTF("partial!!!\n");
+ return buf_end - 1;
+ }
+
+ return buf_end;
+ }
+#endif
+
+ assert((buf_end - buf) >= VERM_BOUNDARY);
+ uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY;
+ if (min) {
+ // Input isn't aligned, so we need to run one iteration with an
+ // unaligned load, then skip buf forward to the next aligned address.
+ // There's some small overlap here, but we don't mind scanning it twice
+ // if we can do it quickly, do we?
+ const u8 *ptr = nocase
+ ? dvermPreconditionNocase(chars1, chars2, buf)
+ : dvermPrecondition(chars1, chars2, buf);
+ if (ptr) {
+ return ptr;
+ }
+
+ buf += VERM_BOUNDARY - min;
+ assert(buf < buf_end);
+ }
+
+ // Aligned loops from here on in
+ const u8 *ptr = nocase ? dvermSearchAlignedNocase(chars1, chars2, c1, c2,
+ buf, buf_end)
+ : dvermSearchAligned(chars1, chars2, c1, c2, buf,
+ buf_end);
+ if (ptr) {
+ return ptr;
+ }
+
+ // Tidy up the mess at the end
+ ptr = nocase ? dvermPreconditionNocase(chars1, chars2,
+ buf_end - VERM_BOUNDARY)
+ : dvermPrecondition(chars1, chars2, buf_end - VERM_BOUNDARY);
+
+ if (ptr) {
+ return ptr;
+ }
+
+ /* check for partial match at end */
+ u8 mask = nocase ? CASE_CLEAR : 0xff;
+ if ((buf_end[-1] & mask) == (u8)c1) {
+ DEBUG_PRINTF("partial!!!\n");
+ return buf_end - 1;
+ }
+
+ return buf_end;
+}
+
+/* returns highest offset of c2 (NOTE: not c1) */
+static really_inline
+const u8 *rvermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf,
+ const u8 *buf_end) {
+ DEBUG_PRINTF("rev double verm scan %s\\x%02hhx%02hhx over %zu bytes\n",
+ nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf));
+ assert(buf < buf_end);
+
+ VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */
+ VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */
+
+#ifdef HAVE_AVX512
+ if (buf_end - buf <= VERM_BOUNDARY) {
+ const u8 *ptr = nocase
+ ? rdvermMiniNocase(chars1, chars2, buf, buf_end)
+ : rdvermMini(chars1, chars2, buf, buf_end);
+
+ if (ptr) {
+ return ptr;
+ }
+
+ // check for partial match at end ???
+ return buf - 1;
+ }
+#endif
+
+ assert((buf_end - buf) >= VERM_BOUNDARY);
+ size_t min = (size_t)buf_end % VERM_BOUNDARY;
+ if (min) {
+ // input not aligned, so we need to run one iteration with an unaligned
+ // load, then skip buf forward to the next aligned address. There's
+ // some small overlap here, but we don't mind scanning it twice if we
+ // can do it quickly, do we?
+ const u8 *ptr = nocase ? rdvermPreconditionNocase(chars1, chars2,
+ buf_end - VERM_BOUNDARY)
+ : rdvermPrecondition(chars1, chars2,
+ buf_end - VERM_BOUNDARY);
+
+ if (ptr) {
+ return ptr;
+ }
+
+ buf_end -= min;
+ if (buf >= buf_end) {
+ return buf_end;
+ }
+ }
+
+ // Aligned loops from here on in
+ if (nocase) {
+ return rdvermSearchAlignedNocase(chars1, chars2, c1, c2, buf, buf_end);
+ } else {
+ return rdvermSearchAligned(chars1, chars2, c1, c2, buf, buf_end);
+ }
}
\ No newline at end of file
*/
static really_inline
-int vermSearchGetOffset(svbool_t matched) {
+uint64_t vermSearchGetOffset(svbool_t matched) {
return svcntp_b8(svptrue_b8(), svbrkb_z(svptrue_b8(), matched));
}
+static really_inline
+int dvermSearchGetOffset(svbool_t matched, svbool_t matched_rot) {
+ int offset = vermSearchGetOffset(matched);
+ int offset_rot = vermSearchGetOffset(matched_rot) - 1;
+ return (offset_rot < offset) ? offset_rot : offset;
+}
+
+static really_inline
+uint64_t rdvermSearchGetSingleOffset(svbool_t matched) {
+ return svcntp_b8(svptrue_b8(), svbrkb_z(svptrue_b8(), svrev_b8(matched)));
+}
+
+static really_inline
+uint64_t rdvermSearchGetOffset(svbool_t matched, svbool_t matched_rot) {
+ uint64_t offset = rdvermSearchGetSingleOffset(matched);
+ uint64_t offset_rot = rdvermSearchGetSingleOffset(matched_rot) - 1;
+ return (offset_rot < offset) ? offset_rot : offset;
+}
+
static really_inline
const u8 *vermSearchCheckMatched(const u8 *buf, svbool_t matched) {
if (unlikely(svptest_any(svptrue_b8(), matched))) {
return NULL;
}
+static really_inline
+const u8 *dvermSearchCheckMatched(const u8 *buf, svbool_t matched,
+ svbool_t matched_rot, svbool_t any) {
+ if (unlikely(svptest_any(svptrue_b8(), any))) {
+ const u8 *matchPos = buf + dvermSearchGetOffset(matched, matched_rot);
+ DEBUG_PRINTF("match pos %p\n", matchPos);
+ return matchPos;
+ }
+ return NULL;
+}
+
+static really_inline
+const u8 *rdvermSearchCheckMatched(const u8 *buf, svbool_t matched,
+ svbool_t matched_rot, svbool_t any) {
+ if (unlikely(svptest_any(svptrue_b8(), any))) {
+ const u8 *matchPos = buf + (svcntb() -
+ rdvermSearchGetOffset(matched, matched_rot));
+ DEBUG_PRINTF("match pos %p\n", matchPos);
+ return matchPos;
+ }
+ return NULL;
+}
+
static really_inline
svbool_t singleMatched(svuint8_t chars, const u8 *buf, svbool_t pg,
bool negate, const int64_t vnum) {
}
}
+static really_inline
+svbool_t doubleMatched(svuint16_t chars, const u8 *buf, const u8 *buf_rot,
+ svbool_t pg, svbool_t pg_rot, svbool_t * const matched,
+ svbool_t * const matched_rot) {
+ svuint16_t vec = svreinterpret_u16(svld1_u8(pg, buf));
+ svuint16_t vec_rot = svreinterpret_u16(svld1_u8(pg_rot, buf_rot));
+ *matched = svmatch(pg, vec, chars);
+ *matched_rot = svmatch(pg_rot, vec_rot, chars);
+ return svorr_z(svptrue_b8(), *matched, *matched_rot);
+}
+
static really_inline
const u8 *vermSearchOnce(svuint8_t chars, const u8 *buf, const u8 *buf_end,
bool negate) {
return rvermSearchCheckMatched(buf, matched);
}
+static really_inline
+const u8 *dvermSearchOnce(svuint16_t chars, const u8 *buf, const u8 *buf_end) {
+ DEBUG_PRINTF("start %p end %p\n", buf, buf_end);
+ assert(buf < buf_end);
+ DEBUG_PRINTF("l = %td\n", buf_end - buf);
+ svbool_t pg = svwhilelt_b8_s64(0, buf_end - buf);
+ svbool_t pg_rot = svwhilele_b8_s64(0, buf_end - buf);
+ svbool_t matched, matched_rot;
+ // buf - 1 won't underflow as the first position in the buffer has been
+ // dealt with meaning that buf - 1 is within the buffer.
+ svbool_t any = doubleMatched(chars, buf, buf - 1, pg, pg_rot,
+ &matched, &matched_rot);
+ return dvermSearchCheckMatched(buf, matched, matched_rot, any);
+}
+
+static really_inline
+const u8 *dvermSearchLoopBody(svuint16_t chars, const u8 *buf) {
+ DEBUG_PRINTF("start %p end %p\n", buf, buf + svcntb());
+ svbool_t matched, matched_rot;
+ // buf - 1 won't underflow as the first position in the buffer has been
+ // dealt with meaning that buf - 1 is within the buffer.
+ svbool_t any = doubleMatched(chars, buf, buf - 1, svptrue_b8(),
+ svptrue_b8(), &matched, &matched_rot);
+ return dvermSearchCheckMatched(buf, matched, matched_rot, any);
+}
+
+static really_inline
+const u8 *rdvermSearchOnce(svuint16_t chars, const u8 *buf, const u8 *buf_end) {
+ DEBUG_PRINTF("start %p end %p\n", buf, buf_end);
+ assert(buf < buf_end);
+
+ DEBUG_PRINTF("l = %td\n", buf_end - buf);
+ // buf_end can be read as the last position in the buffer has been
+ // dealt with meaning that buf_end is within the buffer.
+ // buf_end needs to be read by both the buf load and the buf + 1 load,
+ // this is because buf_end must be the upper 8 bits of the 16 bit element
+ // to be matched.
+ svbool_t pg = svwhilele_b8_s64(0, buf_end - buf);
+ svbool_t pg_rot = svwhilelt_b8_s64(0, buf_end - buf);
+ svbool_t matched, matched_rot;
+ svbool_t any = doubleMatched(chars, buf, buf + 1, pg, pg_rot,
+ &matched, &matched_rot);
+ return rdvermSearchCheckMatched(buf, matched, matched_rot, any);
+}
+
+static really_inline
+const u8 *rdvermSearchLoopBody(svuint16_t chars, const u8 *buf) {
+ DEBUG_PRINTF("start %p end %p\n", buf, buf + svcntb());
+ svbool_t matched, matched_rot;
+ // buf + svcntb() can be read as the last position in the buffer has
+ // been dealt with meaning that buf + svcntb() is within the buffer.
+ svbool_t any = doubleMatched(chars, buf, buf + 1, svptrue_b8(),
+ svptrue_b8(), &matched, &matched_rot);
+ return rdvermSearchCheckMatched(buf, matched, matched_rot, any);
+}
+
static really_inline
const u8 *vermSearch(char c, bool nocase, const u8 *buf, const u8 *buf_end,
bool negate) {
return buf == buf_end ? NULL : rvermSearchLoopBody(chars, buf, negate);
}
+static really_inline
+const u8 *dvermSearch(char c1, char c2, bool nocase, const u8 *buf,
+ const u8 *buf_end) {
+ svuint16_t chars = getCharMaskDouble(c1, c2, nocase);
+ size_t len = buf_end - buf;
+ if (len <= svcntb()) {
+ return dvermSearchOnce(chars, buf, buf_end);
+ }
+ // peel off first part to align to the vector size
+ const u8 *aligned_buf = ROUNDUP_PTR(buf, svcntb_pat(SV_POW2));
+ assert(aligned_buf < buf_end);
+ if (buf != aligned_buf) {
+ const u8 *ptr = dvermSearchLoopBody(chars, buf);
+ if (ptr) return ptr;
+ }
+ buf = aligned_buf;
+ size_t loops = (buf_end - buf) / svcntb();
+ DEBUG_PRINTF("loops %zu \n", loops);
+ for (size_t i = 0; i < loops; i++, buf += svcntb()) {
+ const u8 *ptr = dvermSearchLoopBody(chars, buf);
+ if (ptr) return ptr;
+ }
+ DEBUG_PRINTF("buf %p buf_end %p \n", buf, buf_end);
+ return buf == buf_end ? NULL : dvermSearchLoopBody(chars,
+ buf_end - svcntb());
+}
+
+static really_inline
+const u8 *rdvermSearch(char c1, char c2, bool nocase, const u8 *buf,
+ const u8 *buf_end) {
+ svuint16_t chars = getCharMaskDouble(c1, c2, nocase);
+ size_t len = buf_end - buf;
+ if (len <= svcntb()) {
+ return rdvermSearchOnce(chars, buf, buf_end);
+ }
+ // peel off first part to align to the vector size
+ const u8 *aligned_buf_end = ROUNDDOWN_PTR(buf_end, svcntb_pat(SV_POW2));
+ assert(buf < aligned_buf_end);
+ if (buf_end != aligned_buf_end) {
+ const u8 *rv = rdvermSearchLoopBody(chars, buf_end - svcntb());
+ if (rv) return rv;
+ }
+ buf_end = aligned_buf_end;
+ size_t loops = (buf_end - buf) / svcntb();
+ DEBUG_PRINTF("loops %zu \n", loops);
+ for (size_t i = 0; i < loops; i++) {
+ buf_end -= svcntb();
+ const u8 *rv = rdvermSearchLoopBody(chars, buf_end);
+ if (rv) return rv;
+ }
+ DEBUG_PRINTF("buf %p buf_end %p \n", buf, buf_end);
+ return buf == buf_end ? NULL : rdvermSearchLoopBody(chars, buf);
+}
+
static really_inline
const u8 *vermicelliExec(char c, bool nocase, const u8 *buf,
const u8 *buf_end) {
nocase ? "nocase " : "", c, buf_end - buf);
const u8 *ptr = rvermSearch(c, nocase, buf, buf_end, true);
return ptr ? ptr : buf - 1;
+}
+
+static really_inline
+const u8 *vermicelliDoubleExec(char c1, char c2, bool nocase, const u8 *buf,
+ const u8 *buf_end) {
+ DEBUG_PRINTF("double verm scan %s\\x%02hhx%02hhx over %td bytes\n",
+ nocase ? "nocase " : "", c1, c2, buf_end - buf);
+ assert(buf < buf_end);
+ if (buf_end - buf > 1) {
+ ++buf;
+ const u8 *ptr = dvermSearch(c1, c2, nocase, buf, buf_end);
+ if (ptr) {
+ return ptr;
+ }
+ }
+ /* check for partial match at end */
+ u8 mask = nocase ? CASE_CLEAR : 0xff;
+ if ((buf_end[-1] & mask) == (u8)c1) {
+ DEBUG_PRINTF("partial!!!\n");
+ return buf_end - 1;
+ }
+ return buf_end;
+}
+
+/* returns highest offset of c2 (NOTE: not c1) */
+static really_inline
+const u8 *rvermicelliDoubleExec(char c1, char c2, bool nocase, const u8 *buf,
+ const u8 *buf_end) {
+ DEBUG_PRINTF("rev double verm scan %s\\x%02hhx%02hhx over %td bytes\n",
+ nocase ? "nocase " : "", c1, c2, buf_end - buf);
+ assert(buf < buf_end);
+ if (buf_end - buf > 1) {
+ --buf_end;
+ const u8 *ptr = rdvermSearch(c1, c2, nocase, buf, buf_end);
+ if (ptr) {
+ return ptr;
+ }
+ }
+ return buf - 1;
}
\ No newline at end of file
}
}
+static really_inline
+svuint16_t getCharMaskDouble(const u8 c0, const u8 c1, bool noCase) {
+ if (noCase) {
+ const uint64_t lowerFirst = c0 & 0xdf;
+ const uint64_t upperFirst = c0 | 0x20;
+ const uint64_t lowerSecond = c1 & 0xdf;
+ const uint64_t upperSecond = c1 | 0x20;
+ const uint64_t chars = lowerFirst | (lowerSecond << 8)
+ | (lowerFirst << 16) | (upperSecond) << 24
+ | (upperFirst << 32) | (lowerSecond) << 40
+ | (upperFirst << 48) | (upperSecond) << 56;
+ return svreinterpret_u16(svdup_u64(chars));
+ } else {
+ uint16_t chars_u16 = c0 | (c1 << 8);
+ return svdup_u16(chars_u16);
+ }
+}
+
#endif
#include <string.h> // for memcpy