refactor Noodle to use the same loop as Shufti/Truffle, now it's at least 2x as fast

author Konstantinos Margaritis <konstantinos@vectorcamp.gr>

Thu, 14 Dec 2023 21:06:40 +0000 (23:06 +0200)

committer Konstantinos Margaritis <konma@vectorcamp.gr>

Thu, 21 Dec 2023 23:24:15 +0000 (23:24 +0000)
author Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Thu, 14 Dec 2023 21:06:40 +0000 (23:06 +0200)
committer Konstantinos Margaritis <konma@vectorcamp.gr>
Thu, 21 Dec 2023 23:24:15 +0000 (23:24 +0000)
diff --git a/src/hwlm/noodle_engine_simd.hpp b/src/hwlm/noodle_engine_simd.hpp

index 8006bd79ff9d3cb16d9afdc2d602c03be35f68c4..91c72840d56a9ed60be0d7811e1df48319b88596 100644 (file)
--- a/src/hwlm/noodle_engine_simd.hpp
+++ b/src/hwlm/noodle_engine_simd.hpp
@@ -1,6 +1,6 @@
  /*
   * Copyright (c) 2017, Intel Corporation
- * Copyright (c) 2020-2021, VectorCamp PC
+ * Copyright (c) 2020-2023, VectorCamp PC
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions are met:
@@ -34,7 +34,7 @@
  
  static really_really_inline
  hwlm_error_t single_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
-               Z_TYPE z, size_t len, const struct cb_info *cbi) {
+                          Z_TYPE z, size_t len, const struct cb_info *cbi) {
      while (unlikely(z)) {
          Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z) >> Z_POSSHIFT;
          size_t matchPos = d - buf + pos;
@@ -47,9 +47,10 @@ hwlm_error_t single_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
  
  static really_really_inline
  hwlm_error_t double_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
-               Z_TYPE z, size_t len, const struct cb_info *cbi) {
+                          Z_TYPE z, size_t len, const struct cb_info *cbi) {
      while (unlikely(z)) {
          Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z) >> Z_POSSHIFT;
+        DEBUG_PRINTF("pos %u\n", pos);
          size_t matchPos = d - buf + pos - 1;
          DEBUG_PRINTF("match pos %zu\n", matchPos);
          hwlmcb_rv_t rv = final(n, buf, len, true, cbi, matchPos);
@@ -58,116 +59,6 @@ hwlm_error_t double_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
      return HWLM_SUCCESS;
  }
  
-
-template<uint16_t S>
-static really_inline
-hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf,
-                                 SuperVector<S> caseMask, SuperVector<S> mask1,
-                                 const struct cb_info *cbi, size_t len, size_t start,
-                                 size_t end) {
-    const u8 *d = buf + start;
-    DEBUG_PRINTF("start %zu end %zu\n", start, end);
-    const size_t l = end - start;
-    DEBUG_PRINTF("l = %ld\n", l);
-    //assert(l <= 64);
-    if (!l) {
-        return HWLM_SUCCESS;
-    }
-
-    SuperVector<S> v = SuperVector<S>::Zeroes();
-    memcpy(&v.u, d, l);
-
-    typename SuperVector<S>::comparemask_type mask =
-        SINGLE_LOAD_MASK(l * SuperVector<S>::mask_width());
-    v = v & caseMask;
-    typename SuperVector<S>::comparemask_type z = mask & mask1.eqmask(v);
-    z = SuperVector<S>::iteration_mask(z);
-
-    return single_zscan(n, d, buf, z, len, cbi);
-}
-
-// The short scan routine. It is used both to scan data up to an
-// alignment boundary if needed and to finish off data that the aligned scan
-// function can't handle (due to small/unaligned chunk at end)
-template<uint16_t S>
-static really_inline
-hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf,
-                                 SuperVector<S> caseMask, SuperVector<S> mask1,
-                                 const struct cb_info *cbi, size_t len, size_t offset,
-                                     size_t start,
-                                 size_t end) {
-    const u8 *d = buf + offset;
-    DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
-    const size_t l = end - start;
-    DEBUG_PRINTF("l = %ld\n", l);
-    assert(l <= 64);
-    if (!l) {
-        return HWLM_SUCCESS;
-    }
-    size_t buf_off = start - offset;
-    typename SuperVector<S>::comparemask_type mask =
-        SINGLE_LOAD_MASK(l * SuperVector<S>::mask_width())
-        << (buf_off * SuperVector<S>::mask_width());
-    SuperVector<S> v = SuperVector<S>::loadu(d) & caseMask;
-    typename SuperVector<S>::comparemask_type z = mask & mask1.eqmask(v);
-    z = SuperVector<S>::iteration_mask(z);
-
-    return single_zscan(n, d, buf, z, len, cbi);
-}
-
-template<uint16_t S>
-static really_inline
-hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf,
-                                 SuperVector<S> caseMask, SuperVector<S> mask1, SuperVector<S> mask2,
-                                 const struct cb_info *cbi, size_t len, size_t start, size_t end) {
-    const u8 *d = buf + start;
-    DEBUG_PRINTF("start %zu end %zu\n", start, end);
-    const size_t l = end - start;
-    assert(l <= S);
-    if (!l) {
-        return HWLM_SUCCESS;
-    }
-    SuperVector<S> v = SuperVector<S>::Zeroes();
-    memcpy(&v.u, d, l);
-    v = v & caseMask;
-
-    typename SuperVector<S>::comparemask_type mask =
-        DOUBLE_LOAD_MASK(l * SuperVector<S>::mask_width());
-    typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(v);
-    typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(v);
-    typename SuperVector<S>::comparemask_type z =
-        mask & (z1 << (SuperVector<S>::mask_width())) & z2;
-    z = SuperVector<S>::iteration_mask(z);
-
-    return double_zscan(n, d, buf, z, len, cbi);
-}
-
-template<uint16_t S>
-static really_inline
-hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf,
-                                 SuperVector<S> caseMask, SuperVector<S> mask1, SuperVector<S> mask2,
-                                 const struct cb_info *cbi, size_t len, size_t offset, size_t start, size_t end) {
-    const u8 *d = buf + offset;
-    DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
-    const size_t l = end - start;
-    assert(l <= S);
-    if (!l) {
-        return HWLM_SUCCESS;
-    }
-    SuperVector<S> v = SuperVector<S>::loadu(d) & caseMask;
-    size_t buf_off = start - offset;
-    typename SuperVector<S>::comparemask_type mask =
-        DOUBLE_LOAD_MASK(l * SuperVector<S>::mask_width())
-        << (buf_off * SuperVector<S>::mask_width());
-    typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(v);
-    typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(v);
-    typename SuperVector<S>::comparemask_type z =
-        mask & (z1 << SuperVector<S>::mask_width()) & z2;
-    z = SuperVector<S>::iteration_mask(z);
-
-    return double_zscan(n, d, buf, z, len, cbi);
-}
-
  template <uint16_t S>
  static really_inline
  hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf,
@@ -175,32 +66,36 @@ hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf,
                              SuperVector<S> caseMask, SuperVector<S> mask1,
                              const struct cb_info *cbi) {
      size_t start = offset + n->msk_len - 1;
-    size_t end = len;
  
      const u8 *d = buf + start;
-    const u8 *e = buf + end;
-    DEBUG_PRINTF("start %p end %p \n", d, e);
-    assert(d < e);
-    if (e - d < S) {
-      return scanSingleShort(n, buf, caseMask, mask1, cbi, len, start, end);
-    }
-    if (d + S <= e) {
-        // peel off first part to cacheline boundary
-        const u8 *d1 = ROUNDUP_PTR(d, S);
-        DEBUG_PRINTF("until aligned %p \n", d1);
-        if (scanSingleUnaligned(n, buf, caseMask, mask1, cbi, len, start, start, d1 - buf) == HWLM_TERMINATED) {
-            return HWLM_TERMINATED;
-        }
-        d = d1;
+    const u8 *buf_end = buf + len;
+    assert(d < buf_end);
+
+    DEBUG_PRINTF("noodle %p start %zu len %zu\n", buf, start, buf_end - buf);
+    DEBUG_PRINTF("b %s\n", buf);
+    DEBUG_PRINTF("start %p end %p \n", d, buf_end);
+
+    __builtin_prefetch(d + 16*64);
+    assert(d < buf_end);
+    if (d + S <= buf_end) {
+        // Reach vector aligned boundaries
+        DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S));
+        if (!ISALIGNED_N(d, S)) {
+            const u8 *d1 = ROUNDUP_PTR(d, S);
+            DEBUG_PRINTF("d1 - d: %ld \n", d1 - d);
+            size_t l = d1 - d;
+            SuperVector<S> chars = SuperVector<S>::loadu(d) & caseMask;
+            typename SuperVector<S>::comparemask_type mask = SINGLE_LOAD_MASK(l * SuperVector<S>::mask_width());
+            typename SuperVector<S>::comparemask_type z = mask & mask1.eqmask(chars);
  
-        size_t loops = (end - (d - buf)) / S;
-        DEBUG_PRINTF("loops %ld \n", loops);
+            hwlm_error_t rv = single_zscan(n, d, buf, z, len, cbi);
+            RETURN_IF_TERMINATED(rv);
+            d = d1;
+        }
  
-        for (size_t i = 0; i < loops; i++, d+= S) {
+        while(d + S <= buf_end) {
+            __builtin_prefetch(d + 16*64);
              DEBUG_PRINTF("d %p \n", d);
-            const u8 *base = ROUNDUP_PTR(d, 64);
-            // On large packet buffers, this prefetch appears to get us about 2%.
-            __builtin_prefetch(base + 256);
  
              SuperVector<S> v = SuperVector<S>::load(d) & caseMask;
              typename SuperVector<S>::comparemask_type z = mask1.eqmask(v);
@@ -208,17 +103,23 @@ hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf,
  
              hwlm_error_t rv = single_zscan(n, d, buf, z, len, cbi);
              RETURN_IF_TERMINATED(rv);
+            d += S;
          }
      }
  
-    DEBUG_PRINTF("d %p e %p \n", d, e);
+    DEBUG_PRINTF("d %p e %p \n", d, buf_end);
      // finish off tail
-    size_t s2End = ROUNDDOWN_PTR(e, S) - buf;
-    if (s2End == end) {
-      return HWLM_SUCCESS;
+
+    if (d != buf_end) {
+        SuperVector<S> chars = SuperVector<S>::loadu(d) & caseMask;
+        size_t l = buf_end - d;
+        typename SuperVector<S>::comparemask_type mask = SINGLE_LOAD_MASK(l * SuperVector<S>::mask_width());
+        typename SuperVector<S>::comparemask_type z = mask & mask1.eqmask(chars);
+        hwlm_error_t rv = single_zscan(n, d, buf, z, len, cbi);
+        RETURN_IF_TERMINATED(rv);
      }
  
-    return scanSingleUnaligned(n, buf, caseMask, mask1, cbi, len, end - S, s2End, len);
+    return HWLM_SUCCESS;
  }
  
  template <uint16_t S>
@@ -227,66 +128,84 @@ hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf,
                              size_t len, size_t offset,
                              SuperVector<S> caseMask, SuperVector<S> mask1, SuperVector<S> mask2,
                              const struct cb_info *cbi) {
-    // we stop scanning for the key-fragment when the rest of the key can't
-    // possibly fit in the remaining buffer
      size_t end = len - n->key_offset + 2;
-
      size_t start = offset + n->msk_len - n->key_offset;
  
+    const u8 *d = buf + start;
+    const u8 *buf_end = buf + end;
+    assert(d < buf_end);
+
+    DEBUG_PRINTF("noodle %p start %zu len %zu\n", buf, start, buf_end - buf);
+    DEBUG_PRINTF("b %s\n", buf);
+    DEBUG_PRINTF("start %p end %p \n", d, buf_end);
+
      typename SuperVector<S>::comparemask_type lastz1{0};
  
-    const u8 *d = buf + start;
-    const u8 *e = buf + end;
-    DEBUG_PRINTF("start %p end %p \n", d, e);
-    assert(d < e);
-    if (e - d < S) {
-      return scanDoubleShort(n, buf, caseMask, mask1, mask2, cbi, len, d - buf, end);
-    }
-    if (d + S <= e) {
-        // peel off first part to cacheline boundary
-        const u8 *d1 = ROUNDUP_PTR(d, S) + 1;
-        DEBUG_PRINTF("until aligned %p \n", d1);
-        if (scanDoubleUnaligned(n, buf, caseMask, mask1, mask2, cbi, len, start, start, d1 - buf) == HWLM_TERMINATED) {
-            return HWLM_TERMINATED;
-        }
-        d = d1 - 1;
+    __builtin_prefetch(d + 16*64);
+    assert(d < buf_end);
+    if (d + S <= buf_end) {
+        // Reach vector aligned boundaries
+        DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S));
+        if (!ISALIGNED_N(d, S)) {
+            const u8 *d1 = ROUNDUP_PTR(d, S);
+            size_t l = d1 - d;
+            SuperVector<S> chars = SuperVector<S>::loadu(d) & caseMask;
+            typename SuperVector<S>::comparemask_type mask = DOUBLE_LOAD_MASK(l * SuperVector<S>::mask_width());
+            typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(chars);
+            typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(chars);
+            typename SuperVector<S>::comparemask_type z = mask & (z1 << SuperVector<S>::mask_width()) & z2;
+            lastz1 = z1 >> (Z_SHIFT * SuperVector<S>::mask_width());
+            z = SuperVector<S>::iteration_mask(z);
  
-        size_t loops = (end - (d - buf)) / S;
-        DEBUG_PRINTF("loops %ld \n", loops);
+            hwlm_error_t rv = double_zscan(n, d, buf, z, len, cbi);
+            RETURN_IF_TERMINATED(rv);
+            d = d1;
+        }
  
-        for (size_t i = 0; i < loops; i++, d+= S) {
+        while(d + S <= buf_end) {
+            __builtin_prefetch(d + 16*64);
              DEBUG_PRINTF("d %p \n", d);
-            const u8 *base = ROUNDUP_PTR(d, 64);
-            // On large packet buffers, this prefetch appears to get us about 2%.
-            __builtin_prefetch(base + 256);
  
-            SuperVector<S> v = SuperVector<S>::load(d) & caseMask;
-            typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(v);
-            typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(v);
-            typename SuperVector<S>::comparemask_type z =
-                (z1 << SuperVector<S>::mask_width() | lastz1) & z2;
+            SuperVector<S> chars = SuperVector<S>::load(d) & caseMask;
+            typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(chars);
+            typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(chars);
+            typename SuperVector<S>::comparemask_type z = (z1 << SuperVector<S>::mask_width() | lastz1) & z2;
              lastz1 = z1 >> (Z_SHIFT * SuperVector<S>::mask_width());
              z = SuperVector<S>::iteration_mask(z);
  
              hwlm_error_t rv = double_zscan(n, d, buf, z, len, cbi);
              RETURN_IF_TERMINATED(rv);
-        }
-        if (loops == 0) {
-          d = d1;
+            d += S;
          }
      }
+
+    DEBUG_PRINTF("d %p e %p \n", d, buf_end);
      // finish off tail
-    size_t s2End = ROUNDDOWN_PTR(e, S) - buf;
-    if (s2End == end) {
-      return HWLM_SUCCESS;
+
+    if (d != buf_end) {
+        size_t l = buf_end - d;
+        SuperVector<S> chars = SuperVector<S>::loadu(d) & caseMask;
+        typename SuperVector<S>::comparemask_type mask = DOUBLE_LOAD_MASK(l * SuperVector<S>::mask_width());
+        typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(chars);
+        typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(chars);
+        typename SuperVector<S>::comparemask_type z = mask & (z1 << SuperVector<S>::mask_width() | lastz1) & z2;
+        z = SuperVector<S>::iteration_mask(z);
+
+        hwlm_error_t rv = double_zscan(n, d, buf, z, len, cbi);
+        RETURN_IF_TERMINATED(rv);
      }
-    return scanDoubleUnaligned(n, buf, caseMask, mask1, mask2, cbi, len, end - S, d - buf, end);
+
+    return HWLM_SUCCESS;
  }
  
  // Single-character specialisation, used when keyLen = 1
  static really_inline
  hwlm_error_t scanSingle(const struct noodTable *n, const u8 *buf, size_t len,
                          size_t start, bool noCase, const struct cb_info *cbi) {
+/*    if (len < VECTORSIZE) {
+      return scanSingleSlow(n, buf, len, start, noCase, n->key0, cbi);
+    }*/
+
      if (!ourisalpha(n->key0)) {
          noCase = 0; // force noCase off if we don't have an alphabetic char
      }
author	Konstantinos Margaritis <konstantinos@vectorcamp.gr>
	Thu, 14 Dec 2023 21:06:40 +0000 (23:06 +0200)
committer	Konstantinos Margaritis <konma@vectorcamp.gr>
	Thu, 21 Dec 2023 23:24:15 +0000 (23:24 +0000)