]> git.ipfire.org Git - thirdparty/vectorscan.git/commitdiff
rose: add multi-path shufti 16x8, 32x8, 32x16, 64x8 and multi-path lookaround instruc...
authorXu, Chi <chi.xu@intel.com>
Thu, 30 Mar 2017 20:37:33 +0000 (04:37 +0800)
committerMatthew Barr <matthew.barr@intel.com>
Wed, 26 Apr 2017 05:18:56 +0000 (15:18 +1000)
12 files changed:
src/rose/program_runtime.h
src/rose/rose_build_bytecode.cpp
src/rose/rose_build_dump.cpp
src/rose/rose_build_lookaround.cpp
src/rose/rose_build_lookaround.h
src/rose/rose_build_program.cpp
src/rose/rose_build_program.h
src/rose/rose_common.h
src/rose/rose_internal.h
src/rose/rose_program.h
src/rose/validate_shufti.h
src/util/simd_utils.h

index 30ff85270d50056f67414e23409e20be53514e61..88c312d2dc2121f210679348276749950502a0e2 100644 (file)
@@ -857,13 +857,13 @@ u32 getBufferDataComplex(const struct core_info *ci, const s64a loc,
 }
 
 static rose_inline
-m128 getData128(const struct core_info *ci, s64a offset, u16 *valid_data_mask) {
+m128 getData128(const struct core_info *ci, s64a offset, u32 *valid_data_mask) {
     if (offset > 0 && offset + sizeof(m128) <= ci->len) {
         *valid_data_mask = 0xffff;
         return loadu128(ci->buf + offset);
     }
     ALIGN_DIRECTIVE u8 data[sizeof(m128)];
-    *valid_data_mask = (u16)getBufferDataComplex(ci, offset, data, 16);
+    *valid_data_mask = getBufferDataComplex(ci, offset, data, 16);
     return *(m128 *)data;
 }
 
@@ -892,7 +892,7 @@ int roseCheckShufti16x8(const struct core_info *ci, const u8 *nib_mask,
         return 0;
     }
 
-    u16 valid_data_mask = 0;
+    u32 valid_data_mask = 0;
     m128 data = getData128(ci, offset, &valid_data_mask);
     if (unlikely(!valid_data_mask)) {
         return 1;
@@ -924,7 +924,7 @@ int roseCheckShufti16x16(const struct core_info *ci, const u8 *hi_mask,
         return 0;
     }
 
-    u16 valid_data_mask = 0;
+    u32 valid_data_mask = 0;
     m128 data = getData128(ci, offset, &valid_data_mask);
     if (unlikely(!valid_data_mask)) {
         return 1;
@@ -1020,8 +1020,9 @@ int roseCheckShufti32x16(const struct core_info *ci, const u8 *hi_mask,
 static rose_inline
 int roseCheckSingleLookaround(const struct RoseEngine *t,
                               const struct hs_scratch *scratch,
-                              s8 checkOffset, u32 lookaroundIndex, u64a end) {
-    assert(lookaroundIndex != MO_INVALID_IDX);
+                              s8 checkOffset, u32 lookaroundReachIndex,
+                              u64a end) {
+    assert(lookaroundReachIndex != MO_INVALID_IDX);
     const struct core_info *ci = &scratch->core_info;
     DEBUG_PRINTF("end=%llu, buf_offset=%llu, buf_end=%llu\n", end,
                  ci->buf_offset, ci->buf_offset + ci->len);
@@ -1037,7 +1038,7 @@ int roseCheckSingleLookaround(const struct RoseEngine *t,
     }
 
     const u8 *reach_base = (const u8 *)t + t->lookaroundReachOffset;
-    const u8 *reach = reach_base + lookaroundIndex * REACH_BITVECTOR_LEN;
+    const u8 *reach = reach_base + lookaroundReachIndex;
 
     u8 c;
     if (offset >= 0 && offset < (s64a)ci->len) {
@@ -1063,9 +1064,11 @@ int roseCheckSingleLookaround(const struct RoseEngine *t,
  */
 static rose_inline
 int roseCheckLookaround(const struct RoseEngine *t,
-                        const struct hs_scratch *scratch, u32 lookaroundIndex,
+                        const struct hs_scratch *scratch,
+                        u32 lookaroundLookIndex, u32 lookaroundReachIndex,
                         u32 lookaroundCount, u64a end) {
-    assert(lookaroundIndex != MO_INVALID_IDX);
+    assert(lookaroundLookIndex != MO_INVALID_IDX);
+    assert(lookaroundReachIndex != MO_INVALID_IDX);
     assert(lookaroundCount > 0);
 
     const struct core_info *ci = &scratch->core_info;
@@ -1074,12 +1077,12 @@ int roseCheckLookaround(const struct RoseEngine *t,
 
     const u8 *base = (const u8 *)t;
     const s8 *look_base = (const s8 *)(base + t->lookaroundTableOffset);
-    const s8 *look = look_base + lookaroundIndex;
+    const s8 *look = look_base + lookaroundLookIndex;
     const s8 *look_end = look + lookaroundCount;
     assert(look < look_end);
 
     const u8 *reach_base = base + t->lookaroundReachOffset;
-    const u8 *reach = reach_base + lookaroundIndex * REACH_BITVECTOR_LEN;
+    const u8 *reach = reach_base + lookaroundReachIndex;
 
     // The following code assumes that the lookaround structures are ordered by
     // increasing offset.
@@ -1151,6 +1154,359 @@ int roseCheckLookaround(const struct RoseEngine *t,
     return 1;
 }
 
+/**
+ * \brief Trying to find a matching path by the corresponding path mask of
+ * every lookaround location.
+ */
+static rose_inline
+int roseMultipathLookaround(const struct RoseEngine *t,
+                            const struct hs_scratch *scratch,
+                            u32 multipathLookaroundLookIndex,
+                            u32 multipathLookaroundReachIndex,
+                            u32 multipathLookaroundCount,
+                            s32 last_start, const u8 *start_mask,
+                            u64a end) {
+    assert(multipathLookaroundCount > 0);
+
+    const struct core_info *ci = &scratch->core_info;
+    DEBUG_PRINTF("end=%llu, buf_offset=%llu, buf_end=%llu\n", end,
+                 ci->buf_offset, ci->buf_offset + ci->len);
+
+    const s8 *look_base = getByOffset(t, t->lookaroundTableOffset);
+    const s8 *look = look_base + multipathLookaroundLookIndex;
+    const s8 *look_end = look + multipathLookaroundCount;
+    assert(look < look_end);
+
+    const u8 *reach_base = getByOffset(t, t->lookaroundReachOffset);
+    const u8 *reach = reach_base + multipathLookaroundReachIndex;
+
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    DEBUG_PRINTF("base_offset=%lld\n", base_offset);
+
+    u8 path = 0xff;
+
+    assert(last_start < 0);
+
+    if (unlikely((u64a)(0 - last_start) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    u32 start_offset = 0;
+    do {
+        s64a offset = base_offset + *look;
+        DEBUG_PRINTF("start_mask[%u] = %x\n", start_offset,
+                     start_mask[start_offset]);
+        path = start_mask[start_offset];
+        if (offset >= -(s64a)ci->hlen) {
+            break;
+        }
+        DEBUG_PRINTF("look=%d before history\n", *look);
+        start_offset++;
+        look++;
+        reach += MULTI_REACH_BITVECTOR_LEN;
+    } while (look < look_end);
+
+    DEBUG_PRINTF("scan history (%zu looks left)\n", look_end - look);
+    for (; look < look_end; ++look, reach += MULTI_REACH_BITVECTOR_LEN) {
+        s64a offset = base_offset + *look;
+        DEBUG_PRINTF("reach=%p, rel offset=%lld\n", reach, offset);
+
+        if (offset >= 0) {
+            DEBUG_PRINTF("in buffer\n");
+            break;
+        }
+
+        assert(offset >= -(s64a)ci->hlen && offset < 0);
+        u8 c = ci->hbuf[ci->hlen + offset];
+        path &= reach[c];
+        DEBUG_PRINTF("reach[%x] = %02x path = %0xx\n", c, reach[c],  path);
+        if (!path) {
+            DEBUG_PRINTF("char 0x%02x failed reach check\n", c);
+            return 0;
+        }
+    }
+
+    DEBUG_PRINTF("scan buffer (%zu looks left)\n", look_end - look);
+    for(; look < look_end; ++look, reach += MULTI_REACH_BITVECTOR_LEN) {
+        s64a offset = base_offset + *look;
+        DEBUG_PRINTF("reach=%p, rel offset=%lld\n", reach, offset);
+
+        if (offset >= (s64a)ci->len) {
+            DEBUG_PRINTF("in the future\n");
+            break;
+        }
+
+        assert(offset >= 0 && offset < (s64a)ci->len);
+        u8 c = ci->buf[offset];
+        path &= reach[c];
+        DEBUG_PRINTF("reach[%x] = %02x path = %0xx\n", c, reach[c],  path);
+        if (!path) {
+            DEBUG_PRINTF("char 0x%02x failed reach check\n", c);
+            return 0;
+        }
+    }
+
+    DEBUG_PRINTF("OK :)\n");
+    return 1;
+}
+
+static never_inline
+int roseCheckMultipathShufti16x8(const struct hs_scratch *scratch,
+                       const struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_16x8 *ri,
+                                 u64a end) {
+    const struct core_info *ci = &scratch->core_info;
+    s32 checkOffset = ri->base_offset;
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    assert(ri->last_start <= 0);
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        if ((u64a)(0 - ri->last_start) > end) {
+            DEBUG_PRINTF("too early, fail\n");
+            return 0;
+        }
+    }
+
+    u32 valid_data_mask;
+    m128 data_init = getData128(ci, offset, &valid_data_mask);
+    m128 data_select_mask = loadu128(ri->data_select_mask);
+
+    u32 valid_path_mask = 0;
+    if (unlikely(!(valid_data_mask & 1))) {
+        DEBUG_PRINTF("lose part of backward data\n");
+        DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask);
+
+        m128 expand_valid;
+        u64a expand_mask = 0x8080808080808080ULL;
+        u64a valid_lo = expand64(valid_data_mask & 0xff, expand_mask);
+        u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
+        DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
+        DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
+        expand_valid = set64x2(valid_hi, valid_lo);
+        valid_path_mask = ~movemask128(pshufb(expand_valid,
+                                               data_select_mask));
+    }
+
+    m128 data = pshufb(data_init, data_select_mask);
+    m256 nib_mask = loadu256(ri->nib_mask);
+    m128 bucket_select_mask = loadu128(ri->bucket_select_mask);
+
+    u32 hi_bits_mask = ri->hi_bits_mask;
+    u32 lo_bits_mask = ri->lo_bits_mask;
+    u32 neg_mask = ri->neg_mask;
+
+    if (validateMultipathShuftiMask16x8(data, nib_mask,
+                                        bucket_select_mask,
+                                        hi_bits_mask, lo_bits_mask,
+                                        neg_mask, valid_path_mask)) {
+        DEBUG_PRINTF("check multi-path shufti-16x8 successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+static never_inline
+int roseCheckMultipathShufti32x8(const struct hs_scratch *scratch,
+                       const struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_32x8 *ri,
+                                 u64a end) {
+    const struct core_info *ci = &scratch->core_info;
+    s32 checkOffset = ri->base_offset;
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    assert(ri->last_start <= 0);
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        if ((u64a)(0 - ri->last_start) > end) {
+            DEBUG_PRINTF("too early, fail\n");
+            return 0;
+        }
+    }
+
+    u32 valid_data_mask;
+    m128 data_m128 = getData128(ci, offset, &valid_data_mask);
+    m256 data_double = set2x128(data_m128);
+    m256 data_select_mask = loadu256(ri->data_select_mask);
+
+    u32 valid_path_mask = 0;
+    m256 expand_valid;
+    if (unlikely(!(valid_data_mask & 1))) {
+        DEBUG_PRINTF("lose part of backward data\n");
+        DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask);
+
+        u64a expand_mask = 0x8080808080808080ULL;
+        u64a valid_lo = expand64(valid_data_mask & 0xff, expand_mask);
+        u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
+        DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
+        DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
+        expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
+                                         valid_lo);
+        valid_path_mask = ~movemask256(vpshufb(expand_valid,
+                                               data_select_mask));
+    }
+
+    m256 data = vpshufb(data_double, data_select_mask);
+    m256 hi_mask = loadu2x128(ri->hi_mask);
+    m256 lo_mask = loadu2x128(ri->lo_mask);
+    m256 bucket_select_mask = loadu256(ri->bucket_select_mask);
+
+    u32 hi_bits_mask = ri->hi_bits_mask;
+    u32 lo_bits_mask = ri->lo_bits_mask;
+    u32 neg_mask = ri->neg_mask;
+
+    if (validateMultipathShuftiMask32x8(data, hi_mask, lo_mask,
+                                        bucket_select_mask,
+                                        hi_bits_mask, lo_bits_mask,
+                                        neg_mask, valid_path_mask)) {
+        DEBUG_PRINTF("check multi-path shufti-32x8 successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+static never_inline
+int roseCheckMultipathShufti32x16(const struct hs_scratch *scratch,
+                      const struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_32x16 *ri,
+                                  u64a end) {
+    const struct core_info *ci = &scratch->core_info;
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s32 checkOffset = ri->base_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    assert(ri->last_start <= 0);
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        if ((u64a)(0 - ri->last_start) > end) {
+            DEBUG_PRINTF("too early, fail\n");
+            return 0;
+        }
+    }
+
+    u32 valid_data_mask;
+    m128 data_m128 = getData128(ci, offset, &valid_data_mask);
+    m256 data_double = set2x128(data_m128);
+    m256 data_select_mask = loadu256(ri->data_select_mask);
+
+    u32 valid_path_mask = 0;
+    m256 expand_valid;
+    if (unlikely(!(valid_data_mask & 1))) {
+        DEBUG_PRINTF("lose part of backward data\n");
+        DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask);
+
+        u64a expand_mask = 0x8080808080808080ULL;
+        u64a valid_lo = expand64(valid_data_mask & 0xff, expand_mask);
+        u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
+        DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
+        DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
+        expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
+                                         valid_lo);
+        valid_path_mask = ~movemask256(vpshufb(expand_valid,
+                                               data_select_mask));
+    }
+
+    m256 data = vpshufb(data_double, data_select_mask);
+
+    m256 hi_mask_1 = loadu2x128(ri->hi_mask);
+    m256 hi_mask_2 = loadu2x128(ri->hi_mask + 16);
+    m256 lo_mask_1 = loadu2x128(ri->lo_mask);
+    m256 lo_mask_2 = loadu2x128(ri->lo_mask + 16);
+
+    m256 bucket_select_mask_hi = loadu256(ri->bucket_select_mask_hi);
+    m256 bucket_select_mask_lo = loadu256(ri->bucket_select_mask_lo);
+
+    u32 hi_bits_mask = ri->hi_bits_mask;
+    u32 lo_bits_mask = ri->lo_bits_mask;
+    u32 neg_mask = ri->neg_mask;
+
+    if (validateMultipathShuftiMask32x16(data, hi_mask_1, hi_mask_2,
+                                         lo_mask_1, lo_mask_2,
+                                         bucket_select_mask_hi,
+                                         bucket_select_mask_lo,
+                                         hi_bits_mask, lo_bits_mask,
+                                         neg_mask, valid_path_mask)) {
+        DEBUG_PRINTF("check multi-path shufti-32x16 successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+static never_inline
+int roseCheckMultipathShufti64(const struct hs_scratch *scratch,
+                         const struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_64 *ri,
+                               u64a end) {
+    const struct core_info *ci = &scratch->core_info;
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s32 checkOffset = ri->base_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        if ((u64a)(0 - ri->last_start) > end) {
+            DEBUG_PRINTF("too early, fail\n");
+            return 0;
+        }
+    }
+
+    u32 valid_data_mask;
+    m128 data_m128 = getData128(ci, offset, &valid_data_mask);
+    m256 data_m256 = set2x128(data_m128);
+    m256 data_select_mask_1 = loadu256(ri->data_select_mask);
+    m256 data_select_mask_2 = loadu256(ri->data_select_mask + 32);
+
+    u64a valid_path_mask = 0;
+    m256 expand_valid;
+    if (unlikely(!(valid_data_mask & 1))) {
+        DEBUG_PRINTF("lose part of backward data\n");
+        DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask);
+
+        u64a expand_mask = 0x8080808080808080ULL;
+        u64a valid_lo = expand64(valid_data_mask & 0xff, expand_mask);
+        u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
+        DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
+        DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
+        expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
+                                         valid_lo);
+        u32 valid_path_1 = movemask256(vpshufb(expand_valid,
+                                               data_select_mask_1));
+        u32 valid_path_2 = movemask256(vpshufb(expand_valid,
+                                               data_select_mask_2));
+        valid_path_mask = ~((u64a)valid_path_1 | (u64a)valid_path_2 << 32);
+    }
+
+    m256 data_1 = vpshufb(data_m256, data_select_mask_1);
+    m256 data_2 = vpshufb(data_m256, data_select_mask_2);
+
+    m256 hi_mask = loadu2x128(ri->hi_mask);
+    m256 lo_mask = loadu2x128(ri->lo_mask);
+
+    m256 bucket_select_mask_1 = loadu256(ri->bucket_select_mask);
+    m256 bucket_select_mask_2 = loadu256(ri->bucket_select_mask + 32);
+
+    u64a hi_bits_mask = ri->hi_bits_mask;
+    u64a lo_bits_mask = ri->lo_bits_mask;
+    u64a neg_mask = ri->neg_mask;
+
+    if (validateMultipathShuftiMask64(data_1, data_2, hi_mask, lo_mask,
+                                      bucket_select_mask_1,
+                                      bucket_select_mask_2, hi_bits_mask,
+                                      lo_bits_mask, neg_mask,
+                                      valid_path_mask)) {
+        DEBUG_PRINTF("check multi-path shufti-64 successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
 int roseNfaEarliestSom(u64a start, u64a end, ReportID id, void *context);
 
 static rose_inline
@@ -1614,8 +1970,8 @@ hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t,
             PROGRAM_NEXT_INSTRUCTION
 
             PROGRAM_CASE(CHECK_LOOKAROUND) {
-                if (!roseCheckLookaround(t, scratch, ri->index, ri->count,
-                                         end)) {
+                if (!roseCheckLookaround(t, scratch, ri->look_index,
+                                         ri->reach_index, ri->count, end)) {
                     DEBUG_PRINTF("failed lookaround check\n");
                     assert(ri->fail_jump); // must progress
                     pc += ri->fail_jump;
@@ -2172,6 +2528,59 @@ hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t,
                 work_done = 0;
             }
             PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(MULTIPATH_LOOKAROUND) {
+                if (!roseMultipathLookaround(t, scratch, ri->look_index,
+                                             ri->reach_index, ri->count,
+                                             ri->last_start, ri->start_mask,
+                                             end)) {
+                    DEBUG_PRINTF("failed multi-path lookaround check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MULTIPATH_SHUFTI_16x8) {
+                if (!roseCheckMultipathShufti16x8(scratch, ri, end)) {
+                    DEBUG_PRINTF("failed multi-path shufti 16x8 check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MULTIPATH_SHUFTI_32x8) {
+                if (!roseCheckMultipathShufti32x8(scratch, ri, end)) {
+                    DEBUG_PRINTF("failed multi-path shufti 32x8 check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MULTIPATH_SHUFTI_32x16) {
+                if (!roseCheckMultipathShufti32x16(scratch, ri, end)) {
+                    DEBUG_PRINTF("failed multi-path shufti 32x16 check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MULTIPATH_SHUFTI_64) {
+                if (!roseCheckMultipathShufti64(scratch, ri, end)) {
+                    DEBUG_PRINTF("failed multi-path shufti 64 check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
         }
     }
 
index 6ee08952e5d67fa396e65bf91fd659df643303f6..a0edc7118ffcf16dc1734bc18b1afb9702d44bc0 100644 (file)
@@ -82,6 +82,7 @@
 #include "util/compile_context.h"
 #include "util/compile_error.h"
 #include "util/container.h"
+#include "util/dump_charclass.h"
 #include "util/fatbit_build.h"
 #include "util/graph_range.h"
 #include "util/make_unique.h"
 #include <map>
 #include <queue>
 #include <set>
+#include <sstream>
 #include <string>
 #include <vector>
 #include <utility>
@@ -141,8 +143,8 @@ struct left_build_info {
           countingMiracleReach(cm_cr) {}
 
     // Constructor for a lookaround implementation.
-    explicit left_build_info(const vector<LookEntry> &look)
-        : has_lookaround(true), lookaround(look) {}
+    explicit left_build_info(const vector<vector<LookEntry>> &looks)
+        : has_lookaround(true), lookaround(looks) {}
 
     u32 queue = 0; /* uniquely idents the left_build_info */
     u32 lag = 0;
@@ -154,7 +156,7 @@ struct left_build_info {
     CharReach countingMiracleReach;
     u32 countingMiracleOffset = 0; /* populated later when laying out bytecode */
     bool has_lookaround = false;
-    vector<LookEntry> lookaround; // alternative implementation to the NFA
+    vector<vector<LookEntry>> lookaround; // alternative implementation to the NFA
 };
 
 /**
@@ -197,12 +199,22 @@ struct build_context : noncopyable {
     ue2::unordered_map<RoseProgram, u32, RoseProgramHash,
                        RoseProgramEquivalence> program_cache;
 
-    /** \brief LookEntry list cache, so that we don't have to go scanning
-     * through the full list to find cases we've used already. */
-    ue2::unordered_map<vector<LookEntry>, size_t> lookaround_cache;
+    /** \brief LookEntry list cache, so that we can reuse the look index and
+     * reach index for the same lookaround. */
+    ue2::unordered_map<vector<vector<LookEntry>>,
+                       pair<size_t, size_t>> lookaround_cache;
 
     /** \brief Lookaround table for Rose roles. */
-    vector<LookEntry> lookaround;
+    vector<vector<vector<LookEntry>>> lookaround;
+
+    /** \brief Lookaround look table size. */
+    size_t lookTableSize = 0;
+
+    /** \brief Lookaround reach table size.
+     * since single path lookaround and multi-path lookaround have different
+     * bitvectors range (32 and 256), we need to maintain both look table size
+     * and reach table size. */
+    size_t reachTableSize = 0;
 
     /** \brief State indices, for those roles that have them. */
     ue2::unordered_map<RoseVertex, u32> roleStateIndices;
@@ -1582,7 +1594,7 @@ bool buildLeftfixes(RoseBuildImpl &tbi, build_context &bc,
         // TODO: Handle SOM-tracking cases as well.
         if (cc.grey.roseLookaroundMasks && is_transient &&
             !g[v].left.tracksSom()) {
-            vector<LookEntry> lookaround;
+            vector<vector<LookEntry>> lookaround;
             if (makeLeftfixLookaround(tbi, v, lookaround)) {
                 DEBUG_PRINTF("implementing as lookaround!\n");
                 bc.leftfix_info.emplace(v, left_build_info(lookaround));
@@ -2651,15 +2663,7 @@ bool hasEodAnchors(const RoseBuildImpl &build, const build_context &bc,
 }
 
 static
-void writeLookaroundTables(build_context &bc, RoseEngine &proto) {
-    const auto &look_vec = bc.lookaround;
-    DEBUG_PRINTF("%zu lookaround table entries\n", look_vec.size());
-
-    vector<s8> look_table(look_vec.size(), 0);
-    vector<u8> reach_table(REACH_BITVECTOR_LEN * look_vec.size(), 0);
-
-    s8 *look = look_table.data();
-    u8 *reach = reach_table.data();
+void writeLookaround(const vector<LookEntry> &look_vec, s8 *&look, u8 *&reach) {
     for (const auto &le : look_vec) {
         *look = verify_s8(le.offset);
         const CharReach &cr = le.reach;
@@ -2670,6 +2674,52 @@ void writeLookaroundTables(build_context &bc, RoseEngine &proto) {
         ++look;
         reach += REACH_BITVECTOR_LEN;
     }
+}
+
+static
+void writeMultipathLookaround(const vector<vector<LookEntry>> &multi_look,
+                              s8 *&look, u8 *&reach) {
+    for (const auto &m : multi_look) {
+        u8 u = 0;
+        assert(m.size() == MAX_LOOKAROUND_PATHS);
+        for (size_t i = 0; i < m.size(); i++) {
+            if (m[i].reach.none()) {
+                u |= (u8)1U << i;
+            }
+        }
+        std::fill_n(reach, MULTI_REACH_BITVECTOR_LEN, u);
+
+        for (size_t i = 0; i < m.size(); i++) {
+            const CharReach &cr = m[i].reach;
+            if (cr.none()) {
+                continue;
+            }
+            *look = m[i].offset;
+
+            for (size_t c = cr.find_first(); c != cr.npos;
+                 c = cr.find_next(c)) {
+                reach[c] |= (u8)1U << i;
+            }
+        }
+
+        ++look;
+        reach += MULTI_REACH_BITVECTOR_LEN;
+    }
+}
+
+static
+void writeLookaroundTables(build_context &bc, RoseEngine &proto) {
+    vector<s8> look_table(bc.lookTableSize, 0);
+    vector<u8> reach_table(bc.reachTableSize, 0);
+    s8 *look = look_table.data();
+    u8 *reach = reach_table.data();
+    for (const auto &l : bc.lookaround) {
+        if (l.size() == 1) {
+            writeLookaround(l.front(), look, reach);
+        } else {
+            writeMultipathLookaround(l, look, reach);
+        }
+    }
 
     proto.lookaroundTableOffset = bc.engine_blob.add_range(look_table);
     proto.lookaroundReachOffset = bc.engine_blob.add_range(reach_table);
@@ -2804,30 +2854,37 @@ bool onlyAtEod(const RoseBuildImpl &tbi, RoseVertex v) {
 }
 
 static
-u32 addLookaround(build_context &bc, const vector<LookEntry> &look) {
+void addLookaround(build_context &bc,
+                   const vector<vector<LookEntry>> &look,
+                   u32 &look_index, u32 &reach_index) {
     // Check the cache.
     auto it = bc.lookaround_cache.find(look);
     if (it != bc.lookaround_cache.end()) {
-        DEBUG_PRINTF("reusing look at idx %zu\n", it->second);
-        return verify_u32(it->second);
+        look_index = verify_u32(it->second.first);
+        reach_index = verify_u32(it->second.second);
+        DEBUG_PRINTF("reusing look at idx %u\n", look_index);
+        DEBUG_PRINTF("reusing reach at idx %u\n", reach_index);
+        return;
     }
 
-    // Linear scan for sequence.
-    auto seq_it = search(begin(bc.lookaround), end(bc.lookaround), begin(look),
-                         end(look));
-    if (seq_it != end(bc.lookaround)) {
-        size_t idx = distance(begin(bc.lookaround), seq_it);
-        DEBUG_PRINTF("linear scan found look at idx %zu\n", idx);
-        bc.lookaround_cache.emplace(look, idx);
-        return verify_u32(idx);
+    size_t look_idx = bc.lookTableSize;
+    size_t reach_idx = bc.reachTableSize;
+
+    if (look.size() == 1) {
+        bc.lookTableSize += look.front().size();
+        bc.reachTableSize += look.front().size() * REACH_BITVECTOR_LEN;
+    } else {
+        bc.lookTableSize += look.size();
+        bc.reachTableSize += look.size() * MULTI_REACH_BITVECTOR_LEN;
     }
 
-    // New sequence.
-    size_t idx = bc.lookaround.size();
-    bc.lookaround_cache.emplace(look, idx);
-    insert(&bc.lookaround, bc.lookaround.end(), look);
-    DEBUG_PRINTF("adding look at idx %zu\n", idx);
-    return verify_u32(idx);
+    bc.lookaround_cache.emplace(look, make_pair(look_idx, reach_idx));
+    bc.lookaround.emplace_back(look);
+
+    DEBUG_PRINTF("adding look at idx %zu\n", look_idx);
+    DEBUG_PRINTF("adding reach at idx %zu\n", reach_idx);
+    look_index =  verify_u32(look_idx);
+    reach_index = verify_u32(reach_idx);
 }
 
 static
@@ -2977,7 +3034,7 @@ struct cmpNibble {
 // Insert all pairs of bucket and offset into buckets.
 static really_inline
 void getAllBuckets(const vector<LookEntry> &look,
-                 map<u32, vector<s8>, cmpNibble> &buckets, u32 &neg_mask) {
+                   map<u32, vector<s8>, cmpNibble> &buckets, u64a &neg_mask) {
     s32 base_offset = verify_s32(look.front().offset);
     for (const auto &entry : look) {
         CharReach cr = entry.reach;
@@ -2985,7 +3042,7 @@ void getAllBuckets(const vector<LookEntry> &look,
         if (cr.count() > 128 ) {
             cr.flip();
         } else {
-            neg_mask ^= 1 << (entry.offset - base_offset);
+            neg_mask ^= 1ULL << (entry.offset - base_offset);
         }
         map <u16, u16> lo2hi;
         // We treat Ascii Table as a 16x16 grid.
@@ -3037,23 +3094,16 @@ void nibMaskUpdate(array<u8, 32> &mask, u32 data, u8 bit_index) {
 }
 
 static
-bool makeRoleShufti(const vector<LookEntry> &look,
-                    RoseProgram &program) {
-
-    s32 base_offset = verify_s32(look.front().offset);
-    if (look.back().offset >= base_offset + 32) {
-        return false;
-    }
-    array<u8, 32> hi_mask, lo_mask;
-    hi_mask.fill(0);
-    lo_mask.fill(0);
-    array<u8, 32> bucket_select_hi, bucket_select_lo;
-    bucket_select_hi.fill(0); // will not be used in 16x8 and 32x8.
-    bucket_select_lo.fill(0);
-    u8 bit_index = 0; // number of buckets
+bool getShuftiMasks(const vector<LookEntry> &look, array<u8, 32> &hi_mask,
+                    array<u8, 32> &lo_mask, u8 *bucket_select_hi,
+                    u8 *bucket_select_lo, u64a &neg_mask,
+                    u8 &bit_idx, size_t len) {
     map<u32, u16> nib; // map every bucket to its bucket number.
     map<u32, vector<s8>, cmpNibble> bucket2offsets;
-    u32 neg_mask = ~0u;
+    s32 base_offset = look.front().offset;
+
+    bit_idx = 0;
+    neg_mask = ~0ULL;
 
     getAllBuckets(look, bucket2offsets, neg_mask);
 
@@ -3061,15 +3111,15 @@ bool makeRoleShufti(const vector<LookEntry> &look,
         u32 hi_lo = it.first;
         // New bucket.
         if (!nib[hi_lo]) {
-            if (bit_index >= 16) {
+            if ((bit_idx >= 8 && len == 64) || bit_idx >= 16) {
                 return false;
             }
-            nib[hi_lo] = 1 << bit_index;
+            nib[hi_lo] = 1 << bit_idx;
 
             nibUpdate(nib, hi_lo);
-            nibMaskUpdate(hi_mask, hi_lo >> 16, bit_index);
-            nibMaskUpdate(lo_mask, hi_lo & 0xffff, bit_index);
-            bit_index++;
+            nibMaskUpdate(hi_mask, hi_lo >> 16, bit_idx);
+            nibMaskUpdate(lo_mask, hi_lo & 0xffff, bit_idx);
+            bit_idx++;
         }
 
         DEBUG_PRINTF("hi_lo %x bucket %x\n", hi_lo, nib[hi_lo]);
@@ -3082,6 +3132,113 @@ bool makeRoleShufti(const vector<LookEntry> &look,
             bucket_select_lo[offset - base_offset] |= nib_lo;
         }
     }
+    return true;
+}
+
+static
+unique_ptr<RoseInstruction>
+makeCheckShufti16x8(u32 offset_range, u8 bucket_idx,
+                    const array<u8, 32> &hi_mask, const array<u8, 32> &lo_mask,
+                    const array<u8, 32> &bucket_select_mask,
+                    u32 neg_mask, s32 base_offset,
+                    const RoseInstruction *end_inst) {
+    if (offset_range > 16 || bucket_idx > 8) {
+        return nullptr;
+    }
+    array<u8, 32> nib_mask;
+    array<u8, 16> bucket_select_mask_16;
+    copy(lo_mask.begin(), lo_mask.begin() + 16, nib_mask.begin());
+    copy(hi_mask.begin(), hi_mask.begin() + 16, nib_mask.begin() + 16);
+    copy(bucket_select_mask.begin(), bucket_select_mask.begin() + 16,
+         bucket_select_mask_16.begin());
+    return make_unique<RoseInstrCheckShufti16x8>
+           (nib_mask, bucket_select_mask_16,
+            neg_mask & 0xffff, base_offset, end_inst);
+}
+
+static
+unique_ptr<RoseInstruction>
+makeCheckShufti32x8(u32 offset_range, u8 bucket_idx,
+                    const array<u8, 32> &hi_mask, const array<u8, 32> &lo_mask,
+                    const array<u8, 32> &bucket_select_mask,
+                    u32 neg_mask, s32 base_offset,
+                    const RoseInstruction *end_inst) {
+    if (offset_range > 32 || bucket_idx > 8) {
+        return nullptr;
+    }
+
+    array<u8, 16> hi_mask_16;
+    array<u8, 16> lo_mask_16;
+    copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_16.begin());
+    copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_16.begin());
+    return make_unique<RoseInstrCheckShufti32x8>
+           (hi_mask_16, lo_mask_16, bucket_select_mask,
+            neg_mask, base_offset, end_inst);
+}
+
+static
+unique_ptr<RoseInstruction>
+makeCheckShufti16x16(u32 offset_range, u8 bucket_idx,
+                     const array<u8, 32> &hi_mask, const array<u8, 32> &lo_mask,
+                     const array<u8, 32> &bucket_select_mask_lo,
+                     const array<u8, 32> &bucket_select_mask_hi,
+                     u32 neg_mask, s32 base_offset,
+                     const RoseInstruction *end_inst) {
+    if (offset_range > 16 || bucket_idx > 16) {
+        return nullptr;
+    }
+
+    array<u8, 32> bucket_select_mask_32;
+    copy(bucket_select_mask_lo.begin(), bucket_select_mask_lo.begin() + 16,
+         bucket_select_mask_32.begin());
+    copy(bucket_select_mask_hi.begin(), bucket_select_mask_hi.begin() + 16,
+         bucket_select_mask_32.begin() + 16);
+    return make_unique<RoseInstrCheckShufti16x16>
+           (hi_mask, lo_mask, bucket_select_mask_32,
+            neg_mask & 0xffff, base_offset, end_inst);
+}
+static
+unique_ptr<RoseInstruction>
+makeCheckShufti32x16(u32 offset_range, u8 bucket_idx,
+                     const array<u8, 32> &hi_mask, const array<u8, 32> &lo_mask,
+                     const array<u8, 32> &bucket_select_mask_lo,
+                     const array<u8, 32> &bucket_select_mask_hi,
+                     u32 neg_mask, s32 base_offset,
+                     const RoseInstruction *end_inst) {
+    if (offset_range > 32 || bucket_idx > 16) {
+        return nullptr;
+    }
+
+    return make_unique<RoseInstrCheckShufti32x16>
+           (hi_mask, lo_mask, bucket_select_mask_hi,
+            bucket_select_mask_lo, neg_mask, base_offset, end_inst);
+}
+
+static
+bool makeRoleShufti(const vector<LookEntry> &look,
+                    RoseProgram &program) {
+
+    s32 base_offset = verify_s32(look.front().offset);
+    if (look.back().offset >= base_offset + 32) {
+        return false;
+    }
+
+    u8 bucket_idx = 0; // number of buckets
+    u64a neg_mask_64;
+    array<u8, 32> hi_mask;
+    array<u8, 32> lo_mask;
+    array<u8, 32> bucket_select_hi;
+    array<u8, 32> bucket_select_lo;
+    hi_mask.fill(0);
+    lo_mask.fill(0);
+    bucket_select_hi.fill(0); // will not be used in 16x8 and 32x8.
+    bucket_select_lo.fill(0);
+
+    if (!getShuftiMasks(look, hi_mask, lo_mask, bucket_select_hi.data(),
+                        bucket_select_lo.data(), neg_mask_64, bucket_idx, 32)) {
+        return false;
+    }
+    u32 neg_mask = (u32)neg_mask_64;
 
     DEBUG_PRINTF("hi_mask %s\n",
                  convertMaskstoString(hi_mask.data(), 32).c_str());
@@ -3093,48 +3250,29 @@ bool makeRoleShufti(const vector<LookEntry> &look,
                  convertMaskstoString(bucket_select_lo.data(), 32).c_str());
 
     const auto *end_inst = program.end_instruction();
-    if (bit_index < 8) {
-        if (look.back().offset < base_offset + 16) {
-            neg_mask &= 0xffff;
-            array<u8, 32> nib_mask;
-            array<u8, 16> bucket_select_mask_16;
-            copy(lo_mask.begin(), lo_mask.begin() + 16, nib_mask.begin());
-            copy(hi_mask.begin(), hi_mask.begin() + 16, nib_mask.begin() + 16);
-            copy(bucket_select_lo.begin(), bucket_select_lo.begin() + 16,
-                 bucket_select_mask_16.begin());
-            auto ri = make_unique<RoseInstrCheckShufti16x8>
-                      (nib_mask, bucket_select_mask_16,
-                       neg_mask, base_offset, end_inst);
-            program.add_before_end(move(ri));
-        } else {
-            array<u8, 16> hi_mask_16;
-            array<u8, 16> lo_mask_16;
-            copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_16.begin());
-            copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_16.begin());
-            auto ri = make_unique<RoseInstrCheckShufti32x8>
-                      (hi_mask_16, lo_mask_16, bucket_select_lo,
-                       neg_mask, base_offset, end_inst);
-            program.add_before_end(move(ri));
-        }
-    } else {
-        if (look.back().offset < base_offset + 16) {
-            neg_mask &= 0xffff;
-            array<u8, 32> bucket_select_mask_32;
-            copy(bucket_select_lo.begin(), bucket_select_lo.begin() + 16,
-                 bucket_select_mask_32.begin());
-            copy(bucket_select_hi.begin(), bucket_select_hi.begin() + 16,
-                 bucket_select_mask_32.begin() + 16);
-            auto ri = make_unique<RoseInstrCheckShufti16x16>
-                      (hi_mask, lo_mask, bucket_select_mask_32,
-                       neg_mask, base_offset, end_inst);
-            program.add_before_end(move(ri));
-        } else {
-            auto ri = make_unique<RoseInstrCheckShufti32x16>
-                      (hi_mask, lo_mask, bucket_select_hi, bucket_select_lo,
-                       neg_mask, base_offset, end_inst);
-            program.add_before_end(move(ri));
-        }
-    }
+    s32 offset_range = look.back().offset - base_offset + 1;
+
+    auto ri = makeCheckShufti16x8(offset_range, bucket_idx, hi_mask, lo_mask,
+                                  bucket_select_lo, neg_mask, base_offset,
+                                  end_inst);
+    if (!ri) {
+        ri = makeCheckShufti32x8(offset_range, bucket_idx, hi_mask, lo_mask,
+                                 bucket_select_lo, neg_mask, base_offset,
+                                 end_inst);
+    }
+    if (!ri) {
+        ri = makeCheckShufti16x16(offset_range, bucket_idx, hi_mask, lo_mask,
+                                  bucket_select_lo, bucket_select_hi,
+                                  neg_mask, base_offset, end_inst);
+    }
+    if (!ri) {
+        ri = makeCheckShufti32x16(offset_range, bucket_idx, hi_mask, lo_mask,
+                                  bucket_select_lo, bucket_select_hi,
+                                  neg_mask, base_offset, end_inst);
+    }
+    assert(ri);
+    program.add_before_end(move(ri));
+
     return true;
 }
 
@@ -3153,9 +3291,13 @@ void makeLookaroundInstruction(build_context &bc, const vector<LookEntry> &look,
 
     if (look.size() == 1) {
         s8 offset = look.begin()->offset;
-        u32 look_idx = addLookaround(bc, look);
-        auto ri = make_unique<RoseInstrCheckSingleLookaround>(offset, look_idx,
-                                                    program.end_instruction());
+        u32 look_idx, reach_idx;
+        vector<vector<LookEntry>> lookaround;
+        lookaround.emplace_back(look);
+        addLookaround(bc, lookaround, look_idx, reach_idx);
+        // We don't need look_idx here.
+        auto ri = make_unique<RoseInstrCheckSingleLookaround>(offset, reach_idx,
+                                                     program.end_instruction());
         program.add_before_end(move(ri));
         return;
     }
@@ -3172,10 +3314,242 @@ void makeLookaroundInstruction(build_context &bc, const vector<LookEntry> &look,
         return;
     }
 
-    u32 look_idx = addLookaround(bc, look);
+    u32 look_idx, reach_idx;
+    vector<vector<LookEntry>> lookaround;
+    lookaround.emplace_back(look);
+    addLookaround(bc, lookaround, look_idx, reach_idx);
     u32 look_count = verify_u32(look.size());
 
-    auto ri = make_unique<RoseInstrCheckLookaround>(look_idx, look_count,
+    auto ri = make_unique<RoseInstrCheckLookaround>(look_idx, reach_idx,
+                                                    look_count,
+                                                    program.end_instruction());
+    program.add_before_end(move(ri));
+}
+
+#if defined(DEBUG) || defined(DUMP_SUPPORT)
+static UNUSED
+string dumpMultiLook(const vector<LookEntry> &looks) {
+    ostringstream oss;
+    for (auto it = looks.begin(); it != looks.end(); ++it) {
+        if (it != looks.begin()) {
+            oss << ", ";
+        }
+        oss << "{" << int(it->offset) << ": " << describeClass(it->reach) << "}";
+    }
+    return oss.str();
+}
+#endif
+
+static
+bool makeRoleMultipathShufti(const vector<vector<LookEntry>> &multi_look,
+                             RoseProgram &program) {
+    if (multi_look.empty()) {
+        return false;
+    }
+
+    // find the base offset
+    assert(!multi_look[0].empty());
+    s32 base_offset = multi_look[0].front().offset;
+    s32 last_start = base_offset;
+    s32 end_offset = multi_look[0].back().offset;
+    size_t multi_len = 0;
+
+    for (const auto &look : multi_look) {
+        assert(look.size() > 0);
+        multi_len += look.size();
+
+        LIMIT_TO_AT_MOST(&base_offset, look.front().offset);
+        ENSURE_AT_LEAST(&last_start, look.front().offset);
+        ENSURE_AT_LEAST(&end_offset, look.back().offset);
+    }
+
+    assert(last_start < 0);
+
+    if (end_offset - base_offset >= MULTIPATH_MAX_LEN) {
+        return false;
+    }
+
+    if (multi_len <= 16) {
+        multi_len = 16;
+    } else if (multi_len <= 32) {
+        multi_len = 32;
+    } else if (multi_len <= 64) {
+        multi_len = 64;
+    } else {
+        DEBUG_PRINTF("too long for multi-path\n");
+        return false;
+    }
+
+    vector<LookEntry> linear_look;
+    array<u8, 64> data_select_mask;
+    data_select_mask.fill(0);
+    u64a hi_bits_mask = 0;
+    u64a lo_bits_mask = 0;
+
+    for (const auto &look : multi_look) {
+        assert(linear_look.size() < 64);
+        lo_bits_mask |= 1LLU << linear_look.size();
+        for (const auto &entry : look) {
+            assert(entry.offset - base_offset < MULTIPATH_MAX_LEN);
+            data_select_mask[linear_look.size()] =
+                                          verify_u8(entry.offset - base_offset);
+            linear_look.emplace_back(verify_s8(linear_look.size()), entry.reach);
+        }
+        hi_bits_mask |= 1LLU << (linear_look.size() - 1);
+    }
+
+    u8 bit_index = 0; // number of buckets
+    u64a neg_mask;
+    array<u8, 32> hi_mask;
+    array<u8, 32> lo_mask;
+    array<u8, 64> bucket_select_hi;
+    array<u8, 64> bucket_select_lo;
+    hi_mask.fill(0);
+    lo_mask.fill(0);
+    bucket_select_hi.fill(0);
+    bucket_select_lo.fill(0);
+
+    if (!getShuftiMasks(linear_look, hi_mask, lo_mask, bucket_select_hi.data(),
+                        bucket_select_lo.data(), neg_mask, bit_index,
+                        multi_len)) {
+        return false;
+    }
+
+    DEBUG_PRINTF("hi_mask %s\n",
+                 convertMaskstoString(hi_mask.data(), 16).c_str());
+    DEBUG_PRINTF("lo_mask %s\n",
+                 convertMaskstoString(lo_mask.data(), 16).c_str());
+    DEBUG_PRINTF("bucket_select_hi %s\n",
+                 convertMaskstoString(bucket_select_hi.data(), 64).c_str());
+    DEBUG_PRINTF("bucket_select_lo %s\n",
+                 convertMaskstoString(bucket_select_lo.data(), 64).c_str());
+    DEBUG_PRINTF("data_select_mask %s\n",
+                 convertMaskstoString(data_select_mask.data(), 64).c_str());
+    DEBUG_PRINTF("hi_bits_mask %llx\n", hi_bits_mask);
+    DEBUG_PRINTF("lo_bits_mask %llx\n", lo_bits_mask);
+    DEBUG_PRINTF("neg_mask %llx\n", neg_mask);
+    DEBUG_PRINTF("base_offset %d\n", base_offset);
+    DEBUG_PRINTF("last_start %d\n", last_start);
+
+    // Since we don't have 16x16 now, just call 32x16 instead.
+    if (bit_index > 8) {
+        assert(multi_len <= 32);
+        multi_len = 32;
+    }
+
+    const auto *end_inst = program.end_instruction();
+    assert(multi_len == 16 || multi_len == 32 || multi_len == 64);
+    if (multi_len == 16) {
+        neg_mask &= 0xffff;
+        assert(!(hi_bits_mask & ~0xffffULL));
+        assert(!(lo_bits_mask & ~0xffffULL));
+        assert(bit_index <=8);
+        array<u8, 32> nib_mask;
+        copy(begin(lo_mask), begin(lo_mask) + 16, nib_mask.begin());
+        copy(begin(hi_mask), begin(hi_mask) + 16, nib_mask.begin() + 16);
+
+        auto ri = make_unique<RoseInstrCheckMultipathShufti16x8>
+                  (nib_mask, bucket_select_lo, data_select_mask, hi_bits_mask,
+                   lo_bits_mask, neg_mask, base_offset, last_start, end_inst);
+        program.add_before_end(move(ri));
+    } else if (multi_len == 32) {
+        neg_mask &= 0xffffffff;
+        assert(!(hi_bits_mask & ~0xffffffffULL));
+        assert(!(lo_bits_mask & ~0xffffffffULL));
+        if (bit_index <= 8) {
+            auto ri = make_unique<RoseInstrCheckMultipathShufti32x8>
+                      (hi_mask, lo_mask, bucket_select_lo, data_select_mask,
+                       hi_bits_mask, lo_bits_mask, neg_mask, base_offset,
+                       last_start, end_inst);
+            program.add_before_end(move(ri));
+        } else {
+            auto ri = make_unique<RoseInstrCheckMultipathShufti32x16>
+                      (hi_mask, lo_mask, bucket_select_hi, bucket_select_lo,
+                       data_select_mask, hi_bits_mask, lo_bits_mask, neg_mask,
+                       base_offset, last_start, end_inst);
+            program.add_before_end(move(ri));
+        }
+    } else {
+        auto ri = make_unique<RoseInstrCheckMultipathShufti64>
+                  (hi_mask, lo_mask, bucket_select_lo, data_select_mask,
+                   hi_bits_mask, lo_bits_mask, neg_mask, base_offset,
+                   last_start, end_inst);
+        program.add_before_end(move(ri));
+    }
+    return true;
+}
+
+static
+void makeRoleMultipathLookaround(build_context &bc,
+                                 const vector<vector<LookEntry>> &multi_look,
+                                 RoseProgram &program) {
+    assert(!multi_look.empty());
+    assert(multi_look.size() <= MAX_LOOKAROUND_PATHS);
+    vector<vector<LookEntry>> ordered_look;
+    set<s32> look_offset;
+
+    assert(!multi_look[0].empty());
+    s32 last_start = multi_look[0][0].offset;
+
+    // build offset table.
+    for (const auto &look : multi_look) {
+        assert(look.size() > 0);
+        last_start = max(last_start, (s32)look.begin()->offset);
+
+        for (const auto &t : look) {
+            look_offset.insert(t.offset);
+        }
+    }
+
+    array<u8, MULTIPATH_MAX_LEN> start_mask;
+    if (multi_look.size() < MAX_LOOKAROUND_PATHS) {
+        start_mask.fill((1 << multi_look.size()) - 1);
+    } else {
+        start_mask.fill(0xff);
+    }
+
+    u32 path_idx = 0;
+    for (const auto &look : multi_look) {
+        for (const auto &t : look) {
+            assert(t.offset >= (int)*look_offset.begin());
+            size_t update_offset = t.offset - *look_offset.begin() + 1;
+            if (update_offset < start_mask.size()) {
+                start_mask[update_offset] &= ~(1 << path_idx);
+            }
+        }
+        path_idx++;
+    }
+
+    for (u32 i = 1; i < MULTIPATH_MAX_LEN; i++) {
+        start_mask[i] &= start_mask[i - 1];
+        DEBUG_PRINTF("start_mask[%u] = %x\n", i, start_mask[i]);
+    }
+
+    assert(look_offset.size() <= MULTIPATH_MAX_LEN);
+
+    assert(last_start < 0);
+
+    for (const auto &offset : look_offset) {
+        vector<LookEntry> multi_entry;
+        multi_entry.resize(MAX_LOOKAROUND_PATHS);
+
+        for (size_t i = 0; i < multi_look.size(); i++) {
+            for (const auto &t : multi_look[i]) {
+                if (t.offset == offset) {
+                    multi_entry[i] = t;
+                }
+            }
+        }
+        ordered_look.emplace_back(multi_entry);
+    }
+
+    u32 look_idx, reach_idx;
+    addLookaround(bc, ordered_look, look_idx, reach_idx);
+    u32 look_count = verify_u32(ordered_look.size());
+
+    auto ri = make_unique<RoseInstrMultipathLookaround>(look_idx, reach_idx,
+                                                        look_count, last_start,
+                                                        start_mask,
                                                     program.end_instruction());
     program.add_before_end(move(ri));
 }
@@ -3187,25 +3561,34 @@ void makeRoleLookaround(const RoseBuildImpl &build, build_context &bc,
         return;
     }
 
-    vector<LookEntry> look;
+    vector<vector<LookEntry>> looks;
 
     // Lookaround from leftfix (mandatory).
     if (contains(bc.leftfix_info, v) && bc.leftfix_info.at(v).has_lookaround) {
         DEBUG_PRINTF("using leftfix lookaround\n");
-        look = bc.leftfix_info.at(v).lookaround;
+        looks = bc.leftfix_info.at(v).lookaround;
     }
 
     // We may be able to find more lookaround info (advisory) and merge it
     // in.
-    vector<LookEntry> look_more;
-    findLookaroundMasks(build, v, look_more);
-    mergeLookaround(look, look_more);
-
-    if (look.empty()) {
+    if (looks.size() <= 1) {
+        vector<LookEntry> look;
+        vector<LookEntry> look_more;
+        if (!looks.empty()) {
+            look = move(looks.front());
+        }
+        findLookaroundMasks(build, v, look_more);
+        mergeLookaround(look, look_more);
+        if (!look.empty()) {
+            makeLookaroundInstruction(bc, look, program);
+        }
         return;
     }
 
-    makeLookaroundInstruction(bc, look, program);
+    if (!makeRoleMultipathShufti(looks, program)) {
+        assert(looks.size() <= 8);
+        makeRoleMultipathLookaround(bc, looks, program);
+    }
 }
 
 static
index 0e53d59da097d88eba73bca138686f2e4926582a..30dccb1a393789672ecbf6d7dbf41135ea29b9ca 100644 (file)
@@ -569,10 +569,20 @@ static
 CharReach bitvectorToReach(const u8 *reach) {
     CharReach cr;
 
-    for (size_t i = 0; i < 256; i++) {
+    for (size_t i = 0; i < N_CHARS; i++) {
         if (reach[i / 8] & (1U << (i % 8))) {
             cr.set(i);
+        }
+    }
+    return cr;
+}
 
+static
+CharReach multiBitvectorToReach(const u8 *reach, u8 path_mask) {
+    CharReach cr;
+    for (size_t i = 0; i < N_CHARS; i++) {
+        if (reach[i] & path_mask) {
+            cr.set(i);
         }
     }
     return cr;
@@ -587,9 +597,9 @@ void dumpLookaround(ofstream &os, const RoseEngine *t,
     const s8 *look_base = (const s8 *)(base + t->lookaroundTableOffset);
     const u8 *reach_base = base + t->lookaroundReachOffset;
 
-    const s8 *look = look_base + ri->index;
+    const s8 *look = look_base + ri->look_index;
     const s8 *look_end = look + ri->count;
-    const u8 *reach = reach_base + ri->index * REACH_BITVECTOR_LEN;
+    const u8 *reach = reach_base + ri->reach_index;
 
     os << "    contents:" << endl;
 
@@ -601,6 +611,41 @@ void dumpLookaround(ofstream &os, const RoseEngine *t,
     }
 }
 
+static
+void dumpMultipathLookaround(ofstream &os, const RoseEngine *t,
+                             const ROSE_STRUCT_MULTIPATH_LOOKAROUND *ri) {
+    assert(ri);
+
+    const u8 *base = (const u8 *)t;
+    const s8 *look_base = (const s8 *)(base + t->lookaroundTableOffset);
+    const u8 *reach_base = base + t->lookaroundReachOffset;
+
+    const s8 *look_begin = look_base + ri->look_index;
+    const s8 *look_end = look_begin + ri->count;
+    const u8 *reach_begin = reach_base + ri->reach_index;
+
+    os << "    contents:" << endl;
+
+    u32 path_mask = ri->start_mask[0];
+    while (path_mask) {
+        u32 path = findAndClearLSB_32(&path_mask);
+        os << "    Path #" << path << ":" << endl;
+        os << "      ";
+
+        const s8 *look = look_begin;
+        const u8 *reach = reach_begin;
+        for (; look < look_end; look++, reach += MULTI_REACH_BITVECTOR_LEN) {
+            CharReach cr = multiBitvectorToReach(reach, 1U << path);
+            if (cr.any() && !cr.all()) {
+                os << "<" << int(*look) << ": ";
+                describeClass(os, cr, 1000, CC_OUT_TEXT);
+                os << "> ";
+            }
+        }
+        os << endl;
+    }
+}
+
 static
 vector<u32> sparseIterValues(const mmbit_sparse_iter *it, u32 num_bits) {
     vector<u32> keys;
@@ -666,7 +711,126 @@ string dumpStrMask(const u8 *mask, size_t len) {
     return oss.str();
 }
 
-#define PROGRAM_CASE(name)                                                     \
+static
+CharReach shufti2cr(const u8 *lo, const u8 *hi, u8 bucket_mask) {
+    CharReach cr;
+    for (u32 i = 0; i < N_CHARS; i++) {
+        if(lo[i & 0xf] & hi[i >> 4] & bucket_mask) {
+            cr.set(i);
+        }
+    }
+    return cr;
+}
+
+static
+void dumpLookaroundShufti(ofstream &os, u32 len, const u8 *lo, const u8 *hi,
+                          const u8 *bucket_mask, u32 neg_mask, s32 offset) {
+    assert(len == 16 || len == 32);
+    os << "    contents:" << endl;
+    for (u32 idx = 0; idx < len; idx++) {
+        CharReach cr = shufti2cr(lo, hi, bucket_mask[idx]);
+
+        if (neg_mask & (1U << idx)) {
+            cr.flip();
+        }
+
+        if (cr.any() && !cr.all()) {
+            os << "      " << std::setw(4) << std::setfill(' ')
+               << int(offset + idx) << ": ";
+            describeClass(os, cr, 1000, CC_OUT_TEXT);
+            os << endl;
+        }
+    }
+}
+
+static
+void dumpLookaroundShufti(ofstream &os, u32 len, const u8 *lo, const u8 *hi,
+                          const u8 *lo_2, const u8 *hi_2, const u8 *bucket_mask,
+                          const u8 *bucket_mask_2, u32 neg_mask, s32 offset) {
+    assert(len == 16 || len == 32);
+    os << "    contents:" << endl;
+    for (u32 idx = 0; idx < len; idx++) {
+        CharReach cr = shufti2cr(lo, hi, bucket_mask[idx]);
+        cr |= shufti2cr(lo_2, hi_2, bucket_mask_2[idx]);
+
+        if (neg_mask & (1U << idx)) {
+            cr.flip();
+        }
+
+        if (cr.any() && !cr.all()) {
+            os << "      " << std::setw(4) << std::setfill(' ')
+               << int(offset + idx) << ": ";
+            describeClass(os, cr, 1000, CC_OUT_TEXT);
+            os << endl;
+        }
+    }
+}
+
+static
+void dumpMultipathShufti(ofstream &os, u32 len, const u8 *lo, const u8 *hi,
+                         const u8 *bucket_mask, const u8 *data_offset,
+                         u64a neg_mask, s32 base_offset) {
+    assert(len == 16 || len == 32 || len == 64);
+    os << "    contents:" << endl;
+    u32 path = 0;
+    for (u32 idx = 0; idx < len; idx++) {
+        CharReach cr = shufti2cr(lo, hi, bucket_mask[idx]);
+
+        if (neg_mask & (1ULL << idx)) {
+            cr.flip();
+        }
+
+        if (cr.any() && !cr.all()) {
+            if (idx == 0 || data_offset[idx - 1] > data_offset[idx]) {
+                path++;
+                if (idx) {
+                    os << endl;
+                }
+                os << "    Path #" << path << ":" << endl;
+                os << "      ";
+            }
+
+            os << "<" << int(base_offset + data_offset[idx]) << ": ";
+            describeClass(os, cr, 1000, CC_OUT_TEXT);
+            os << "> ";
+        }
+    }
+    os << endl;
+}
+
+static
+void dumpMultipathShufti(ofstream &os, u32 len, const u8 *lo, const u8 *hi,
+                         const u8 *lo_2, const u8 *hi_2, const u8 *bucket_mask,
+                         const u8 *bucket_mask_2, const u8 *data_offset,
+                         u32 neg_mask, s32 base_offset) {
+    assert(len == 16 || len == 32 || len == 64);
+    os << "    contents:";
+    u32 path = 0;
+    for (u32 idx = 0; idx < len; idx++) {
+        CharReach cr = shufti2cr(lo, hi, bucket_mask[idx]);
+        cr |= shufti2cr(lo_2, hi_2, bucket_mask_2[idx]);
+
+        if (neg_mask & (1ULL << idx)) {
+            cr.flip();
+        }
+
+        if (cr.any() && !cr.all()) {
+            if (idx == 0 || data_offset[idx - 1] > data_offset[idx]) {
+                path++;
+                os << endl;
+                os << "    Path #" << path << ":" << endl;
+                os << "      ";
+            }
+
+            os << "<" << int(base_offset + data_offset[idx]) << ": ";
+            describeClass(os, cr, 1000, CC_OUT_TEXT);
+            os << "> ";
+        }
+    }
+    os << endl;
+}
+
+           #define PROGRAM_CASE(name)                                                     \
     case ROSE_INSTR_##name: {                                                  \
         os << "  " << std::setw(4) << std::setfill('0') << (pc - pc_base)      \
            << ": " #name " (" << (int)ROSE_INSTR_##name << ")" << endl;        \
@@ -741,7 +905,8 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
             PROGRAM_NEXT_INSTRUCTION
 
             PROGRAM_CASE(CHECK_LOOKAROUND) {
-                os << "    index " << ri->index << endl;
+                os << "    look_index " << ri->look_index << endl;
+                os << "    reach_index " << ri->reach_index << endl;
                 os << "    count " << ri->count << endl;
                 os << "    fail_jump " << offset + ri->fail_jump << endl;
                 dumpLookaround(os, t, ri);
@@ -795,8 +960,13 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
                    << dumpStrMask(ri->bucket_select_mask,
                                   sizeof(ri->bucket_select_mask))
                    << endl;
+                os << "    neg_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
                 os << "    offset " << ri->offset << endl;
                 os << "    fail_jump " << offset + ri->fail_jump << endl;
+                dumpLookaroundShufti(os, 16, ri->nib_mask, ri->nib_mask + 16,
+                                     ri->bucket_select_mask, ri->neg_mask,
+                                     ri->offset);
             }
             PROGRAM_NEXT_INSTRUCTION
 
@@ -811,8 +981,13 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
                    << dumpStrMask(ri->bucket_select_mask,
                                   sizeof(ri->bucket_select_mask))
                    << endl;
+                os << "    neg_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
                 os << "    offset " << ri->offset << endl;
                 os << "    fail_jump " << offset + ri->fail_jump << endl;
+                dumpLookaroundShufti(os, 32, ri->lo_mask, ri->hi_mask,
+                                     ri->bucket_select_mask, ri->neg_mask,
+                                     ri->offset);
             }
             PROGRAM_NEXT_INSTRUCTION
 
@@ -827,8 +1002,15 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
                    << dumpStrMask(ri->bucket_select_mask,
                                   sizeof(ri->bucket_select_mask))
                    << endl;
+                os << "    neg_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
                 os << "    offset " << ri->offset << endl;
                 os << "    fail_jump " << offset + ri->fail_jump << endl;
+                dumpLookaroundShufti(os, 16, ri->lo_mask, ri->hi_mask,
+                                     ri->lo_mask + 16, ri->hi_mask + 16,
+                                     ri->bucket_select_mask,
+                                     ri->bucket_select_mask + 16,
+                                     ri->neg_mask, ri->offset);
             }
             PROGRAM_NEXT_INSTRUCTION
 
@@ -847,8 +1029,15 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
                    << dumpStrMask(ri->bucket_select_mask_lo,
                                   sizeof(ri->bucket_select_mask_lo))
                    << endl;
+                os << "    neg_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
                 os << "    offset " << ri->offset << endl;
                 os << "    fail_jump " << offset + ri->fail_jump << endl;
+                dumpLookaroundShufti(os, 32, ri->lo_mask, ri->hi_mask,
+                                     ri->lo_mask + 16, ri->hi_mask + 16,
+                                     ri->bucket_select_mask_lo,
+                                     ri->bucket_select_mask_hi,
+                                     ri->neg_mask, ri->offset);
             }
             PROGRAM_NEXT_INSTRUCTION
 
@@ -1103,6 +1292,146 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
             PROGRAM_CASE(CLEAR_WORK_DONE) {}
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(MULTIPATH_LOOKAROUND) {
+                os << "    look_index " << ri->look_index << endl;
+                os << "    reach_index " << ri->reach_index << endl;
+                os << "    count " << ri->count << endl;
+                os << "    last_start " << ri->last_start << endl;
+                os << "    start_mask "
+                   << dumpStrMask(ri->start_mask, sizeof(ri->start_mask))
+                   << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+                dumpMultipathLookaround(os, t, ri);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MULTIPATH_SHUFTI_16x8) {
+                os << "    nib_mask "
+                   << dumpStrMask(ri->nib_mask, sizeof(ri->nib_mask))
+                   << endl;
+                os << "    bucket_select_mask "
+                   << dumpStrMask(ri->bucket_select_mask,
+                                  sizeof(ri->bucket_select_mask))
+                   << endl;
+                os << "    data_select_mask "
+                   << dumpStrMask(ri->data_select_mask,
+                                  sizeof(ri->data_select_mask))
+                   << endl;
+                os << "    hi_bits_mask 0x" << std::hex << std::setw(4)
+                   << std::setfill('0') << ri->hi_bits_mask << std::dec << endl;
+                os << "    lo_bits_mask 0x" << std::hex << std::setw(4)
+                   << std::setfill('0') << ri->lo_bits_mask << std::dec << endl;
+                os << "    neg_mask 0x" << std::hex << std::setw(4)
+                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
+                os << "    base_offset " << ri->base_offset << endl;
+                os << "    last_start " << ri->last_start << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+                dumpMultipathShufti(os, 16, ri->nib_mask, ri->nib_mask + 16,
+                                    ri->bucket_select_mask,
+                                    ri->data_select_mask,
+                                    ri->neg_mask, ri->base_offset);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MULTIPATH_SHUFTI_32x8) {
+                os << "    hi_mask "
+                   << dumpStrMask(ri->hi_mask, sizeof(ri->hi_mask))
+                   << endl;
+                os << "    lo_mask "
+                   << dumpStrMask(ri->lo_mask, sizeof(ri->lo_mask))
+                   << endl;
+                os << "    bucket_select_mask "
+                   << dumpStrMask(ri->bucket_select_mask,
+                                  sizeof(ri->bucket_select_mask))
+                   << endl;
+                os << "    data_select_mask "
+                   << dumpStrMask(ri->data_select_mask,
+                                  sizeof(ri->data_select_mask))
+                   << endl;
+                os << "    hi_bits_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->hi_bits_mask << std::dec << endl;
+                os << "    lo_bits_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->lo_bits_mask << std::dec << endl;
+                os << "    neg_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
+                os << "    base_offset " << ri->base_offset << endl;
+                os << "    last_start " << ri->last_start << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+                dumpMultipathShufti(os, 32, ri->lo_mask, ri->hi_mask,
+                                    ri->bucket_select_mask,
+                                    ri->data_select_mask,
+                                    ri->neg_mask, ri->base_offset);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MULTIPATH_SHUFTI_32x16) {
+                os << "    hi_mask "
+                   << dumpStrMask(ri->hi_mask, sizeof(ri->hi_mask))
+                   << endl;
+                os << "    lo_mask "
+                   << dumpStrMask(ri->lo_mask, sizeof(ri->lo_mask))
+                   << endl;
+                os << "    bucket_select_mask_hi "
+                   << dumpStrMask(ri->bucket_select_mask_hi,
+                                  sizeof(ri->bucket_select_mask_hi))
+                   << endl;
+                os << "    bucket_select_mask_lo "
+                   << dumpStrMask(ri->bucket_select_mask_lo,
+                                  sizeof(ri->bucket_select_mask_lo))
+                   << endl;
+                os << "    data_select_mask "
+                   << dumpStrMask(ri->data_select_mask,
+                                  sizeof(ri->data_select_mask))
+                   << endl;
+                os << "    hi_bits_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->hi_bits_mask << std::dec << endl;
+                os << "    lo_bits_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->lo_bits_mask << std::dec << endl;
+                os << "    neg_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
+                os << "    base_offset " << ri->base_offset << endl;
+                os << "    last_start " << ri->last_start << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+                dumpMultipathShufti(os, 32, ri->lo_mask, ri->hi_mask,
+                                    ri->lo_mask + 16, ri->hi_mask + 16,
+                                    ri->bucket_select_mask_lo,
+                                    ri->bucket_select_mask_hi,
+                                    ri->data_select_mask,
+                                    ri->neg_mask, ri->base_offset);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MULTIPATH_SHUFTI_64) {
+                os << "    hi_mask "
+                   << dumpStrMask(ri->hi_mask, sizeof(ri->hi_mask))
+                   << endl;
+                os << "    lo_mask "
+                   << dumpStrMask(ri->lo_mask, sizeof(ri->lo_mask))
+                   << endl;
+                os << "    bucket_select_mask "
+                   << dumpStrMask(ri->bucket_select_mask,
+                                  sizeof(ri->bucket_select_mask))
+                   << endl;
+                os << "    data_select_mask "
+                   << dumpStrMask(ri->data_select_mask,
+                                  sizeof(ri->data_select_mask))
+                   << endl;
+                os << "    hi_bits_mask 0x" << std::hex << std::setw(16)
+                   << std::setfill('0') << ri->hi_bits_mask << std::dec << endl;
+                os << "    lo_bits_mask 0x" << std::hex << std::setw(16)
+                   << std::setfill('0') << ri->lo_bits_mask << std::dec << endl;
+                os << "    neg_mask 0x" << std::hex << std::setw(16)
+                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
+                os << "    base_offset " << ri->base_offset << endl;
+                os << "    last_start " << ri->last_start << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+                dumpMultipathShufti(os, 64, ri->lo_mask, ri->hi_mask,
+                                    ri->bucket_select_mask,
+                                    ri->data_select_mask,
+                                    ri->neg_mask, ri->base_offset);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
         default:
             os << "  UNKNOWN (code " << int{code} << ")" << endl;
             os << "  <stopping>" << endl;
index ae990f7f45a24672c72bc27ca9e9de6c242ab1f6..07ab7c59b39ad30b7c47f7982be89987124555bc 100644 (file)
@@ -45,6 +45,7 @@
 
 #include <cstdlib>
 #include <queue>
+#include <sstream>
 
 using namespace std;
 
@@ -62,6 +63,20 @@ static const u32 MAX_LOOKAROUND_ENTRIES = 16;
 /** \brief We would rather have lookarounds with smaller reach than this. */
 static const u32 LOOKAROUND_WIDE_REACH = 200;
 
+#if defined(DEBUG) || defined(DUMP_SUPPORT)
+static UNUSED
+string dump(const map<s32, CharReach> &look) {
+    ostringstream oss;
+    for (auto it = look.begin(), ite = look.end(); it != ite; ++it) {
+        if (it != look.begin()) {
+            oss << ", ";
+        }
+        oss << "{" << it->first << ": " << describeClass(it->second) << "}";
+    }
+    return oss.str();
+}
+#endif
+
 static
 void getForwardReach(const NGHolder &g, u32 top, map<s32, CharReach> &look) {
     ue2::flat_set<NFAVertex> curr, next;
@@ -298,21 +313,6 @@ void findBackwardReach(const RoseGraph &g, const RoseVertex v,
     // TODO: implement DFA variants if necessary.
 }
 
-#if defined(DEBUG) || defined(DUMP_SUPPORT)
-#include <sstream>
-static UNUSED
-string dump(const map<s32, CharReach> &look) {
-    ostringstream oss;
-    for (auto it = look.begin(), ite = look.end(); it != ite; ++it) {
-        if (it != look.begin()) {
-            oss << ", ";
-        }
-        oss << "{" << it->first << ": " << describeClass(it->second) << "}";
-    }
-    return oss.str();
-}
-#endif
-
 static
 void normalise(map<s32, CharReach> &look) {
     // We can erase entries where the reach is "all characters".
@@ -554,6 +554,76 @@ void trimLiterals(const RoseBuildImpl &build, const RoseVertex v,
     DEBUG_PRINTF("post-trim lookaround: %s\n", dump(look).c_str());
 }
 
+static
+void normaliseLeftfix(map<s32, CharReach> &look) {
+    // We can erase entries where the reach is "all characters", except for the
+    // very first one -- this might be required to establish a minimum bound on
+    // the literal's match offset.
+
+    // TODO: It would be cleaner to use a literal program instruction to check
+    // the minimum bound explicitly.
+
+    if (look.empty()) {
+        return;
+    }
+
+    const auto earliest = begin(look)->first;
+
+    vector<s32> dead;
+    for (const auto &m : look) {
+        if (m.second.all() && m.first != earliest) {
+            dead.push_back(m.first);
+        }
+    }
+    erase_all(&look, dead);
+}
+
+static
+bool trimMultipathLeftfix(const RoseBuildImpl &build, const RoseVertex v,
+                          vector<map<s32, CharReach>> &looks) {
+    size_t path_count = 0;
+    for (auto &look : looks) {
+        ++path_count;
+        DEBUG_PRINTF("Path #%ld\n", path_count);
+
+        assert(!look.empty());
+        trimLiterals(build, v, look);
+
+        if (look.empty()) {
+            return false;
+        }
+
+        // Could be optimized here, just keep the empty byte of the longest path
+        normaliseLeftfix(look);
+
+        if (look.size() > MAX_LOOKAROUND_ENTRIES) {
+            DEBUG_PRINTF("lookaround too big (%zu entries)\n", look.size());
+            return false;
+        }
+    }
+    return true;
+}
+
+static
+void transToLookaround(const vector<map<s32, CharReach>> &looks,
+                       vector<vector<LookEntry>> &lookarounds) {
+    for (const auto &look : looks) {
+        vector<LookEntry> lookaround;
+        DEBUG_PRINTF("lookaround: %s\n", dump(look).c_str());
+        lookaround.reserve(look.size());
+        for (const auto &m : look) {
+            if (m.first < -128 || m.first > 127) {
+                DEBUG_PRINTF("range too big\n");
+                lookarounds.clear();
+                return;
+            }
+            s8 offset = verify_s8(m.first);
+            lookaround.emplace_back(offset, m.second);
+        }
+        lookarounds.push_back(lookaround);
+    }
+}
+
 void findLookaroundMasks(const RoseBuildImpl &tbi, const RoseVertex v,
                          vector<LookEntry> &lookaround) {
     lookaround.clear();
@@ -592,115 +662,155 @@ void findLookaroundMasks(const RoseBuildImpl &tbi, const RoseVertex v,
 }
 
 static
-bool hasSingleFloatingStart(const NGHolder &g) {
-    NFAVertex initial = NGHolder::null_vertex();
-    for (auto v : adjacent_vertices_range(g.startDs, g)) {
-        if (v == g.startDs) {
-            continue;
-        }
-        if (initial != NGHolder::null_vertex()) {
-            DEBUG_PRINTF("more than one start\n");
-            return false;
-        }
-        initial = v;
-    }
-
-    if (initial == NGHolder::null_vertex()) {
-        DEBUG_PRINTF("no floating starts\n");
-        return false;
-    }
+bool checkShuftiBuckets(const vector<map<s32, CharReach>> &looks,
+                        u32 bucket_size) {
+    set<u32> bucket;
+    for (const auto &look : looks) {
+        for (const auto &l : look) {
+            CharReach cr = l.second;
+            if (cr.count() > 128) {
+                cr.flip();
+            }
+            map <u16, u16> lo2hi;
+
+            for (size_t i = cr.find_first(); i != CharReach::npos;) {
+                u8 it_hi = i >> 4;
+                u16 low_encode = 0;
+                while (i != CharReach::npos && (i >> 4) == it_hi) {
+                    low_encode |= 1 << (i &0xf);
+                    i = cr.find_next(i);
+                }
+                lo2hi[low_encode] |= 1 << it_hi;
+            }
 
-    // Anchored start must have no successors other than startDs and initial.
-    for (auto v : adjacent_vertices_range(g.start, g)) {
-        if (v != initial && v != g.startDs) {
-            DEBUG_PRINTF("anchored start\n");
-            return false;
+            for (const auto &it : lo2hi) {
+                u32 hi_lo = (it.second << 16) | it.first;
+                bucket.insert(hi_lo);
+            }
         }
     }
-
-    return true;
+    DEBUG_PRINTF("shufti has %lu bucket(s)\n", bucket.size());
+    return bucket.size() <= bucket_size;
 }
 
 static
-bool getTransientPrefixReach(const NGHolder &g, u32 lag,
-                             map<s32, CharReach> &look) {
-    if (in_degree(g.accept, g) != 1) {
-        DEBUG_PRINTF("more than one accept\n");
+bool getTransientPrefixReach(const NGHolder &g, ReportID report, u32 lag,
+                             vector<map<s32, CharReach>> &looks) {
+    if (!isAcyclic(g)) {
+        DEBUG_PRINTF("contains back-edge\n");
         return false;
     }
 
-    // Must be a floating chain wired to startDs.
-    if (!hasSingleFloatingStart(g)) {
-        DEBUG_PRINTF("not a single floating start\n");
+    // Must be floating chains wired to startDs.
+    if (!isFloating(g)) {
+        DEBUG_PRINTF("not a floating start\n");
         return false;
     }
 
-    NFAVertex v = *(inv_adjacent_vertices(g.accept, g).first);
-    u32 i = lag + 1;
-    while (v != g.startDs) {
-        DEBUG_PRINTF("i=%u, v=%zu\n", i, g[v].index);
-        if (is_special(v, g)) {
-            DEBUG_PRINTF("special\n");
-            return false;
+    vector<NFAVertex> curr;
+    for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
+        if (v == g.start || v == g.startDs) {
+            DEBUG_PRINTF("empty graph\n");
+            return true;
+        }
+        if (contains(g[v].reports, report)) {
+            curr.push_back(v);
         }
+    }
 
-        look[0 - i] = g[v].char_reach;
+    assert(!curr.empty());
 
-        NFAVertex next = NGHolder::null_vertex();
-        for (auto u : inv_adjacent_vertices_range(v, g)) {
-            if (u == g.start) {
-                continue; // Benign, checked by hasSingleFloatingStart
-            }
-            if (next == NGHolder::null_vertex()) {
-                next = u;
-                continue;
-            }
-            DEBUG_PRINTF("branch\n");
-            return false;
-        }
+    u32 total_len = curr.size();
+
+    for (const auto &v : curr) {
+        looks.emplace_back(map<s32, CharReach>());
+        looks.back()[0 - (lag + 1)] = g[v].char_reach;
+    }
 
-        if (next == NGHolder::null_vertex() || next == v) {
-            DEBUG_PRINTF("no predecessor or only self-loop\n");
-            // This graph is malformed -- all vertices in a graph that makes it
-            // to this analysis should have predecessors.
-            assert(0);
+    bool curr_active = false;
+
+    /* For each offset -i, we backwardly trace the path by vertices in curr.
+     * Once there are more than 8 paths and more than 64 bits total_len,
+     * which means that neither MULTIPATH_LOOKAROUND nor MULTIPATH_SHUFTI
+     * could be successfully built, we will give up the path finding.
+     * Otherwise, the loop will halt when all vertices in curr are startDs.
+     */
+    for (u32 i = lag + 2; i < (lag + 2) + MAX_BACK_LEN; i++) {
+        curr_active = false;
+        size_t curr_size = curr.size();
+        if (curr.size() > 1 && i > lag + MULTIPATH_MAX_LEN) {
+            DEBUG_PRINTF("range is larger than 16 in multi-path\n");
             return false;
         }
 
-        v = next;
-        i++;
-    }
+        for (size_t idx = 0; idx < curr_size; idx++) {
+            NFAVertex v = curr[idx];
+            if (v == g.startDs) {
+                continue;
+            }
+            assert(!is_special(v, g));
 
-    DEBUG_PRINTF("done\n");
-    return true;
-}
+            for (auto u : inv_adjacent_vertices_range(v, g)) {
+                if (u == g.start || u == g.startDs) {
+                    curr[idx] = g.startDs;
+                    break;
+                }
+            }
 
-static
-void normaliseLeftfix(map<s32, CharReach> &look) {
-    // We can erase entries where the reach is "all characters", except for the
-    // very first one -- this might be required to establish a minimum bound on
-    // the literal's match offset.
+            if (is_special(curr[idx], g)) {
+                continue;
+            }
 
-    // TODO: It would be cleaner to use a literal program instruction to check
-    // the minimum bound explicitly.
+            for (auto u : inv_adjacent_vertices_range(v, g)) {
+                curr_active = true;
+                if (curr[idx] == v) {
+                    curr[idx] = u;
+                    looks[idx][0 - i] = g[u].char_reach;
+                    total_len++;
+                } else {
+                    curr.push_back(u);
+                    looks.push_back(looks[idx]);
+                    (looks.back())[0 - i] = g[u].char_reach;
+                    total_len += looks.back().size();
+                }
 
-    if (look.empty()) {
-        return;
+                if (curr.size() > MAX_LOOKAROUND_PATHS && total_len > 64) {
+                    DEBUG_PRINTF("too many branches\n");
+                    return false;
+                }
+            }
+        }
+        if (!curr_active) {
+            break;
+        }
     }
 
-    const auto earliest = begin(look)->first;
+    if (curr_active) {
+        DEBUG_PRINTF("single path too long\n");
+        return false;
+    }
 
-    vector<s32> dead;
-    for (const auto &m : look) {
-        if (m.second.all() && m.first != earliest) {
-            dead.push_back(m.first);
+    // More than 8 paths, check multi-path shufti.
+    if (curr.size() > MAX_LOOKAROUND_PATHS) {
+        u32 bucket_size = total_len > 32 ? 8 : 16;
+        if (!checkShuftiBuckets(looks, bucket_size)) {
+            DEBUG_PRINTF("shufti has too many buckets\n");
+            return false;
         }
     }
-    erase_all(&look, dead);
+
+    assert(!looks.empty());
+    if (looks.size() == 1) {
+        DEBUG_PRINTF("single lookaround\n");
+    } else {
+        DEBUG_PRINTF("multi-path lookaround\n");
+    }
+    DEBUG_PRINTF("done\n");
+    return true;
 }
 
 bool makeLeftfixLookaround(const RoseBuildImpl &build, const RoseVertex v,
-                           vector<LookEntry> &lookaround) {
+                           vector<vector<LookEntry>> &lookaround) {
     lookaround.clear();
 
     const RoseGraph &g = build.g;
@@ -716,36 +826,19 @@ bool makeLeftfixLookaround(const RoseBuildImpl &build, const RoseVertex v,
         return false;
     }
 
-    map<s32, CharReach> look;
-    if (!getTransientPrefixReach(*leftfix.graph(), g[v].left.lag, look)) {
-        DEBUG_PRINTF("not a chain\n");
-        return false;
-    }
-
-    trimLiterals(build, v, look);
-    normaliseLeftfix(look);
-
-    if (look.size() > MAX_LOOKAROUND_ENTRIES) {
-        DEBUG_PRINTF("lookaround too big (%zu entries)\n", look.size());
+    vector<map<s32, CharReach>> looks;
+    if (!getTransientPrefixReach(*leftfix.graph(), g[v].left.leftfix_report,
+                                 g[v].left.lag, looks)) {
+        DEBUG_PRINTF("graph has loop or too large\n");
         return false;
     }
 
-    if (look.empty()) {
-        DEBUG_PRINTF("lookaround empty; this is weird\n");
+    if (!trimMultipathLeftfix(build, v, looks)) {
         return false;
     }
+    transToLookaround(looks, lookaround);
 
-    lookaround.reserve(look.size());
-    for (const auto &m : look) {
-        if (m.first < -128 || m.first > 127) {
-            DEBUG_PRINTF("range too big\n");
-            return false;
-        }
-        s8 offset = verify_s8(m.first);
-        lookaround.emplace_back(offset, m.second);
-    }
-
-    return true;
+    return !lookaround.empty();
 }
 
 void mergeLookaround(vector<LookEntry> &lookaround,
index 993bd2291b48cec77bcdf64254a28e7858fac6cb..aea87ccf860c0f4b4e1d2c1e59ffec7346d83164 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,6 +36,9 @@
 
 #include <vector>
 
+/** \brief Max path number for multi-path lookaround. */
+#define MAX_LOOKAROUND_PATHS 8
+
 namespace ue2 {
 
 class CharReach;
@@ -44,6 +47,7 @@ class RoseBuildImpl;
 /** \brief Lookaround entry prototype, describing the reachability at a given
  * distance from the end of a role match. */
 struct LookEntry {
+    LookEntry() : offset(0) {}
     LookEntry(s8 offset_in, const CharReach &reach_in)
         : offset(offset_in), reach(reach_in) {}
     s8 offset; //!< offset from role match location.
@@ -63,7 +67,7 @@ size_t hash_value(const LookEntry &l) {
 }
 
 void findLookaroundMasks(const RoseBuildImpl &tbi, const RoseVertex v,
-                         std::vector<LookEntry> &lookaround);
+                         std::vector<LookEntry> &look_more);
 
 /**
  * \brief If possible, render the prefix of the given vertex as a lookaround.
@@ -72,7 +76,7 @@ void findLookaroundMasks(const RoseBuildImpl &tbi, const RoseVertex v,
  * it can be satisfied with a lookaround alone.
  */
 bool makeLeftfixLookaround(const RoseBuildImpl &build, const RoseVertex v,
-                           std::vector<LookEntry> &lookaround);
+                           std::vector<std::vector<LookEntry>> &lookaround);
 
 void mergeLookaround(std::vector<LookEntry> &lookaround,
                      const std::vector<LookEntry> &more_lookaround);
index 1c0fd2ab8d94875813a55762c482c20fad482e39..9e030e8efd0db696c3cc8b083f2be0749e108218 100644 (file)
@@ -127,7 +127,8 @@ void RoseInstrCheckLookaround::write(void *dest, RoseEngineBlob &blob,
                                      const OffsetMap &offset_map) const {
     RoseInstrBase::write(dest, blob, offset_map);
     auto *inst = static_cast<impl_type *>(dest);
-    inst->index = index;
+    inst->look_index = look_index;
+    inst->reach_index = reach_index;
     inst->count = count;
     inst->fail_jump = calc_jump(offset_map, this, target);
 }
@@ -537,6 +538,93 @@ void RoseInstrCheckMedLitNocase::write(void *dest, RoseEngineBlob &blob,
     inst->fail_jump = calc_jump(offset_map, this, target);
 }
 
+void RoseInstrMultipathLookaround::write(void *dest, RoseEngineBlob &blob,
+                                         const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->look_index = look_index;
+    inst->reach_index = reach_index;
+    inst->count = count;
+    inst->last_start = last_start;
+    copy(begin(start_mask), end(start_mask), inst->start_mask);
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckMultipathShufti16x8::write(void *dest, RoseEngineBlob &blob,
+                                              const OffsetMap &offset_map)
+     const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    copy(begin(nib_mask), end(nib_mask), inst->nib_mask);
+    copy(begin(bucket_select_mask), begin(bucket_select_mask) + 16,
+         inst->bucket_select_mask);
+    copy(begin(data_select_mask), begin(data_select_mask) + 16,
+         inst->data_select_mask);
+    inst->hi_bits_mask = hi_bits_mask;
+    inst->lo_bits_mask = lo_bits_mask;
+    inst->neg_mask = neg_mask;
+    inst->base_offset = base_offset;
+    inst->last_start = last_start;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckMultipathShufti32x8::write(void *dest, RoseEngineBlob &blob,
+                                              const OffsetMap &offset_map)
+     const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    copy(begin(hi_mask), begin(hi_mask) + 16, inst->hi_mask);
+    copy(begin(lo_mask), begin(lo_mask) + 16, inst->lo_mask);
+    copy(begin(bucket_select_mask), begin(bucket_select_mask) + 32,
+         inst->bucket_select_mask);
+    copy(begin(data_select_mask), begin(data_select_mask) + 32,
+         inst->data_select_mask);
+    inst->hi_bits_mask = hi_bits_mask;
+    inst->lo_bits_mask = lo_bits_mask;
+    inst->neg_mask = neg_mask;
+    inst->base_offset = base_offset;
+    inst->last_start = last_start;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckMultipathShufti32x16::write(void *dest, RoseEngineBlob &blob,
+                                           const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    copy(begin(hi_mask), end(hi_mask), inst->hi_mask);
+    copy(begin(lo_mask), end(lo_mask), inst->lo_mask);
+    copy(begin(bucket_select_mask_hi), begin(bucket_select_mask_hi) + 32,
+         inst->bucket_select_mask_hi);
+    copy(begin(bucket_select_mask_lo), begin(bucket_select_mask_lo) + 32,
+         inst->bucket_select_mask_lo);
+    copy(begin(data_select_mask), begin(data_select_mask) + 32,
+         inst->data_select_mask);
+    inst->hi_bits_mask = hi_bits_mask;
+    inst->lo_bits_mask = lo_bits_mask;
+    inst->neg_mask = neg_mask;
+    inst->base_offset = base_offset;
+    inst->last_start = last_start;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckMultipathShufti64::write(void *dest, RoseEngineBlob &blob,
+                                            const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    copy(begin(hi_mask), begin(hi_mask) + 16, inst->hi_mask);
+    copy(begin(lo_mask), begin(lo_mask) + 16, inst->lo_mask);
+    copy(begin(bucket_select_mask), end(bucket_select_mask),
+         inst->bucket_select_mask);
+    copy(begin(data_select_mask), end(data_select_mask),
+         inst->data_select_mask);
+    inst->hi_bits_mask = hi_bits_mask;
+    inst->lo_bits_mask = lo_bits_mask;
+    inst->neg_mask = neg_mask;
+    inst->base_offset = base_offset;
+    inst->last_start = last_start;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
 static
 OffsetMap makeOffsetMap(const RoseProgram &program, u32 *total_len) {
     OffsetMap offset_map;
index a63f03c8f715edaec1e7a3575148ab4d9adf700d..3931f095e0bb90b592b6a556be2b6e564a85830f 100644 (file)
@@ -420,20 +420,24 @@ class RoseInstrCheckLookaround
                                     ROSE_STRUCT_CHECK_LOOKAROUND,
                                     RoseInstrCheckLookaround> {
 public:
-    u32 index;
+    u32 look_index;
+    u32 reach_index;
     u32 count;
     const RoseInstruction *target;
 
-    RoseInstrCheckLookaround(u32 index_in, u32 count_in,
-                             const RoseInstruction *target_in)
-        : index(index_in), count(count_in), target(target_in) {}
+    RoseInstrCheckLookaround(u32 look_index_in, u32 reach_index_in,
+                             u32 count_in, const RoseInstruction *target_in)
+        : look_index(look_index_in), reach_index(reach_index_in),
+          count(count_in), target(target_in) {}
 
     bool operator==(const RoseInstrCheckLookaround &ri) const {
-        return index == ri.index && count == ri.count && target == ri.target;
+        return look_index == ri.look_index && reach_index == ri.reach_index &&
+               count == ri.count && target == ri.target;
     }
 
     size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), index, count);
+        return hash_all(static_cast<int>(opcode), look_index, reach_index,
+                        count);
     }
 
     void write(void *dest, RoseEngineBlob &blob,
@@ -441,7 +445,8 @@ public:
 
     bool equiv_to(const RoseInstrCheckLookaround &ri, const OffsetMap &offsets,
                   const OffsetMap &other_offsets) const {
-        return index == ri.index && count == ri.count &&
+        return look_index == ri.look_index && reach_index == ri.reach_index &&
+               count == ri.count &&
                offsets.at(target) == other_offsets.at(ri.target);
     }
 };
@@ -498,7 +503,7 @@ public:
     RoseInstrCheckMask32(std::array<u8, 32> and_mask_in,
                          std::array<u8, 32> cmp_mask_in, u32 neg_mask_in,
                          s32 offset_in, const RoseInstruction *target_in)
-        : and_mask(move(and_mask_in)), cmp_mask(move(cmp_mask_in)),
+        : and_mask(std::move(and_mask_in)), cmp_mask(std::move(cmp_mask_in)),
           neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
 
     bool operator==(const RoseInstrCheckMask32 &ri) const {
@@ -576,8 +581,8 @@ public:
                              std::array<u8, 16> bucket_select_mask_in,
                              u32 neg_mask_in, s32 offset_in,
                              const RoseInstruction *target_in)
-        : nib_mask(move(nib_mask_in)),
-          bucket_select_mask(move(bucket_select_mask_in)),
+        : nib_mask(std::move(nib_mask_in)),
+          bucket_select_mask(std::move(bucket_select_mask_in)),
           neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
 
     bool operator==(const RoseInstrCheckShufti16x8 &ri) const {
@@ -621,8 +626,8 @@ public:
                              std::array<u8, 32> bucket_select_mask_in,
                              u32 neg_mask_in, s32 offset_in,
                              const RoseInstruction *target_in)
-        : hi_mask(move(hi_mask_in)), lo_mask(move(lo_mask_in)),
-          bucket_select_mask(move(bucket_select_mask_in)),
+        : hi_mask(std::move(hi_mask_in)), lo_mask(std::move(lo_mask_in)),
+          bucket_select_mask(std::move(bucket_select_mask_in)),
           neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
 
     bool operator==(const RoseInstrCheckShufti32x8 &ri) const {
@@ -666,8 +671,8 @@ public:
                               std::array<u8, 32> bucket_select_mask_in,
                               u32 neg_mask_in, s32 offset_in,
                               const RoseInstruction *target_in)
-        : hi_mask(move(hi_mask_in)), lo_mask(move(lo_mask_in)),
-          bucket_select_mask(move(bucket_select_mask_in)),
+        : hi_mask(std::move(hi_mask_in)), lo_mask(std::move(lo_mask_in)),
+          bucket_select_mask(std::move(bucket_select_mask_in)),
           neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
 
     bool operator==(const RoseInstrCheckShufti16x16 &ri) const {
@@ -713,9 +718,9 @@ public:
                               std::array<u8, 32> bucket_select_mask_lo_in,
                               u32 neg_mask_in, s32 offset_in,
                               const RoseInstruction *target_in)
-        : hi_mask(move(hi_mask_in)), lo_mask(move(lo_mask_in)),
-          bucket_select_mask_hi(move(bucket_select_mask_hi_in)),
-          bucket_select_mask_lo(move(bucket_select_mask_lo_in)),
+        : hi_mask(std::move(hi_mask_in)), lo_mask(std::move(lo_mask_in)),
+          bucket_select_mask_hi(std::move(bucket_select_mask_hi_in)),
+          bucket_select_mask_lo(std::move(bucket_select_mask_lo_in)),
           neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
 
     bool operator==(const RoseInstrCheckShufti32x16 &ri) const {
@@ -1859,6 +1864,306 @@ public:
     ~RoseInstrClearWorkDone() override;
 };
 
+class RoseInstrMultipathLookaround
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_MULTIPATH_LOOKAROUND,
+                                    ROSE_STRUCT_MULTIPATH_LOOKAROUND,
+                                    RoseInstrMultipathLookaround> {
+public:
+    u32 look_index;
+    u32 reach_index;
+    u32 count;
+    s32 last_start;
+    std::array<u8, 16> start_mask;
+    const RoseInstruction *target;
+
+    RoseInstrMultipathLookaround(u32 look_index_in, u32 reach_index_in,
+                                 u32 count_in, s32 last_start_in,
+                                 std::array<u8, 16> start_mask_in,
+                                 const RoseInstruction *target_in)
+        : look_index(look_index_in), reach_index(reach_index_in),
+          count(count_in), last_start(last_start_in),
+          start_mask(std::move(start_mask_in)), target(target_in) {}
+
+    bool operator==(const RoseInstrMultipathLookaround &ri) const {
+        return look_index == ri.look_index && reach_index == ri.reach_index &&
+               count == ri.count && last_start == ri.last_start &&
+               start_mask == ri.start_mask && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), look_index, reach_index,
+                        count, last_start, start_mask);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrMultipathLookaround &ri,
+                  const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return look_index == ri.look_index && reach_index == ri.reach_index &&
+               count == ri.count && last_start == ri.last_start &&
+               start_mask == ri.start_mask &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckMultipathShufti16x8
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_16x8,
+                                    ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_16x8,
+                                    RoseInstrCheckMultipathShufti16x8> {
+public:
+    std::array<u8, 32> nib_mask;
+    std::array<u8, 64> bucket_select_mask;
+    std::array<u8, 64> data_select_mask;
+    u16 hi_bits_mask;
+    u16 lo_bits_mask;
+    u16 neg_mask;
+    s32 base_offset;
+    s32 last_start;
+    const RoseInstruction *target;
+
+    RoseInstrCheckMultipathShufti16x8(std::array<u8, 32> nib_mask_in,
+                                      std::array<u8, 64> bucket_select_mask_in,
+                                      std::array<u8, 64> data_select_mask_in,
+                                      u16 hi_bits_mask_in, u16 lo_bits_mask_in,
+                                      u16 neg_mask_in, s32 base_offset_in,
+                                      s32 last_start_in,
+                                      const RoseInstruction *target_in)
+        : nib_mask(std::move(nib_mask_in)),
+          bucket_select_mask(std::move(bucket_select_mask_in)),
+          data_select_mask(std::move(data_select_mask_in)),
+          hi_bits_mask(hi_bits_mask_in), lo_bits_mask(lo_bits_mask_in),
+          neg_mask(neg_mask_in), base_offset(base_offset_in),
+          last_start(last_start_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckMultipathShufti16x8 &ri) const {
+        return nib_mask == ri.nib_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               data_select_mask == ri.data_select_mask &&
+               hi_bits_mask == ri.hi_bits_mask &&
+               lo_bits_mask == ri.lo_bits_mask &&
+               neg_mask == ri.neg_mask && base_offset == ri.base_offset &&
+               last_start == ri.last_start && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), nib_mask,
+                        bucket_select_mask, data_select_mask, hi_bits_mask,
+                        lo_bits_mask, neg_mask, base_offset, last_start);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckMultipathShufti16x8 &ri,
+                  const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return nib_mask == ri.nib_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               data_select_mask == ri.data_select_mask &&
+               hi_bits_mask == ri.hi_bits_mask &&
+               lo_bits_mask == ri.lo_bits_mask && neg_mask == ri.neg_mask &&
+               base_offset == ri.base_offset && last_start == ri.last_start &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckMultipathShufti32x8
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_32x8,
+                                    ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_32x8,
+                                    RoseInstrCheckMultipathShufti32x8> {
+public:
+    std::array<u8, 32> hi_mask;
+    std::array<u8, 32> lo_mask;
+    std::array<u8, 64> bucket_select_mask;
+    std::array<u8, 64> data_select_mask;
+    u32 hi_bits_mask;
+    u32 lo_bits_mask;
+    u32 neg_mask;
+    s32 base_offset;
+    s32 last_start;
+    const RoseInstruction *target;
+
+    RoseInstrCheckMultipathShufti32x8(std::array<u8, 32> hi_mask_in,
+                                      std::array<u8, 32> lo_mask_in,
+                                      std::array<u8, 64> bucket_select_mask_in,
+                                      std::array<u8, 64> data_select_mask_in,
+                                      u32 hi_bits_mask_in, u32 lo_bits_mask_in,
+                                      u32 neg_mask_in, s32 base_offset_in,
+                                      s32 last_start_in,
+                                      const RoseInstruction *target_in)
+        : hi_mask(std::move(hi_mask_in)), lo_mask(std::move(lo_mask_in)),
+          bucket_select_mask(std::move(bucket_select_mask_in)),
+          data_select_mask(std::move(data_select_mask_in)),
+          hi_bits_mask(hi_bits_mask_in), lo_bits_mask(lo_bits_mask_in),
+          neg_mask(neg_mask_in), base_offset(base_offset_in),
+          last_start(last_start_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckMultipathShufti32x8 &ri) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               data_select_mask == ri.data_select_mask &&
+               hi_bits_mask == ri.hi_bits_mask &&
+               lo_bits_mask == ri.lo_bits_mask &&
+               neg_mask == ri.neg_mask && base_offset == ri.base_offset &&
+               last_start == ri.last_start && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), hi_mask, lo_mask,
+                        bucket_select_mask, data_select_mask, hi_bits_mask,
+                        lo_bits_mask, neg_mask, base_offset, last_start);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckMultipathShufti32x8 &ri,
+                  const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               data_select_mask == ri.data_select_mask &&
+               hi_bits_mask == ri.hi_bits_mask &&
+               lo_bits_mask == ri.lo_bits_mask && neg_mask == ri.neg_mask &&
+               base_offset == ri.base_offset && last_start == ri.last_start &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckMultipathShufti32x16
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_32x16,
+                                    ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_32x16,
+                                    RoseInstrCheckMultipathShufti32x16> {
+public:
+    std::array<u8, 32> hi_mask;
+    std::array<u8, 32> lo_mask;
+    std::array<u8, 64> bucket_select_mask_hi;
+    std::array<u8, 64> bucket_select_mask_lo;
+    std::array<u8, 64> data_select_mask;
+    u32 hi_bits_mask;
+    u32 lo_bits_mask;
+    u32 neg_mask;
+    s32 base_offset;
+    s32 last_start;
+    const RoseInstruction *target;
+
+    RoseInstrCheckMultipathShufti32x16(std::array<u8, 32> hi_mask_in,
+                                       std::array<u8, 32> lo_mask_in,
+                                   std::array<u8, 64> bucket_select_mask_hi_in,
+                                   std::array<u8, 64> bucket_select_mask_lo_in,
+                                       std::array<u8, 64> data_select_mask_in,
+                                       u32 hi_bits_mask_in, u32 lo_bits_mask_in,
+                                       u32 neg_mask_in, s32 base_offset_in,
+                                       s32 last_start_in,
+                                       const RoseInstruction *target_in)
+        : hi_mask(std::move(hi_mask_in)), lo_mask(std::move(lo_mask_in)),
+          bucket_select_mask_hi(std::move(bucket_select_mask_hi_in)),
+          bucket_select_mask_lo(std::move(bucket_select_mask_lo_in)),
+          data_select_mask(std::move(data_select_mask_in)),
+          hi_bits_mask(hi_bits_mask_in), lo_bits_mask(lo_bits_mask_in),
+          neg_mask(neg_mask_in), base_offset(base_offset_in),
+          last_start(last_start_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckMultipathShufti32x16 &ri) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask_hi == ri.bucket_select_mask_hi &&
+               bucket_select_mask_lo == ri.bucket_select_mask_lo &&
+               data_select_mask == ri.data_select_mask &&
+               hi_bits_mask == ri.hi_bits_mask &&
+               lo_bits_mask == ri.lo_bits_mask &&
+               neg_mask == ri.neg_mask && base_offset == ri.base_offset &&
+               last_start == ri.last_start && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), hi_mask, lo_mask,
+                        bucket_select_mask_hi, bucket_select_mask_lo,
+                        data_select_mask, hi_bits_mask, lo_bits_mask, neg_mask,
+                        base_offset, last_start);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckMultipathShufti32x16 &ri,
+                  const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask_hi == ri.bucket_select_mask_hi &&
+               bucket_select_mask_lo == ri.bucket_select_mask_lo &&
+               data_select_mask == ri.data_select_mask &&
+               hi_bits_mask == ri.hi_bits_mask &&
+               lo_bits_mask == ri.lo_bits_mask && neg_mask == ri.neg_mask &&
+               base_offset == ri.base_offset && last_start == ri.last_start &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckMultipathShufti64
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_64,
+                                    ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_64,
+                                    RoseInstrCheckMultipathShufti64> {
+public:
+    std::array<u8, 32> hi_mask;
+    std::array<u8, 32> lo_mask;
+    std::array<u8, 64> bucket_select_mask;
+    std::array<u8, 64> data_select_mask;
+    u64a hi_bits_mask;
+    u64a lo_bits_mask;
+    u64a neg_mask;
+    s32 base_offset;
+    s32 last_start;
+    const RoseInstruction *target;
+
+    RoseInstrCheckMultipathShufti64(std::array<u8, 32> hi_mask_in,
+                                    std::array<u8, 32> lo_mask_in,
+                                    std::array<u8, 64> bucket_select_mask_in,
+                                    std::array<u8, 64> data_select_mask_in,
+                                    u64a hi_bits_mask_in, u64a lo_bits_mask_in,
+                                    u64a neg_mask_in, s32 base_offset_in,
+                                    s32 last_start_in,
+                                    const RoseInstruction *target_in)
+        : hi_mask(std::move(hi_mask_in)), lo_mask(std::move(lo_mask_in)),
+          bucket_select_mask(std::move(bucket_select_mask_in)),
+          data_select_mask(std::move(data_select_mask_in)),
+          hi_bits_mask(hi_bits_mask_in), lo_bits_mask(lo_bits_mask_in),
+          neg_mask(neg_mask_in), base_offset(base_offset_in),
+          last_start(last_start_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckMultipathShufti64 &ri) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               data_select_mask == ri.data_select_mask &&
+               hi_bits_mask == ri.hi_bits_mask &&
+               lo_bits_mask == ri.lo_bits_mask &&
+               neg_mask == ri.neg_mask && base_offset == ri.base_offset &&
+               last_start == ri.last_start && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), hi_mask, lo_mask,
+                        bucket_select_mask, data_select_mask, hi_bits_mask,
+                        lo_bits_mask, neg_mask, base_offset, last_start);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckMultipathShufti64 &ri,
+                  const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               data_select_mask == ri.data_select_mask &&
+               hi_bits_mask == ri.hi_bits_mask &&
+               lo_bits_mask == ri.lo_bits_mask && neg_mask == ri.neg_mask &&
+               base_offset == ri.base_offset && last_start == ri.last_start &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
 class RoseInstrEnd
     : public RoseInstrBaseTrivial<ROSE_INSTR_END, ROSE_STRUCT_END,
                                   RoseInstrEnd> {
index c0250aa5e0889d15da7bd0dbd9cad1de8d7ab8f5..34678b8fcc4e1ec82e05470c4ee39d1d43206b1c 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
 /** \brief Length in bytes of a reach bitvector, used by the lookaround code. */
 #define REACH_BITVECTOR_LEN 32
 
+/** \brief Length in bytes of a reach bitvector for multi-path lookaround. */
+#define MULTI_REACH_BITVECTOR_LEN 256
+
+/**
+ * \brief The max offset from the leftmost byte to the rightmost byte in
+ * multi-path lookaround.
+ */
+#define MULTIPATH_MAX_LEN 16
+
 /** \brief Value used to represent an invalid Rose program offset. */
 #define ROSE_INVALID_PROG_OFFSET 0
 
index 48f15ff002623eb88757e894c58951c4d3f41164..06a9b0692b2eb63c1a0bf85d2d161b6366b98acc 100644 (file)
@@ -386,7 +386,8 @@ struct RoseEngine {
     u32 roseCount;
     u32 lookaroundTableOffset; //!< base of lookaround offset list (of s8 values)
     u32 lookaroundReachOffset; /**< base of lookaround reach bitvectors (32
-                                * bytes each) */
+                                * bytes for single-path lookaround and 256 bytes
+                                * for multi-path lookaround) */
 
     u32 eodProgramOffset; //!< EOD program, otherwise 0.
 
index cf1a9eb6803e7a99f8b8ff7842ccf21436ad2a49..ebda679a0febd4ef9220ae3912bd04778b8f411e 100644 (file)
@@ -36,6 +36,7 @@
 #include "som/som_operation.h"
 #include "rose_internal.h"
 #include "ue2common.h"
+#include "util/simd_types.h"
 
 /** \brief Minimum alignment for each instruction in memory. */
 #define ROSE_INSTR_MIN_ALIGN 8U
@@ -146,7 +147,38 @@ enum RoseInstructionCode {
      */
     ROSE_INSTR_CLEAR_WORK_DONE,
 
-    LAST_ROSE_INSTRUCTION = ROSE_INSTR_CLEAR_WORK_DONE //!< Sentinel.
+    /** \brief Check lookaround if it has multiple paths. */
+    ROSE_INSTR_MULTIPATH_LOOKAROUND,
+
+    /**
+     * \brief Use shufti to check lookaround with multiple paths. The total
+     * length of the paths is 16 bytes at most and shufti has 8 buckets.
+     * All paths can be at most 16 bytes long.
+     */
+    ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_16x8,
+
+    /**
+     * \brief Use shufti to check lookaround with multiple paths. The total
+     * length of the paths is 32 bytes at most and shufti has 8 buckets.
+     * All paths can be at most 16 bytes long.
+     */
+    ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_32x8,
+
+    /**
+     * \brief Use shufti to check lookaround with multiple paths. The total
+     * length of the paths is 32 bytes at most and shufti has 16 buckets.
+     * All paths can be at most 16 bytes long.
+     */
+    ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_32x16,
+
+    /**
+     * \brief Use shufti to check multiple paths lookaround. The total
+     * length of the paths is 64 bytes at most and shufti has 8 buckets.
+     * All paths can be at most 16 bytes long.
+     */
+    ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_64,
+
+    LAST_ROSE_INSTRUCTION = ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_64 //!< Sentinel.
 };
 
 struct ROSE_STRUCT_END {
@@ -192,14 +224,15 @@ struct ROSE_STRUCT_CHECK_NOT_HANDLED {
 struct ROSE_STRUCT_CHECK_SINGLE_LOOKAROUND {
     u8 code; //!< From enum RoseInstructionCode.
     s8 offset; //!< The offset of the byte to examine.
-    u32 reach_index; //!< The index of the reach table entry to use.
+    u32 reach_index; //!< Index for lookaround reach bitvectors.
     u32 fail_jump; //!< Jump forward this many bytes on failure.
 };
 
 struct ROSE_STRUCT_CHECK_LOOKAROUND {
     u8 code; //!< From enum RoseInstructionCode.
-    u32 index;
-    u32 count;
+    u32 look_index; //!< Index for lookaround offset list.
+    u32 reach_index; //!< Index for lookaround reach bitvectors.
+    u32 count; //!< The count of lookaround entries in one instruction.
     u32 fail_jump; //!< Jump forward this many bytes on failure.
 };
 
@@ -526,4 +559,70 @@ struct ROSE_STRUCT_CLEAR_WORK_DONE {
     u8 code; //!< From enum RoseInstructionCode.
 };
 
+struct ROSE_STRUCT_MULTIPATH_LOOKAROUND {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 look_index; //!< Index for lookaround offset list.
+    u32 reach_index; //!< Index for lookaround reach bitvectors.
+    u32 count; //!< The lookaround byte numbers for each path.
+    s32 last_start; //!< The latest start offset among 8 paths.
+    u8 start_mask[MULTIPATH_MAX_LEN]; /*!< Used to initialize path if left-most
+                                       * data is missed. */
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_16x8 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 nib_mask[2 * sizeof(m128)]; //!< High and low nibble mask in shufti.
+    u8 bucket_select_mask[sizeof(m128)]; //!< Mask for bucket assigning.
+    u8 data_select_mask[sizeof(m128)]; //!< Shuffle mask for data ordering.
+    u32 hi_bits_mask; //!< High-bits used in multi-path validation.
+    u32 lo_bits_mask; //!< Low-bits used in multi-path validation.
+    u32 neg_mask; //!< 64 bits negation mask.
+    s32 base_offset; //!< Relative offset of the first byte.
+    s32 last_start; //!< The latest start offset among 8 paths.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_32x8 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 hi_mask[sizeof(m128)]; //!< High nibble mask in shufti.
+    u8 lo_mask[sizeof(m128)]; //!< Low nibble mask in shufti.
+    u8 bucket_select_mask[sizeof(m256)]; //!< Mask for bucket assigning.
+    u8 data_select_mask[sizeof(m256)]; //!< Shuffle mask for data ordering.
+    u32 hi_bits_mask; //!< High-bits used in multi-path validation.
+    u32 lo_bits_mask; //!< Low-bits used in multi-path validation.
+    u32 neg_mask; //!< 64 bits negation mask.
+    s32 base_offset; //!< Relative offset of the first byte.
+    s32 last_start; //!< The latest start offset among 8 paths.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_32x16 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 hi_mask[sizeof(m256)]; //!< High nibble mask in shufti.
+    u8 lo_mask[sizeof(m256)]; //!< Low nibble mask in shufti.
+    u8 bucket_select_mask_hi[sizeof(m256)]; //!< Mask for bucket assigning.
+    u8 bucket_select_mask_lo[sizeof(m256)]; //!< Mask for bucket assigning.
+    u8 data_select_mask[sizeof(m256)]; //!< Shuffle mask for data ordering.
+    u32 hi_bits_mask; //!< High-bits used in multi-path validation.
+    u32 lo_bits_mask; //!< Low-bits used in multi-path validation.
+    u32 neg_mask; //!< 64 bits negation mask.
+    s32 base_offset; //!< Relative offset of the first byte.
+    s32 last_start; //!< The latest start offset among 8 paths.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_64 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 hi_mask[sizeof(m128)]; //!< High nibble mask in shufti.
+    u8 lo_mask[sizeof(m128)]; //!< Low nibble mask in shufti.
+    u8 bucket_select_mask[2 * sizeof(m256)]; //!< Mask for bucket assigning.
+    u8 data_select_mask[2 * sizeof(m256)]; //!< Shuffle mask for data ordering.
+    u64a hi_bits_mask; //!< High-bits used in multi-path validation.
+    u64a lo_bits_mask; //!< Low-bits used in multi-path validation.
+    u64a neg_mask; //!< 64 bits negation mask.
+    s32 base_offset; //!< Relative offset of the first byte.
+    s32 last_start; //!< The latest start offset among 8 paths.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
 #endif // ROSE_ROSE_PROGRAM_H
index 49d2c2fe6672f2af51d2323e2826312fee68d71e..e26d6c2bfdc8cc3dc4f6756570846ee858a6bcf8 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -46,7 +46,7 @@ void dumpMask(const void *mask, int len) {
 static really_inline
 int validateShuftiMask16x16(const m256 data, const m256 hi_mask,
                             const m256 lo_mask, const m256 and_mask,
-                            const u32 neg_mask, const u16 valid_data_mask) {
+                            const u32 neg_mask, const u32 valid_data_mask) {
     m256 low4bits = set32x8(0xf);
     m256 c_lo = vpshufb(lo_mask, and256(data, low4bits));
     m256 c_hi = vpshufb(hi_mask, rshift64_m256(andnot256(low4bits, data), 4));
@@ -75,7 +75,7 @@ int validateShuftiMask16x16(const m256 data, const m256 hi_mask,
 static really_inline
 int validateShuftiMask16x8(const m128 data, const m256 nib_mask,
                            const m128 and_mask, const u32 neg_mask,
-                           const u16 valid_data_mask) {
+                           const u32 valid_data_mask) {
     m256 data_m256 = combine2x128(rshift64_m128(data, 4), data);
     m256 low4bits = set32x8(0xf);
     m256 c_nib = vpshufb(nib_mask, and256(data_m256, low4bits));
@@ -172,4 +172,121 @@ int validateShuftiMask32x16(const m256 data,
     u32 cmp_result = (nresult ^ neg_mask) & valid_data_mask;
     return !cmp_result;
 }
+
+static really_inline
+int checkMultipath32(u32 data, u32 hi_bits, u32 lo_bits) {
+    u32 t = ~(data | hi_bits);
+    t += lo_bits;
+    t &= (~data) & hi_bits;
+    DEBUG_PRINTF("t %x\n", t);
+    return !!t;
+}
+
+static really_inline
+int checkMultipath64(u64a data, u64a hi_bits, u64a lo_bits) {
+    u64a t = ~(data | hi_bits);
+    t += lo_bits;
+    t &= (~data) & hi_bits;
+    DEBUG_PRINTF("t %llx\n", t);
+    return !!t;
+}
+
+static really_inline
+int validateMultipathShuftiMask16x8(const m128 data,
+                                    const m256 nib_mask,
+                                    const m128 bucket_select_mask,
+                                    const u32 hi_bits, const u32 lo_bits,
+                                    const u32 neg_mask,
+                                    const u32 valid_path_mask) {
+    m256 data_256 = combine2x128(rshift64_m128(data, 4), data);
+    m256 low4bits = set32x8(0xf);
+    m256 c_nib = vpshufb(nib_mask, and256(data_256, low4bits));
+    m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib));
+    m128 result = and128(t, bucket_select_mask);
+    u32 nresult = movemask128(eq128(result, zeroes128()));
+    u32 cmp_result = (nresult ^ neg_mask) | valid_path_mask;
+
+    DEBUG_PRINTF("cmp_result %x\n", cmp_result);
+
+    return checkMultipath32(cmp_result, hi_bits, lo_bits);
+}
+
+static really_inline
+int validateMultipathShuftiMask32x8(const m256 data,
+                                    const m256 hi_mask, const m256 lo_mask,
+                                    const m256 bucket_select_mask,
+                                    const u32 hi_bits, const u32 lo_bits,
+                                    const u32 neg_mask,
+                                    const u32 valid_path_mask) {
+    m256 low4bits = set32x8(0xf);
+    m256 data_lo = and256(data, low4bits);
+    m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
+    m256 c_lo = vpshufb(lo_mask, data_lo);
+    m256 c_hi = vpshufb(hi_mask, data_hi);
+    m256 c = and256(c_lo, c_hi);
+    m256 result = and256(c, bucket_select_mask);
+    u32 nresult = movemask256(eq256(result, zeroes256()));
+    u32 cmp_result = (nresult ^ neg_mask) | valid_path_mask;
+
+    DEBUG_PRINTF("cmp_result %x\n", cmp_result);
+
+    return checkMultipath32(cmp_result, hi_bits, lo_bits);
+}
+
+static really_inline
+int validateMultipathShuftiMask32x16(const m256 data,
+                                     const m256 hi_mask_1, const m256 hi_mask_2,
+                                     const m256 lo_mask_1, const m256 lo_mask_2,
+                                     const m256 bucket_select_mask_hi,
+                                     const m256 bucket_select_mask_lo,
+                                     const u32 hi_bits, const u32 lo_bits,
+                                     const u32 neg_mask,
+                                     const u32 valid_path_mask) {
+    m256 low4bits = set32x8(0xf);
+    m256 data_lo = and256(data, low4bits);
+    m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
+    m256 c_lo_1 = vpshufb(lo_mask_1, data_lo);
+    m256 c_lo_2 = vpshufb(lo_mask_2, data_lo);
+    m256 c_hi_1 = vpshufb(hi_mask_1, data_hi);
+    m256 c_hi_2 = vpshufb(hi_mask_2, data_hi);
+    m256 t1 = and256(c_lo_1, c_hi_1);
+    m256 t2 = and256(c_lo_2, c_hi_2);
+    m256 result = or256(and256(t1, bucket_select_mask_lo),
+                        and256(t2, bucket_select_mask_hi));
+    u32 nresult = movemask256(eq256(result, zeroes256()));
+    u32 cmp_result = (nresult ^ neg_mask) | valid_path_mask;
+
+    DEBUG_PRINTF("cmp_result %x\n", cmp_result);
+
+    return checkMultipath32(cmp_result, hi_bits, lo_bits);
+}
+
+static really_inline
+int validateMultipathShuftiMask64(const m256 data_1, const m256 data_2,
+                                  const m256 hi_mask, const m256 lo_mask,
+                                  const m256 bucket_select_mask_1,
+                                  const m256 bucket_select_mask_2,
+                                  const u64a hi_bits, const u64a lo_bits,
+                                  const u64a neg_mask,
+                                  const u64a valid_path_mask) {
+    m256 low4bits = set32x8(0xf);
+    m256 c_lo_1 = vpshufb(lo_mask, and256(data_1, low4bits));
+    m256 c_lo_2 = vpshufb(lo_mask, and256(data_2, low4bits));
+    m256 c_hi_1 = vpshufb(hi_mask,
+                          rshift64_m256(andnot256(low4bits, data_1), 4));
+    m256 c_hi_2 = vpshufb(hi_mask,
+                          rshift64_m256(andnot256(low4bits, data_2), 4));
+    m256 t1 = and256(c_lo_1, c_hi_1);
+    m256 t2 = and256(c_lo_2, c_hi_2);
+    m256 nresult_1 = eq256(and256(t1, bucket_select_mask_1), zeroes256());
+    m256 nresult_2 = eq256(and256(t2, bucket_select_mask_2), zeroes256());
+    u64a nresult = (u64a)movemask256(nresult_1) |
+                   (u64a)movemask256(nresult_2) << 32;
+    u64a cmp_result = (nresult ^ neg_mask) | valid_path_mask;
+
+    DEBUG_PRINTF("cmp_result %llx\n", cmp_result);
+
+    return checkMultipath64(cmp_result, hi_bits, lo_bits);
+}
+
 #endif
index bc49a04661b35414855f7768dce981bad6b88659..1f8848430d7f4a8afd7fc2d9c3756cac3846779f 100644 (file)
@@ -317,6 +317,11 @@ m128 sub_u8_m128(m128 a, m128 b) {
     return _mm_sub_epi8(a, b);
 }
 
+static really_inline
+m128 set64x2(u64a hi, u64a lo) {
+    return _mm_set_epi64x(hi, lo);
+}
+
 /****
  **** 256-bit Primitives
  ****/
@@ -592,6 +597,18 @@ m256 mask1bit256(unsigned int n) {
     return loadu256(&simd_onebit_masks[mask_idx]);
 }
 
+static really_inline
+m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
+#if defined(HAVE_AVX2)
+    return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0);
+#else
+    m256 rv;
+    rv.hi = set64x2(hi_1, hi_0);
+    rv.lo = set64x2(lo_1, lo_0);
+    return rv;
+#endif
+}
+
 #if !defined(HAVE_AVX2)
 // switches on bit N in the given vector.
 static really_inline