Apply some consistency to the names we give shifts

author Matthew Barr <matthew.barr@intel.com>

Wed, 15 Jun 2016 01:02:42 +0000 (11:02 +1000)

committer Matthew Barr <matthew.barr@intel.com>

Fri, 8 Jul 2016 01:07:50 +0000 (11:07 +1000)
author Matthew Barr <matthew.barr@intel.com>
Wed, 15 Jun 2016 01:02:42 +0000 (11:02 +1000)
committer Matthew Barr <matthew.barr@intel.com>
Fri, 8 Jul 2016 01:07:50 +0000 (11:07 +1000)
diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c

index aa9d1c1d0fe9fa67f410c32c56e9f9c2c7a2e9f9..c79db037ad1b4ea423988d30b59b8d760dfd0989 100644 (file)
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -131,7 +131,7 @@ m128 getInitState(const struct FDR *fdr, u8 len_history, const u8 *ft,
          u32 tmp = lv_u16(z->start + z->shift - 1, z->buf, z->end + 1);
          tmp &= fdr->domainMask;
          s = *((const m128 *)ft + tmp);
-        s = shiftRight8Bits(s);
+        s = rshiftbyte_m128(s, 1);
      } else {
          s = fdr->start;
      }
@@ -185,20 +185,20 @@ void get_conf_stride_1(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
      m128 st14 = *(const m128 *)(ft + v14*8);
      m128 st15 = *(const m128 *)(ft + v15*8);
  
-    st1 = byteShiftLeft128(st1, 1);
-    st2 = byteShiftLeft128(st2, 2);
-    st3 = byteShiftLeft128(st3, 3);
-    st4 = byteShiftLeft128(st4, 4);
-    st5 = byteShiftLeft128(st5, 5);
-    st6 = byteShiftLeft128(st6, 6);
-    st7 = byteShiftLeft128(st7, 7);
-    st9 = byteShiftLeft128(st9, 1);
-    st10 = byteShiftLeft128(st10, 2);
-    st11 = byteShiftLeft128(st11, 3);
-    st12 = byteShiftLeft128(st12, 4);
-    st13 = byteShiftLeft128(st13, 5);
-    st14 = byteShiftLeft128(st14, 6);
-    st15 = byteShiftLeft128(st15, 7);
+    st1 = lshiftbyte_m128(st1, 1);
+    st2 = lshiftbyte_m128(st2, 2);
+    st3 = lshiftbyte_m128(st3, 3);
+    st4 = lshiftbyte_m128(st4, 4);
+    st5 = lshiftbyte_m128(st5, 5);
+    st6 = lshiftbyte_m128(st6, 6);
+    st7 = lshiftbyte_m128(st7, 7);
+    st9 = lshiftbyte_m128(st9, 1);
+    st10 = lshiftbyte_m128(st10, 2);
+    st11 = lshiftbyte_m128(st11, 3);
+    st12 = lshiftbyte_m128(st12, 4);
+    st13 = lshiftbyte_m128(st13, 5);
+    st14 = lshiftbyte_m128(st14, 6);
+    st15 = lshiftbyte_m128(st15, 7);
  
      *s = or128(*s, st0);
      *s = or128(*s, st1);
@@ -209,7 +209,7 @@ void get_conf_stride_1(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
      *s = or128(*s, st6);
      *s = or128(*s, st7);
      *conf0 = movq(*s);
-    *s = byteShiftRight128(*s, 8);
+    *s = rshiftbyte_m128(*s, 8);
      *conf0 ^= ~0ULL;
  
      *s = or128(*s, st8);
@@ -221,7 +221,7 @@ void get_conf_stride_1(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
      *s = or128(*s, st14);
      *s = or128(*s, st15);
      *conf8 = movq(*s);
-    *s = byteShiftRight128(*s, 8);
+    *s = rshiftbyte_m128(*s, 8);
      *conf8 ^= ~0ULL;
  }
  
@@ -252,19 +252,19 @@ void get_conf_stride_2(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
      m128 st12 = *(const m128 *)(ft + v12*8);
      m128 st14 = *(const m128 *)(ft + v14*8);
  
-    st2 = byteShiftLeft128(st2, 2);
-    st4 = byteShiftLeft128(st4, 4);
-    st6 = byteShiftLeft128(st6, 6);
-    st10 = byteShiftLeft128(st10, 2);
-    st12 = byteShiftLeft128(st12, 4);
-    st14 = byteShiftLeft128(st14, 6);
+    st2  = lshiftbyte_m128(st2, 2);
+    st4  = lshiftbyte_m128(st4, 4);
+    st6  = lshiftbyte_m128(st6, 6);
+    st10 = lshiftbyte_m128(st10, 2);
+    st12 = lshiftbyte_m128(st12, 4);
+    st14 = lshiftbyte_m128(st14, 6);
  
      *s = or128(*s, st0);
      *s = or128(*s, st2);
      *s = or128(*s, st4);
      *s = or128(*s, st6);
      *conf0 = movq(*s);
-    *s = byteShiftRight128(*s, 8);
+    *s = rshiftbyte_m128(*s, 8);
      *conf0 ^= ~0ULL;
  
      *s = or128(*s, st8);
@@ -272,7 +272,7 @@ void get_conf_stride_2(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
      *s = or128(*s, st12);
      *s = or128(*s, st14);
      *conf8 = movq(*s);
-    *s = byteShiftRight128(*s, 8);
+    *s = rshiftbyte_m128(*s, 8);
      *conf8 ^= ~0ULL;
  }
  
@@ -295,19 +295,19 @@ void get_conf_stride_4(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
      m128 st8 = *(const m128 *)(ft + v8*8);
      m128 st12 = *(const m128 *)(ft + v12*8);
  
-    st4 = byteShiftLeft128(st4, 4);
-    st12 = byteShiftLeft128(st12, 4);
+    st4 = lshiftbyte_m128(st4, 4);
+    st12 = lshiftbyte_m128(st12, 4);
  
      *s = or128(*s, st0);
      *s = or128(*s, st4);
      *conf0 = movq(*s);
-    *s = byteShiftRight128(*s, 8);
+    *s = rshiftbyte_m128(*s, 8);
      *conf0 ^= ~0ULL;
  
      *s = or128(*s, st8);
      *s = or128(*s, st12);
      *conf8 = movq(*s);
-    *s = byteShiftRight128(*s, 8);
+    *s = rshiftbyte_m128(*s, 8);
      *conf8 ^= ~0ULL;
  }
  
diff --git a/src/fdr/teddy.c b/src/fdr/teddy.c

index 4ff0b18e3b984bee0faa4efc6d3d052a3498351a..2406a167fe88626235a2d9cb0457e8744eb88a9a 100644 (file)
--- a/src/fdr/teddy.c
+++ b/src/fdr/teddy.c
@@ -79,7 +79,7 @@ const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = {
  do {                                                                        \
      if (unlikely(isnonzero128(var))) {                                      \
          u64a lo = movq(var);                                                \
-        u64a hi = movq(byteShiftRight128(var, 8));                          \
+        u64a hi = movq(rshiftbyte_m128(var, 8));                            \
          if (unlikely(lo)) {                                                 \
              conf_fn(&lo, bucket, offset, confBase, reason, a, ptr,          \
                      control, &last_match);                                  \
@@ -97,9 +97,9 @@ do {                                                                        \
  do {                                                                        \
      if (unlikely(isnonzero128(var))) {                                      \
          u32 part1 = movd(var);                                              \
-        u32 part2 = movd(byteShiftRight128(var, 4));                        \
-        u32 part3 = movd(byteShiftRight128(var, 8));                        \
-        u32 part4 = movd(byteShiftRight128(var, 12));                       \
+        u32 part2 = movd(rshiftbyte_m128(var, 4));                          \
+        u32 part3 = movd(rshiftbyte_m128(var, 8));                          \
+        u32 part4 = movd(rshiftbyte_m128(var, 12));                         \
          if (unlikely(part1)) {                                              \
              conf_fn(&part1, bucket, offset, confBase, reason, a, ptr,       \
                      control, &last_match);                                  \
@@ -128,7 +128,7 @@ static really_inline
  m128 prep_conf_teddy_m1(const m128 *maskBase, m128 p_mask, m128 val) {
      m128 mask = set16x8(0xf);
      m128 lo = and128(val, mask);
-    m128 hi = and128(rshift2x64(val, 4), mask);
+    m128 hi = and128(rshift64_m128(val, 4), mask);
      return and128(and128(pshufb(maskBase[0*2], lo),
                           pshufb(maskBase[0*2+1], hi)), p_mask);
  }
@@ -138,7 +138,7 @@ m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 p_mask,
                          m128 val) {
      m128 mask = set16x8(0xf);
      m128 lo = and128(val, mask);
-    m128 hi = and128(rshift2x64(val, 4), mask);
+    m128 hi = and128(rshift64_m128(val, 4), mask);
      m128 r = prep_conf_teddy_m1(maskBase, p_mask, val);
  
      m128 res_1 = and128(pshufb(maskBase[1*2], lo),
@@ -153,7 +153,7 @@ m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
                          m128 p_mask, m128 val) {
      m128 mask = set16x8(0xf);
      m128 lo = and128(val, mask);
-    m128 hi = and128(rshift2x64(val, 4), mask);
+    m128 hi = and128(rshift64_m128(val, 4), mask);
      m128 r = prep_conf_teddy_m2(maskBase, old_1, p_mask, val);
  
      m128 res_2 = and128(pshufb(maskBase[2*2], lo),
@@ -168,7 +168,7 @@ m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
                          m128 *old_3, m128 p_mask, m128 val) {
      m128 mask = set16x8(0xf);
      m128 lo = and128(val, mask);
-    m128 hi = and128(rshift2x64(val, 4), mask);
+    m128 hi = and128(rshift64_m128(val, 4), mask);
      m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, p_mask, val);
  
      m128 res_3 = and128(pshufb(maskBase[3*2], lo),
diff --git a/src/fdr/teddy_avx2.c b/src/fdr/teddy_avx2.c

index ef06813c29192a4de3e470b01470164d2df5e452..5ea4e3688faaf7db00fdf71ce6c9a93a04efd882 100644 (file)
--- a/src/fdr/teddy_avx2.c
+++ b/src/fdr/teddy_avx2.c
@@ -371,7 +371,7 @@ void bit_array_fast_teddy(m128 var, u16 *bitArr, u32 *arrCnt, u32 offset) {
                                      64 * (offset);
              *arrCnt += 1;
          }
-        u64a part_1 = movq(byteShiftRight128(var, 8));
+        u64a part_1 = movq(rshiftbyte_m128(var, 8));
          while (unlikely(part_1)) {
              bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_1) +
                                      64 * (offset + 1);
@@ -384,19 +384,19 @@ void bit_array_fast_teddy(m128 var, u16 *bitArr, u32 *arrCnt, u32 offset) {
                                      32 * (offset * 2);
              *arrCnt += 1;
          }
-        u32 part_1 = movd(byteShiftRight128(var, 4));
+        u32 part_1 = movd(rshiftbyte_m128(var, 4));
          while (unlikely(part_1)) {
              bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_1) +
                                      32 * (offset * 2 + 1);
              *arrCnt += 1;
          }
-        u32 part_2 = movd(byteShiftRight128(var, 8));
+        u32 part_2 = movd(rshiftbyte_m128(var, 8));
          while (unlikely(part_2)) {
              bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_2) +
                                      32 * (offset * 2 + 2);
              *arrCnt += 1;
          }
-        u32 part_3 = movd(byteShiftRight128(var, 12));
+        u32 part_3 = movd(rshiftbyte_m128(var, 12));
          while (unlikely(part_3)) {
              bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_3) +
                                      32 * (offset * 2 + 3);
@@ -410,7 +410,7 @@ static really_inline
  m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 p_mask, m256 val) {
      m256 mask = set32x8(0xf);
      m256 lo = and256(val, mask);
-    m256 hi = and256(rshift4x64(val, 4), mask);
+    m256 hi = and256(rshift64_m256(val, 4), mask);
      return and256(and256(vpshufb(maskBase[0*2], lo),
                           vpshufb(maskBase[0*2+1], hi)), p_mask);
  }
@@ -420,7 +420,7 @@ m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 p_mask,
                              m256 val) {
      m256 mask = set32x8(0xf);
      m256 lo = and256(val, mask);
-    m256 hi = and256(rshift4x64(val, 4), mask);
+    m256 hi = and256(rshift64_m256(val, 4), mask);
      m256 r = prep_conf_fat_teddy_m1(maskBase, p_mask, val);
  
      m256 res_1 = and256(vpshufb(maskBase[1*2], lo),
@@ -435,7 +435,7 @@ m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
                              m256 p_mask, m256 val) {
      m256 mask = set32x8(0xf);
      m256 lo = and256(val, mask);
-    m256 hi = and256(rshift4x64(val, 4), mask);
+    m256 hi = and256(rshift64_m256(val, 4), mask);
      m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, p_mask, val);
  
      m256 res_2 = and256(vpshufb(maskBase[2*2], lo),
@@ -450,7 +450,7 @@ m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
                              m256 *old_3, m256 p_mask, m256 val) {
      m256 mask = set32x8(0xf);
      m256 lo = and256(val, mask);
-    m256 hi = and256(rshift4x64(val, 4), mask);
+    m256 hi = and256(rshift64_m256(val, 4), mask);
      m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, p_mask, val);
  
      m256 res_3 = and256(vpshufb(maskBase[3*2], lo),
@@ -464,7 +464,7 @@ static really_inline
  m256 prep_conf_fast_teddy_m1(m256 val, m256 mask, m256 maskLo, m256 maskHi,
                               m256 p_mask) {
      m256 lo = and256(val, mask);
-    m256 hi = and256(rshift4x64(val, 4), mask);
+    m256 hi = and256(rshift64_m256(val, 4), mask);
      m256 res = and256(vpshufb(maskLo, lo), vpshufb(maskHi, hi));
      return and256(res, p_mask);
  }
diff --git a/src/hwlm/noodle_engine_sse.c b/src/hwlm/noodle_engine_sse.c

index b36732462f498093f2188de1627c5aa068a32015..40575409e2cacc6670f15192aa8a42bec964ea9b 100644 (file)
--- a/src/hwlm/noodle_engine_sse.c
+++ b/src/hwlm/noodle_engine_sse.c
@@ -115,7 +115,8 @@ hwlm_error_t scanDoubleShort(const u8 *buf, size_t len, const u8 *key,
          v = and128(v, caseMask);
      }
  
-    u32 z = movemask128(and128(shiftLeft8Bits(eq128(mask1, v)), eq128(mask2, v)));
+    u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1),
+                               eq128(mask2, v)));
  
      // mask out where we can't match
      u32 mask = (0xFFFF >> (16 - l));
@@ -142,7 +143,8 @@ hwlm_error_t scanDoubleUnaligned(const u8 *buf, size_t len, size_t offset,
          v = and128(v, caseMask);
      }
  
-    u32 z = movemask128(and128(shiftLeft8Bits(eq128(mask1, v)), eq128(mask2, v)));
+    u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1),
+                               eq128(mask2, v)));
  
      // mask out where we can't match
      u32 buf_off = start - offset;
diff --git a/src/nfa/limex_runtime.h b/src/nfa/limex_runtime.h

index 70601e27245675d1ff35b1d9a5470adef5cb35a7..e0c182fcb66ece52fb90dad56c7814f92ce38ed5 100644 (file)
--- a/src/nfa/limex_runtime.h
+++ b/src/nfa/limex_runtime.h
@@ -75,7 +75,7 @@ struct proto_cache {
  // Shift macros for Limited NFAs. Defined in terms of uniform ops.
  // LimExNFAxxx ptr in 'limex' and the current state in 's'
  #define NFA_EXEC_LIM_SHIFT(nels_type, nels_i)                                  \
-    (JOIN(shift_, nels_type)(                                                  \
+    (JOIN(lshift_, nels_type)(                                                 \
          JOIN(and_, nels_type)(s,                                               \
                                JOIN(load_, nels_type)(&limex->shift[nels_i])),  \
          limex->shiftAmount[nels_i]))
diff --git a/src/nfa/shufti.c b/src/nfa/shufti.c

index 5aba984792be4e517d4dc2a139bc39cde01d2d9f..903e04da4638af6f62ccf88a1d986783e7570dce 100644 (file)
--- a/src/nfa/shufti.c
+++ b/src/nfa/shufti.c
@@ -40,7 +40,6 @@
  
  #include "shufti_common.h"
  
-
  /** \brief Naive byte-by-byte implementation. */
  static really_inline
  const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, const u8 *buf,
@@ -234,7 +233,7 @@ const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi,
  
      m128 c2_lo  = pshufb(mask2_lo, chars_lo);
      m128 c2_hi  = pshufb(mask2_hi, chars_hi);
-    m128 t2     = or128(t, shiftRight8Bits(or128(c2_lo, c2_hi)));
+    m128 t2     = or128(t, rshiftbyte_m128(or128(c2_lo, c2_hi), 1));
  
  #ifdef DEBUG
      DEBUG_PRINTF(" c2_lo: "); dumpMsk128(c2_lo);        printf("\n");
@@ -471,7 +470,7 @@ const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi, m256 mask2_lo, m256 mask2_hi,
  
      m256 c2_lo  = vpshufb(mask2_lo, chars_lo);
      m256 c2_hi  = vpshufb(mask2_hi, chars_hi);
-    m256 t2     = or256(t, shift256Right8Bits(or256(c2_lo, c2_hi)));
+    m256 t2 = or256(t, rshift128_m256(or256(c2_lo, c2_hi), 1));
  
  #ifdef DEBUG
      DEBUG_PRINTF(" c2_lo: "); dumpMsk256(c2_lo);        printf("\n");
diff --git a/src/nfa/shufti_common.h b/src/nfa/shufti_common.h

index 84835665745dd8e29f2ddc64a7755122870f12f9..e63ad27afc7348c30b0f1fc7af488274555612df 100644 (file)
--- a/src/nfa/shufti_common.h
+++ b/src/nfa/shufti_common.h
@@ -93,7 +93,7 @@ DUMP_MSK(128)
  #endif
  
  #define GET_LO_4(chars) and128(chars, low4bits)
-#define GET_HI_4(chars) rshift2x64(andnot128(low4bits, chars), 4)
+#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4)
  
  static really_inline
  u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits,
@@ -119,7 +119,7 @@ DUMP_MSK(256)
  #endif
  
  #define GET_LO_4(chars) and256(chars, low4bits)
-#define GET_HI_4(chars) rshift4x64(andnot256(low4bits, chars), 4)
+#define GET_HI_4(chars) rshift64_m256(andnot256(low4bits, chars), 4)
  
  static really_inline
  u32 block(m256 mask_lo, m256 mask_hi, m256 chars, const m256 low4bits,
diff --git a/src/nfa/truffle_common.h b/src/nfa/truffle_common.h

index 593a605ebd36d1519eb74d3665cb0e4d2f7ca800..7368e550dc03685738256711ecbb4796bef540db 100644 (file)
--- a/src/nfa/truffle_common.h
+++ b/src/nfa/truffle_common.h
@@ -48,7 +48,6 @@ const u8 *firstMatch(const u8 *buf, u32 z) {
      return NULL; // no match
  }
  
-#define shift128r(a, b) _mm_srli_epi64((a), (b))
  static really_inline
  u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) {
  
@@ -59,7 +58,7 @@ u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) {
      m128 shuf1 = pshufb(shuf_mask_lo_highclear, v);
      m128 t1 = xor128(v, highconst);
      m128 shuf2 = pshufb(shuf_mask_lo_highset, t1);
-    m128 t2 = andnot128(highconst, shift128r(v, 4));
+    m128 t2 = andnot128(highconst, rshift64_m128(v, 4));
      m128 shuf3 = pshufb(shuf_mask_hi, t2);
      m128 tmp = and128(or128(shuf1, shuf2), shuf3);
      m128 tmp2 = eq128(tmp, zeroes128());
@@ -102,7 +101,6 @@ const u8 *firstMatch(const u8 *buf, u32 z) {
      return NULL; // no match
  }
  
-#define shift256r(a, b) _mm256_srli_epi64((a), (b))
  static really_inline
  u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) {
  
@@ -113,7 +111,7 @@ u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) {
      m256 shuf1 = vpshufb(shuf_mask_lo_highclear, v);
      m256 t1 = xor256(v, highconst);
      m256 shuf2 = vpshufb(shuf_mask_lo_highset, t1);
-    m256 t2 = andnot256(highconst, shift256r(v, 4));
+    m256 t2 = andnot256(highconst, rshift64_m256(v, 4));
      m256 shuf3 = vpshufb(shuf_mask_hi, t2);
      m256 tmp = and256(or256(shuf1, shuf2), shuf3);
      m256 tmp2 = eq256(tmp, zeroes256());
diff --git a/src/nfa/vermicelli_sse.h b/src/nfa/vermicelli_sse.h

index 1883a44cfc492c687fe5bab336b6f8b902f9d0fe..0749470f5040a3b9d91dac2a64972f5cabcb60e4 100644 (file)
--- a/src/nfa/vermicelli_sse.h
+++ b/src/nfa/vermicelli_sse.h
@@ -138,7 +138,7 @@ const u8 *dvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2,
      for (; buf + 16 < buf_end; buf += 16) {
          m128 data = load128(buf);
          u32 z = movemask128(and128(eq128(chars1, data),
-                            shiftRight8Bits(eq128(chars2, data))));
+                                   rshiftbyte_m128(eq128(chars2, data), 1)));
          if (buf[15] == c1 && buf[16] == c2) {
              z |= (1 << 15);
          }
@@ -161,7 +161,7 @@ const u8 *dvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
          m128 data = load128(buf);
          m128 v = and128(casemask, data);
          u32 z = movemask128(and128(eq128(chars1, v),
-                            shiftRight8Bits(eq128(chars2, v))));
+                                   rshiftbyte_m128(eq128(chars2, v), 1)));
          if ((buf[15] & CASE_CLEAR) == c1 && (buf[16] & CASE_CLEAR) == c2) {
              z |= (1 << 15);
          }
@@ -182,8 +182,10 @@ const u8 *dvermSearchAlignedMasked(m128 chars1, m128 chars2,
  
      for (; buf + 16 < buf_end; buf += 16) {
          m128 data = load128(buf);
-        u32 z = movemask128(and128(eq128(chars1, and128(data, mask1)),
-                   shiftRight8Bits(eq128(chars2, and128(data, mask2)))));
+        m128 v1 = eq128(chars1, and128(data, mask1));
+        m128 v2 = eq128(chars2, and128(data, mask2));
+        u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1)));
+
          if ((buf[15] & m1) == c1 && (buf[16] & m2) == c2) {
              z |= (1 << 15);
          }
@@ -201,7 +203,7 @@ static really_inline
  const u8 *dvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
      m128 data = loadu128(buf); // unaligned
      u32 z = movemask128(and128(eq128(chars1, data),
-                        shiftRight8Bits(eq128(chars2, data))));
+                               rshiftbyte_m128(eq128(chars2, data), 1)));
  
      /* no fixup of the boundary required - the aligned run will pick it up */
      if (unlikely(z)) {
@@ -219,7 +221,7 @@ const u8 *dvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
      m128 data = loadu128(buf); // unaligned
      m128 v = and128(casemask, data);
      u32 z = movemask128(and128(eq128(chars1, v),
-                               shiftRight8Bits(eq128(chars2, v))));
+                               rshiftbyte_m128(eq128(chars2, v), 1)));
  
      /* no fixup of the boundary required - the aligned run will pick it up */
      if (unlikely(z)) {
@@ -234,8 +236,9 @@ static really_inline
  const u8 *dvermPreconditionMasked(m128 chars1, m128 chars2,
                                    m128 mask1, m128 mask2, const u8 *buf) {
      m128 data = loadu128(buf); // unaligned
-    u32 z = movemask128(and128(eq128(chars1, and128(data, mask1)),
-               shiftRight8Bits(eq128(chars2, and128(data, mask2)))));
+    m128 v1 = eq128(chars1, and128(data, mask1));
+    m128 v2 = eq128(chars2, and128(data, mask2));
+    u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1)));
  
      /* no fixup of the boundary required - the aligned run will pick it up */
      if (unlikely(z)) {
@@ -324,7 +327,7 @@ const u8 *rdvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2,
      for (; buf + 16 < buf_end; buf_end -= 16) {
          m128 data = load128(buf_end - 16);
          u32 z = movemask128(and128(eq128(chars2, data),
-                            shiftLeft8Bits(eq128(chars1, data))));
+                                   lshiftbyte_m128(eq128(chars1, data), 1)));
          if (buf_end[-17] == c1 && buf_end[-16] == c2) {
              z |= 1;
          }
@@ -345,7 +348,7 @@ const u8 *rdvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
          m128 data = load128(buf_end - 16);
          m128 v = and128(casemask, data);
          u32 z = movemask128(and128(eq128(chars2, v),
-                            shiftLeft8Bits(eq128(chars1, v))));
+                                   lshiftbyte_m128(eq128(chars1, v), 1)));
          if ((buf_end[-17] & CASE_CLEAR) == c1
              && (buf_end[-16] & CASE_CLEAR) == c2) {
              z |= 1;
@@ -362,7 +365,7 @@ static really_inline
  const u8 *rdvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
      m128 data = loadu128(buf);
      u32 z = movemask128(and128(eq128(chars2, data),
-                               shiftLeft8Bits(eq128(chars1, data))));
+                               lshiftbyte_m128(eq128(chars1, data), 1)));
  
      /* no fixup of the boundary required - the aligned run will pick it up */
      if (unlikely(z)) {
@@ -380,7 +383,7 @@ const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
      m128 data = loadu128(buf);
      m128 v = and128(casemask, data);
      u32 z = movemask128(and128(eq128(chars2, v),
-                               shiftLeft8Bits(eq128(chars1, v))));
+                               lshiftbyte_m128(eq128(chars1, v), 1)));
      /* no fixup of the boundary required - the aligned run will pick it up */
      if (unlikely(z)) {
          return lastMatchOffset(buf + 16, z);
diff --git a/src/rose/counting_miracle.h b/src/rose/counting_miracle.h

index cd84d052246b183281d5acfab7188366137cf051..76db5a77c7c50fdb8bd8e529336d08e4f0998e98 100644 (file)
--- a/src/rose/counting_miracle.h
+++ b/src/rose/counting_miracle.h
@@ -82,7 +82,7 @@ char roseCountingMiracleScan(u8 c, const u8 *d, const u8 *d_end,
  }
  
  #define GET_LO_4(chars) and128(chars, low4bits)
-#define GET_HI_4(chars) rshift2x64(andnot128(low4bits, chars), 4)
+#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4)
  
  static really_inline
  u32 roseCountingMiracleScanShufti(m128 mask_lo, m128 mask_hi, u8 poison,
diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h

index d3dba9a3e83b433be0f8dda2d93e79d7c451e1db..5f557ba511b6fd7ea5641952b6a4d321cfd934bd 100644 (file)
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -149,8 +149,8 @@ static really_inline u32 diffrich64_128(m128 a, m128 b) {
  #endif
  }
  
-#define shift2x64(a, b)  _mm_slli_epi64((a), (b))
-#define rshift2x64(a, b) _mm_srli_epi64((a), (b))
+#define lshift64_m128(a, b) _mm_slli_epi64((a), (b))
+#define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
  #define eq128(a, b)      _mm_cmpeq_epi8((a), (b))
  #define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
  
@@ -172,16 +172,8 @@ static really_inline u64a movq(const m128 in) {
  #endif
  }
  
-static really_inline m128 shiftRight8Bits(m128 a) {
-    return _mm_srli_si128(a,1);
-}
-
-static really_inline m128 shiftLeft8Bits(m128 a) {
-    return _mm_slli_si128(a,1);
-}
-
-#define byteShiftRight128(a, count_immed) _mm_srli_si128(a, count_immed)
-#define byteShiftLeft128(a, count_immed) _mm_slli_si128(a, count_immed)
+#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
+#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
  
  #if !defined(__AVX2__)
  // TODO: this entire file needs restructuring - this carveout is awful
@@ -191,8 +183,8 @@ static really_inline m128 shiftLeft8Bits(m128 a) {
  #define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4)
  #define extract64from256(a, imm) _mm_extract_epi64((imm >> 2) ? a.hi : a.lo, imm % 2)
  #else
-#define extract32from256(a, imm) movd(byteShiftRight128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 8))
-#define extract64from256(a, imm) movq(byteShiftRight128((imm >> 2) ? a.hi : a.lo, (imm % 2) * 8))
+#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 8))
+#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 2) * 8))
  #endif
  
  #endif // !AVX2
@@ -213,10 +205,6 @@ static really_inline m128 andnot128(m128 a, m128 b) {
      return _mm_andnot_si128(a, b);
  }
  
-// The shift amount is an immediate, so we define these operations as macros on
-// Intel SIMD.
-#define shift128(a, b)  _mm_slli_epi64((a), (b))
-
  // aligned load
  static really_inline m128 load128(const void *ptr) {
      assert(ISALIGNED_N(ptr, alignof(m128)));
@@ -335,8 +323,8 @@ m128 variable_byte_shift_m128(m128 in, s32 amount) {
   ****/
  
  #if defined(__AVX2__)
-#define shift4x64(a, b)  _mm256_slli_epi64((a), (b))
-#define rshift4x64(a, b) _mm256_srli_epi64((a), (b))
+#define lshift64_m256(a, b) _mm256_slli_epi64((a), (b))
+#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
  
  static really_inline
  m256 set32x8(u32 in) {
@@ -354,18 +342,18 @@ m256 set2x128(m128 a) {
  #else
  
  static really_inline
-m256 shift4x64(m256 a, int b) {
+m256 lshift64_m256(m256 a, int b) {
      m256 rv = a;
-    rv.lo = shift2x64(rv.lo, b);
-    rv.hi = shift2x64(rv.hi, b);
+    rv.lo = lshift64_m128(rv.lo, b);
+    rv.hi = lshift64_m128(rv.hi, b);
      return rv;
  }
  
  static really_inline
-m256 rshift4x64(m256 a, int b) {
+m256 rshift64_m256(m256 a, int b) {
      m256 rv = a;
-    rv.lo = rshift2x64(rv.lo, b);
-    rv.hi = rshift2x64(rv.hi, b);
+    rv.lo = rshift64_m128(rv.lo, b);
+    rv.hi = rshift64_m128(rv.hi, b);
      return rv;
  }
  static really_inline
@@ -461,18 +449,6 @@ static really_inline m256 andnot256(m256 a, m256 b) {
  }
  #endif
  
-// The shift amount is an immediate
-#if defined(__AVX2__)
-#define shift256(a, b)  _mm256_slli_epi64((a), (b))
-#else
-static really_really_inline m256 shift256(m256 a, unsigned b) {
-    m256 rv;
-    rv.lo = shift128(a.lo, b);
-    rv.hi = shift128(a.hi, b);
-    return rv;
-}
-#endif
-
  static really_inline int diff256(m256 a, m256 b) {
  #if defined(__AVX2__)
      return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1);
@@ -673,21 +649,12 @@ m128 movdq_lo(m256 x) {
      return _mm256_extracti128_si256(x, 0);
  }
  
-static really_inline
-m256 shift256Right8Bits(m256 a) {
-    return _mm256_srli_si256(a, 1);
-}
-
-static really_inline
-m256 shift256Left8Bits(m256 a) {
-    return _mm256_slli_si256(a, 1);
-}
  #define cast256to128(a) _mm256_castsi256_si128(a)
  #define cast128to256(a) _mm256_castsi128_si256(a)
  #define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E)
  #define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm)
-#define byteShiftRight256(a, count_immed) _mm256_srli_si256(a, count_immed)
-#define byteShiftLeft256(a, count_immed) _mm256_slli_si256(a, count_immed)
+#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed)
+#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed)
  #define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2)
  #define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
  #define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a))
@@ -741,11 +708,12 @@ static really_inline m384 andnot384(m384 a, m384 b) {
  }
  
  // The shift amount is an immediate
-static really_really_inline m384 shift384(m384 a, unsigned b) {
+static really_really_inline
+m384 lshift64_m384(m384 a, unsigned b) {
      m384 rv;
-    rv.lo = shift128(a.lo, b);
-    rv.mid = shift128(a.mid, b);
-    rv.hi = shift128(a.hi, b);
+    rv.lo = lshift64_m128(a.lo, b);
+    rv.mid = lshift64_m128(a.mid, b);
+    rv.hi = lshift64_m128(a.hi, b);
      return rv;
  }
  
@@ -913,10 +881,11 @@ static really_inline m512 andnot512(m512 a, m512 b) {
  }
  
  // The shift amount is an immediate
-static really_really_inline m512 shift512(m512 a, unsigned b) {
+static really_really_inline
+m512 lshift64_m512(m512 a, unsigned b) {
      m512 rv;
-    rv.lo = shift256(a.lo, b);
-    rv.hi = shift256(a.hi, b);
+    rv.lo = lshift64_m256(a.lo, b);
+    rv.hi = lshift64_m256(a.hi, b);
      return rv;
  }
  
diff --git a/src/util/uniform_ops.h b/src/util/uniform_ops.h

index 45ea41081810ff16ab2b16c2bc23389a152d1f66..0619c7e4aaab3aca147e0a40362943d9dfdf9e39 100644 (file)
--- a/src/util/uniform_ops.h
+++ b/src/util/uniform_ops.h
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions are met:
@@ -125,12 +125,12 @@
  #define andnot_m384(a, b)   (andnot384(a, b))
  #define andnot_m512(a, b)   (andnot512(a, b))
  
-#define shift_u32(a, b)     ((a) << (b))
-#define shift_u64a(a, b)    ((a) << (b))
-#define shift_m128(a, b)    (shift128(a, b))
-#define shift_m256(a, b)    (shift256(a, b))
-#define shift_m384(a, b)    (shift384(a, b))
-#define shift_m512(a, b)    (shift512(a, b))
+#define lshift_u32(a, b)    ((a) << (b))
+#define lshift_u64a(a, b)   ((a) << (b))
+#define lshift_m128(a, b)   (lshift64_m128(a, b))
+#define lshift_m256(a, b)   (lshift64_m256(a, b))
+#define lshift_m384(a, b)   (lshift64_m384(a, b))
+#define lshift_m512(a, b)   (lshift64_m512(a, b))
  
  #define isZero_u8(a)        ((a) == 0)
  #define isZero_u32(a)       ((a) == 0)
diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp

index e95f7533341e55ab1c0ae2d20ecca546af6fd1ae..3c07b2b0ccb713c49857bff5fe57cea1f743a4ea 100644 (file)
--- a/unit/internal/simd_utils.cpp
+++ b/unit/internal/simd_utils.cpp
@@ -643,50 +643,50 @@ TEST(SimdUtilsTest, variableByteShift128) {
      char base[] = "0123456789ABCDEF";
      m128 in = loadu128(base);
  
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 0),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 0),
                           variable_byte_shift_m128(in, 0)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 1),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 1),
                           variable_byte_shift_m128(in, -1)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 2),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 2),
                           variable_byte_shift_m128(in, -2)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 3),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 3),
                           variable_byte_shift_m128(in, -3)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 4),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 4),
                           variable_byte_shift_m128(in, -4)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 5),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 5),
                           variable_byte_shift_m128(in, -5)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 6),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 6),
                           variable_byte_shift_m128(in, -6)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 7),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 7),
                           variable_byte_shift_m128(in, -7)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 8),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 8),
                           variable_byte_shift_m128(in, -8)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 9),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 9),
                           variable_byte_shift_m128(in, -9)));
-    EXPECT_TRUE(!diff128(byteShiftRight128(in, 10),
+    EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 10),
                           variable_byte_shift_m128(in, -10)));
  
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 0),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 0),
                           variable_byte_shift_m128(in, 0)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 1),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 1),
                           variable_byte_shift_m128(in, 1)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 2),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 2),
                           variable_byte_shift_m128(in, 2)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 3),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 3),
                           variable_byte_shift_m128(in, 3)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 4),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 4),
                           variable_byte_shift_m128(in, 4)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 5),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 5),
                           variable_byte_shift_m128(in, 5)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 6),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 6),
                           variable_byte_shift_m128(in, 6)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 7),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 7),
                           variable_byte_shift_m128(in, 7)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 8),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 8),
                           variable_byte_shift_m128(in, 8)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 9),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 9),
                           variable_byte_shift_m128(in, 9)));
-    EXPECT_TRUE(!diff128(byteShiftLeft128(in, 10),
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 10),
                           variable_byte_shift_m128(in, 10)));
  
      EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, 16)));
author	Matthew Barr <matthew.barr@intel.com>
	Wed, 15 Jun 2016 01:02:42 +0000 (11:02 +1000)
committer	Matthew Barr <matthew.barr@intel.com>
	Fri, 8 Jul 2016 01:07:50 +0000 (11:07 +1000)
src/fdr/fdr.c		patch \| blob \| blame \| history
src/fdr/teddy.c		patch \| blob \| blame \| history
src/fdr/teddy_avx2.c		patch \| blob \| blame \| history
src/hwlm/noodle_engine_sse.c		patch \| blob \| blame \| history
src/nfa/limex_runtime.h		patch \| blob \| blame \| history
src/nfa/shufti.c		patch \| blob \| blame \| history
src/nfa/shufti_common.h		patch \| blob \| blame \| history
src/nfa/truffle_common.h		patch \| blob \| blame \| history
src/nfa/vermicelli_sse.h		patch \| blob \| blame \| history
src/rose/counting_miracle.h		patch \| blob \| blame \| history
src/util/simd_utils.h		patch \| blob \| blame \| history
src/util/uniform_ops.h		patch \| blob \| blame \| history
unit/internal/simd_utils.cpp		patch \| blob \| blame \| history