avx512: add basic functions to simd_utils

author Matthew Barr <matthew.barr@intel.com>

Wed, 20 Jul 2016 01:31:34 +0000 (11:31 +1000)

committer Matthew Barr <matthew.barr@intel.com>

Tue, 30 May 2017 03:59:18 +0000 (13:59 +1000)
author Matthew Barr <matthew.barr@intel.com>
Wed, 20 Jul 2016 01:31:34 +0000 (11:31 +1000)
committer Matthew Barr <matthew.barr@intel.com>
Tue, 30 May 2017 03:59:18 +0000 (13:59 +1000)
diff --git a/cmake/arch.cmake b/cmake/arch.cmake

index e98fbf22733e78603ebf6e1cf82cf070ed19a447..69902f579754ae2665b3e3fa8b706bbcccd66c02 100644 (file)
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -31,5 +31,24 @@ int main(){
      (void)_mm256_xor_si256(z, z);
  }" HAVE_AVX2)
  
+if (NOT HAVE_AVX2)
+    message(STATUS "Building without AVX2 support")
+endif ()
+
+# and now for AVX512
+CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
+#if !defined(__AVX512BW__)
+#error no avx512bw
+#endif
+
+int main(){
+    __m512i z = _mm512_setzero_si512();
+    (void)_mm512_abs_epi8(z);
+}" HAVE_AVX512)
+
+if (NOT HAVE_AVX512)
+    message(STATUS "Building without AVX512 support")
+endif ()
+
  unset (CMAKE_REQUIRED_FLAGS)
  unset (INTRIN_INC_H)
diff --git a/cmake/config.h.in b/cmake/config.h.in

index 5434668e52bec90c921c88dce1d72e4a122315bc..6e23f49342e434ab25b5276a93394a9fb2e715a1 100644 (file)
--- a/cmake/config.h.in
+++ b/cmake/config.h.in
@@ -15,6 +15,9 @@
  /* "Define if building for EM64T" */
  #cmakedefine ARCH_X86_64
  
+/* Define if AVX-512BW available */
+#cmakedefine HAVE_AVX512
+
  /* internal build, switch on dump support. */
  #cmakedefine DUMP_SUPPORT
  
diff --git a/src/nfa/limex_accel.c b/src/nfa/limex_accel.c

index c34216f376bd33eadb028ef4d6bf06e3bd34c2d4..4834b6a547083aced15cdbc554ce75759e807bcd 100644 (file)
--- a/src/nfa/limex_accel.c
+++ b/src/nfa/limex_accel.c
@@ -151,18 +151,20 @@ size_t doAccel512(const m512 *state, const struct LimExNFA512 *limex,
      DEBUG_PRINTF("using PSHUFB for 512-bit shuffle\n");
      m512 accelPerm = limex->accelPermute;
      m512 accelComp = limex->accelCompare;
-#if !defined(HAVE_AVX2)
+#if defined(HAVE_AVX512)
+    idx = packedExtract512(s, accelPerm, accelComp);
+#elif defined(HAVE_AVX2)
+    u32 idx1 = packedExtract256(s.lo, accelPerm.lo, accelComp.lo);
+    u32 idx2 = packedExtract256(s.hi, accelPerm.hi, accelComp.hi);
+    assert((idx1 & idx2) == 0); // should be no shared bits
+    idx = idx1 | idx2;
+#else
      u32 idx1 = packedExtract128(s.lo.lo, accelPerm.lo.lo, accelComp.lo.lo);
      u32 idx2 = packedExtract128(s.lo.hi, accelPerm.lo.hi, accelComp.lo.hi);
      u32 idx3 = packedExtract128(s.hi.lo, accelPerm.hi.lo, accelComp.hi.lo);
      u32 idx4 = packedExtract128(s.hi.hi, accelPerm.hi.hi, accelComp.hi.hi);
      assert((idx1 & idx2 & idx3 & idx4) == 0); // should be no shared bits
      idx = idx1 | idx2 | idx3 | idx4;
-#else
-    u32 idx1 = packedExtract256(s.lo, accelPerm.lo, accelComp.lo);
-    u32 idx2 = packedExtract256(s.hi, accelPerm.hi, accelComp.hi);
-    assert((idx1 & idx2) == 0); // should be no shared bits
-    idx = idx1 | idx2;
  #endif
      return accelScanWrapper(accelTable, aux, input, idx, i, end);
  }
diff --git a/src/nfa/limex_shuffle.h b/src/nfa/limex_shuffle.h

index 5d9b3ef87754e06c8e6af243afcd238d93f39882..4c142a34196d1e38492a53574404728f02374414 100644 (file)
--- a/src/nfa/limex_shuffle.h
+++ b/src/nfa/limex_shuffle.h
@@ -62,4 +62,17 @@ u32 packedExtract256(m256 s, const m256 permute, const m256 compare) {
  }
  #endif // AVX2
  
+#if defined(HAVE_AVX512)
+static really_inline
+u32 packedExtract512(m512 s, const m512 permute, const m512 compare) {
+    // vpshufb doesn't cross lanes, so this is a bit of a cheat
+    m512 shuffled = pshufb_m512(s, permute);
+    m512 compared = and512(shuffled, compare);
+    u64a rv = ~eq512mask(compared, shuffled);
+    // stitch the lane-wise results back together
+    rv = rv >> 32 | rv;
+    return (u32)(((rv >> 16) | rv) & 0xffffU);
+}
+#endif // AVX512
+
  #endif // LIMEX_SHUFFLE_H
diff --git a/src/nfa/nfa_build_util.cpp b/src/nfa/nfa_build_util.cpp

index 3103cd29780b8f2180afa28e3045b68afdc77cdb..9185ccdd7599f116cffba0354c46f53124218cef 100644 (file)
--- a/src/nfa/nfa_build_util.cpp
+++ b/src/nfa/nfa_build_util.cpp
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions are met:
@@ -401,7 +401,7 @@ const char *NFATraits<SHENG_NFA>::name = "Sheng";
  template<> struct NFATraits<TAMARAMA_NFA> {
      UNUSED static const char *name;
      static const NFACategory category = NFA_OTHER;
-    static const u32 stateAlign = 32;
+    static const u32 stateAlign = 64;
      static const bool fast = true;
      static const nfa_dispatch_fn has_accel;
      static const nfa_dispatch_fn has_repeats;
diff --git a/src/util/simd_types.h b/src/util/simd_types.h

index 64844dcb3e0199459e22c6b0942fc8ce9034dc5d..962cad6c974a413bfc88ed531f25f154a80ca405 100644 (file)
--- a/src/util/simd_types.h
+++ b/src/util/simd_types.h
@@ -46,9 +46,12 @@ typedef __m256i m256;
  typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256;
  #endif
  
-// these should align to 16 and 32 respectively
  typedef struct {m128 lo; m128 mid; m128 hi;} m384;
-typedef struct {m256 lo; m256 hi;} m512;
+#if defined(HAVE_AVX512)
+typedef __m512i m512;
+#else
+typedef struct ALIGN_ATTR(64) {m256 lo; m256 hi;} m512;
+#endif
  
  #endif /* SIMD_TYPES_H */
  
diff --git a/src/util/simd_utils.c b/src/util/simd_utils.c

index 54b5b4baadf5b283613e57af11fa2099977b5d73..25a81412e19a8ee9b3238a8967238f09c0efd1b5 100644 (file)
--- a/src/util/simd_utils.c
+++ b/src/util/simd_utils.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions are met:
@@ -49,6 +49,7 @@ ALIGN_CL_DIRECTIVE const char vbs_mask_data[] = {
  
  /** \brief LUT for the mask1bit functions. */
  ALIGN_CL_DIRECTIVE const u8 simd_onebit_masks[] = {
+    ZEROES_32, ZEROES_32,
      ZEROES_31, 0x01, ZEROES_32,
      ZEROES_31, 0x02, ZEROES_32,
      ZEROES_31, 0x04, ZEROES_32,
@@ -57,4 +58,5 @@ ALIGN_CL_DIRECTIVE const u8 simd_onebit_masks[] = {
      ZEROES_31, 0x20, ZEROES_32,
      ZEROES_31, 0x40, ZEROES_32,
      ZEROES_31, 0x80, ZEROES_32,
+    ZEROES_32, ZEROES_32,
  };
diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h

index 1f8848430d7f4a8afd7fc2d9c3756cac3846779f..5f4fe9213a5a49d8687208b9fe69a84d4fd326f7 100644 (file)
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -240,7 +240,7 @@ extern const u8 simd_onebit_masks[];
  static really_inline
  m128 mask1bit128(unsigned int n) {
      assert(n < sizeof(m128) * 8);
-    u32 mask_idx = ((n % 8) * 64) + 31;
+    u32 mask_idx = ((n % 8) * 64) + 95;
      mask_idx -= n / 8;
      return loadu128(&simd_onebit_masks[mask_idx]);
  }
@@ -290,6 +290,18 @@ m256 vpshufb(m256 a, m256 b) {
  #endif
  }
  
+#if defined(HAVE_AVX512)
+static really_inline
+m512 pshufb_m512(m512 a, m512 b) {
+    return _mm512_shuffle_epi8(a, b);
+}
+
+static really_inline
+m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) {
+    return _mm512_maskz_shuffle_epi8(k, a, b);
+}
+#endif
+
  static really_inline
  m128 variable_byte_shift_m128(m128 in, s32 amount) {
      assert(amount >= -16 && amount <= 16);
@@ -592,7 +604,7 @@ m256 loadbytes256(const void *ptr, unsigned int n) {
  static really_inline
  m256 mask1bit256(unsigned int n) {
      assert(n < sizeof(m256) * 8);
-    u32 mask_idx = ((n % 8) * 64) + 31;
+    u32 mask_idx = ((n % 8) * 64) + 95;
      mask_idx -= n / 8;
      return loadu256(&simd_onebit_masks[mask_idx]);
  }
@@ -902,41 +914,110 @@ char testbit384(m384 val, unsigned int n) {
   **** 512-bit Primitives
   ****/
  
-static really_inline m512 and512(m512 a, m512 b) {
+#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b))
+#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b))
+
+static really_inline
+m512 zeroes512(void) {
+#if defined(HAVE_AVX512)
+    return _mm512_setzero_si512();
+#else
+    m512 rv = {zeroes256(), zeroes256()};
+    return rv;
+#endif
+}
+
+static really_inline
+m512 ones512(void) {
+#if defined(HAVE_AVX512)
+    return _mm512_set1_epi8(0xFF);
+    //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512());
+#else
+    m512 rv = {ones256(), ones256()};
+    return rv;
+#endif
+}
+
+#if defined(HAVE_AVX512)
+static really_inline
+m512 set64x8(u8 a) {
+    return _mm512_set1_epi8(a);
+}
+
+static really_inline
+m512 set8x64(u64a a) {
+    return _mm512_set1_epi64(a);
+}
+
+static really_inline
+m512 set4x128(m128 a) {
+    return _mm512_broadcast_i32x4(a);
+}
+#endif
+
+static really_inline
+m512 and512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return _mm512_and_si512(a, b);
+#else
      m512 rv;
      rv.lo = and256(a.lo, b.lo);
      rv.hi = and256(a.hi, b.hi);
      return rv;
+#endif
  }
  
-static really_inline m512 or512(m512 a, m512 b) {
+static really_inline
+m512 or512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return _mm512_or_si512(a, b);
+#else
      m512 rv;
      rv.lo = or256(a.lo, b.lo);
      rv.hi = or256(a.hi, b.hi);
      return rv;
+#endif
  }
  
-static really_inline m512 xor512(m512 a, m512 b) {
+static really_inline
+m512 xor512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return _mm512_xor_si512(a, b);
+#else
      m512 rv;
      rv.lo = xor256(a.lo, b.lo);
      rv.hi = xor256(a.hi, b.hi);
      return rv;
+#endif
  }
  
-static really_inline m512 not512(m512 a) {
+static really_inline
+m512 not512(m512 a) {
+#if defined(HAVE_AVX512)
+    return _mm512_xor_si512(a, ones512());
+#else
      m512 rv;
      rv.lo = not256(a.lo);
      rv.hi = not256(a.hi);
      return rv;
+#endif
  }
  
-static really_inline m512 andnot512(m512 a, m512 b) {
+static really_inline
+m512 andnot512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return _mm512_andnot_si512(a, b);
+#else
      m512 rv;
      rv.lo = andnot256(a.lo, b.lo);
      rv.hi = andnot256(a.hi, b.hi);
      return rv;
+#endif
  }
  
+#if defined(HAVE_AVX512)
+#define lshift64_m512(a, b) _mm512_slli_epi64((a), b)
+#else
  // The shift amount is an immediate
  static really_really_inline
  m512 lshift64_m512(m512 a, unsigned b) {
@@ -945,29 +1026,37 @@ m512 lshift64_m512(m512 a, unsigned b) {
      rv.hi = lshift64_m256(a.hi, b);
      return rv;
  }
+#endif
  
-static really_inline m512 zeroes512(void) {
-    m512 rv = {zeroes256(), zeroes256()};
-    return rv;
-}
+#if defined(HAVE_AVX512)
+#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b))
+#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed)
+#endif
  
-static really_inline m512 ones512(void) {
-    m512 rv = {ones256(), ones256()};
-    return rv;
-}
+#if !defined(_MM_CMPINT_NE)
+#define _MM_CMPINT_NE 0x4
+#endif
  
-static really_inline int diff512(m512 a, m512 b) {
+static really_inline
+int diff512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE);
+#else
      return diff256(a.lo, b.lo) || diff256(a.hi, b.hi);
+#endif
  }
  
-static really_inline int isnonzero512(m512 a) {
-#if !defined(HAVE_AVX2)
+static really_inline
+int isnonzero512(m512 a) {
+#if defined(HAVE_AVX512)
+    return diff512(a, zeroes512());
+#elif defined(HAVE_AVX2)
+    m256 x = or256(a.lo, a.hi);
+    return !!diff256(x, zeroes256());
+#else
      m128 x = or128(a.lo.lo, a.lo.hi);
      m128 y = or128(a.hi.lo, a.hi.hi);
      return isnonzero128(or128(x, y));
-#else
-    m256 x = or256(a.lo, a.hi);
-    return !!diff256(x, zeroes256());
  #endif
  }
  
@@ -975,8 +1064,11 @@ static really_inline int isnonzero512(m512 a) {
   * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit
   * mask indicating which 32-bit words contain differences.
   */
-static really_inline u32 diffrich512(m512 a, m512 b) {
-#if defined(HAVE_AVX2)
+static really_inline
+u32 diffrich512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE);
+#elif defined(HAVE_AVX2)
      return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8);
  #else
      a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo);
@@ -993,22 +1085,32 @@ static really_inline u32 diffrich512(m512 a, m512 b) {
   * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and
   * returns a 16-bit mask indicating which 64-bit words contain differences.
   */
-static really_inline u32 diffrich64_512(m512 a, m512 b) {
+static really_inline
+u32 diffrich64_512(m512 a, m512 b) {
+    //TODO: cmp_epi64?
      u32 d = diffrich512(a, b);
      return (d | (d >> 1)) & 0x55555555;
  }
  
  // aligned load
-static really_inline m512 load512(const void *ptr) {
+static really_inline
+m512 load512(const void *ptr) {
+#if defined(HAVE_AVX512)
+    return _mm512_load_si512(ptr);
+#else
      assert(ISALIGNED_N(ptr, alignof(m256)));
      m512 rv = { load256(ptr), load256((const char *)ptr + 32) };
      return rv;
+#endif
  }
  
  // aligned store
-static really_inline void store512(void *ptr, m512 a) {
-    assert(ISALIGNED_N(ptr, alignof(m256)));
-#if defined(HAVE_AVX2)
+static really_inline
+void store512(void *ptr, m512 a) {
+    assert(ISALIGNED_N(ptr, alignof(m512)));
+#if defined(HAVE_AVX512)
+    return _mm512_store_si512(ptr, a);
+#elif defined(HAVE_AVX2)
      m512 *x = (m512 *)ptr;
      store256(&x->lo, a.lo);
      store256(&x->hi, a.hi);
@@ -1019,11 +1121,28 @@ static really_inline void store512(void *ptr, m512 a) {
  }
  
  // unaligned load
-static really_inline m512 loadu512(const void *ptr) {
+static really_inline
+m512 loadu512(const void *ptr) {
+#if defined(HAVE_AVX512)
+    return _mm512_loadu_si512(ptr);
+#else
      m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) };
      return rv;
+#endif
  }
  
+#if defined(HAVE_AVX512)
+static really_inline
+m512 loadu_maskz_m512(__mmask64 k, const void *ptr) {
+    return _mm512_maskz_loadu_epi8(k, ptr);
+}
+
+static really_inline
+m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
+    return _mm512_mask_loadu_epi8(src, k, ptr);
+}
+#endif
+
  // packed unaligned store of first N bytes
  static really_inline
  void storebytes512(void *ptr, m512 a, unsigned int n) {
@@ -1040,6 +1159,14 @@ m512 loadbytes512(const void *ptr, unsigned int n) {
      return a;
  }
  
+static really_inline
+m512 mask1bit512(unsigned int n) {
+    assert(n < sizeof(m512) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 95;
+    mask_idx -= n / 8;
+    return loadu512(&simd_onebit_masks[mask_idx]);
+}
+
  // switches on bit N in the given vector.
  static really_inline
  void setbit512(m512 *ptr, unsigned int n) {
@@ -1056,6 +1183,8 @@ void setbit512(m512 *ptr, unsigned int n) {
          sub = &ptr->hi.hi;
      }
      setbit128(sub, n % 128);
+#elif defined(HAVE_AVX512)
+    *ptr = or512(mask1bit512(n), *ptr);
  #else
      m256 *sub;
      if (n < 256) {
@@ -1084,6 +1213,8 @@ void clearbit512(m512 *ptr, unsigned int n) {
          sub = &ptr->hi.hi;
      }
      clearbit128(sub, n % 128);
+#elif defined(HAVE_AVX512)
+    *ptr = andnot512(mask1bit512(n), *ptr);
  #else
      m256 *sub;
      if (n < 256) {
@@ -1112,6 +1243,9 @@ char testbit512(m512 val, unsigned int n) {
          sub = val.hi.hi;
      }
      return testbit128(sub, n % 128);
+#elif defined(HAVE_AVX512)
+    const m512 mask = mask1bit512(n);
+    return !!_mm512_test_epi8_mask(mask, val);
  #else
      m256 sub;
      if (n < 256) {
diff --git a/src/util/state_compress.c b/src/util/state_compress.c

index 87e62429d65701691876c7bf3354d9a5599db710..7238849e7f89d578c319448f7da7c47a484e3b0f 100644 (file)
--- a/src/util/state_compress.c
+++ b/src/util/state_compress.c
@@ -547,16 +547,21 @@ m512 loadcompressed512_32bit(const void *ptr, m512 mvec) {
                    expand32(v[14], m[14]), expand32(v[15], m[15]) };
  
      m512 xvec;
-#if !defined(HAVE_AVX2)
-    xvec.lo.lo = _mm_set_epi32(x[3], x[2], x[1], x[0]);
-    xvec.lo.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]);
-    xvec.hi.lo = _mm_set_epi32(x[11], x[10], x[9], x[8]);
-    xvec.hi.hi = _mm_set_epi32(x[15], x[14], x[13], x[12]);
-#else
+#if defined(HAVE_AVX512)
+    xvec = _mm512_set_epi32(x[15], x[14], x[13], x[12],
+                            x[11], x[10], x[9], x[8],
+                            x[7], x[6], x[5], x[4],
+                            x[3], x[2], x[1], x[0]);
+#elif defined(HAVE_AVX2)
      xvec.lo = _mm256_set_epi32(x[7], x[6], x[5], x[4],
                                 x[3], x[2], x[1], x[0]);
      xvec.hi = _mm256_set_epi32(x[15], x[14], x[13], x[12],
                                 x[11], x[10], x[9], x[8]);
+#else
+    xvec.lo.lo = _mm_set_epi32(x[3], x[2], x[1], x[0]);
+    xvec.lo.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]);
+    xvec.hi.lo = _mm_set_epi32(x[11], x[10], x[9], x[8]);
+    xvec.hi.hi = _mm_set_epi32(x[15], x[14], x[13], x[12]);
  #endif
      return xvec;
  }
@@ -582,14 +587,17 @@ m512 loadcompressed512_64bit(const void *ptr, m512 mvec) {
                    expand64(v[4], m[4]), expand64(v[5], m[5]),
                    expand64(v[6], m[6]), expand64(v[7], m[7]) };
  
-#if !defined(HAVE_AVX2)
+#if defined(HAVE_AVX512)
+    m512 xvec = _mm512_set_epi64(x[7], x[6], x[5], x[4],
+                                 x[3], x[2], x[1], x[0]);
+#elif defined(HAVE_AVX2)
+    m512 xvec = { .lo = _mm256_set_epi64x(x[3], x[2], x[1], x[0]),
+                  .hi = _mm256_set_epi64x(x[7], x[6], x[5], x[4])};
+#else
      m512 xvec = { .lo = { _mm_set_epi64x(x[1], x[0]),
                            _mm_set_epi64x(x[3], x[2]) },
                    .hi = { _mm_set_epi64x(x[5], x[4]),
                            _mm_set_epi64x(x[7], x[6]) } };
-#else
-    m512 xvec = { .lo = _mm256_set_epi64x(x[3], x[2], x[1], x[0]),
-                  .hi = _mm256_set_epi64x(x[7], x[6], x[5], x[4])};
  #endif
      return xvec;
  }
diff --git a/unit/internal/shuffle.cpp b/unit/internal/shuffle.cpp

index fcf337f2ef4da7f4f8f77bf76c539b6e63fd915e..b2316babd89bca0e37d9eae1853292014f0b3da3 100644 (file)
--- a/unit/internal/shuffle.cpp
+++ b/unit/internal/shuffle.cpp
@@ -165,14 +165,15 @@ TEST(Shuffle, PackedExtract64_3) {
  template<typename T>
  static
  void build_pshufb_masks_onebit(unsigned int bit, T *permute, T *compare) {
-    static_assert(sizeof(T) == sizeof(m128) || sizeof(T) == sizeof(m256),
+    static_assert(sizeof(T) == sizeof(m128) || sizeof(T) == sizeof(m256) ||
+                      sizeof(T) == sizeof(m512),
                    "should be valid type");
      // permute mask has 0x80 in all bytes except the one we care about
      memset(permute, 0x80, sizeof(*permute));
      memset(compare, 0, sizeof(*compare));
      char *pmsk = (char *)permute;
      char *cmsk = (char *)compare;
-    u8 off = (bit >= 128) ? 0x10 : 0;
+    u8 off = (bit >= 128) ? (bit >= 256) ? (bit >= 384) ? 0x30 : 0x20 : 0x10 : 0;
      pmsk[off] = bit/8;
      cmsk[off] = ~(1 << (bit % 8));
  }
@@ -214,4 +215,24 @@ TEST(Shuffle, PackedExtract256_1) {
      }
  }
  #endif
+
+#if defined(HAVE_AVX512)
+TEST(Shuffle, PackedExtract512_1) {
+    // Try all possible one-bit masks
+    for (unsigned int i = 0; i < 512; i++) {
+        // shuffle a single 1 bit to the front
+        m512 permute, compare;
+        build_pshufb_masks_onebit(i, &permute, &compare);
+        EXPECT_EQ(1U, packedExtract512(setbit<m512>(i), permute, compare));
+        EXPECT_EQ(1U, packedExtract512(ones512(), permute, compare));
+        // we should get zero out of these cases
+        EXPECT_EQ(0U, packedExtract512(zeroes512(), permute, compare));
+        EXPECT_EQ(0U, packedExtract512(not512(setbit<m512>(i)), permute, compare));
+        // we should get zero out of all the other bit positions
+        for (unsigned int j = 0; (j != i && j < 512); j++) {
+            EXPECT_EQ(0U, packedExtract512(setbit<m512>(j), permute, compare));
+        }
+    }
+}
+#endif
  } // namespace
diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp

index dac3722ec4b5f1300db301ef5d51111bdcb60e60..0d3926d65ed6714d6b5b479d796fe8a132dd0564 100644 (file)
--- a/unit/internal/simd_utils.cpp
+++ b/unit/internal/simd_utils.cpp
@@ -590,7 +590,7 @@ TEST(SimdUtilsTest, alignment) {
      ASSERT_EQ(16, alignof(m128));
      ASSERT_EQ(32, alignof(m256));
      ASSERT_EQ(16, alignof(m384));
-    ASSERT_EQ(32, alignof(m512));
+    ASSERT_EQ(64, alignof(m512));
  }
  
  TEST(SimdUtilsTest, movq) {
author	Matthew Barr <matthew.barr@intel.com>
	Wed, 20 Jul 2016 01:31:34 +0000 (11:31 +1000)
committer	Matthew Barr <matthew.barr@intel.com>
	Tue, 30 May 2017 03:59:18 +0000 (13:59 +1000)
cmake/arch.cmake		patch \| blob \| blame \| history
cmake/config.h.in		patch \| blob \| blame \| history
src/nfa/limex_accel.c		patch \| blob \| blame \| history
src/nfa/limex_shuffle.h		patch \| blob \| blame \| history
src/nfa/nfa_build_util.cpp		patch \| blob \| blame \| history
src/util/simd_types.h		patch \| blob \| blame \| history
src/util/simd_utils.c		patch \| blob \| blame \| history
src/util/simd_utils.h		patch \| blob \| blame \| history
src/util/state_compress.c		patch \| blob \| blame \| history
unit/internal/shuffle.cpp		patch \| blob \| blame \| history
unit/internal/simd_utils.cpp		patch \| blob \| blame \| history