simd: simplify the set-all-ones util funcs

author Matthew Barr <matthew.barr@intel.com>

Mon, 6 Jun 2016 06:24:55 +0000 (16:24 +1000)

committer Matthew Barr <matthew.barr@intel.com>

Fri, 8 Jul 2016 01:07:50 +0000 (11:07 +1000)
author Matthew Barr <matthew.barr@intel.com>
Mon, 6 Jun 2016 06:24:55 +0000 (16:24 +1000)
committer Matthew Barr <matthew.barr@intel.com>
Fri, 8 Jul 2016 01:07:50 +0000 (11:07 +1000)
diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h

index 4ac7b176f4a4c3a084540e3f0ebbe5b8295b1621..90f3893d48ca67b26ef899250f09a3ecd24bd47c 100644 (file)
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -88,10 +88,6 @@
  #  endif
  #endif
  
-#ifdef _WIN32
-#define NO_ASM
-#endif
-
  // Fallback to identity case.
  #ifndef assume_aligned
  #define assume_aligned(x, y) (x)
@@ -106,13 +102,12 @@ extern const char vbs_mask_data[];
  #endif
  
  static really_inline m128 ones128(void) {
-#if !defined(NO_ASM)
-    // trick from Intel's optimization guide to generate all-ones. We have to
-    // use inline asm, as there's no intrinsic for this yet.
-    m128 ret;
-    __asm__ ("pcmpeqb %0,%0" : "=x"(ret));
-    return ret;
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+    /* gcc gets this right */
+    return _mm_set1_epi8(0xFF);
  #else
+    /* trick from Intel's optimization guide to generate all-ones.
+     * ICC converts this to the single cmpeq instruction */
      return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128());
  #endif
  }
@@ -172,19 +167,8 @@ static really_inline unsigned short cmpmsk8(m128 a, m128 b) {
  #define eq128(a, b)      _mm_cmpeq_epi8((a), (b))
  #define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
  
-// We found that this generated better code with gcc-4.1 and with the default
-// tuning settings on gcc-4.4 than just using the _mm_set1_epi8() instrinsic.
  static really_inline m128 set16x8(u8 c) {
-#if !defined(__AVX2__)
-    m128 a = _mm_cvtsi32_si128((int)c);
-    a = _mm_unpacklo_epi8(a, a);
-    a = _mm_unpacklo_epi8(a, a);
-    a = _mm_shuffle_epi32(a, 0);
-    return a;
-#else
-    // uses a broadcast for much win
      return _mm_set1_epi8(c);
-#endif
  }
  
  static really_inline u32 movd(const m128 in) {
@@ -369,8 +353,7 @@ m128 variable_byte_shift_m128(m128 in, s32 amount) {
  
  static really_inline
  m256 set32x8(u32 in) {
-    m128 a = _mm_cvtsi32_si128(in);
-    return _mm256_broadcastb_epi8(a);
+    return _mm256_set1_epi8(in);
  }
  
  #define eq256(a, b)     _mm256_cmpeq_epi8((a), (b))
@@ -423,12 +406,7 @@ static really_inline m256 zeroes256(void) {
  
  static really_inline m256 ones256(void) {
  #if defined(__AVX2__)
-    m256 rv;
-#if defined(NO_ASM)
-    rv = eq256(zeroes256(), zeroes256());
-#else
-    __asm__ ("vpcmpeqb %0,%0,%0" : "=x"(rv));
-#endif
+    m256 rv = _mm256_set1_epi8(0xFF);
  #else
      m256 rv = {ones128(), ones128()};
  #endif
author	Matthew Barr <matthew.barr@intel.com>
	Mon, 6 Jun 2016 06:24:55 +0000 (16:24 +1000)
committer	Matthew Barr <matthew.barr@intel.com>
	Fri, 8 Jul 2016 01:07:50 +0000 (11:07 +1000)