alignr methods for avx2 and avx512 added

author apostolos <apostolos.tapsas@vectorcamp.gr>

Wed, 28 Jul 2021 09:55:32 +0000 (12:55 +0300)

committer Konstantinos Margaritis <konstantinos@vectorcamp.gr>

Tue, 12 Oct 2021 08:51:34 +0000 (11:51 +0300)
author apostolos <apostolos.tapsas@vectorcamp.gr>
Wed, 28 Jul 2021 09:55:32 +0000 (12:55 +0300)
committer Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Tue, 12 Oct 2021 08:51:34 +0000 (11:51 +0300)
diff --git a/src/util/supervector/arch/x86/impl.cpp b/src/util/supervector/arch/x86/impl.cpp

index 3c305d4b8c7fec266e71981db9affc1ce01836be..26e459099493e1633999d24d04fc84687b7df1e2 100644 (file)
--- a/src/util/supervector/arch/x86/impl.cpp
+++ b/src/util/supervector/arch/x86/impl.cpp
@@ -685,6 +685,7 @@ really_inline SuperVector<32> SuperVector<32>::loadu_maskz(void const *ptr, uint
      return mask & v;
  }
  
+
  #ifdef HS_OPTIMIZE
  template<>
  really_inline SuperVector<32> SuperVector<32>::alignr(SuperVector<32> &other, int8_t offset)
@@ -695,45 +696,47 @@ really_inline SuperVector<32> SuperVector<32>::alignr(SuperVector<32> &other, in
  template<>
  really_inline SuperVector<32> SuperVector<32>::alignr(SuperVector<32> &other, int8_t offset)
  {
-    switch(offset) {
-    case 0: return other; break;
-    case 1: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 1)}; break;
-    case 2: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 2)}; break;
-    case 3: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 3)}; break;
-    case 4: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 4)}; break;
-    case 5: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 5)}; break;
-    case 6: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 6)}; break;
-    case 7: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 7)}; break;
-    case 8: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 8)}; break;
-    case 9: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 9)}; break;
-    case 10: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 10)}; break;
-    case 11: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 11)}; break;
-    case 12: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 12)}; break;
-    case 13: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 13)}; break;
-    case 14: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 14)}; break;
-    case 15: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 15)}; break;
-    case 16: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 16)}; break;
-    case 17: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 17)}; break;
-    case 18: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 18)}; break;
-    case 19: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 19)}; break;
-    case 20: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 20)}; break;
-    case 21: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 21)}; break;
-    case 22: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 22)}; break;
-    case 23: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 23)}; break;
-    case 24: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 24)}; break;
-    case 25: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 25)}; break;
-    case 26: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 26)}; break;
-    case 27: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 27)}; break;
-    case 28: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 28)}; break;
-    case 29: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 39)}; break;
-    case 30: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 30)}; break;
-    case 31: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 31)}; break;
+    // As found here: https://stackoverflow.com/questions/8517970/mm-alignr-epi8-palignr-equivalent-in-avx2#8637458
+    switch (offset){ 
+    case 0 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 0), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 0)); break;
+    case 1 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 1), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 1)); break;
+    case 2 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 2), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 2)); break;
+    case 3 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 3), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 3)); break;
+    case 4 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 4), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 4)); break;
+    case 5 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 5), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 5)); break;
+    case 6 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 6), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 6)); break;
+    case 7 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 7), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 7)); break;
+    case 8 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 8), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 8)); break;
+    case 9 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 9), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 9)); break;
+    case 10 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 10), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 10)); break;
+    case 11 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 11), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 11)); break;
+    case 12 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 12), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 12)); break;
+    case 13 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 13), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 13)); break;
+    case 14 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 14), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 14)); break;
+    case 15 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 15), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 15)); break;
+    case 16 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 0), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 0)); break;
+    case 17 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 1), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 1)); break;
+    case 18 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 2), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 2)); break;
+    case 19 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 3), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 3)); break;
+    case 20 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 4), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 4)); break;
+    case 21 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 5), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 5)); break;
+    case 22 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 6), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 6)); break;
+    case 23 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 7), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 7)); break;
+    case 24 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 8), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 8)); break;
+    case 25 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 9), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 9)); break;
+    case 26 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 10), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 10)); break;
+    case 27 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 11), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 11)); break;
+    case 28 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 12), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 12)); break;
+    case 29 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 13), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 13)); break;
+    case 30 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 14), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 14)); break;
+    case 31 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 15), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 15)); break;  
      default: break;
      }
      return *this;
  }
  #endif
  
+
  template<>
  really_inline SuperVector<32> SuperVector<32>::pshufb(SuperVector<32> b)
  {
@@ -1208,26 +1211,25 @@ really_inline SuperVector<64> SuperVector<64>::alignr(SuperVector<64> &l, int8_t
  template<>
  really_inline SuperVector<64> SuperVector<64>::alignr(SuperVector<64> &l, int8_t offset)
  {
-    switch(offset) {
-    case 0: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 0)};; break;
-    case 1: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 1)}; break;
-    case 2: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 2)}; break;
-    case 3: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 3)}; break;
-    case 4: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 4)}; break;
-    case 5: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 5)}; break;
-    case 6: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 6)}; break;
-    case 7: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 7)}; break;
-    case 8: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 8)}; break;
-    case 9: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 9)}; break;
-    case 10: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 10)}; break;
-    case 11: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 11)}; break;
-    case 12: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 12)}; break;
-    case 13: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 13)}; break;
-    case 14: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 14)}; break;
-    case 15: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 15)}; break;
-    default: break;
+    if(offset == 0){
+        return *this;
+    } else if (offset < 32){
+        SuperVector<32> lo256 = u.v256[0];
+        SuperVector<32> hi256 = u.v256[1];
+        SuperVector<32> o_lo256 = l.u.v256[0];
+        SuperVector<32> carry1 = hi256.alignr(lo256,offset);
+        SuperVector<32> carry2 = o_lo256.alignr(hi256,offset);
+        return SuperVector(carry1, carry2);
+    } else if (offset <= 64){
+        SuperVector<32> hi256 = u.v256[1];
+        SuperVector<32> o_lo256 = l.u.v256[0];
+        SuperVector<32> o_hi256 = l.u.v256[1];
+        SuperVector<32> carry1 = o_lo256.alignr(hi256, offset - 32);
+        SuperVector<32> carry2 = o_hi256.alignr(o_lo256,offset -32);
+        return SuperVector(carry1, carry2);
+    } else {
+        return *this;
      }
-    return *this;
  }
  #endif
  
diff --git a/unit/internal/supervector.cpp b/unit/internal/supervector.cpp

index e85d815ecf97af99f0f3f75c4c4eed29f334fa55..8ea30f85d8463057fb0e7901ad12e89b041f7915 100644 (file)
--- a/unit/internal/supervector.cpp
+++ b/unit/internal/supervector.cpp
@@ -354,23 +354,9 @@ TEST(SuperVectorUtilsTest,Alignr128c){
      }
      auto SP1 = SuperVector<16>::loadu(vec);
      auto SP2 = SuperVector<16>::loadu(vec+16);
-    TEST_ALIGNR128(SP1, SP2, vec, 0);
-    TEST_ALIGNR128(SP1, SP2, vec, 1);
-    TEST_ALIGNR128(SP1, SP2, vec, 2);
-    TEST_ALIGNR128(SP1, SP2, vec, 3);
-    TEST_ALIGNR128(SP1, SP2, vec, 4);
-    TEST_ALIGNR128(SP1, SP2, vec, 5);
-    TEST_ALIGNR128(SP1, SP2, vec, 6);
-    TEST_ALIGNR128(SP1, SP2, vec, 7);
-    TEST_ALIGNR128(SP1, SP2, vec, 8);
-    TEST_ALIGNR128(SP1, SP2, vec, 9);
-    TEST_ALIGNR128(SP1, SP2, vec, 10);
-    TEST_ALIGNR128(SP1, SP2, vec, 11);
-    TEST_ALIGNR128(SP1, SP2, vec, 12);
-    TEST_ALIGNR128(SP1, SP2, vec, 13);
-    TEST_ALIGNR128(SP1, SP2, vec, 14);
-    TEST_ALIGNR128(SP1, SP2, vec, 15);
-    TEST_ALIGNR128(SP1, SP2, vec, 16);
+    for (int j = 0; j<16; j++){
+        TEST_ALIGNR128(SP1, SP2, vec, j);
+    }
  }
  
  
@@ -693,14 +679,11 @@ TEST(SuperVectorUtilsTest,RShift128_256c){
  
  
  /*Define ALIGNR256 macro*/
-/*
-#define TEST_ALIGNR256(v1, v2, buf, l) {                                                 \
-                                           auto v_aligned = v2.alignr(v1, l);            \
-                                           v_aligned.print8("v_aligned");\
-                                           for (size_t i=0; i<32; i++) {                 \
-                                               printf("vec[%ld] = %02x\n", i+l, vec[i+l]);\
-                                               ASSERT_EQ(v_aligned.u.u8[i], vec[i + l]); \
-                                           }                                             \
+#define TEST_ALIGNR256(v1, v2, buf, l) {                                                  \
+                                           auto v_aligned = v2.alignr(v1, l);             \
+                                           for (size_t i=0; i<32; i++) {                  \
+                                               ASSERT_EQ(v_aligned.u.u8[i], vec[i + l]);  \
+                                           }                                              \
                                         }
  
  TEST(SuperVectorUtilsTest,Alignr256c){
@@ -710,41 +693,10 @@ TEST(SuperVectorUtilsTest,Alignr256c){
      }
      auto SP1 = SuperVector<32>::loadu(vec);
      auto SP2 = SuperVector<32>::loadu(vec+32);
-    TEST_ALIGNR256(SP1, SP2, vec, 0);
-    TEST_ALIGNR256(SP1, SP2, vec, 1);
-    TEST_ALIGNR256(SP1, SP2, vec, 2);
-    TEST_ALIGNR256(SP1, SP2, vec, 3);
-    TEST_ALIGNR256(SP1, SP2, vec, 4);
-    TEST_ALIGNR256(SP1, SP2, vec, 5);
-    TEST_ALIGNR256(SP1, SP2, vec, 6);
-    TEST_ALIGNR256(SP1, SP2, vec, 7);
-    TEST_ALIGNR256(SP1, SP2, vec, 8);
-    TEST_ALIGNR256(SP1, SP2, vec, 9);
-    TEST_ALIGNR256(SP1, SP2, vec, 10);
-    TEST_ALIGNR256(SP1, SP2, vec, 11);
-    TEST_ALIGNR256(SP1, SP2, vec, 12);
-    TEST_ALIGNR256(SP1, SP2, vec, 13);
-    TEST_ALIGNR256(SP1, SP2, vec, 14);
-    TEST_ALIGNR256(SP1, SP2, vec, 15);
-    TEST_ALIGNR256(SP1, SP2, vec, 16);
-    TEST_ALIGNR256(SP1, SP2, vec, 17);
-    TEST_ALIGNR256(SP1, SP2, vec, 18);
-    TEST_ALIGNR256(SP1, SP2, vec, 19);
-    TEST_ALIGNR256(SP1, SP2, vec, 20);
-    TEST_ALIGNR256(SP1, SP2, vec, 21);
-    TEST_ALIGNR256(SP1, SP2, vec, 22);
-    TEST_ALIGNR256(SP1, SP2, vec, 23);
-    TEST_ALIGNR256(SP1, SP2, vec, 24);
-    TEST_ALIGNR256(SP1, SP2, vec, 25);
-    TEST_ALIGNR256(SP1, SP2, vec, 26);
-    TEST_ALIGNR256(SP1, SP2, vec, 27);
-    TEST_ALIGNR256(SP1, SP2, vec, 28);
-    TEST_ALIGNR256(SP1, SP2, vec, 29);
-    TEST_ALIGNR256(SP1, SP2, vec, 30);
-    TEST_ALIGNR256(SP1, SP2, vec, 31);
-    TEST_ALIGNR256(SP1, SP2, vec, 32);
+    for(int j=0; j<32; j++) {
+        TEST_ALIGNR256(SP1, SP2, vec, j);
+    }
  }
-*/
  
  #endif // HAVE_AVX2
  
@@ -1073,9 +1025,8 @@ TEST(SuperVectorUtilsTest,LShift128_512c){
  
  
  /*Define ALIGNR512 macro*/
-/*
  #define TEST_ALIGNR512(v1, v2, buf, l) {                                                 \
-                                           auto v_aligned = v2.alignr(v1, l);            \
+                                           auto v_aligned = v1.alignr(v2, l);            \
                                             for (size_t i=0; i<64; i++) {                 \
                                                 ASSERT_EQ(v_aligned.u.u8[i], vec[i + l]); \
                                             }                                             \
@@ -1087,24 +1038,10 @@ TEST(SuperVectorUtilsTest,Alignr512c){
          vec[i]=i;
      }
      auto SP1 = SuperVector<64>::loadu(vec);
-    auto SP2 = SuperVector<64>::loadu(vec+32);
-    TEST_ALIGNR512(SP1, SP2, vec, 0);
-    TEST_ALIGNR512(SP1, SP2, vec, 1);
-    TEST_ALIGNR512(SP1, SP2, vec, 2);
-    TEST_ALIGNR512(SP1, SP2, vec, 3);
-    TEST_ALIGNR512(SP1, SP2, vec, 4);
-    TEST_ALIGNR512(SP1, SP2, vec, 5);
-    TEST_ALIGNR512(SP1, SP2, vec, 6);
-    TEST_ALIGNR512(SP1, SP2, vec, 7);
-    TEST_ALIGNR512(SP1, SP2, vec, 8);
-    TEST_ALIGNR512(SP1, SP2, vec, 9);
-    TEST_ALIGNR512(SP1, SP2, vec, 10);
-    TEST_ALIGNR512(SP1, SP2, vec, 11);
-    TEST_ALIGNR512(SP1, SP2, vec, 12);
-    TEST_ALIGNR512(SP1, SP2, vec, 13);
-    TEST_ALIGNR512(SP1, SP2, vec, 14);
-    TEST_ALIGNR512(SP1, SP2, vec, 15);
-    TEST_ALIGNR512(SP1, SP2, vec, 16);
+    auto SP2 = SuperVector<64>::loadu(vec+64);
+    for(int j=0; j<64; j++){
+        TEST_ALIGNR512(SP1, SP2, vec, j);
+    }
  }
-*/
+
  #endif // HAVE_AVX512
author	apostolos <apostolos.tapsas@vectorcamp.gr>
	Wed, 28 Jul 2021 09:55:32 +0000 (12:55 +0300)
committer	Konstantinos Margaritis <konstantinos@vectorcamp.gr>
	Tue, 12 Oct 2021 08:51:34 +0000 (11:51 +0300)
src/util/supervector/arch/x86/impl.cpp		patch \| blob \| blame \| history
unit/internal/supervector.cpp		patch \| blob \| blame \| history