Changes/Additions to SuperVector class * added ==,!=,>=,>,<=,< operators * reworked...

author Konstantinos Margaritis <konstantinos@vectorcamp.gr>

Sun, 3 Oct 2021 10:43:13 +0000 (10:43 +0000)

committer Konstantinos Margaritis <konstantinos@vectorcamp.gr>

Tue, 12 Oct 2021 08:51:34 +0000 (11:51 +0300)
author Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Sun, 3 Oct 2021 10:43:13 +0000 (10:43 +0000)
committer Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Tue, 12 Oct 2021 08:51:34 +0000 (11:51 +0300)
diff --git a/src/util/supervector/arch/arm/impl.cpp b/src/util/supervector/arch/arm/impl.cpp

index 65d0faa576a894610145e3b3e670431fadfc64c1..34e5486d94144e9c94ac5bf4accee5dfac562087 100644 (file)
--- a/src/util/supervector/arch/arm/impl.cpp
+++ b/src/util/supervector/arch/arm/impl.cpp
@@ -37,86 +37,80 @@
  
  // 128-bit NEON implementation
  
-template<>
-really_inline SuperVector<16>::SuperVector(SuperVector const &other)
-{
-  u.v128[0] = other.u.v128[0];
-}
-
  template<>
  really_inline SuperVector<16>::SuperVector(typename base_type::type const v)
  {
-  u.v128[0] = v;
-};
+    u.v128[0] = v;
+}
  
  template<>
  template<>
  really_inline SuperVector<16>::SuperVector<int8x16_t>(int8x16_t const other)
  {
-  u.v128[0] = static_cast<int32x4_t>(other);
+    u.v128[0] = static_cast<m128>(other);
  }
  
  template<>
  template<>
  really_inline SuperVector<16>::SuperVector<uint8x16_t>(uint8x16_t const other)
  {
-  u.v128[0] = static_cast<int32x4_t>(other);
+    u.v128[0] = static_cast<m128>(other);
  }
  
  template<>
  template<>
  really_inline SuperVector<16>::SuperVector<int8_t>(int8_t const other)
  {
-  u.v128[0] = vdupq_n_s8(other);
+    u.v128[0] = vdupq_n_s8(other);
  }
  
  template<>
  template<>
  really_inline SuperVector<16>::SuperVector<uint8_t>(uint8_t const other)
  {
-  u.v128[0] = vdupq_n_u8(other);
+    u.v128[0] = vdupq_n_u8(other);
  }
  
  template<>
  template<>
  really_inline SuperVector<16>::SuperVector<int16_t>(int16_t const other)
  {
-  u.v128[0] = vdupq_n_s16(other);
+    u.v128[0] = vdupq_n_s16(other);
  }
  
  template<>
  template<>
  really_inline SuperVector<16>::SuperVector<uint16_t>(uint16_t const other)
  {
-  u.v128[0] = vdupq_n_u16(other);
+    u.v128[0] = vdupq_n_u16(other);
  }
  
  template<>
  template<>
  really_inline SuperVector<16>::SuperVector<int32_t>(int32_t const other)
  {
-  u.v128[0] = vdupq_n_s32(other);
+    u.v128[0] = vdupq_n_s32(other);
  }
  
  template<>
  template<>
  really_inline SuperVector<16>::SuperVector<uint32_t>(uint32_t const other)
  {
-  u.v128[0] = vdupq_n_u32(other);
+    u.v128[0] = vdupq_n_u32(other);
  }
  
  template<>
  template<>
  really_inline SuperVector<16>::SuperVector<int64_t>(int64_t const other)
  {
-  u.v128[0] = vdupq_n_s64(other);
+    u.v128[0] = vdupq_n_s64(other);
  }
  
  template<>
  template<>
  really_inline SuperVector<16>::SuperVector<uint64_t>(uint64_t const other)
  {
-  u.v128[0] = vdupq_n_u64(other);
+    u.v128[0] = vdupq_n_u64(other);
  }
  
  // Constants
@@ -159,9 +153,9 @@ really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const &
  }
  
  template <>
-really_inline SuperVector<16> SuperVector<16>::opand(SuperVector<16> const &b) const
+really_inline SuperVector<16> SuperVector<16>::operator!() const
  {
-    return {vandq_s8(u.v128[0], b.u.v128[0])};
+    return {vmvnq_s8(u.v128[0])};
  }
  
  template <>
@@ -171,56 +165,279 @@ really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b
  }
  
  template <>
-really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const
+really_inline SuperVector<16> SuperVector<16>::operator==(SuperVector<16> const &b) const
  {
      return {vceqq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])};
  }
  
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator!=(SuperVector<16> const &b) const
+{
+    return !(*this == b);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const &b) const
+{
+    return {vcgtq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const &b) const
+{
+    return {vcgeq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const &b) const
+{
+    return {vcltq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const &b) const
+{
+    return {vcgeq_s8((int16x8_t)u.v128[0], (int16x8_t)b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const
+{
+    return (*this == b);
+}
+
  template <>
  really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(void) const
  {
-    static const uint8x16_t powers{ 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
+    SuperVector powers{0x8040201008040201UL};
  
      // Compute the mask from the input
-    uint64x2_t mask  = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint16x8_t)u.v128[0], powers))));
+    uint64x2_t mask  = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint16x8_t)u.v128[0], powers.u.v128[0]))));
      uint64x2_t mask1 = (m128)vextq_s8(mask, vdupq_n_u8(0), 7);
      mask = vorrq_u8(mask, mask1);
  
      // Get the resulting bytes
      uint16_t output;
-    vst1q_lane_u16((uint16_t*)&output, (uint16x8_t)mask, 0);
+    vst1q_lane_u16(&output, (uint16x8_t)mask, 0);
      return static_cast<typename SuperVector<16>::movemask_type>(output);
  }
  
  template <>
  really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(SuperVector<16> const b) const
  {
-  return eq(b).movemask();
-}
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::rshift128_var(uint8_t const N) const
-{
-    switch(N) {
-    case 1: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 1)}; break;
-    case 2: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 2)}; break;
-    case 3: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 3)}; break;
-    case 4: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 4)}; break;
-    case 5: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 5)}; break;
-    case 6: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 6)}; break;
-    case 7: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 7)}; break;
-    case 8: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 8)}; break;
-    case 9: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 9)}; break;
-    case 10: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 10)}; break;
-    case 11: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 11)}; break;
-    case 12: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 12)}; break;
-    case 13: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 13)}; break;
-    case 14: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 14)}; break;
-    case 15: return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), 15)}; break;
-    case 16: return Zeroes(); break;
-    default: break;
-    }
-    return *this;
+    return eq(b).movemask();
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const
+{
+    return {(m128)vshlq_n_s8(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const
+{
+    return {(m128)vshlq_n_s16(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const
+{
+    return {(m128)vshlq_n_s32(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const
+{
+    return {(m128)vshlq_n_s64(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const
+{
+    return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 16 - N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_imm() const
+{
+    return vshl_128_imm<N>();
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_8_imm() const
+{
+    return {(m128)vshrq_n_s8(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const
+{
+    return {(m128)vshrq_n_s16(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const
+{
+    return {(m128)vshrq_n_s32(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const
+{
+    return {(m128)vshrq_n_s64(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const
+{
+    return {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_imm() const
+{
+    return vshr_128_imm<N>();
+}
+
+#if !defined(HS_OPTIMIZE)
+template SuperVector<16> SuperVector<16>::vshl_8_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshl_16_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshl_64_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshl_64_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshl_128_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshl_128_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshr_8_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshr_8_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshr_16_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshr_64_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshr_64_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshr_128_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshr_128_imm<4>() const;
+#endif
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_8  (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshlq_n_s8(u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshlq_n_s16(u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshlq_n_s32(u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshlq_n_s64(u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 16 - n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl(uint8_t const N) const
+{
+    return vshl_128(N);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshrq_n_s8(u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshrq_n_s16(u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshrq_n_s32(u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128)vshrq_n_s64(u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_s8((int16x8_t)u.v128[0], vdupq_n_u8(0), n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const
+{
+    return vshr_128(N);
  }
  
  #ifdef HS_OPTIMIZE
@@ -233,35 +450,10 @@ really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
  template <>
  really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
  {
-    return rshift128_var(N);
+    return vshr_128(N);
  }
  #endif
  
-template <>
-really_inline SuperVector<16> SuperVector<16>::lshift128_var(uint8_t const N) const
-{
-    switch(N) {
-    case 1: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 15)}; break;
-    case 2: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 14)}; break;
-    case 3: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 13)}; break;
-    case 4: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 12)}; break;
-    case 5: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 11)}; break;
-    case 6: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 10)}; break;
-    case 7: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 9)}; break;
-    case 8: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 8)}; break;
-    case 9: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 7)}; break;
-    case 10: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 6)}; break;
-    case 11: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 5)}; break;
-    case 12: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 4)}; break;
-    case 13: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 3)}; break;
-    case 14: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 2)}; break;
-    case 15: return {vextq_s8(vdupq_n_u8(0), (int16x8_t)u.v128[0], 1)}; break;
-    case 16: return Zeroes(); break;
-    default: break;
-    }
-    return *this;
-}
-
  #ifdef HS_OPTIMIZE
  template <>
  really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
@@ -272,10 +464,23 @@ really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
  template <>
  really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
  {
-    return lshift128_var(N);
+    return vshl_128(N);
  }
  #endif
  
+
+template<>
+really_inline SuperVector<16> SuperVector<16>::Ones_vshr(uint8_t const N)
+{
+    return Ones().vshr_128(N);
+}
+
+template<>
+really_inline SuperVector<16> SuperVector<16>::Ones_vshl(uint8_t const N)
+{
+    return Ones().vshl_128(N);
+}
+
  template <>
  really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr)
  {
@@ -293,10 +498,10 @@ really_inline SuperVector<16> SuperVector<16>::load(void const *ptr)
  template <>
  really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
  {
-    SuperVector<16> mask = Ones().rshift128_var(16 -len);
-    mask.print8("mask");
+    SuperVector mask = Ones_vshr(16 -len);
+    //mask.print8("mask");
      SuperVector<16> v = loadu(ptr);
-    v.print8("v");
+    //v.print8("v");
      return mask & v;
  }
  
@@ -314,124 +519,53 @@ really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, in
  template<>
  really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
  {
-  switch(offset) {
-  case 0: return other; break;
-  case 1: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 1)}; break;
-  case 2: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 2)}; break;
-  case 3: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 3)}; break;
-  case 4: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 4)}; break;
-  case 5: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 5)}; break;
-  case 6: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 6)}; break;
-  case 7: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 7)}; break;
-  case 8: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 8)}; break;
-  case 9: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 9)}; break;
-  case 10: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 10)}; break;
-  case 11: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 11)}; break;
-  case 12: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 12)}; break;
-  case 13: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 13)}; break;
-  case 14: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 14)}; break;
-  case 15: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 15)}; break;
-  case 16: return *this; break;
-  default: break;
-  }
-  return *this;
+    switch(offset) {
+    case 0: return other; break;
+    case 1: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 1)}; break;
+    case 2: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 2)}; break;
+    case 3: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 3)}; break;
+    case 4: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 4)}; break;
+    case 5: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 5)}; break;
+    case 6: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 6)}; break;
+    case 7: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 7)}; break;
+    case 8: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 8)}; break;
+    case 9: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 9)}; break;
+    case 10: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 10)}; break;
+    case 11: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 11)}; break;
+    case 12: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 12)}; break;
+    case 13: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 13)}; break;
+    case 14: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 14)}; break;
+    case 15: return {vextq_s8((int16x8_t) other.u.v128[0], (int16x8_t) u.v128[0], 15)}; break;
+    case 16: return *this; break;
+    default: break;
+    }
+    return *this;
  }
  #endif
  
  template<>
-really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b)
-{
-    /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
-       In NEON, if >=16, then the result is zero, otherwise it is that lane.
-       btranslated is the version that is converted from Intel to NEON.  */
-    int8x16_t btranslated = vandq_s8((int8x16_t)b.u.v128[0], vdupq_n_s8(0x8f));
-    return {vqtbl1q_s8((int8x16_t)u.v128[0], (uint8x16_t)btranslated)};
-}
-
-template<>
-really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b, uint8_t const len)
-{
-    SuperVector<16> mask = Ones().rshift128_var(16 -len);
-    return mask & pshufb(b);
-}
-
-#ifdef HS_OPTIMIZE
  template<>
-really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const N)
+really_inline SuperVector<16> SuperVector<16>::pshufb<false>(SuperVector<16> b)
  {
-  return {(m128)vshlq_n_s64(u.v128[0], N)};
+    return {vqtbl1q_s8((int8x16_t)u.v128[0], (uint8x16_t)b.u.v128[0])};
  }
-#else
-template<>
-really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const N)
-{
-  switch(N) {
-  case 0: return {(m128)vshlq_n_s64(u.v128[0], 0)}; break;
-  case 1: return {(m128)vshlq_n_s64(u.v128[0], 1)}; break;
-  case 2: return {(m128)vshlq_n_s64(u.v128[0], 2)}; break;
-  case 3: return {(m128)vshlq_n_s64(u.v128[0], 3)}; break;
-  case 4: return {(m128)vshlq_n_s64(u.v128[0], 4)}; break;
-  case 5: return {(m128)vshlq_n_s64(u.v128[0], 5)}; break;
-  case 6: return {(m128)vshlq_n_s64(u.v128[0], 6)}; break;
-  case 7: return {(m128)vshlq_n_s64(u.v128[0], 7)}; break;
-  case 8: return {(m128)vshlq_n_s64(u.v128[0], 8)}; break;
-  case 9: return {(m128)vshlq_n_s64(u.v128[0], 9)}; break;
-  case 10: return {(m128)vshlq_n_s64(u.v128[0], 10)}; break;
-  case 11: return {(m128)vshlq_n_s64(u.v128[0], 11)}; break;
-  case 12: return {(m128)vshlq_n_s64(u.v128[0], 12)}; break;
-  case 13: return {(m128)vshlq_n_s64(u.v128[0], 13)}; break;
-  case 14: return {(m128)vshlq_n_s64(u.v128[0], 14)}; break;
-  case 15: return {(m128)vshlq_n_s64(u.v128[0], 15)}; break;
-  default: break;
-  }
-  return *this;
-}
-#endif
  
-#ifdef HS_OPTIMIZE
-template<>
-really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const N)
-{
-  return {(m128)vshrq_n_s64(u.v128[0], N)};
-}
-#else
  template<>
-really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const N)
-{
-  switch(N) {
-  case 0: return {(m128)vshrq_n_s64(u.v128[0], 0)}; break;
-  case 1: return {(m128)vshrq_n_s64(u.v128[0], 1)}; break;
-  case 2: return {(m128)vshrq_n_s64(u.v128[0], 2)}; break;
-  case 3: return {(m128)vshrq_n_s64(u.v128[0], 3)}; break;
-  case 4: return {(m128)vshrq_n_s64(u.v128[0], 4)}; break;
-  case 5: return {(m128)vshrq_n_s64(u.v128[0], 5)}; break;
-  case 6: return {(m128)vshrq_n_s64(u.v128[0], 6)}; break;
-  case 7: return {(m128)vshrq_n_s64(u.v128[0], 7)}; break;
-  case 8: return {(m128)vshrq_n_s64(u.v128[0], 8)}; break;
-  case 9: return {(m128)vshrq_n_s64(u.v128[0], 9)}; break;
-  case 10: return {(m128)vshrq_n_s64(u.v128[0], 10)}; break;
-  case 11: return {(m128)vshrq_n_s64(u.v128[0], 11)}; break;
-  case 12: return {(m128)vshrq_n_s64(u.v128[0], 12)}; break;
-  case 13: return {(m128)vshrq_n_s64(u.v128[0], 13)}; break;
-  case 14: return {(m128)vshrq_n_s64(u.v128[0], 14)}; break;
-  case 15: return {(m128)vshrq_n_s64(u.v128[0], 15)}; break;
-  default: break;
-  }
-  return *this;
-}
-#endif
-
  template<>
-really_inline SuperVector<16> SuperVector<16>::lshift128(uint8_t const N)
+really_inline SuperVector<16> SuperVector<16>::pshufb<true>(SuperVector<16> b)
  {
-  return *this << N;
+    /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
+       In NEON, if >=16, then the result is zero, otherwise it is that lane.
+       btranslated is the version that is converted from Intel to NEON.  */
+    SuperVector<16> btranslated = b & SuperVector<16>::dup_s8(0x8f);
+    return pshufb<false>(btranslated);
  }
  
  template<>
-really_inline SuperVector<16> SuperVector<16>::rshift128(uint8_t const N)
+really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b, uint8_t const len)
  {
-  return *this >> N;
+    SuperVector mask = Ones_vshr(16 -len);
+    return mask & pshufb<true>(b);
  }
  
-
  #endif // SIMD_IMPL_HPP
diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp

index 718cd0f6a9ecd78370e1d72ebd02d1d059c7a967..200783e19abfabc2e6943443c72859e170225357 100644 (file)
--- a/src/util/supervector/supervector.hpp
+++ b/src/util/supervector/supervector.hpp
@@ -174,8 +174,9 @@ public:
      double   f64[SIZE / sizeof(double)];
    } u;
  
-  SuperVector() {};
-  SuperVector(SuperVector const &other);
+  constexpr SuperVector() {};
+  constexpr SuperVector(SuperVector const &other)
+  :u(other.u) {};
    SuperVector(typename base_type::type const v);
  
    template<typename T>
@@ -198,11 +199,20 @@ public:
    SuperVector operator&(SuperVector const &b) const;
    SuperVector operator|(SuperVector const &b) const;
    SuperVector operator^(SuperVector const &b) const;
+  SuperVector operator!() const;
+
+  SuperVector operator==(SuperVector const &b) const;
+  SuperVector operator!=(SuperVector const &b) const;
+  SuperVector operator>(SuperVector const &b) const;
+  SuperVector operator>=(SuperVector const &b) const;
+  SuperVector operator<(SuperVector const &b) const;
+  SuperVector operator<=(SuperVector const &b) const;
  
    SuperVector opand(SuperVector const &b) const { return *this & b; }
    SuperVector opor (SuperVector const &b) const { return *this | b; }
    SuperVector opxor(SuperVector const &b) const { return *this ^ b; }
    SuperVector opandnot(SuperVector const &b) const;
+  SuperVector opnot() const { return !(*this); }
  
    SuperVector eq(SuperVector const &b) const;
    SuperVector operator<<(uint8_t const N) const;
@@ -215,6 +225,7 @@ public:
    static SuperVector loadu_maskz(void const *ptr, uint8_t const len);
    SuperVector alignr(SuperVector &other, int8_t offset);
  
+  template<bool emulateIntel>
    SuperVector pshufb(SuperVector b);
    SuperVector pshufb_maskz(SuperVector b, uint8_t const len);
  
diff --git a/unit/internal/supervector.cpp b/unit/internal/supervector.cpp

index 8b6830f01dc8bb4f4c3f5876e016779a629dc349..16a590469146590f9bc11ac9bc059ed9598b8620 100644 (file)
--- a/unit/internal/supervector.cpp
+++ b/unit/internal/supervector.cpp
@@ -284,7 +284,7 @@ TEST(SuperVectorUtilsTest,pshufb128c) {
      }
      auto SP1 = SuperVector<16>::loadu(vec);
      auto SP2 = SuperVector<16>::loadu(vec2);
-    auto SResult = SP1.pshufb(SP2);
+    auto SResult = SP1.template pshufb<true>(SP2);
      for (int i=0; i<16; i++) {
          ASSERT_EQ(vec[vec2[i]],SResult.u.u8[i]);
      }
author	Konstantinos Margaritis <konstantinos@vectorcamp.gr>
	Sun, 3 Oct 2021 10:43:13 +0000 (10:43 +0000)
committer	Konstantinos Margaritis <konstantinos@vectorcamp.gr>
	Tue, 12 Oct 2021 08:51:34 +0000 (11:51 +0300)
src/util/supervector/arch/arm/impl.cpp		patch \| blob \| blame \| history
src/util/supervector/supervector.hpp		patch \| blob \| blame \| history
unit/internal/supervector.cpp		patch \| blob \| blame \| history