optimize comparemask implementation, clean up code, use union types instead of casts

author Konstantinos Margaritis <konstantinos@vectorcamp.gr>

Tue, 6 Sep 2022 23:02:11 +0000 (02:02 +0300)

committer Konstantinos Margaritis <konstantinos@vectorcamp.gr>

Tue, 6 Sep 2022 23:02:11 +0000 (02:02 +0300)
author Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Tue, 6 Sep 2022 23:02:11 +0000 (02:02 +0300)
committer Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Tue, 6 Sep 2022 23:02:11 +0000 (02:02 +0300)
diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp

index 5becb8f8151eb02a7231549801074dcdb161da52..7903bee29cd3a1775c6651977987e7bce68a6393 100644 (file)
--- a/src/util/supervector/arch/ppc64el/impl.cpp
+++ b/src/util/supervector/arch/ppc64el/impl.cpp
@@ -39,7 +39,7 @@
  #include "util/supervector/supervector.hpp"
  #include <iostream>
  
-// 128-bit Powerpc64le implementation
+// 128-bit IBM Power VSX implementation
  
  template<>
  really_inline SuperVector<16>::SuperVector(SuperVector const &other)
@@ -47,6 +47,69 @@ really_inline SuperVector<16>::SuperVector(SuperVector const &other)
      u.v128[0] = other.u.v128[0];
  }
  
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(char __bool __vector v)
+{
+    u.u8x16[0] = (uint8x16_t) v;
+};
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int8x16_t const v)
+{
+    u.s8x16[0] = v;
+};
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint8x16_t const v)
+{
+    u.u8x16[0] = v;
+};
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int16x8_t const v)
+{
+    u.s16x8[0] = v;
+};
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint16x8_t const v)
+{
+    u.u16x8[0] = v;
+};
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int32x4_t const v)
+{
+    u.s32x4[0] = v;
+};
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint32x4_t const v)
+{
+    u.u32x4[0] = v;
+};
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int64x2_t const v)
+{
+    u.s64x2[0] = v;
+};
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint64x2_t const v)
+{
+    u.u64x2[0] = v;
+};
+
  template<>
  really_inline SuperVector<16>::SuperVector(typename base_type::type const v)
  {
@@ -57,69 +120,69 @@ template<>
  template<>
  really_inline SuperVector<16>::SuperVector(int8_t const other)
  {
-    u.v128[0] = (m128) vec_splats(other);
+    u.s8x16[0] = vec_splats(other);
  }
  
  template<>
  template<>
  really_inline SuperVector<16>::SuperVector(uint8_t const other)
  {
-    u.v128[0] = (m128) vec_splats(static_cast<uint8_t>(other));
+    u.u8x16[0] = vec_splats(static_cast<uint8_t>(other));
  }
  
  template<>
  template<>
  really_inline SuperVector<16>::SuperVector(int16_t const other)
  {
-    u.v128[0] = (m128) vec_splats(other);
+    u.s16x8[0] = vec_splats(other);
  }
  
  template<>
  template<>
  really_inline SuperVector<16>::SuperVector(uint16_t const other)
  {
-    u.v128[0] = (m128) vec_splats(static_cast<uint16_t>(other));
+    u.u16x8[0] = vec_splats(static_cast<uint16_t>(other));
  }
  
  template<>
  template<>
  really_inline SuperVector<16>::SuperVector(int32_t const other)
  {
-    u.v128[0] = (m128) vec_splats(other);
+    u.s32x4[0] = vec_splats(other);
  }
  
  template<>
  template<>
  really_inline SuperVector<16>::SuperVector(uint32_t const other)
  {
-    u.v128[0] = (m128) vec_splats(static_cast<uint32_t>(other));
+    u.u32x4[0] = vec_splats(static_cast<uint32_t>(other));
  }
  
  template<>
  template<>
  really_inline SuperVector<16>::SuperVector(int64_t const other)
  {
-    u.v128[0] = (m128) vec_splats(static_cast<ulong64_t>(other));
+    u.s64x2[0] = (int64x2_t) vec_splats(static_cast<ulong64_t>(other));
  }
  
  template<>
  template<>
  really_inline SuperVector<16>::SuperVector(uint64_t const other)
  {
-    u.v128[0] = (m128) vec_splats(static_cast<ulong64_t>(other));
+    u.u64x2[0] = (uint64x2_t) vec_splats(static_cast<ulong64_t>(other));
  }
  
  // Constants
  template<>
  really_inline SuperVector<16> SuperVector<16>::Ones(void)
  {
-    return  {(m128) vec_splat_s8(-1)};
+    return  { vec_splat_s8(-1)};
  }
  
  template<>
  really_inline SuperVector<16> SuperVector<16>::Zeroes(void)
  {
-    return  {(m128) vec_splat_s8(0)};
+    return  { vec_splat_s8(0) };
  }
  
  // Methods
@@ -133,39 +196,38 @@ really_inline void SuperVector<16>::operator=(SuperVector<16> const &other)
  template <>
  really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const &b) const
  {
-    return {vec_and(u.v128[0], b.u.v128[0])};
+    return { vec_and(u.v128[0], b.u.v128[0]) };
  }
  
  template <>
  really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const &b) const
  {
-    return  {vec_or(u.v128[0], b.u.v128[0])};
+    return  { vec_or(u.v128[0], b.u.v128[0]) };
  }
  
  template <>
  really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const &b) const
  {
-    return  {(m128) vec_xor(u.v128[0], b.u.v128[0])};
+    return  { vec_xor(u.v128[0], b.u.v128[0]) };
  }
  
  template <>
  really_inline SuperVector<16> SuperVector<16>::operator!() const
  {
-    return  {(m128) vec_xor(u.v128[0], u.v128[0])};
+    return  { vec_xor(u.v128[0], u.v128[0]) };
  }
  
  template <>
  really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const
  {
-   m128 not_res = vec_xor(u.v128[0], (m128)vec_splat_s8(-1));
-   return {(m128) vec_and(not_res, (m128)b.u.v128[0]) };
+   int8x16_t not_res = vec_xor(u.s8x16[0], vec_splat_s8(-1));
+   return { vec_and(not_res, b.u.s8x16[0]) };
  }
  
-
  template <>
  really_inline SuperVector<16> SuperVector<16>::operator==(SuperVector<16> const &b) const
  {
-    return {(m128) vec_cmpeq(u.s8x16[0], b.u.s8x16[0])};
+    return { vec_cmpeq(u.s8x16[0], b.u.s8x16[0])};
  }
  
  template <>
@@ -177,28 +239,27 @@ really_inline SuperVector<16> SuperVector<16>::operator!=(SuperVector<16> const
  template <>
  really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const &b) const
  { 
-    return {(m128) vec_cmpgt(u.v128[0], b.u.v128[0])}; 
+    return { vec_cmpgt(u.s8x16[0], b.u.s8x16[0])};
  }
  
  template <>
  really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const &b) const
  {
-    return {(m128) vec_cmpge(u.v128[0], b.u.v128[0])};  
+    return { vec_cmpge(u.s8x16[0], b.u.s8x16[0])};
  }
  
  template <>
  really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const &b) const
  {
-    return {(m128) vec_cmpgt(b.u.v128[0], u.v128[0])};  
+    return { vec_cmpgt(b.u.s8x16[0], u.s8x16[0])};
  }
  
  template <>
  really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const &b) const
  {   
-    return {(m128) vec_cmpge(b.u.v128[0], u.v128[0])};   
+    return { vec_cmpge(b.u.s8x16[0], u.s8x16[0])};
  }
  
-
  template <>
  really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const
  {
@@ -208,25 +269,12 @@ really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) cons
  template <>
  really_inline typename SuperVector<16>::comparemask_type
  SuperVector<16>::comparemask(void) const {
-    uint8x16_t s1 = vec_sr((uint8x16_t)u.v128[0], vec_splat_u8(7));
-    
-    uint16x8_t ss = vec_sr((uint16x8_t)s1, vec_splat_u16(7));
-    uint16x8_t res_and = vec_and((uint16x8_t)s1, vec_splats((uint16_t)0xff));
-    uint16x8_t s2 = vec_or((uint16x8_t)ss, res_and);
-    
-    uint32x4_t ss2 = vec_sr((uint32x4_t)s2 , vec_splat_u32(14));
-    uint32x4_t res_and2 = vec_and((uint32x4_t)s2, vec_splats((uint32_t)0xff));
-    uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2);
-
-    uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28));
-    uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((ulong64_t)0xff));
-    uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3);
-
-    uint64x2_t ss4 = vec_sld((uint64x2_t) vec_splats(0), s4, 9);
-    uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((ulong64_t)0xff));
-    uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4);
-    
-    return s5[0];
+    uint8x16_t bitmask = vec_gb( u.u8x16[0]);
+    static uint8x16_t perm = { 16, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    bitmask = (uint8x16_t) vec_perm(vec_splat_u8(0), bitmask, perm);
+    u32 movemask;
+    vec_ste((uint32x4_t) bitmask, 0, &movemask);
+    return movemask;
  }
  
  template <>
@@ -248,35 +296,35 @@ template <>
  template<uint8_t N>
  really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const
  {
-    return { (m128) vec_sl(u.s8x16[0], vec_splats((uint8_t)N)) }; 
+    return { vec_sl(u.s8x16[0], vec_splat_u8(N)) };
  }
  
  template <>
  template<uint8_t N>
  really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const
  {
-    return { (m128) vec_sl(u.s16x8[0], vec_splats((uint16_t)N)) };
+    return { vec_sl(u.s16x8[0], vec_splat_u16(N)) };
  }
  
  template <>
  template<uint8_t N>
  really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const
  {
-    return { (m128) vec_sl(u.s32x4[0], vec_splats((uint32_t)N)) };
+    return { vec_sl(u.s32x4[0], vec_splat_u32(N)) };
  }
  
  template <>
  template<uint8_t N>
  really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const
  {
-    return { (m128) vec_sl(u.s64x2[0], vec_splats((ulong64_t)N)) };
+    return { vec_sl(u.s64x2[0], vec_splats((ulong64_t) N)) };
  }
  
  template <>
  template<uint8_t N>
  really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const
  {
-    return { (m128) vec_sld(u.s8x16[0], (int8x16_t)vec_splat_s8(0), N)}; 
+    return { vec_sld(u.s8x16[0], vec_splat_s8(0), N)};
  }
  
  template <>
@@ -290,35 +338,35 @@ template <>
  template<uint8_t N>
  really_inline SuperVector<16> SuperVector<16>::vshr_8_imm() const
  {
-    return { (m128) vec_sr(u.s8x16[0], vec_splats((uint8_t)N)) };
+    return { vec_sr(u.s8x16[0], vec_splat_u8(N)) };
  }
  
  template <>
  template<uint8_t N>
  really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const
  {
-    return { (m128) vec_sr(u.s16x8[0], vec_splats((uint16_t)N)) }; 
+    return { vec_sr(u.s16x8[0], vec_splat_u16(N)) };
  }
  
  template <>
  template<uint8_t N>
  really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const
  {
-    return { (m128) vec_sr(u.s32x4[0], vec_splats((uint32_t)N)) };
+    return { vec_sr(u.s32x4[0], vec_splat_u32(N)) };
  }
  
  template <>
  template<uint8_t N>
  really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const
  {               
-   return { (m128) vec_sr(u.s64x2[0], vec_splats((ulong64_t)N)) }; 
+   return { vec_sr(u.s64x2[0], vec_splats((ulong64_t)N)) };
  }
  
  template <>
  template<uint8_t N>
  really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const
  {   
-    return { (m128) vec_sld((int8x16_t)vec_splat_s8(0), u.s8x16[0], 16 - N) }; 
+    return { vec_sld(vec_splat_s8(0), u.s8x16[0], 16 - N) };
  }
  
  template <>
@@ -535,9 +583,7 @@ template <>
  really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
  {
      SuperVector<16> mask = Ones_vshr(16 -len);
-    mask.print8("mask");
      SuperVector<16> v = loadu(ptr);
-    v.print8("v");
      return mask & v;
  }
  
@@ -574,9 +620,9 @@ really_inline SuperVector<16> SuperVector<16>::pshufb<false>(SuperVector<16> b)
      /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
         In NEON or PPC, if >=16, then the result is zero, otherwise it is that lane.
         below is the version that is converted from Intel to PPC.  */
-    uint8x16_t mask =(uint8x16_t)vec_cmpge(b.u.u8x16[0], (uint8x16_t)vec_splats((uint8_t)0x80));
+    uint8x16_t mask =(uint8x16_t)vec_cmpge(b.u.u8x16[0], vec_splats((uint8_t)0x80));
      uint8x16_t res = vec_perm (u.u8x16[0], u.u8x16[0], b.u.u8x16[0]);
-    return (m128) vec_sel(res, (uint8x16_t)vec_splat_s8(0), mask);
+    return { vec_sel(res, vec_splat_u8(0), mask) };
  }
  
  template<>
diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp

index 5d066c1ab679b8ce87fcb700d898d9cf21501e09..fef5f09f6de35f6d77b1f5fe8a39d5eb9b224dae 100644 (file)
--- a/src/util/supervector/supervector.hpp
+++ b/src/util/supervector/supervector.hpp
@@ -177,13 +177,13 @@ public:
  
  #if defined(ARCH_ARM32) || defined(ARCH_AARCH64) || defined(ARCH_PPC64EL)
      uint64x2_t ALIGN_ATTR(BaseVector<16>::size) u64x2[SIZE / BaseVector<16>::size];
-    int64x2_t  ALIGN_ATTR(BaseVector<16>::size) s64x2[SIZE / BaseVector<16>::size];
+    int64x2_t ALIGN_ATTR(BaseVector<16>::size) s64x2[SIZE / BaseVector<16>::size];
      uint32x4_t ALIGN_ATTR(BaseVector<16>::size) u32x4[SIZE / BaseVector<16>::size];
-    int32x4_t  ALIGN_ATTR(BaseVector<16>::size) s32x4[SIZE / BaseVector<16>::size];
+    int32x4_t ALIGN_ATTR(BaseVector<16>::size) s32x4[SIZE / BaseVector<16>::size];
      uint16x8_t ALIGN_ATTR(BaseVector<16>::size) u16x8[SIZE / BaseVector<16>::size];
-    int16x8_t  ALIGN_ATTR(BaseVector<16>::size) s16x8[SIZE / BaseVector<16>::size];
+    int16x8_t ALIGN_ATTR(BaseVector<16>::size) s16x8[SIZE / BaseVector<16>::size];
      uint8x16_t ALIGN_ATTR(BaseVector<16>::size) u8x16[SIZE / BaseVector<16>::size];
-    int8x16_t  ALIGN_ATTR(BaseVector<16>::size) s8x16[SIZE / BaseVector<16>::size];
+    int8x16_t ALIGN_ATTR(BaseVector<16>::size) s8x16[SIZE / BaseVector<16>::size];
  #endif
  
      uint64_t u64[SIZE / sizeof(uint64_t)];
@@ -204,7 +204,7 @@ public:
    SuperVector(typename base_type::type const v);
  
    template<typename T>
-  SuperVector(T other);
+  SuperVector(T const other);
  
    SuperVector(SuperVector<SIZE/2> const lo, SuperVector<SIZE/2> const hi);
    SuperVector(previous_type const lo, previous_type const hi);
author	Konstantinos Margaritis <konstantinos@vectorcamp.gr>
	Tue, 6 Sep 2022 23:02:11 +0000 (02:02 +0300)
committer	Konstantinos Margaritis <konstantinos@vectorcamp.gr>
	Tue, 6 Sep 2022 23:02:11 +0000 (02:02 +0300)
src/util/supervector/arch/ppc64el/impl.cpp		patch \| blob \| blame \| history
src/util/supervector/supervector.hpp		patch \| blob \| blame \| history