add missing ARM SuperVector methods, some tests still fail, WIP

author Konstantinos Margaritis <konstantinos@vectorcamp.gr>

Fri, 11 Jun 2021 10:33:01 +0000 (13:33 +0300)

committer Konstantinos Margaritis <konstantinos@vectorcamp.gr>

Tue, 12 Oct 2021 08:51:34 +0000 (11:51 +0300)
author Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Fri, 11 Jun 2021 10:33:01 +0000 (13:33 +0300)
committer Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Tue, 12 Oct 2021 08:51:34 +0000 (11:51 +0300)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index 8b46e61056aeb2d194370331b30d7e64358a9a70..7645ee56dcfa0fc8b3ff44363866b48eb05d244c 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -602,7 +602,7 @@ set (hs_exec_common_SRCS
      ${hs_exec_common_SRCS}
      src/util/arch/x86/cpuid_flags.c
      )
-else (ARCH_ARM32 OR ARCH_AARCH64)
+elseif (ARCH_ARM32 OR ARCH_AARCH64)
  set (hs_exec_common_SRCS
      ${hs_exec_common_SRCS}
      src/util/arch/arm/cpuid_flags.c
@@ -758,7 +758,7 @@ if (ARCH_IA32 OR ARCH_X86_64)
  set (hs_exec_SRCS
      ${hs_exec_SRCS}
      src/util/simd/arch/x86/impl.cpp)
-else (ARCH_ARM32 OR ARCH_AARCH64)
+elseif (ARCH_ARM32 OR ARCH_AARCH64)
  set (hs_exec_SRCS
      ${hs_exec_SRCS}
      src/util/simd/arch/arm/impl.cpp)
diff --git a/src/util/simd/arch/arm/impl.cpp b/src/util/simd/arch/arm/impl.cpp

index 2c150489508fcde565f808c67e4998bd32612595..75796a4b6ec15ef4d9ead60520db7bcf7736b6fc 100644 (file)
--- a/src/util/simd/arch/arm/impl.cpp
+++ b/src/util/simd/arch/arm/impl.cpp
@@ -131,6 +131,8 @@ really_inline SuperVector<16> SuperVector<16>::Zeroes(void)
      return {vdupq_n_u8(0)};
  }
  
+// Methods
+
  template <>
  really_inline void SuperVector<16>::operator=(SuperVector<16> const &o)
  {
@@ -143,6 +145,24 @@ really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const b
      return {vandq_s8(u.v128[0], b.u.v128[0])};
  }
  
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const b) const
+{
+    return {vandq_s8(u.v128[0], b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::opand(SuperVector<16> const b) const
+{
+    return {vandq_s8(u.v128[0], b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const b) const
+{
+    return {vandq_s8(u.v128[0], b.u.v128[0])};
+}
+
  template <>
  really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const b) const
  {
@@ -171,7 +191,7 @@ really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(Su
         return eq(b).movemask();
  }
  
-#ifndef DEBUG
+#ifndef HS_OPTIMIZE
  template <>
  really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
  {
@@ -205,6 +225,38 @@ really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
  }
  #endif
  
+#ifdef HS_OPTIMIZE
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
+{
+       return {vshrq_n_s32(u.v128[0], N)};
+}
+#else
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
+{
+       switch(N) {
+       case 0: return {vshrq_n_s32(u.v128[0], 0)}; break;
+       case 1: return {vshrq_n_s32(u.v128[0], 1)}; break;
+       case 2: return {vshrq_n_s32(u.v128[0], 2)}; break;
+       case 3: return {vshrq_n_s32(u.v128[0], 3)}; break;
+       case 4: return {vshrq_n_s32(u.v128[0], 4)}; break;
+       case 5: return {vshrq_n_s32(u.v128[0], 5)}; break;
+       case 6: return {vshrq_n_s32(u.v128[0], 6)}; break;
+       case 7: return {vshrq_n_s32(u.v128[0], 7)}; break;
+       case 8: return {vshrq_n_s32(u.v128[0], 8)}; break;
+       case 9: return {vshrq_n_s32(u.v128[0], 9)}; break;
+       case 10: return {vshrq_n_s32(u.v128[0], 10)}; break;
+       case 11: return {vshrq_n_s32(u.v128[0], 11)}; break;
+       case 12: return {vshrq_n_s32(u.v128[0], 12)}; break;
+       case 13: return {vshrq_n_s32(u.v128[0], 13)}; break;
+       case 14: return {vshrq_n_s32(u.v128[0], 14)}; break;
+       case 15: return {vshrq_n_s32(u.v128[0], 15)}; break;
+       default: break;
+       }
+       return *this;
+}
+#endif
  
  template <>
  really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr)
@@ -217,10 +269,20 @@ really_inline SuperVector<16> SuperVector<16>::load(void const *ptr)
  {
      assert(ISALIGNED_N(ptr, alignof(SuperVector::size)));
      ptr = assume_aligned(ptr, SuperVector::size);
-    return vld1q_s32((const int32_t *)ptr);
+    return {vld1q_s32((const int32_t *)ptr)};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
+{
+    uint8_t alignment = (uintptr_t)(ptr) & 15;
+    SuperVector<16> maskb = Ones() << alignment;
+    SuperVector<16> maske = Ones() >> (16 -len - alignment);
+    SuperVector<16> v = SuperVector<16>::loadu((const m128 *)ptr);
+    return {maskb.u.v128[0] & maske.u.v128[0] & v.u.v128[0]};
  }
  
-#ifndef DEBUG
+#ifndef HS_OPTIMIZE
  template<>
  really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> r, int8_t offset)
  {
@@ -254,6 +316,81 @@ really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> l, int8_t
  }
  #endif
  
+template<>
+really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b)
+{
+    /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
+       In NEON, if >=16, then the result is zero, otherwise it is that lane.
+       btranslated is the version that is converted from Intel to NEON.  */
+    int8x16_t btranslated = vandq_s8((int8x16_t)b.u.v128[0], vdupq_n_s8(0x8f));
+    return {vqtbl1q_s8((int8x16_t)u.v128[0], (uint8x16_t)btranslated)};
+}
+
+#ifdef HS_OPTIMIZE
+template<>
+really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const l)
+{
+       return {(m128)vshlq_n_s64(u.v128[0], l)};
+}
+#else
+template<>
+really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const l)
+{
+       switch(l) {
+       case 0: return {vshlq_n_s64(u.v128[0], 0)}; break;
+       case 1: return {vshlq_n_s64(u.v128[0], 1)}; break;
+       case 2: return {vshlq_n_s64(u.v128[0], 2)}; break;
+       case 3: return {vshlq_n_s64(u.v128[0], 3)}; break;
+       case 4: return {vshlq_n_s64(u.v128[0], 4)}; break;
+       case 5: return {vshlq_n_s64(u.v128[0], 5)}; break;
+       case 6: return {vshlq_n_s64(u.v128[0], 6)}; break;
+       case 7: return {vshlq_n_s64(u.v128[0], 7)}; break;
+       case 8: return {vshlq_n_s64(u.v128[0], 8)}; break;
+       case 9: return {vshlq_n_s64(u.v128[0], 9)}; break;
+       case 10: return {vshlq_n_s64(u.v128[0], 10)}; break;
+       case 11: return {vshlq_n_s64(u.v128[0], 11)}; break;
+       case 12: return {vshlq_n_s64(u.v128[0], 12)}; break;
+       case 13: return {vshlq_n_s64(u.v128[0], 13)}; break;
+       case 14: return {vshlq_n_s64(u.v128[0], 14)}; break;
+       case 15: return {vshlq_n_s64(u.v128[0], 15)}; break;
+       default: break;
+       }
+       return *this;
+}
+#endif
+
+#ifdef HS_OPTIMIZE
+template<>
+really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const l)
+{
+       return {(m128)vshrq_n_s64(u.v128[0], l)};
+}
+#else
+template<>
+really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const l)
+{
+       switch(l) {
+       case 0: return {vshrq_n_s64(u.v128[0], 0)}; break;
+       case 1: return {vshrq_n_s64(u.v128[0], 1)}; break;
+       case 2: return {vshrq_n_s64(u.v128[0], 2)}; break;
+       case 3: return {vshrq_n_s64(u.v128[0], 3)}; break;
+       case 4: return {vshrq_n_s64(u.v128[0], 4)}; break;
+       case 5: return {vshrq_n_s64(u.v128[0], 5)}; break;
+       case 6: return {vshrq_n_s64(u.v128[0], 6)}; break;
+       case 7: return {vshrq_n_s64(u.v128[0], 7)}; break;
+       case 8: return {vshrq_n_s64(u.v128[0], 8)}; break;
+       case 9: return {vshrq_n_s64(u.v128[0], 9)}; break;
+       case 10: return {vshrq_n_s64(u.v128[0], 10)}; break;
+       case 11: return {vshrq_n_s64(u.v128[0], 11)}; break;
+       case 12: return {vshrq_n_s64(u.v128[0], 12)}; break;
+       case 13: return {vshrq_n_s64(u.v128[0], 13)}; break;
+       case 14: return {vshrq_n_s64(u.v128[0], 14)}; break;
+       case 15: return {vshrq_n_s64(u.v128[0], 15)}; break;
+       default: break;
+       }
+       return *this;
+}
+#endif
  
  
  #endif // SIMD_IMPL_HPP
diff --git a/src/util/simd/arch/x86/impl.cpp b/src/util/simd/arch/x86/impl.cpp

index 476d28acd891979536c4b2fe1ce36c9986df18a1..d31325198ca9fb1abd15850bd3772efd76ccd2bb 100644 (file)
--- a/src/util/simd/arch/x86/impl.cpp
+++ b/src/util/simd/arch/x86/impl.cpp
@@ -165,13 +165,13 @@ really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const b
  }
  
  template <>
-really_inline SuperVector<16> SuperVector<16>::mand(SuperVector<16> const b) const
+really_inline SuperVector<16> SuperVector<16>::opand(SuperVector<16> const b) const
  {
      return *this & b;
  }
  
  template <>
-really_inline SuperVector<16> SuperVector<16>::mandnot(SuperVector<16> const b) const
+really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const b) const
  {
      return {_mm_andnot_si128(u.v128[0], b.u.v128[0])};
  }
diff --git a/src/util/simd/types.hpp b/src/util/simd/types.hpp

index a9883458d9c0b6a570071f6fe4a8a3036dd18600..4c94888808f19b64e8d0586e3c184b47bc98219c 100644 (file)
--- a/src/util/simd/types.hpp
+++ b/src/util/simd/types.hpp
@@ -175,8 +175,9 @@ public:
    SuperVector operator&(SuperVector const b) const;
    SuperVector operator|(SuperVector const b) const;
  
-  SuperVector mand(SuperVector const b) const;
-  SuperVector mandnot(SuperVector const b) const;
+  SuperVector opand(SuperVector const b) const;
+  SuperVector opor(SuperVector const b) const;
+  SuperVector opandnot(SuperVector const b) const;
  
    SuperVector eq(SuperVector const b) const;
    SuperVector operator<<(uint8_t const N) const;
author	Konstantinos Margaritis <konstantinos@vectorcamp.gr>
	Fri, 11 Jun 2021 10:33:01 +0000 (13:33 +0300)
committer	Konstantinos Margaritis <konstantinos@vectorcamp.gr>
	Tue, 12 Oct 2021 08:51:34 +0000 (11:51 +0300)
CMakeLists.txt		patch \| blob \| blame \| history
src/util/simd/arch/arm/impl.cpp		patch \| blob \| blame \| history
src/util/simd/arch/x86/impl.cpp		patch \| blob \| blame \| history
src/util/simd/types.hpp		patch \| blob \| blame \| history