add ARM simd_utils vectorized functions for 128-bit vectors

author Konstantinos Margaritis <konstantinos@vectorcamp.gr>

Thu, 15 Oct 2020 13:26:49 +0000 (16:26 +0300)

committer Konstantinos Margaritis <konstantinos@vectorcamp.gr>

Thu, 15 Oct 2020 13:26:49 +0000 (16:26 +0300)
author Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Thu, 15 Oct 2020 13:26:49 +0000 (16:26 +0300)
committer Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Thu, 15 Oct 2020 13:26:49 +0000 (16:26 +0300)
diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h

index 606892fb9e5c5efd55f201f5cc36ffe64e1ba550..74f447fb2914cf16c612e386f4f7b31d3d5a93b1 100644 (file)
--- a/src/util/arch/arm/simd_utils.h
+++ b/src/util/arch/arm/simd_utils.h
@@ -95,7 +95,18 @@ static really_inline m128 eq128(m128 a, m128 b) {
      return (m128) vceqq_s8((int8x16_t)a, (int8x16_t)b);
  }
  
-#define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
+static really_inline u32 movemask128(m128 a) {
+    static const uint8x16_t powers = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
+
+    // Compute the mask from the input
+    uint64x2_t mask= vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers))));
+
+    // Get the resulting bytes
+    uint16_t output;
+    vst1q_lane_u8((uint8_t*)&output + 0, (uint8x16_t)mask, 0);
+    vst1q_lane_u8((uint8_t*)&output + 1, (uint8x16_t)mask, 8);
+    return output;
+}
  
  static really_inline m128 set1_16x8(u8 c) {
      return (m128) vdupq_n_u8(c);
@@ -229,21 +240,22 @@ void clearbit128(m128 *ptr, unsigned int n) {
  static really_inline
  char testbit128(m128 val, unsigned int n) {
      const m128 mask = mask1bit128(n);
-#if defined(HAVE_SSE41)
-    return !_mm_testz_si128(mask, val);
-#else
+
      return isnonzero128(and128(mask, val));
-#endif
  }
  
-// offset must be an immediate
-#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset)
+static really_inline
+m128 palignr(m128 r, m128 l, int offset) {
+    return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset);
+}
  
  static really_inline
  m128 pshufb_m128(m128 a, m128 b) {
-    m128 result;
-    result = _mm_shuffle_epi8(a, b);
-    return result;
+    /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
+       In NEON, if >=16, then the result is zero, otherwise it is that lane.
+       btranslated is the version that is converted from Intel to NEON.  */
+    int8x16_t btranslated = vandq_s8((int8x16_t)b,vdupq_n_s8(0x8f));
+    return (m128)vqtbl1q_s8((int8x16_t)a, (uint8x16_t)btranslated);
  }
  
  static really_inline
author	Konstantinos Margaritis <konstantinos@vectorcamp.gr>
	Thu, 15 Oct 2020 13:26:49 +0000 (16:26 +0300)
committer	Konstantinos Margaritis <konstantinos@vectorcamp.gr>
	Thu, 15 Oct 2020 13:26:49 +0000 (16:26 +0300)