remove loads from movemask128, variable_byte_shift, add palignr_imm(), minor fixes

author Konstantinos Margaritis <konstantinos@vectorcamp.gr>

Fri, 22 Jan 2021 08:13:19 +0000 (10:13 +0200)

committer Konstantinos Margaritis <markos@users.noreply.github.com>

Mon, 25 Jan 2021 10:13:35 +0000 (12:13 +0200)
author Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Fri, 22 Jan 2021 08:13:19 +0000 (10:13 +0200)
committer Konstantinos Margaritis <markos@users.noreply.github.com>
Mon, 25 Jan 2021 10:13:35 +0000 (12:13 +0200)
diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h

index dcf3fe581ccbc05ae35f73ca39235efe97a320f5..f3215fb22b84db86b21dddfa09967223c60eff89 100644 (file)
--- a/src/util/arch/arm/simd_utils.h
+++ b/src/util/arch/arm/simd_utils.h
@@ -121,16 +121,18 @@ static really_inline m128 eq64_m128(m128 a, m128 b) {
      return (m128) vceqq_u64((int64x2_t)a, (int64x2_t)b);
  }
  
+
  static really_inline u32 movemask128(m128 a) {
      static const uint8x16_t powers = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
  
      // Compute the mask from the input
-    uint64x2_t mask= vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers))));
+    uint64x2_t mask  = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers))));
+    uint64x2_t mask1 = (m128)vextq_s8(mask, zeroes128(), 7);
+    mask = vorrq_u8(mask, mask1);
  
      // Get the resulting bytes
      uint16_t output;
-    vst1q_lane_u8((uint8_t*)&output + 0, (uint8x16_t)mask, 0);
-    vst1q_lane_u8((uint8_t*)&output + 1, (uint8x16_t)mask, 8);
+    vst1q_lane_u16((uint16_t*)&output, (uint16x8_t)mask, 0);
      return output;
  }
  
@@ -233,14 +235,12 @@ static really_inline m128 andnot128(m128 a, m128 b) {
  // aligned load
  static really_inline m128 load128(const void *ptr) {
      assert(ISALIGNED_N(ptr, alignof(m128)));
-    ptr = assume_aligned(ptr, 16);
      return (m128) vld1q_s32((const int32_t *)ptr);
  }
  
  // aligned store
  static really_inline void store128(void *ptr, m128 a) {
      assert(ISALIGNED_N(ptr, alignof(m128)));
-    ptr = assume_aligned(ptr, 16);
      vst1q_s32((int32_t *)ptr, a);
  }
  
@@ -270,22 +270,13 @@ m128 loadbytes128(const void *ptr, unsigned int n) {
      return a;
  }
  
-static really_inline
-m128 variable_byte_shift_m128(m128 in, s32 amount) {
-    assert(amount >= -16 && amount <= 16);
-    m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
-    return vqtbl1q_s8(in, shift_mask);
-}
  
  #define CASE_ALIGN_VECTORS(a, b, offset)  case offset: return (m128)vextq_s8((int8x16_t)(a), (int8x16_t)(b), (offset)); break;
  
-static really_inline
-m128 palignr(m128 r, m128 l, int offset) {
-#if defined(HS_OPTIMIZE)
-        return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset);
-#else
+static really_really_inline
+m128 palignr_imm(m128 r, m128 l, int offset) {
      switch (offset) {
-    CASE_ALIGN_VECTORS(l, r, 0);
+    case 0: return l; break;
      CASE_ALIGN_VECTORS(l, r, 1);
      CASE_ALIGN_VECTORS(l, r, 2);
      CASE_ALIGN_VECTORS(l, r, 3);
@@ -301,30 +292,42 @@ m128 palignr(m128 r, m128 l, int offset) {
      CASE_ALIGN_VECTORS(l, r, 13);
      CASE_ALIGN_VECTORS(l, r, 14);
      CASE_ALIGN_VECTORS(l, r, 15);
+    case 16: return r; break;
      default:
         return zeroes128();
         break;
      }
+}
+
+static really_really_inline
+m128 palignr(m128 r, m128 l, int offset) {
+#if defined(HS_OPTIMIZE)
+    return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset);
+#else
+    return palignr_imm(r, l, offset);
  #endif
  }
  #undef CASE_ALIGN_VECTORS
  
  static really_really_inline
  m128 rshiftbyte_m128(m128 a, unsigned b) {
-    if (b)
-        return palignr(zeroes128(), a, b);
-    else
-        return a;
+    return palignr(zeroes128(), a, b);
  }
  
  static really_really_inline
  m128 lshiftbyte_m128(m128 a, unsigned b) {
-    if (b)
-        return palignr(a, zeroes128(), 16 - b);
-    else
-        return a;
+    return palignr(a, zeroes128(), 16 - b);
  }
  
+static really_inline
+m128 variable_byte_shift_m128(m128 in, s32 amount) {
+    assert(amount >= -16 && amount <= 16);
+    static const uint8x16_t vbs_mask = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f };
+    const uint8x16_t outside_mask = set1_16x8(0xf0);
+
+    m128 shift_mask = palignr_imm(vbs_mask, outside_mask, 16 - amount);
+    return vqtbl1q_s8(in, shift_mask);
+}
  
  #ifdef __cplusplus
  extern "C" {
author	Konstantinos Margaritis <konstantinos@vectorcamp.gr>
	Fri, 22 Jan 2021 08:13:19 +0000 (10:13 +0200)
committer	Konstantinos Margaritis <markos@users.noreply.github.com>
	Mon, 25 Jan 2021 10:13:35 +0000 (12:13 +0200)