WIP: rework fdr to use fewer instructions, gives about 10% performance increase on...

author Konstantinos Margaritis <markos@freevec.org>

Mon, 19 Feb 2024 11:09:02 +0000 (13:09 +0200)

committer Konstantinos Margaritis <konstantinos@vectorcamp.gr>

Mon, 19 Feb 2024 11:51:53 +0000 (19:51 +0800)
author Konstantinos Margaritis <markos@freevec.org>
Mon, 19 Feb 2024 11:09:02 +0000 (13:09 +0200)
committer Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Mon, 19 Feb 2024 11:51:53 +0000 (19:51 +0800)
diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c

index d67e271991257409aa1cf0031dbdaac96a5ef26f..62a08e4e4a1d354836cb8089ff80675d864d975e 100644 (file)
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -1,5 +1,6 @@
  /*
   * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2024, VectorCamp PC
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions are met:
@@ -103,6 +104,7 @@ m128 getInitState(const struct FDR *fdr, u8 len_history, const u64a *ft,
      return s;
  }
  
+#include "../print_simd.h"
  
  static really_inline
  void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
@@ -111,41 +113,97 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
      /* +1: the zones ensure that we can read the byte at z->end */
      assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
  
-    u64a it_hi = *(const u64a *)itPtr;
-    u64a it_lo = *(const u64a *)(itPtr + 8);
-    u64a reach0  = domain_mask & it_hi;
-    u64a reach1  = domain_mask & (it_hi >> 8);
-    u64a reach2  = domain_mask & (it_hi >> 16);
-    u64a reach3  = domain_mask & (it_hi >> 24);
-    u64a reach4  = domain_mask & (it_hi >> 32);
-    u64a reach5  = domain_mask & (it_hi >> 40);
-    u64a reach6  = domain_mask & (it_hi >> 48);
-    u64a reach7  = domain_mask & ((it_hi >> 56) | (it_lo << 8));
-    u64a reach8  = domain_mask & it_lo;
-    u64a reach9  = domain_mask & (it_lo >> 8);
-    u64a reach10 = domain_mask & (it_lo >> 16);
-    u64a reach11 = domain_mask & (it_lo >> 24);
-    u64a reach12 = domain_mask & (it_lo >> 32);
-    u64a reach13 = domain_mask & (it_lo >> 40);
-    u64a reach14 = domain_mask & (it_lo >> 48);
-    u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15);
-
-    m128 st0  = load_m128_from_u64a(ft + reach0);
-    m128 st1  = lshiftbyte_m128(load_m128_from_u64a(ft + reach1), 1);
-    m128 st2  = lshiftbyte_m128(load_m128_from_u64a(ft + reach2), 2);
-    m128 st3  = lshiftbyte_m128(load_m128_from_u64a(ft + reach3), 3);
-    m128 st4  = lshiftbyte_m128(load_m128_from_u64a(ft + reach4), 4);
-    m128 st5  = lshiftbyte_m128(load_m128_from_u64a(ft + reach5), 5);
-    m128 st6  = lshiftbyte_m128(load_m128_from_u64a(ft + reach6), 6);
-    m128 st7  = lshiftbyte_m128(load_m128_from_u64a(ft + reach7), 7);
-    m128 st8  = load_m128_from_u64a(ft + reach8);
-    m128 st9  = lshiftbyte_m128(load_m128_from_u64a(ft + reach9), 1);
-    m128 st10 = lshiftbyte_m128(load_m128_from_u64a(ft + reach10), 2);
-    m128 st11 = lshiftbyte_m128(load_m128_from_u64a(ft + reach11), 3);
-    m128 st12 = lshiftbyte_m128(load_m128_from_u64a(ft + reach12), 4);
-    m128 st13 = lshiftbyte_m128(load_m128_from_u64a(ft + reach13), 5);
-    m128 st14 = lshiftbyte_m128(load_m128_from_u64a(ft + reach14), 6);
-    m128 st15 = lshiftbyte_m128(load_m128_from_u64a(ft + reach15), 7);
+    // u64a ALIGN_ATTR(16) reach[16];
+    u32 ALIGN_ATTR(16) reach[16];
+
+    m128 domain_mask_v = set1_4x32(domain_mask);
+    // m256 ft_v = set1_4x64((ptrdiff_t)ft);
+    
+    m128 it_v = loadu128(itPtr);
+    m128 it_shifted8_v = rshiftbyte_m128(it_v, 1);
+    m128 it_shifted16_v = rshiftbyte_m128(it_v, 2);
+    m128 it_shifted24_v = rshiftbyte_m128(it_v, 3);
+    it_shifted24_v = insert32_m128(it_shifted24_v, unaligned_load_u32(itPtr + 15), 3);
+
+    m128 reach_v[4];
+    // m256 reach64_v[4];
+
+    reach_v[0] = and128(domain_mask_v, it_v);
+    reach_v[1] = and128(domain_mask_v, it_shifted8_v);
+    reach_v[2] = and128(domain_mask_v, it_shifted16_v);
+    reach_v[3] = and128(domain_mask_v, it_shifted24_v);
+
+    // reach_v[0] = lshift32_m128(reach_v[0], 3);
+    // reach_v[1] = lshift32_m128(reach_v[1], 3);
+    // reach_v[2] = lshift32_m128(reach_v[2], 3);
+    // reach_v[3] = lshift32_m128(reach_v[3], 3);
+
+    // reach64_v[0] = widen128(reach_v[0]);
+    // reach64_v[1] = widen128(reach_v[1]);
+    // reach64_v[2] = widen128(reach_v[2]);
+    // reach64_v[3] = widen128(reach_v[3]);
+
+    // reach64_v[0] = add256(reach64_v[0], ft_v);
+    // reach64_v[1] = add256(reach64_v[1], ft_v);
+    // reach64_v[2] = add256(reach64_v[2], ft_v);
+    // reach64_v[3] = add256(reach64_v[3], ft_v);
+
+    // store256(&reach[0], reach64_v[0]);
+    // store256(&reach[4], reach64_v[1]);
+    // store256(&reach[8], reach64_v[2]);
+    // store256(&reach[12], reach64_v[3]);
+    store128(&reach[0], reach_v[0]);
+    store128(&reach[4], reach_v[1]);
+    store128(&reach[8], reach_v[2]);
+    store128(&reach[12], reach_v[3]);
+
+    m128 st0  = load_m128_from_u64a(ft + reach[0]);
+    m128 st4  = load_m128_from_u64a(ft + reach[1]);
+    m128 st8  = load_m128_from_u64a(ft + reach[2]);
+    m128 st12 = load_m128_from_u64a(ft + reach[3]);
+    m128 st1  = load_m128_from_u64a(ft + reach[4]);
+    m128 st5  = load_m128_from_u64a(ft + reach[5]);
+    m128 st9  = load_m128_from_u64a(ft + reach[6]);
+    m128 st13 = load_m128_from_u64a(ft + reach[7]);
+    m128 st2  = load_m128_from_u64a(ft + reach[8]);
+    m128 st6  = load_m128_from_u64a(ft + reach[9]);
+    m128 st10 = load_m128_from_u64a(ft + reach[10]);
+    m128 st14 = load_m128_from_u64a(ft + reach[11]);
+    m128 st3  = load_m128_from_u64a(ft + reach[12]);
+    m128 st7  = load_m128_from_u64a(ft + reach[13]);
+    m128 st11 = load_m128_from_u64a(ft + reach[14]);
+    m128 st15 = load_m128_from_u64a(ft + reach[15]);
+    // m128 st0  = load_m128_from_u64a((u64a *)reach[0]);
+    // m128 st4  = load_m128_from_u64a((u64a *)reach[1]);
+    // m128 st8  = load_m128_from_u64a((u64a *)reach[2]);
+    // m128 st12 = load_m128_from_u64a((u64a *)reach[3]);
+    // m128 st1  = load_m128_from_u64a((u64a *)reach[4]);
+    // m128 st5  = load_m128_from_u64a((u64a *)reach[5]);
+    // m128 st9  = load_m128_from_u64a((u64a *)reach[6]);
+    // m128 st13 = load_m128_from_u64a((u64a *)reach[7]);
+    // m128 st2  = load_m128_from_u64a((u64a *)reach[8]);
+    // m128 st6  = load_m128_from_u64a((u64a *)reach[9]);
+    // m128 st10 = load_m128_from_u64a((u64a *)reach[10]);
+    // m128 st14 = load_m128_from_u64a((u64a *)reach[11]);
+    // m128 st3  = load_m128_from_u64a((u64a *)reach[12]);
+    // m128 st7  = load_m128_from_u64a((u64a *)reach[13]);
+    // m128 st11 = load_m128_from_u64a((u64a *)reach[14]);
+    // m128 st15 = load_m128_from_u64a((u64a *)reach[15]);
+    
+    st1  = lshiftbyte_m128(st1, 1);
+    st2  = lshiftbyte_m128(st2, 2);
+    st3  = lshiftbyte_m128(st3, 3);
+    st4  = lshiftbyte_m128(st4, 4);
+    st5  = lshiftbyte_m128(st5, 5);
+    st6  = lshiftbyte_m128(st6, 6);
+    st7  = lshiftbyte_m128(st7, 7);
+    st9  = lshiftbyte_m128(st9, 1);
+    st10 = lshiftbyte_m128(st10, 2);
+    st11 = lshiftbyte_m128(st11, 3);
+    st12 = lshiftbyte_m128(st12, 4);
+    st13 = lshiftbyte_m128(st13, 5);
+    st14 = lshiftbyte_m128(st14, 6);
+    st15 = lshiftbyte_m128(st15, 7);
  
      st0 = or128(st0, st1);
      st2 = or128(st2, st3);
diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h

index 7f8539b090777e34262263b1493eb337bb028628..858866d77b08a48c1c626af36610d5797db11b29 100644 (file)
--- a/src/util/arch/arm/simd_utils.h
+++ b/src/util/arch/arm/simd_utils.h
@@ -181,6 +181,10 @@ static really_inline m128 set1_2x64(u64a c) {
      return (m128) vdupq_n_u64(c);
  }
  
+static really_inline m128 insert32_m128(m128 in, u32 val, const int imm) {
+    return vsetq_lane_u32((uint32x4_t)in, val, imm);
+}
+
  static really_inline u32 movd(const m128 in) {
      return vgetq_lane_u32((uint32x4_t) in, 0);
  }
@@ -195,6 +199,12 @@ m128 load_m128_from_u64a(const u64a *p) {
      return (m128) vsetq_lane_u64(*p, (uint64x2_t) zeroes128(), 0);
  }
  
+/* another form of movq */
+static really_inline
+m128 load_m128_from_u64a(const u64a *p) {
+    return (m128) vsetq_lane_u64(*p, (uint64x2_t) zeroes128(), 0);
+}
+
  static really_inline u32 extract32from128(const m128 in, unsigned imm) {
  #if defined(HAVE__BUILTIN_CONSTANT_P)
      if (__builtin_constant_p(imm)) {
diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h

index 24331b1037c85de3da5be4406699ec136d432cc4..4ac92ab3a54f7d320165a2b82c8046b27d17bcd2 100644 (file)
--- a/src/util/arch/common/simd_utils.h
+++ b/src/util/arch/common/simd_utils.h
@@ -384,6 +384,14 @@ m256 pshufb_m256(m256 a, m256 b) {
      return rv;
  }
  
+static really_inline
+m256 widen128(m128 x) {
+    m256 rv;
+    rv.lo = widenlo128(x);
+    rv.hi = widenhi128(x);
+    return rv;
+}
+
  #endif // HAVE_SIMD_256_BITS
  
  /****
diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h

index 49797ababa91ded8a574d8bd1315b6b9b45ffab1..9c2984c0e412050a6f880dc0637174999ae792b4 100644 (file)
--- a/src/util/arch/x86/simd_utils.h
+++ b/src/util/arch/x86/simd_utils.h
@@ -122,6 +122,17 @@ m128 sub_2x64(m128 a, m128 b) {
      return (m128) _mm_sub_epi64(a, b);
  }
  
+static really_really_inline
+m128 lshift32_m128(m128 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return _mm_slli_epi32(a, b);
+    }
+#endif
+    m128 x = _mm_cvtsi32_si128(b);
+    return _mm_sll_epi32(a, x);
+}
+
  static really_really_inline
  m128 lshift64_m128(m128 a, unsigned b) {
  #if defined(HAVE__BUILTIN_CONSTANT_P)
@@ -156,6 +167,10 @@ static really_inline m128 set1_2x64(u64a c) {
      return _mm_set1_epi64x(c);
  }
  
+static really_inline m128 insert32_m128(m128 in, u32 val, const int imm) {
+    return _mm_insert_epi32(in, val, imm);
+}
+
  static really_inline u32 movd(const m128 in) {
      return _mm_cvtsi128_si32(in);
  }
@@ -451,6 +466,18 @@ m128 set2x64(u64a hi, u64a lo) {
      return _mm_set_epi64x(hi, lo);
  }
  
+#include "../print_simd.h"
+
+static really_inline
+m128 widenlo128(m128 x) {
+    return _mm_unpacklo_epi32(x, zeroes128());
+}
+
+static really_inline
+m128 widenhi128(m128 x) {
+    return _mm_unpackhi_epi32(x, zeroes128());
+}
+
  /****
   **** 256-bit Primitives
   ****/
@@ -677,6 +704,12 @@ m256 combine2x128(m128 hi, m128 lo) {
      return insert128to256(cast128to256(lo), hi, 1);
  #endif
  }
+
+static really_inline
+m256 widen128(m128 x) {
+    return (m256) _mm256_cvtepu32_epi64(x);
+}
+
  #endif //AVX2
  
  /****
author	Konstantinos Margaritis <markos@freevec.org>
	Mon, 19 Feb 2024 11:09:02 +0000 (13:09 +0200)
committer	Konstantinos Margaritis <konstantinos@vectorcamp.gr>
	Mon, 19 Feb 2024 11:51:53 +0000 (19:51 +0800)
src/fdr/fdr.c		patch \| blob \| blame \| history
src/util/arch/arm/simd_utils.h		patch \| blob \| blame \| history
src/util/arch/common/simd_utils.h		patch \| blob \| blame \| history
src/util/arch/x86/simd_utils.h		patch \| blob \| blame \| history