aarch64: convert vector shift + bitwise and + multiply to vector compare

author mtsamis <manolis.tsamis@vrull.eu>

Mon, 1 Aug 2022 12:11:02 +0000 (14:11 +0200)

committer Philipp Tomsich <philipp.tomsich@vrull.eu>

Thu, 11 May 2023 19:59:09 +0000 (21:59 +0200)
author mtsamis <manolis.tsamis@vrull.eu>
Mon, 1 Aug 2022 12:11:02 +0000 (14:11 +0200)
committer Philipp Tomsich <philipp.tomsich@vrull.eu>
Thu, 11 May 2023 19:59:09 +0000 (21:59 +0200)
diff --git a/gcc/match.pd b/gcc/match.pd

index dde95766e8b26cc4a9994e8cc9faae5b31176aaa..b7f28ab074c2107d9799ad2ddf795e42f14e5ea1 100644 (file)
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -303,6 +303,67 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
      (view_convert (bit_and:itype (view_convert @0)
                                  (ne @1 { build_zero_cst (type); })))))))
  
+/* In SWAR (SIMD within a register) code a signed comparison of packed data
+   can be constructed with a particular combination of shift, bitwise and,
+   and multiplication by constants.  If that code is vectorized we can
+   convert this pattern into a more efficient vector comparison.  */
+(simplify
+ (mult (bit_and (rshift @0 uniform_integer_cst_p@1)
+           uniform_integer_cst_p@2)
+    uniform_integer_cst_p@3)
+ (with {
+   tree rshift_cst = uniform_integer_cst_p (@1);
+   tree bit_and_cst = uniform_integer_cst_p (@2);
+   tree mult_cst = uniform_integer_cst_p (@3);
+  }
+  /* Make sure we're working with vectors and uniform vector constants.  */
+  (if (VECTOR_TYPE_P (type)
+       && tree_fits_uhwi_p (rshift_cst)
+       && tree_fits_uhwi_p (mult_cst)
+       && tree_fits_uhwi_p (bit_and_cst))
+   /* Compute what constants would be needed for this to represent a packed
+      comparison based on the shift amount denoted by RSHIFT_CST.  */
+   (with {
+     HOST_WIDE_INT vec_elem_bits = vector_element_bits (type);
+     poly_int64 vec_nelts = TYPE_VECTOR_SUBPARTS (type);
+     poly_int64 vec_bits = vec_elem_bits * vec_nelts;
+     unsigned HOST_WIDE_INT cmp_bits_i, bit_and_i, mult_i;
+     unsigned HOST_WIDE_INT target_mult_i, target_bit_and_i;
+     cmp_bits_i = tree_to_uhwi (rshift_cst) + 1;
+     mult_i = tree_to_uhwi (mult_cst);
+     target_mult_i = (HOST_WIDE_INT_1U << cmp_bits_i) - 1;
+     bit_and_i = tree_to_uhwi (bit_and_cst);
+     target_bit_and_i = 0;
+
+     /* The bit pattern in BIT_AND_I should be a mask for the least
+       significant bit of each packed element that is CMP_BITS wide.  */
+     for (unsigned i = 0; i < vec_elem_bits / cmp_bits_i; i++)
+       target_bit_and_i = (target_bit_and_i << cmp_bits_i) | 1U;
+    }
+    (if ((exact_log2 (cmp_bits_i)) >= 0
+        && cmp_bits_i < HOST_BITS_PER_WIDE_INT
+        && multiple_p (vec_bits, cmp_bits_i)
+        && vec_elem_bits <= HOST_BITS_PER_WIDE_INT
+        && target_mult_i == mult_i
+        && target_bit_and_i == bit_and_i)
+     /* Compute the vector shape for the comparison and check if the target is
+       able to expand the comparison with that type.  */
+     (with {
+       /* We're doing a signed comparison.  */
+       tree cmp_type = build_nonstandard_integer_type (cmp_bits_i, 0);
+       poly_int64 vector_type_nelts = exact_div (vec_bits, cmp_bits_i);
+       tree vec_cmp_type = build_vector_type (cmp_type, vector_type_nelts);
+       tree vec_truth_type = truth_type_for (vec_cmp_type);
+       tree zeros = build_zero_cst (vec_cmp_type);
+       tree ones = build_all_ones_cst (vec_cmp_type);
+      }
+      (if (expand_vec_cmp_expr_p (vec_cmp_type, vec_truth_type, LT_EXPR)
+          && expand_vec_cond_expr_p (vec_cmp_type, vec_truth_type, LT_EXPR))
+       (view_convert:type (vec_cond (lt:vec_truth_type
+                                    (view_convert:vec_cmp_type @0)
+                                    { zeros; })
+                          { ones; } { zeros; })))))))))
+
  (for cmp (gt ge lt le)
       outp (convert convert negate negate)
       outn (negate negate convert convert)
diff --git a/gcc/testsuite/gcc.target/aarch64/swar_to_vec_cmp.c b/gcc/testsuite/gcc.target/aarch64/swar_to_vec_cmp.c

new file mode 100644 (file)

index 0000000..26f9ad9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/swar_to_vec_cmp.c
@@ -0,0 +1,72 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
+typedef unsigned int uint32_t;
+
+/* 8-bit SWAR tests.  */
+
+static uint8_t packed_cmp_8_8(uint8_t a)
+{
+  return ((a >> 7) & 0x1U) * 0xffU;
+}
+
+/* 16-bit SWAR tests.  */
+
+static uint16_t packed_cmp_8_16(uint16_t a)
+{
+  return ((a >> 7) & 0x101U) * 0xffU;
+}
+
+static uint16_t packed_cmp_16_16(uint16_t a)
+{
+  return ((a >> 15) & 0x1U) * 0xffffU;
+}
+
+/* 32-bit SWAR tests.  */
+
+static uint32_t packed_cmp_8_32(uint32_t a)
+{
+  return ((a >> 7) & 0x1010101U) * 0xffU;
+}
+
+static uint32_t packed_cmp_16_32(uint32_t a)
+{
+  return ((a >> 15) & 0x10001U) * 0xffffU;
+}
+
+static uint32_t packed_cmp_32_32(uint32_t a)
+{
+  return ((a >> 31) & 0x1U) * 0xffffffffU;
+}
+
+/* Driver function to test the vectorized code generated for the different
+   packed_cmp variants.  */
+
+#define VECTORIZED_PACKED_CMP(T, FUNC)         \
+  void vectorized_cmp_##FUNC(T* a, int n)      \
+  {                                            \
+    n = (n / 32) * 32;                         \
+    for(int i = 0; i < n; i += 4)              \
+    {                                          \
+      a[i + 0] = FUNC(a[i + 0]);               \
+      a[i + 1] = FUNC(a[i + 1]);               \
+      a[i + 2] = FUNC(a[i + 2]);               \
+      a[i + 3] = FUNC(a[i + 3]);               \
+    }                                          \
+  }
+
+VECTORIZED_PACKED_CMP(uint8_t, packed_cmp_8_8);
+
+VECTORIZED_PACKED_CMP(uint16_t, packed_cmp_8_16);
+VECTORIZED_PACKED_CMP(uint16_t, packed_cmp_16_16);
+
+VECTORIZED_PACKED_CMP(uint32_t, packed_cmp_8_32);
+VECTORIZED_PACKED_CMP(uint32_t, packed_cmp_16_32);
+VECTORIZED_PACKED_CMP(uint32_t, packed_cmp_32_32);
+
+/* { dg-final { scan-assembler {\tcmlt\t} } } */
+/* { dg-final { scan-assembler-not {\tushr\t} } } */
+/* { dg-final { scan-assembler-not {\tshl\t} } } */
+/* { dg-final { scan-assembler-not {\tmul\t} } } */
author	mtsamis <manolis.tsamis@vrull.eu>
	Mon, 1 Aug 2022 12:11:02 +0000 (14:11 +0200)
committer	Philipp Tomsich <philipp.tomsich@vrull.eu>
	Thu, 11 May 2023 19:59:09 +0000 (21:59 +0200)
gcc/match.pd		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/swar_to_vec_cmp.c	[new file with mode: 0644]	patch \| blob