AArch64: Add implementation for pow2 bitmask division.

author Tamar Christina <tamar.christina@arm.com>

Mon, 14 Nov 2022 15:52:26 +0000 (15:52 +0000)

committer Tamar Christina <tamar.christina@arm.com>

Mon, 14 Nov 2022 17:41:33 +0000 (17:41 +0000)
author Tamar Christina <tamar.christina@arm.com>
Mon, 14 Nov 2022 15:52:26 +0000 (15:52 +0000)
committer Tamar Christina <tamar.christina@arm.com>
Mon, 14 Nov 2022 17:41:33 +0000 (17:41 +0000)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md

index 5386043739a9b2e328bfb2fc9067da8feeac1a92..104088f67d2eb005880e2a62b9421421a8ba6a8c 100644 (file)
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4867,6 +4867,63 @@
    }
  )
  
+;; div optimizations using narrowings
+;; we can do the division e.g. shorts by 255 faster by calculating it as
+;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
+;; double the precision of x.
+;;
+;; If we imagine a short as being composed of two blocks of bytes then
+;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalent to
+;; adding 1 to each sub component:
+;;
+;;      short value of 16-bits
+;; ┌──────────────┬────────────────┐
+;; │              │                │
+;; └──────────────┴────────────────┘
+;;   8-bit part1 ▲  8-bit part2   ▲
+;;               │                │
+;;               │                │
+;;              +1               +1
+;;
+;; after the first addition, we have to shift right by 8, and narrow the
+;; results back to a byte.  Remember that the addition must be done in
+;; double the precision of the input.  Since 8 is half the size of a short
+;; we can use a narrowing halfing instruction in AArch64, addhn which also
+;; does the addition in a wider precision and narrows back to a byte.  The
+;; shift itself is implicit in the operation as it writes back only the top
+;; half of the result. i.e. bits 2*esize-1:esize.
+;;
+;; Since we have narrowed the result of the first part back to a byte, for
+;; the second addition we can use a widening addition, uaddw.
+;;
+;; For the final shift, since it's unsigned arithmetic we emit an ushr by 8.
+;;
+;; The shift is later optimized by combine to a uzp2 with movi #0.
+(define_expand "@aarch64_bitmask_udiv<mode>3"
+  [(match_operand:VQN 0 "register_operand")
+   (match_operand:VQN 1 "register_operand")
+   (match_operand:VQN 2 "immediate_operand")]
+  "TARGET_SIMD"
+{
+  unsigned HOST_WIDE_INT size
+    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1;
+  rtx elt = unwrap_const_vec_duplicate (operands[2]);
+  if (!CONST_INT_P (elt) || UINTVAL (elt) != size)
+    FAIL;
+
+  rtx addend = gen_reg_rtx (<MODE>mode);
+  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1);
+  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROWQ2>mode));
+  rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode);
+  rtx tmp2 = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend));
+  unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode);
+  rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, bitsize);
+  emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1));
+  emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, shift_vector));
+  DONE;
+})
+
  ;; pmul.
  
  (define_insn "aarch64_pmul<mode>"
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc

index a7f7c3c0121d635e629e7354e80ddf7162685715..c91df6f5006c257690aafb75398933d628a970e1 100644 (file)
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -24306,6 +24306,40 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
    return ret;
  }
  
+/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST.  */
+
+bool
+aarch64_vectorize_can_special_div_by_constant (enum tree_code code,
+                                              tree vectype, wide_int cst,
+                                              rtx *output, rtx in0, rtx in1)
+{
+  if (code != TRUNC_DIV_EXPR
+      || !TYPE_UNSIGNED (vectype))
+    return false;
+
+  unsigned int flags = aarch64_classify_vector_mode (TYPE_MODE (vectype));
+  if ((flags & VEC_ANY_SVE) && !TARGET_SVE2)
+    return false;
+
+  if (in0 == NULL_RTX && in1 == NULL_RTX)
+    {
+      wide_int val = wi::add (cst, 1);
+      int pow = wi::exact_log2 (val);
+      return pow == (int)(element_precision (vectype) / 2);
+    }
+
+  if (!VECTOR_TYPE_P (vectype))
+   return false;
+
+  gcc_assert (output);
+
+  if (!*output)
+    *output = gen_reg_rtx (TYPE_MODE (vectype));
+
+  emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), *output, in0, in1));
+  return true;
+}
+
  /* Generate a byte permute mask for a register of mode MODE,
     which has NUNITS units.  */
  
@@ -27796,6 +27830,10 @@ aarch64_libgcc_floating_mode_supported_p
  #undef TARGET_VECTOR_ALIGNMENT
  #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
  
+#undef TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
+#define TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST \
+  aarch64_vectorize_can_special_div_by_constant
+
  #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
  #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
    aarch64_vectorize_preferred_vector_alignment
diff --git a/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c

new file mode 100644 (file)

index 0000000..2a53579
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
@@ -0,0 +1,61 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <stdint.h>
+
+#pragma GCC target "+nosve"
+
+/*
+** draw_bitmap1:
+** ...
+**     addhn   v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
+**     addhn   v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
+**     uaddw   v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
+**     uaddw   v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
+**     uzp2    v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** ...
+*/
+void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xfe;
+}
+
+/*
+** draw_bitmap3:
+** ...
+**     addhn   v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
+**     addhn   v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
+**     uaddw   v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+**     uaddw   v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+**     uzp2    v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+** ...
+*/
+void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+/*
+** draw_bitmap4:
+** ...
+**     addhn   v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
+**     addhn   v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
+**     uaddw   v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
+**     uaddw   v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
+**     uzp2    v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** ...
+*/
+void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
author	Tamar Christina <tamar.christina@arm.com>
	Mon, 14 Nov 2022 15:52:26 +0000 (15:52 +0000)
committer	Tamar Christina <tamar.christina@arm.com>
	Mon, 14 Nov 2022 17:41:33 +0000 (17:41 +0000)
gcc/config/aarch64/aarch64-simd.md		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64.cc		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c	[new file with mode: 0644]	patch \| blob