}
)
-;; div optimizations using narrowings
-;; we can do the division e.g. shorts by 255 faster by calculating it as
-;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
-;; double the precision of x.
-;;
-;; If we imagine a short as being composed of two blocks of bytes then
-;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalent to
-;; adding 1 to each sub component:
-;;
-;; short value of 16-bits
-;; ┌──────────────┬────────────────┐
-;; │ │ │
-;; └──────────────┴────────────────┘
-;; 8-bit part1 ▲ 8-bit part2 ▲
-;; │ │
-;; │ │
-;; +1 +1
-;;
-;; after the first addition, we have to shift right by 8, and narrow the
-;; results back to a byte. Remember that the addition must be done in
-;; double the precision of the input. Since 8 is half the size of a short
-;; we can use a narrowing halfing instruction in AArch64, addhn which also
-;; does the addition in a wider precision and narrows back to a byte. The
-;; shift itself is implicit in the operation as it writes back only the top
-;; half of the result. i.e. bits 2*esize-1:esize.
-;;
-;; Since we have narrowed the result of the first part back to a byte, for
-;; the second addition we can use a widening addition, uaddw.
-;;
-;; For the final shift, since it's unsigned arithmetic we emit an ushr by 8.
-;;
-;; The shift is later optimized by combine to a uzp2 with movi #0.
-(define_expand "@aarch64_bitmask_udiv<mode>3"
- [(match_operand:VQN 0 "register_operand")
- (match_operand:VQN 1 "register_operand")
- (match_operand:VQN 2 "immediate_operand")]
+;; Optimize ((a + b) >> n) + c where n is half the bitsize of the vector
+(define_insn_and_split "*bitmask_shift_plus<mode>"
+ [(set (match_operand:VQN 0 "register_operand" "=&w")
+ (plus:VQN
+ (lshiftrt:VQN
+ (plus:VQN (match_operand:VQN 1 "register_operand" "w")
+ (match_operand:VQN 2 "register_operand" "w"))
+ (match_operand:VQN 3 "aarch64_simd_shift_imm_vec_exact_top" ""))
+ (match_operand:VQN 4 "register_operand" "w")))]
"TARGET_SIMD"
+ "#"
+ "&& true"
+ [(const_int 0)]
{
- unsigned HOST_WIDE_INT size
- = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1;
- rtx elt = unwrap_const_vec_duplicate (operands[2]);
- if (!CONST_INT_P (elt) || UINTVAL (elt) != size)
- FAIL;
-
- rtx addend = gen_reg_rtx (<MODE>mode);
- rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1);
- emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROWQ2>mode));
- rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode);
- rtx tmp2 = gen_reg_rtx (<MODE>mode);
- emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend));
- unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode);
- rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, bitsize);
- emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1));
- emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, shift_vector));
+ rtx tmp;
+ if (can_create_pseudo_p ())
+ tmp = gen_reg_rtx (<VNARROWQ>mode);
+ else
+ tmp = gen_rtx_REG (<VNARROWQ>mode, REGNO (operands[0]));
+ emit_insn (gen_aarch64_addhn<mode> (tmp, operands[1], operands[2]));
+ emit_insn (gen_aarch64_uaddw<Vnarrowq> (operands[0], operands[4], tmp));
DONE;
})
;; ---- [INT] Reciprocal approximation
;; ---- [INT<-FP] Base-2 logarithm
;; ---- [INT] Polynomial multiplication
-;; ---- [INT] Misc optab implementations
;;
;; == Permutation
;; ---- [INT,FP] General permutes
"<sve_int_op>\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>"
)
+;; Optimize ((a + b) >> n) where n is half the bitsize of the vector
+(define_insn "*bitmask_shift_plus<mode>"
+ [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
+ (unspec:SVE_FULL_HSDI
+ [(match_operand:<VPRED> 1)
+ (lshiftrt:SVE_FULL_HSDI
+ (plus:SVE_FULL_HSDI
+ (match_operand:SVE_FULL_HSDI 2 "register_operand" "w")
+ (match_operand:SVE_FULL_HSDI 3 "register_operand" "w"))
+ (match_operand:SVE_FULL_HSDI 4
+ "aarch64_simd_shift_imm_vec_exact_top" ""))]
+ UNSPEC_PRED_X))]
+ "TARGET_SVE2"
+ "addhnb\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>"
+)
+
;; -------------------------------------------------------------------------
;; ---- [INT] Narrowing right shifts
;; -------------------------------------------------------------------------
"<sve_int_op>\t%0.<Vewtype>, %1.<Vetype>, %2.<Vetype>"
)
-;; -------------------------------------------------------------------------
-;; ---- [INT] Misc optab implementations
-;; -------------------------------------------------------------------------
-;; Includes:
-;; - aarch64_bitmask_udiv
-;; -------------------------------------------------------------------------
-
-;; div optimizations using narrowings
-;; we can do the division e.g. shorts by 255 faster by calculating it as
-;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
-;; double the precision of x.
-;;
-;; See aarch64-simd.md for bigger explanation.
-(define_expand "@aarch64_bitmask_udiv<mode>3"
- [(match_operand:SVE_FULL_HSDI 0 "register_operand")
- (match_operand:SVE_FULL_HSDI 1 "register_operand")
- (match_operand:SVE_FULL_HSDI 2 "immediate_operand")]
- "TARGET_SVE2"
-{
- unsigned HOST_WIDE_INT size
- = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1;
- rtx elt = unwrap_const_vec_duplicate (operands[2]);
- if (!CONST_INT_P (elt) || UINTVAL (elt) != size)
- FAIL;
-
- rtx addend = gen_reg_rtx (<MODE>mode);
- rtx tmp1 = gen_reg_rtx (<VNARROW>mode);
- rtx tmp2 = gen_reg_rtx (<VNARROW>mode);
- rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1);
- emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROW>mode));
- emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1, operands[1],
- addend));
- emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2, operands[1],
- lowpart_subreg (<MODE>mode, tmp1,
- <VNARROW>mode)));
- emit_move_insn (operands[0],
- lowpart_subreg (<MODE>mode, tmp2, <VNARROW>mode));
- DONE;
-})
-
;; =========================================================================
;; == Permutation
;; =========================================================================
return default_vectorize_related_mode (vector_mode, element_mode, nunits);
}
+/* Implement TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT. */
+
+static bool
+aarch64_vectorize_preferred_div_as_shifts_over_mult (const_tree type)
+{
+ machine_mode mode = TYPE_MODE (type);
+ unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+ bool sve_p = (vec_flags & VEC_ANY_SVE);
+ bool simd_p = (vec_flags & VEC_ADVSIMD);
+
+ return (sve_p && TARGET_SVE2) || (simd_p && TARGET_SIMD);
+}
+
/* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
prefer to use the first arithmetic operand as the else value if
the else value doesn't matter, since that exactly matches the SVE
return ret;
}
-
-/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST. */
-
-bool
-aarch64_vectorize_can_special_div_by_constant (enum tree_code code,
- tree vectype, wide_int cst,
- rtx *output, rtx in0, rtx in1)
-{
- if (code != TRUNC_DIV_EXPR
- || !TYPE_UNSIGNED (vectype))
- return false;
-
- machine_mode mode = TYPE_MODE (vectype);
- unsigned int flags = aarch64_classify_vector_mode (mode);
- if ((flags & VEC_ANY_SVE) && !TARGET_SVE2)
- return false;
-
- int pow = wi::exact_log2 (cst + 1);
- auto insn_code = maybe_code_for_aarch64_bitmask_udiv3 (TYPE_MODE (vectype));
- /* SVE actually has a div operator, we may have gotten here through
- that route. */
- if (pow != (int) (element_precision (vectype) / 2)
- || insn_code == CODE_FOR_nothing)
- return false;
-
- /* We can use the optimized pattern. */
- if (in0 == NULL_RTX && in1 == NULL_RTX)
- return true;
-
- gcc_assert (output);
-
- expand_operand ops[3];
- create_output_operand (&ops[0], *output, mode);
- create_input_operand (&ops[1], in0, mode);
- create_fixed_operand (&ops[2], in1);
- expand_insn (insn_code, 3, ops);
- *output = ops[0].value;
- return true;
-}
-
/* Generate a byte permute mask for a register of mode MODE,
which has NUNITS units. */
#undef TARGET_MAX_ANCHOR_OFFSET
#define TARGET_MAX_ANCHOR_OFFSET 4095
+#undef TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
+#define TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT \
+ aarch64_vectorize_preferred_div_as_shifts_over_mult
+
#undef TARGET_VECTOR_ALIGNMENT
#define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
-#undef TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
-#define TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST \
- aarch64_vectorize_can_special_div_by_constant
-
#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
aarch64_vectorize_preferred_vector_alignment