like
r16-105-g599bca27dc37b3, the patch handles redunduant clean up of
upper-bits for maskload.
.i.e
Successfully matched this instruction:
(set (reg:V4DF 175)
(vec_merge:V4DF (unspec:V4DF [
(mem:V4DF (plus:DI (reg/v/f:DI 155 [ b ])
(reg:DI 143 [ ivtmp.56 ])) [1 S32 A64])
] UNSPEC_MASKLOAD)
(const_vector:V4DF [
(const_double:DF 0.0 [0x0.0p+0]) repeated x4
])
(and:QI (reg:QI 125 [ mask__29.16 ])
(const_int 15 [0xf]))))
For maskstore, looks like it's already optimal(at least I can't make a
testcase).
So The patch only hanldes maskload.
gcc/ChangeLog:
PR target/103750
* config/i386/i386.cc (ix86_rtx_costs): Adjust rtx_cost for
maskload.
* config/i386/sse.md (*<avx512>_load<mode>mask_and15): New
define_insn_and_split.
(*<avx512>_load<mode>mask_and3): Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/i386/avx512f-pr103750-3.c: New test.
}
/* This is masked instruction, assume the same cost,
as nonmasked variant. */
- else if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
+ else if (TARGET_AVX512F
+ && (register_operand (mask, GET_MODE (mask))
+ /* Redunduant clean up of high bits for kmask with VL=2/4
+ .i.e (vec_merge op0, op1, (and op3 15)). */
+ || (GET_CODE (mask) == AND
+ && register_operand (XEXP (mask, 0), GET_MODE (mask))
+ && CONST_INT_P (XEXP (mask, 1))
+ && ((INTVAL (XEXP (mask, 1)) == 3
+ && GET_MODE_NUNITS (mode) == 2)
+ || (INTVAL (XEXP (mask, 1)) == 15
+ && GET_MODE_NUNITS (mode) == 4)))))
{
*total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
+ rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed);
"&& 1"
[(set (match_dup 0) (match_dup 1))])
+(define_insn_and_split "*<avx512>_load<mode>mask_and15"
+ [(set (match_operand:V48_AVX512VL_4 0 "register_operand" "=v")
+ (vec_merge:V48_AVX512VL_4
+ (unspec:V48_AVX512VL_4
+ [(match_operand:V48_AVX512VL_4 1 "memory_operand" "m")]
+ UNSPEC_MASKLOAD)
+ (match_operand:V48_AVX512VL_4 2 "nonimm_or_0_operand" "0C")
+ (and:QI
+ (match_operand:QI 3 "register_operand" "Yk")
+ (const_int 15))))]
+ "TARGET_AVX512F"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (vec_merge:V48_AVX512VL_4
+ (unspec:V48_AVX512VL_4 [(match_dup 1)] UNSPEC_MASKLOAD)
+ (match_dup 2)
+ (match_dup 3)))])
+
+(define_insn_and_split "*<avx512>_load<mode>mask_and3"
+ [(set (match_operand:V8_AVX512VL_2 0 "register_operand" "=v")
+ (vec_merge:V8_AVX512VL_2
+ (unspec:V8_AVX512VL_2
+ [(match_operand:V8_AVX512VL_2 1 "memory_operand" "m")]
+ UNSPEC_MASKLOAD)
+ (match_operand:V8_AVX512VL_2 2 "nonimm_or_0_operand" "0C")
+ (and:QI
+ (match_operand:QI 3 "register_operand" "Yk")
+ (const_int 3))))]
+ "TARGET_AVX512F"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (vec_merge:V8_AVX512VL_2
+ (unspec:V8_AVX512VL_2 [(match_dup 1)] UNSPEC_MASKLOAD)
+ (match_dup 2)
+ (match_dup 3)))])
+
(define_expand "<avx512>_load<mode>_mask"
[(set (match_operand:VI12_AVX512VL 0 "register_operand")
(vec_merge:VI12_AVX512VL
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -mprefer-vector-width=256 -Ofast" } */
+/* { dg-final { scan-assembler-not "kmov" } } */
+
+void
+foo (double* a, double* __restrict b, double* c, double* d, int n)
+{
+ for (int i = 0; i != n; i++)
+ {
+ double tmp = 0.0;
+ if (c[i] > d[i])
+ tmp = b[i];
+ a[i] = tmp;
+ }
+}
+
+void
+foo1 (double* a, double* __restrict b, double* c, double* d, int n)
+{
+ for (int i = 0; i != n; i++)
+ {
+ double tmp = 0.0;
+ if (c[i] > d[i])
+ a[i] = b[i];
+ }
+}