return new pass_insert_endbr_and_patchable_area (ctxt);
}
+bool
+ix86_rpad_gate ()
+{
+ return (TARGET_AVX
+ && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+ && TARGET_SSE_MATH
+ && optimize
+ && optimize_function_for_speed_p (cfun));
+}
+
/* At entry of the nearest common dominator for basic blocks with
conversions/rcp/sqrt/rsqrt/round, generate a single
vxorps %xmmN, %xmmN, %xmmN
/* opt_pass methods: */
bool gate (function *) final override
{
- return (TARGET_AVX
- && TARGET_SSE_PARTIAL_REG_DEPENDENCY
- && TARGET_SSE_MATH
- && optimize
- && optimize_function_for_speed_p (cfun));
+ return ix86_rpad_gate ();
}
unsigned int execute (function *) final override
flag_cunroll_grow_size = flag_peel_loops || optimize >= 3;
}
- /* Late combine tends to undo some of the effects of STV and RPAD,
- by combining instructions back to their original form. */
- if (!OPTION_SET_P (flag_late_combine_instructions))
- flag_late_combine_instructions = 0;
}
/* Clear stack slot assignments remembered from previous functions.
*/
INSERT_PASS_AFTER (pass_postreload_cse, 1, pass_insert_vzeroupper);
- INSERT_PASS_AFTER (pass_combine, 1, pass_stv, false /* timode_p */);
+ INSERT_PASS_AFTER (pass_late_combine, 1, pass_stv, false /* timode_p */);
/* Run the 64-bit STV pass before the CSE pass so that CONST0_RTX and
CONSTM1_RTX generated by the STV pass can be CSEed. */
INSERT_PASS_BEFORE (pass_cse2, 1, pass_stv, true /* timode_p */);
INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbr_and_patchable_area);
- INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency);
+ INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_partial_avx_dependency);
(gcc::context *);
extern bool ix86_has_no_direct_extern_access;
+extern bool ix86_rpad_gate ();
/* In i386-expand.cc. */
bool ix86_check_builtin_isa_match (unsigned int, HOST_WIDE_INT*,
}
}
+static int
+ix86_insn_cost (rtx_insn *insn, bool speed)
+{
+ int insn_cost = 0;
+ /* Add extra cost to avoid post_reload late_combine revert
+ the optimization did in pass_rpad. */
+ if (reload_completed
+ && ix86_rpad_gate ()
+ && recog_memoized (insn) >= 0
+ && get_attr_avx_partial_xmm_update (insn)
+ == AVX_PARTIAL_XMM_UPDATE_TRUE)
+ insn_cost += COSTS_N_INSNS (3);
+
+ return insn_cost + pattern_cost (PATTERN (insn), speed);
+}
+
/* Compute a (partial) cost for rtx X. Return true if the complete
cost has been computed, and false if subexpressions should be
scanned. In either case, *TOTAL contains the cost result. */
#define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
#undef TARGET_RTX_COSTS
#define TARGET_RTX_COSTS ix86_rtx_costs
+#undef TARGET_INSN_COST
+#define TARGET_INSN_COST ix86_insn_cost
#undef TARGET_ADDRESS_COST
#define TARGET_ADDRESS_COST ix86_address_cost
/* { dg-options "-O2 -mavx512f -mavx512dq" } */
/* { dg-additional-options "-fno-PIE" { target ia32 } } */
/* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 } } }
-/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 2 } } */
-/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to16\\\}" 2 } } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 2 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to16\\\}" 2 } } */
/* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %zmm\[0-9\]+" 3 } } */
/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, %zmm\[0-9\]+" 3 { target { ! ia32 } } } } */
/* { dg-options "-O2 -mavx512f" } */
/* { dg-additional-options "-fno-PIE" { target ia32 } } */
/* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 } } }
-/* { dg-final { scan-assembler-not "\[^\n\]*\\\{1to8\\\}" { target ia32 } } } */
/* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %zmm\[0-9\]+" 4 } } */
/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, %zmm\[0-9\]+" 4 { target { ! ia32 } } } } */
/* { dg-do compile } */
/* { dg-options "-mavx512f -O2" } */
-/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 { target { ! ia32 } } } } */
/* { dg-final { scan-assembler-times "vfmadd...ps\[^\n\]*%zmm\[0-9\]+" 1 } } */
#define type __m512
/* { dg-do compile } */
/* { dg-options "-mavx512f -O2" } */
-/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 { target { ! ia32 } } } } */
/* { dg-final { scan-assembler-times "vfmsub...ps\[^\n\]*%zmm\[0-9\]+" 1 } } */
#define type __m512
/* { dg-do compile } */
/* { dg-options "-mavx512f -O2" } */
-/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 { target { ! ia32 } } } } */
/* { dg-final { scan-assembler-times "vfnmadd...ps\[^\n\]*%zmm\[0-9\]+" 1 } } */
#define type __m512
/* { dg-do compile } */
/* { dg-options "-mavx512f -O2" } */
-/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 { target { ! ia32 } } } } */
/* { dg-final { scan-assembler-times "vfnmsub...ps\[^\n\]*%zmm\[0-9\]+" 1 } } */
#define type __m512
/* { dg-options "-O2 -mavx512f -mavx512vl -mavx512dq" } */
/* { dg-additional-options "-fno-PIE" { target ia32 } } */
/* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 } } }
-/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to2\\\}" 2 } } */
-/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 4 } } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to2\\\}" 2 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 4 { target { ! ia32 } } } } */
/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 2 } } */
/* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 3 } } */
/* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 3 } } */
/* { dg-options "-O2 -mavx512f -mavx512vl" } */
/* { dg-additional-options "-fno-PIE" { target ia32 } } */
/* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 } } }
-/* { dg-final { scan-assembler-not "\[^\n\]*\\\{1to2\\\}" { target ia32 } } } */
-/* { dg-final { scan-assembler-not "\[^\n\]*\\\{1to4\\\}" { target ia32 } } } */
/* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 4 } } */
/* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 4 } } */
/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, %xmm\[0-9\]+" 4 { target { ! ia32 } } } } */
/* { dg-do compile { target { ! ia32 } } } */
/* { dg-options "-O2 -mavx" } */
-/* { dg-final { scan-assembler-times "vmovapd|vmovsd" 3 } } */
+/* { dg-final { scan-assembler-times "vmovapd|vmovsd" 2 } } */
static inline double g (double x){
asm volatile ("" : "+x" (x));
/* Vectorization factor two, two two-element stores to a using movq
and two two-element stores to b via pextrq/movhps of the high part. */
-/* { dg-final { scan-assembler-times "movq" 2 } } */
+/* { dg-final { scan-assembler-times "movq\[\t ]+%xmm\[0-9]" 2 } } */
/* { dg-final { scan-assembler-times "pextrq" 2 { target { ! ia32 } } } } */
/* { dg-final { scan-assembler-times "movhps" 2 { target { ia32 } } } } */