--- /dev/null
+/* PR tree-optimization/116139 */
+/* { dg-do compile } */
+/* { dg-options "-Ofast --param fully-pipelined-fma=1 -mcpu=neoverse-n3" } */
+
+#define LOOP_COUNT 800000000
+typedef double data_e;
+
+data_e
+foo (data_e in)
+{
+ data_e a1, a2, a3, a4;
+ data_e tmp, result = 0;
+ a1 = in + 0.1;
+ a2 = in * 0.1;
+ a3 = in + 0.01;
+ a4 = in * 0.59;
+
+ data_e result2 = 0;
+
+ for (int ic = 0; ic < LOOP_COUNT; ic++)
+ {
+ tmp = a1 + a2 * a2 + a3 * a3 + a4 * a4 ;
+ result += tmp - ic;
+ result2 = result2 / 2 - tmp;
+
+ a1 += 0.91;
+ a2 += 0.1;
+ a3 -= 0.01;
+ a4 -= 0.89;
+
+ }
+
+ return result + result2;
+}
+
, it is latency(MULT)*2 + latency(ADD)*2. Assuming latency(MULT) >=
latency(ADD), the first variant is preferred.
- Find out if we can get a smaller width considering FMA. */
- if (width > 1 && mult_num && param_fully_pipelined_fma)
+ Find out if we can get a smaller width considering FMA.
+ Assume FMUL and FMA use the same units that can also do FADD.
+ For other scenarios, such as when FMUL and FADD are using separated units,
+ the following code may not apply. */
+
+ int width_mult = targetm.sched.reassociation_width (MULT_EXPR, mode);
+ if (width > 1 && mult_num && param_fully_pipelined_fma
+ && width_mult <= width)
{
- /* When param_fully_pipelined_fma is set, assume FMUL and FMA use the
- same units that can also do FADD. For other scenarios, such as when
- FMUL and FADD are using separated units, the following code may not
- appy. */
- int width_mult = targetm.sched.reassociation_width (MULT_EXPR, mode);
- gcc_checking_assert (width_mult <= width);
-
/* Latency of MULT_EXPRs. */
int lat_mul
= get_mult_latency_consider_fma (ops_num, mult_num, width_mult);