AArch64: Add fma_reassoc_width [PR107413]

author Wilco Dijkstra <wilco.dijkstra@arm.com>

Wed, 23 Nov 2022 17:27:19 +0000 (17:27 +0000)

committer Wilco Dijkstra <wilco.dijkstra@arm.com>

Thu, 24 Nov 2022 13:30:04 +0000 (13:30 +0000)
author Wilco Dijkstra <wilco.dijkstra@arm.com>
Wed, 23 Nov 2022 17:27:19 +0000 (17:27 +0000)
committer Wilco Dijkstra <wilco.dijkstra@arm.com>
Thu, 24 Nov 2022 13:30:04 +0000 (13:30 +0000)
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h

index 238820581c5ee7617f8eed1df2cf5418b1127e19..4be93c93c26e091f878bc8e4cf06e90888405fb2 100644 (file)
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -540,6 +540,7 @@ struct tune_params
    const char *loop_align;
    int int_reassoc_width;
    int fp_reassoc_width;
+  int fma_reassoc_width;
    int vec_reassoc_width;
    int min_div_recip_mul_sf;
    int min_div_recip_mul_df;
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc

index c91df6f5006c257690aafb75398933d628a970e1..15d478c77ceb2d6c52a70b6ffd8fdadcfa8deba0 100644 (file)
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -1346,6 +1346,7 @@ static const struct tune_params generic_tunings =
    "8", /* loop_align.  */
    2,   /* int_reassoc_width.  */
    4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
    1,   /* vec_reassoc_width.  */
    2,   /* min_div_recip_mul_sf.  */
    2,   /* min_div_recip_mul_df.  */
@@ -1382,6 +1383,7 @@ static const struct tune_params cortexa35_tunings =
    "8", /* loop_align.  */
    2,   /* int_reassoc_width.  */
    4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
    1,   /* vec_reassoc_width.  */
    2,   /* min_div_recip_mul_sf.  */
    2,   /* min_div_recip_mul_df.  */
@@ -1415,6 +1417,7 @@ static const struct tune_params cortexa53_tunings =
    "8", /* loop_align.  */
    2,   /* int_reassoc_width.  */
    4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
    1,   /* vec_reassoc_width.  */
    2,   /* min_div_recip_mul_sf.  */
    2,   /* min_div_recip_mul_df.  */
@@ -1448,6 +1451,7 @@ static const struct tune_params cortexa57_tunings =
    "8", /* loop_align.  */
    2,   /* int_reassoc_width.  */
    4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
    1,   /* vec_reassoc_width.  */
    2,   /* min_div_recip_mul_sf.  */
    2,   /* min_div_recip_mul_df.  */
@@ -1481,6 +1485,7 @@ static const struct tune_params cortexa72_tunings =
    "8", /* loop_align.  */
    2,   /* int_reassoc_width.  */
    4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
    1,   /* vec_reassoc_width.  */
    2,   /* min_div_recip_mul_sf.  */
    2,   /* min_div_recip_mul_df.  */
@@ -1514,6 +1519,7 @@ static const struct tune_params cortexa73_tunings =
    "8", /* loop_align.  */
    2,   /* int_reassoc_width.  */
    4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
    1,   /* vec_reassoc_width.  */
    2,   /* min_div_recip_mul_sf.  */
    2,   /* min_div_recip_mul_df.  */
@@ -1548,6 +1554,7 @@ static const struct tune_params exynosm1_tunings =
    "4", /* loop_align.  */
    2,   /* int_reassoc_width.  */
    4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
    1,   /* vec_reassoc_width.  */
    2,   /* min_div_recip_mul_sf.  */
    2,   /* min_div_recip_mul_df.  */
@@ -1580,6 +1587,7 @@ static const struct tune_params thunderxt88_tunings =
    "8", /* loop_align.  */
    2,   /* int_reassoc_width.  */
    4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
    1,   /* vec_reassoc_width.  */
    2,   /* min_div_recip_mul_sf.  */
    2,   /* min_div_recip_mul_df.  */
@@ -1612,6 +1620,7 @@ static const struct tune_params thunderx_tunings =
    "8", /* loop_align.  */
    2,   /* int_reassoc_width.  */
    4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
    1,   /* vec_reassoc_width.  */
    2,   /* min_div_recip_mul_sf.  */
    2,   /* min_div_recip_mul_df.  */
@@ -1646,6 +1655,7 @@ static const struct tune_params tsv110_tunings =
    "8",  /* loop_align.  */
    2,    /* int_reassoc_width.  */
    4,    /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
    1,    /* vec_reassoc_width.  */
    2,    /* min_div_recip_mul_sf.  */
    2,    /* min_div_recip_mul_df.  */
@@ -1678,6 +1688,7 @@ static const struct tune_params xgene1_tunings =
    "16",        /* loop_align.  */
    2,   /* int_reassoc_width.  */
    4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
    1,   /* vec_reassoc_width.  */
    2,   /* min_div_recip_mul_sf.  */
    2,   /* min_div_recip_mul_df.  */
@@ -1710,6 +1721,7 @@ static const struct tune_params emag_tunings =
    "16",        /* loop_align.  */
    2,   /* int_reassoc_width.  */
    4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
    1,   /* vec_reassoc_width.  */
    2,   /* min_div_recip_mul_sf.  */
    2,   /* min_div_recip_mul_df.  */
@@ -1743,6 +1755,7 @@ static const struct tune_params qdf24xx_tunings =
    "16",        /* loop_align.  */
    2,   /* int_reassoc_width.  */
    4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
    1,   /* vec_reassoc_width.  */
    2,   /* min_div_recip_mul_sf.  */
    2,   /* min_div_recip_mul_df.  */
@@ -1778,6 +1791,7 @@ static const struct tune_params saphira_tunings =
    "16",        /* loop_align.  */
    2,   /* int_reassoc_width.  */
    4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
    1,   /* vec_reassoc_width.  */
    2,   /* min_div_recip_mul_sf.  */
    2,   /* min_div_recip_mul_df.  */
@@ -1811,6 +1825,7 @@ static const struct tune_params thunderx2t99_tunings =
    "16",        /* loop_align.  */
    3,   /* int_reassoc_width.  */
    2,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
    2,   /* vec_reassoc_width.  */
    2,   /* min_div_recip_mul_sf.  */
    2,   /* min_div_recip_mul_df.  */
@@ -1844,6 +1859,7 @@ static const struct tune_params thunderx3t110_tunings =
    "16",        /* loop_align.  */
    3,   /* int_reassoc_width.  */
    2,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
    2,   /* vec_reassoc_width.  */
    2,   /* min_div_recip_mul_sf.  */
    2,   /* min_div_recip_mul_df.  */
@@ -1876,6 +1892,7 @@ static const struct tune_params neoversen1_tunings =
    "32:16",     /* loop_align.  */
    2,   /* int_reassoc_width.  */
    4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
    2,   /* vec_reassoc_width.  */
    2,   /* min_div_recip_mul_sf.  */
    2,   /* min_div_recip_mul_df.  */
@@ -1912,6 +1929,7 @@ static const struct tune_params ampere1_tunings =
    "32:16",     /* loop_align.  */
    2,   /* int_reassoc_width.  */
    4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
    2,   /* vec_reassoc_width.  */
    2,   /* min_div_recip_mul_sf.  */
    2,   /* min_div_recip_mul_df.  */
@@ -1949,6 +1967,7 @@ static const struct tune_params ampere1a_tunings =
    "32:16",     /* loop_align.  */
    2,   /* int_reassoc_width.  */
    4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
    2,   /* vec_reassoc_width.  */
    2,   /* min_div_recip_mul_sf.  */
    2,   /* min_div_recip_mul_df.  */
@@ -2126,6 +2145,7 @@ static const struct tune_params neoversev1_tunings =
    "32:16",     /* loop_align.  */
    2,   /* int_reassoc_width.  */
    4,   /* fp_reassoc_width.  */
+  4,   /* fma_reassoc_width.  */
    2,   /* vec_reassoc_width.  */
    2,   /* min_div_recip_mul_sf.  */
    2,   /* min_div_recip_mul_df.  */
@@ -2263,6 +2283,7 @@ static const struct tune_params neoverse512tvb_tunings =
    "32:16",     /* loop_align.  */
    2,   /* int_reassoc_width.  */
    4,   /* fp_reassoc_width.  */
+  4,   /* fma_reassoc_width.  */
    2,   /* vec_reassoc_width.  */
    2,   /* min_div_recip_mul_sf.  */
    2,   /* min_div_recip_mul_df.  */
@@ -2451,6 +2472,7 @@ static const struct tune_params neoversen2_tunings =
    "32:16",     /* loop_align.  */
    2,   /* int_reassoc_width.  */
    4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
    2,   /* vec_reassoc_width.  */
    2,   /* min_div_recip_mul_sf.  */
    2,   /* min_div_recip_mul_df.  */
@@ -2640,6 +2662,7 @@ static const struct tune_params neoversev2_tunings =
    "32:16",     /* loop_align.  */
    3,   /* int_reassoc_width.  */
    6,   /* fp_reassoc_width.  */
+  4,   /* fma_reassoc_width.  */
    3,   /* vec_reassoc_width.  */
    2,   /* min_div_recip_mul_sf.  */
    2,   /* min_div_recip_mul_df.  */
@@ -2675,6 +2698,7 @@ static const struct tune_params a64fx_tunings =
    "32",        /* loop_align.  */
    4,   /* int_reassoc_width.  */
    2,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
    2,   /* vec_reassoc_width.  */
    2,   /* min_div_recip_mul_sf.  */
    2,   /* min_div_recip_mul_df.  */
@@ -3387,9 +3411,15 @@ aarch64_reassociation_width (unsigned opc, machine_mode mode)
      return aarch64_tune_params.vec_reassoc_width;
    if (INTEGRAL_MODE_P (mode))
      return aarch64_tune_params.int_reassoc_width;
-  /* Avoid reassociating floating point addition so we emit more FMAs.  */
-  if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
-    return aarch64_tune_params.fp_reassoc_width;
+  /* Reassociation reduces the number of FMAs which may result in worse
+     performance.  Use a per-CPU setting for FMA reassociation which allows
+     narrow CPUs with few FP pipes to switch it off (value of 1), and wider
+     CPUs with many FP pipes to enable reassociation.
+     Since the reassociation pass doesn't understand FMA at all, assume
+     that any FP addition might turn into FMA.  */
+  if (FLOAT_MODE_P (mode))
+    return opc == PLUS_EXPR ? aarch64_tune_params.fma_reassoc_width
+                           : aarch64_tune_params.fp_reassoc_width;
    return 1;
  }
author	Wilco Dijkstra <wilco.dijkstra@arm.com>
	Wed, 23 Nov 2022 17:27:19 +0000 (17:27 +0000)
committer	Wilco Dijkstra <wilco.dijkstra@arm.com>
	Thu, 24 Nov 2022 13:30:04 +0000 (13:30 +0000)
gcc/config/aarch64/aarch64-protos.h		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64.cc		patch \| blob \| blame \| history