]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
aarch64: modify Ampere CPU tunings on reassociation/FMA
authorDi Zhao <dizhao@os.amperecomputing.com>
Fri, 1 Dec 2023 08:16:40 +0000 (16:16 +0800)
committerDi Zhao <dizhao@os.amperecomputing.com>
Fri, 1 Dec 2023 09:02:46 +0000 (17:02 +0800)
1. Allow reassociation on FP additions.

2. Avoid generating loop-dependant FMA chains. Added a tuning
option 'AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA' for this.

gcc/ChangeLog:

* config/aarch64/aarch64-tuning-flags.def
(AARCH64_EXTRA_TUNING_OPTION): New tuning option to avoid
cross-loop FMA.
* config/aarch64/aarch64.cc
(aarch64_override_options_internal): Set
param_avoid_fma_max_bits according to tuning option.
* config/aarch64/tuning_models/ampere1.h (ampere1_tunings):
Modify tunings related with FMA.
* config/aarch64/tuning_models/ampere1a.h (ampere1a_tunings):
Likewise.
* config/aarch64/tuning_models/ampere1b.h (ampere1b_tunings):
Likewise.

gcc/config/aarch64/aarch64-tuning-flags.def
gcc/config/aarch64/aarch64.cc
gcc/config/aarch64/tuning_models/ampere1.h
gcc/config/aarch64/tuning_models/ampere1a.h
gcc/config/aarch64/tuning_models/ampere1b.h

index 774568e91060b3ae545cb3ab48cb1bd6f7dc01b1..f28a73839a63540e4ca7186c956fe7fb61d765ae 100644 (file)
@@ -47,4 +47,6 @@ AARCH64_EXTRA_TUNING_OPTION ("use_new_vector_costs", USE_NEW_VECTOR_COSTS)
 
 AARCH64_EXTRA_TUNING_OPTION ("matched_vector_throughput", MATCHED_VECTOR_THROUGHPUT)
 
+AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", AVOID_CROSS_LOOP_FMA)
+
 #undef AARCH64_EXTRA_TUNING_OPTION
index bde21f7f7f5e5572d705c8397836904f6ca47814..0f83ec0d09d976d3f549d3ff112f8bad1eed0ceb 100644 (file)
@@ -16083,6 +16083,12 @@ aarch64_override_options_internal (struct gcc_options *opts)
       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
     opts->x_flag_prefetch_loop_arrays = 1;
 
+  /* Avoid loop-dependant FMA chains.  */
+  if (aarch64_tune_params.extra_tuning_flags
+      & AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA)
+    SET_OPTION_IF_UNSET (opts, &global_options_set, param_avoid_fma_max_bits,
+                        512);
+
   aarch64_override_options_after_change_1 (opts);
 }
 
index 8d2a1c696103259f23cf73df26cef9d4fa05ac73..a144e8f94b305c2e2e5682dab5b7e8789ad8b6ce 100644 (file)
@@ -104,7 +104,7 @@ static const struct tune_params ampere1_tunings =
   2,   /* min_div_recip_mul_df.  */
   0,   /* max_case_values.  */
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),   /* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA),   /* tune_flags.  */
   &ampere1_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
   AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
index c419ffb3c1a936a01690ad157c6c71dc645273c8..f688ed08a792d1e6c18730bc0fe67c33fc20f3a7 100644 (file)
@@ -50,13 +50,13 @@ static const struct tune_params ampere1a_tunings =
   "32:16",     /* loop_align.  */
   2,   /* int_reassoc_width.  */
   4,   /* fp_reassoc_width.  */
-  1,   /* fma_reassoc_width.  */
+  4,   /* fma_reassoc_width.  */
   2,   /* vec_reassoc_width.  */
   2,   /* min_div_recip_mul_sf.  */
   2,   /* min_div_recip_mul_df.  */
   0,   /* max_case_values.  */
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),   /* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA),   /* tune_flags.  */
   &ampere1_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
   AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */
index c4928f50d2907dbc93dc755ad187e776dfdb7241..a98b6a980f70b7f40964733ae531452ca4ff9626 100644 (file)
@@ -99,13 +99,14 @@ static const struct tune_params ampere1b_tunings =
   "32:16",     /* loop_align.  */
   2,   /* int_reassoc_width.  */
   4,   /* fp_reassoc_width.  */
-  1,   /* fma_reassoc_width.  */
+  4,   /* fma_reassoc_width.  */
   2,   /* vec_reassoc_width.  */
   2,   /* min_div_recip_mul_sf.  */
   2,   /* min_div_recip_mul_df.  */
   0,   /* max_case_values.  */
   tune_params::AUTOPREFETCHER_STRONG,  /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),     /* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND |
+   AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA),   /* tune_flags.  */
   &ampere1b_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALIGNED,   /* ldp_policy_model.  */
   AARCH64_LDP_STP_POLICY_ALIGNED    /* stp_policy_model.  */