AArch64: Update Neoverse N2 cost model to release costs

author Tamar Christina <tamar.christina@arm.com>

Thu, 1 Aug 2024 15:54:49 +0000 (16:54 +0100)

committer Tamar Christina <tamar.christina@arm.com>

Thu, 1 Aug 2024 15:54:49 +0000 (16:54 +0100)
author Tamar Christina <tamar.christina@arm.com>
Thu, 1 Aug 2024 15:54:49 +0000 (16:54 +0100)
committer Tamar Christina <tamar.christina@arm.com>
Thu, 1 Aug 2024 15:54:49 +0000 (16:54 +0100)
diff --git a/gcc/config/aarch64/tuning_models/neoversen2.h b/gcc/config/aarch64/tuning_models/neoversen2.h

index be9a48ac3adc097f967c217fe09dcac194d7d14f..d41e714aa045266ecae62a36ed02dfbfb7597c3a 100644 (file)
--- a/gcc/config/aarch64/tuning_models/neoversen2.h
+++ b/gcc/config/aarch64/tuning_models/neoversen2.h
@@ -57,7 +57,7 @@ static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
    2, /* ld2_st2_permute_cost */
    2, /* ld3_st3_permute_cost  */
    3, /* ld4_st4_permute_cost  */
-  3, /* permute_cost  */
+  2, /* permute_cost  */
    4, /* reduc_i8_cost  */
    4, /* reduc_i16_cost  */
    2, /* reduc_i32_cost  */
@@ -86,27 +86,27 @@ static const sve_vec_cost neoversen2_sve_vector_cost =
    {
      2, /* int_stmt_cost  */
      2, /* fp_stmt_cost  */
-    3, /* ld2_st2_permute_cost  */
-    4, /* ld3_st3_permute_cost  */
-    4, /* ld4_st4_permute_cost  */
-    3, /* permute_cost  */
+    2, /* ld2_st2_permute_cost  */
+    3, /* ld3_st3_permute_cost  */
+    3, /* ld4_st4_permute_cost  */
+    2, /* permute_cost  */
      /* Theoretically, a reduction involving 15 scalar ADDs could
         complete in ~5 cycles and would have a cost of 15.  [SU]ADDV
-       completes in 11 cycles, so give it a cost of 15 + 6.  */
-    21, /* reduc_i8_cost  */
-    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
-    13, /* reduc_i16_cost  */
-    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
-    9, /* reduc_i32_cost  */
-    /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
-    2, /* reduc_i64_cost  */
+       completes in 9 cycles, so give it a cost of 15 + 4.  */
+    19, /* reduc_i8_cost  */
+    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 8: 7 + 5.  */
+    12, /* reduc_i16_cost  */
+    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 6: 3 + 4.  */
+    7, /* reduc_i32_cost  */
+    /* Likewise for 1 scalar ADDs (~1 cycles) vs. 4: 1 + 3.  */
+    4, /* reduc_i64_cost  */
      /* Theoretically, a reduction involving 7 scalar FADDs could
-       complete in ~8 cycles and would have a cost of 14.  FADDV
-       completes in 6 cycles, so give it a cost of 14 - 2.  */
+       complete in ~8 cycles and would have a cost of  14.  FADDV
+       completes in 6 cycles, so give it a cost of 14 + -2.  */
      12, /* reduc_f16_cost  */
-    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0.  */
+    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 + 0.  */
      6, /* reduc_f32_cost  */
-    /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0.  */
+    /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 + 0.  */
      2, /* reduc_f64_cost  */
      2, /* store_elt_extra_cost  */
      /* This value is just inherited from the Cortex-A57 table.  */
@@ -127,7 +127,7 @@ static const sve_vec_cost neoversen2_sve_vector_cost =
    /* A strided Advanced SIMD x64 load would take two parallel FP loads
       (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
       is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
-     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
+     (cost 8) and a vec_construct (cost 4).  Add a full vector operation
       (cost 2) to that, to avoid the difference being lost in rounding.
  
       There is no easy comparison between a strided Advanced SIMD x32 load
@@ -165,14 +165,14 @@ static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info =
  {
    {
      {
-      3, /* loads_per_cycle  */
+      3, /* loads_stores_per_cycle  */
        2, /* stores_per_cycle  */
        2, /* general_ops_per_cycle  */
        0, /* fp_simd_load_general_ops  */
        1 /* fp_simd_store_general_ops  */
      },
      2, /* ld2_st2_general_ops  */
-    3, /* ld3_st3_general_ops  */
+    2, /* ld3_st3_general_ops  */
      3 /* ld4_st4_general_ops  */
    },
    2, /* pred_ops_per_cycle  */
@@ -190,7 +190,7 @@ static const aarch64_vec_issue_info neoversen2_vec_issue_info =
    &neoversen2_sve_issue_info
  };
  
-/* Neoverse N2 costs for vector insn classes.  */
+/* Neoversen2 costs for vector insn classes.  */
  static const struct cpu_vector_cost neoversen2_vector_cost =
  {
    1, /* scalar_int_stmt_cost  */
@@ -220,7 +220,7 @@ static const struct tune_params neoversen2_tunings =
      6, /* load_pred.  */
      1 /* store_pred.  */
    }, /* memmov_cost.  */
-  3, /* issue_rate  */
+  5, /* issue_rate  */
    (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
    "32:16",     /* function_align.  */
    "4",         /* jump_align.  */
@@ -243,4 +243,4 @@ static const struct tune_params neoversen2_tunings =
    AARCH64_LDP_STP_POLICY_ALWAYS           /* stp_policy_model.  */
  };
  
-#endif /* GCC_AARCH64_H_NEOVERSEN2.  */
+#endif /* GCC_AARCH64_H_NEOVERSEN2.  */
+\ No newline at end of file
author	Tamar Christina <tamar.christina@arm.com>
	Thu, 1 Aug 2024 15:54:49 +0000 (16:54 +0100)
committer	Tamar Christina <tamar.christina@arm.com>
	Thu, 1 Aug 2024 15:54:49 +0000 (16:54 +0100)