AARCH64_CORE("cortex-x2", cortexx2, cortexa57, V9A, (SVE2_BITPERM, MEMTAG, I8MM, BF16), neoversen2, 0x41, 0xd48, -1)
-AARCH64_CORE("cortex-x3", cortexx3, cortexa57, V9A, (SVE2_BITPERM, MEMTAG, I8MM, BF16), neoversen2, 0x41, 0xd4e, -1)
+AARCH64_CORE("cortex-x3", cortexx3, cortexa57, V9A, (SVE2_BITPERM, MEMTAG, I8MM, BF16), neoversev2, 0x41, 0xd4e, -1)
AARCH64_CORE("cortex-x4", cortexx4, cortexa57, V9_2A, (SVE2_BITPERM, MEMTAG, PROFILE), neoversen2, 0x41, 0xd81, -1)
2, /* ld2_st2_permute_cost */
2, /* ld3_st3_permute_cost */
3, /* ld4_st4_permute_cost */
- 3, /* permute_cost */
+ 2, /* permute_cost */
4, /* reduc_i8_cost */
4, /* reduc_i16_cost */
2, /* reduc_i32_cost */
2, /* reduc_i64_cost */
6, /* reduc_f16_cost */
- 3, /* reduc_f32_cost */
+ 4, /* reduc_f32_cost */
2, /* reduc_f64_cost */
2, /* store_elt_extra_cost */
/* This value is just inherited from the Cortex-A57 table. */
{
2, /* int_stmt_cost */
2, /* fp_stmt_cost */
- 3, /* ld2_st2_permute_cost */
+ 2, /* ld2_st2_permute_cost */
3, /* ld3_st3_permute_cost */
- 4, /* ld4_st4_permute_cost */
- 3, /* permute_cost */
+ 3, /* ld4_st4_permute_cost */
+ 2, /* permute_cost */
/* Theoretically, a reduction involving 15 scalar ADDs could
- complete in ~3 cycles and would have a cost of 15. [SU]ADDV
- completes in 11 cycles, so give it a cost of 15 + 8. */
- 21, /* reduc_i8_cost */
- /* Likewise for 7 scalar ADDs (~2 cycles) vs. 9: 7 + 7. */
- 14, /* reduc_i16_cost */
- /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 4. */
+ complete in ~5 cycles and would have a cost of 15. [SU]ADDV
+ completes in 9 cycles, so give it a cost of 15 + 4. */
+ 19, /* reduc_i8_cost */
+ /* Likewise for 7 scalar ADDs (~3 cycles) vs. 8: 7 + 5. */
+ 12, /* reduc_i16_cost */
+ /* Likewise for 3 scalar ADDs (~2 cycles) vs. 6: 3 + 4. */
7, /* reduc_i32_cost */
- /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1. */
- 2, /* reduc_i64_cost */
+ /* Likewise for 1 scalar ADDs (~1 cycles) vs. 4: 1 + 3. */
+ 4, /* reduc_i64_cost */
/* Theoretically, a reduction involving 7 scalar FADDs could
- complete in ~6 cycles and would have a cost of 14. FADDV
+ complete in ~6 cycles and would have a cost of 14. FADDV
completes in 8 cycles, so give it a cost of 14 + 2. */
16, /* reduc_f16_cost */
/* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */
/* A strided Advanced SIMD x64 load would take two parallel FP loads
(8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
- (cost 8) and a vec_construct (cost 2). Add a full vector operation
+ (cost 8) and a vec_construct (cost 4). Add a full vector operation
(cost 2) to that, to avoid the difference being lost in rounding.
There is no easy comparison between a strided Advanced SIMD x32 load
{
{
{
- 3, /* loads_per_cycle */
+ 3, /* loads_stores_per_cycle */
2, /* stores_per_cycle */
4, /* general_ops_per_cycle */
0, /* fp_simd_load_general_ops */
1 /* fp_simd_store_general_ops */
},
2, /* ld2_st2_general_ops */
- 3, /* ld3_st3_general_ops */
+ 2, /* ld3_st3_general_ops */
3 /* ld4_st4_general_ops */
},
2, /* pred_ops_per_cycle */
&neoversev2_sve_issue_info
};
-/* Demeter costs for vector insn classes. */
+/* Neoversev2 costs for vector insn classes. */
static const struct cpu_vector_cost neoversev2_vector_cost =
{
1, /* scalar_int_stmt_cost */
AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */
};
-#endif /* GCC_AARCH64_H_NEOVERSEV2. */
+#endif /* GCC_AARCH64_H_NEOVERSEV2. */
\ No newline at end of file