From: liuhongt Date: Fri, 19 Sep 2025 02:13:22 +0000 (-0700) Subject: Disable vect unroll for znver2/Znver1. X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=753e5c8a3b04320ae183a7546fb8b926a4678bdb;p=thirdparty%2Fgcc.git Disable vect unroll for znver2/Znver1. Since it regressed SPEC performance(Refer to PR121994), I guess it's related to register pressure and can be tuned by adjusting reduc_lat_mult_thr. I don't have Zen2 machine, so for simplity, I'll just disable unroll in vectorizer for Zen2. Also adjust count number for {AVX256,AVX512}_SPLIT_REGS. gcc/ChangeLog: PR target/121994 * config/i386/x86-tune-costs.h (znver2_cost): Set vect_unroll_limit to 1. (znver1_cost): Ditto. * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): Adjust count number for {AVX256,AVX512}_SPLIT_REGS. --- diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 5ef7c315091..6eb26cd7b82 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -26144,6 +26144,14 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, /* Record number of load/store/gather/scatter in vectorized body. */ if (where == vect_body && !m_costing_for_scalar) { + int scale = 1; + if (vectype + && ((GET_MODE_SIZE (TYPE_MODE (vectype)) == 64 + && TARGET_AVX512_SPLIT_REGS) + || (GET_MODE_SIZE (TYPE_MODE (vectype)) == 32 + && TARGET_AVX256_SPLIT_REGS))) + scale = 2; + switch (kind) { /* Emulated gather/scatter or any scalarization. */ @@ -26166,7 +26174,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, /* Handle __builtin_fma. */ if (gimple_call_combined_fn (stmt_info->stmt) == CFN_FMA) { - m_num_reduc[X86_REDUC_FMA] += count; + m_num_reduc[X86_REDUC_FMA] += count * scale; break; } @@ -26203,12 +26211,12 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, && (def = SSA_NAME_DEF_STMT (rhs1), true) && is_gimple_assign (def) && gimple_assign_rhs_code (def) == MULT_EXPR) - m_num_reduc[X86_REDUC_FMA] += count; + m_num_reduc[X86_REDUC_FMA] += count * scale; else if (TREE_CODE (rhs2) == SSA_NAME && (def = SSA_NAME_DEF_STMT (rhs2), true) && is_gimple_assign (def) && gimple_assign_rhs_code (def) == MULT_EXPR) - m_num_reduc[X86_REDUC_FMA] += count; + m_num_reduc[X86_REDUC_FMA] += count * scale; break; /* Vectorizer lane_reducing_op_p supports DOT_PROX_EXPR, @@ -26237,7 +26245,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, ? TARGET_AVX10_2 : (TARGET_AVXVNNIINT8 || TARGET_AVX10_2)); } - m_num_reduc[X86_REDUC_DOT_PROD] += count; + m_num_reduc[X86_REDUC_DOT_PROD] += count * scale; /* Dislike to do unroll and partial sum for emulated DOT_PROD_EXPR. */ @@ -26246,7 +26254,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, break; case SAD_EXPR: - m_num_reduc[X86_REDUC_SAD] += count; + m_num_reduc[X86_REDUC_SAD] += count * scale; break; default: diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index 1649ea2fe3e..c7a0f6805ca 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -1744,7 +1744,7 @@ struct processor_costs znver1_cost = { FMA/DOT_PROD_EXPR/SAD_EXPR, it's used to determine unroll factor in the vectorizer. */ - 4, /* Limit how much the autovectorizer + 1, /* Limit how much the autovectorizer may unroll a loop. */ znver1_memcpy, znver1_memset, @@ -1918,7 +1918,7 @@ struct processor_costs znver2_cost = { FMA/DOT_PROD_EXPR/SAD_EXPR, it's used to determine unroll factor in the vectorizer. */ - 4, /* Limit how much the autovectorizer + 1, /* Limit how much the autovectorizer may unroll a loop. */ znver2_memcpy, znver2_memset,