From: Jakub Jelinek Date: Tue, 17 Feb 2026 10:43:43 +0000 (+0100) Subject: bswap: Handle VEC_PACK_TRUNC_EXPR [PR120233] X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=4b71cafc8447e09ee41aff02acb5b26e8b112466;p=thirdparty%2Fgcc.git bswap: Handle VEC_PACK_TRUNC_EXPR [PR120233] With r16-531 we've regressed FAIL: gcc.target/i386/pr108938-3.c scan-assembler-times bswap[\t ]+ 3 on ia32 and also made 2 separate regressions in the same testcase on x86_64-linux (which in scan-assembler-times cancel out; previously we were generating one 64-bit bswap + rotate in one function and one 32-bit bswap + rotate in another function, now we emit 2 32-bit bswaps in the first one and really horrible code in the second one). The following patch fixes the latter function by emitting 32-bit bswap + 32-bit rotate on both ia32 and x86_64. This fixes the above FAIL (and introduces FAIL: gcc.target/i386/pr108938-3.c scan-assembler-times bswap[\t ]+ 2 on x86_64). The problem is that the vectorizer now uses VEC_PACK_TRUNC_EXPR and bswap/store_merging was only able to handle vectors in a CONSTRUCTOR. The patch adds handling of VEC_PACK_TRUNC_EXPR if its operands are CONSTRUCTORs. Without a testcase, I wasn't confident enough to write BYTES_BIG_ENDIAN support, for CONSTRUCTOR { A, B, C, D } we make DCBA out of it on little endian but ABCD on big endian and that would need to be combined with picking up the most significant halves of each element. I'll look incrementally at the other function. 2026-02-17 Jakub Jelinek PR target/120233 * gimple-ssa-store-merging.cc (find_bswap_or_nop_2): New function. (find_bswap_or_nop): Move CONSTRUCTOR handling to above function, call it instead of find_bswap_or_nop_1. (bswap_replace): Handle VEC_PACK_TRUNC_EXPR like CONSTRUCTOR. (maybe_optimize_vector_constructor): Likewise. (pass_optimize_bswap::execute): Likewise. (get_status_for_store_merging): Likewise. (pass_store_merging::execute): Likewise. --- diff --git a/gcc/gimple-ssa-store-merging.cc b/gcc/gimple-ssa-store-merging.cc index 2c22ec3baf2..b7ebbb85ed2 100644 --- a/gcc/gimple-ssa-store-merging.cc +++ b/gcc/gimple-ssa-store-merging.cc @@ -814,6 +814,145 @@ find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit) return NULL; } +/* Like find_bswap_or_nop_1, but also handles CONSTRUCTOR and + VEC_PACK_TRUNC_EXPR. */ + +gimple * +find_bswap_or_nop_2 (gimple *stmt, struct symbolic_number *n, int limit) +{ + if (gimple *ret = find_bswap_or_nop_1 (stmt, n, limit)) + return ret; + + if (!limit + || !is_gimple_assign (stmt) + || BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN + || stmt_can_throw_internal (cfun, stmt)) + return NULL; + + tree rhs1 = gimple_assign_rhs1 (stmt); + if (gimple_assign_rhs_code (stmt) == VEC_PACK_TRUNC_EXPR + && TREE_CODE (rhs1) == SSA_NAME) + { + if (BYTES_BIG_ENDIAN) + return NULL; /* For now... */ + gimple *rhs1_stmt = SSA_NAME_DEF_STMT (rhs1); + tree rhs2 = gimple_assign_rhs2 (stmt); + if (TREE_CODE (rhs2) != SSA_NAME) + return NULL; + gimple *rhs2_stmt = SSA_NAME_DEF_STMT (rhs2); + struct symbolic_number n1, n2; + + gimple *source_stmt1 = find_bswap_or_nop_2 (rhs1_stmt, &n1, limit - 1); + if (!source_stmt1) + return NULL; + gimple *source_stmt2 = find_bswap_or_nop_2 (rhs2_stmt, &n2, limit - 1); + if (!source_stmt2) + return NULL; + + if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type)) + return NULL; + if (n1.vuse != n2.vuse) + return NULL; + auto nn2 = n2.n; + n2.n = 0; + /* We need to take into account number of elements of each vector, + which perform_symbolic_merge doesn't know. So, handle it as + two separate BIT_IOR_EXPR merges, each time with one operand + with changed mastk to all 0s, and then merge here. */ + gimple *source_stmt + = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n, + BIT_IOR_EXPR); + if (!source_stmt) + return NULL; + n2.n = nn2; + auto nn1 = n->n; + n1.n = 0; + gimple *source_stmt3 + = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n, + BIT_IOR_EXPR); + gcc_assert (source_stmt == source_stmt3); + nn2 = n->n; + tree lhs = gimple_assign_lhs (stmt); + int eltsize + = TYPE_PRECISION (TREE_TYPE (TREE_TYPE (lhs))) / BITS_PER_UNIT; + int nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (lhs)).to_constant (); + int size = eltsize * nelts; + int hsize = size / 2; + n->n = 0; + if (!BYTES_BIG_ENDIAN) + for (int i = 0; i < size; i++) + n->n |= ((((i < hsize ? nn1 : nn2) + >> (((i % hsize) / eltsize * 2 * eltsize + + (i % eltsize))) * BITS_PER_MARKER) & MARKER_MASK) + << (i * BITS_PER_MARKER)); + else + gcc_unreachable (); + return source_stmt; + } + + if (gimple_assign_rhs_code (stmt) != CONSTRUCTOR) + return NULL; + + tree type_size = TYPE_SIZE_UNIT (TREE_TYPE (gimple_get_lhs (stmt))); + unsigned HOST_WIDE_INT sz = tree_to_uhwi (type_size) * BITS_PER_UNIT; + if (sz != 16 && sz != 32 && sz != 64) + return NULL; + if (CONSTRUCTOR_NELTS (rhs1) == 0) + return NULL; + tree eltype = TREE_TYPE (TREE_TYPE (rhs1)); + unsigned HOST_WIDE_INT eltsz = int_size_in_bytes (eltype) * BITS_PER_UNIT; + if (TYPE_PRECISION (eltype) != eltsz) + return NULL; + constructor_elt *elt; + unsigned int i; + tree type = build_nonstandard_integer_type (sz, 1); + gimple *ins_stmt = NULL; + FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (rhs1), i, elt) + { + if (TREE_CODE (elt->value) != SSA_NAME + || !INTEGRAL_TYPE_P (TREE_TYPE (elt->value))) + return NULL; + struct symbolic_number n1; + gimple *source_stmt + = find_bswap_or_nop_1 (SSA_NAME_DEF_STMT (elt->value), &n1, limit - 1); + + if (!source_stmt) + return NULL; + + n1.type = type; + if (!n1.base_addr) + n1.range = sz / BITS_PER_UNIT; + + if (i == 0) + { + ins_stmt = source_stmt; + *n = n1; + } + else + { + if (n->vuse != n1.vuse) + return NULL; + + struct symbolic_number n0 = *n; + + if (!BYTES_BIG_ENDIAN) + { + if (!do_shift_rotate (LSHIFT_EXPR, &n1, i * eltsz)) + return NULL; + } + else if (!do_shift_rotate (LSHIFT_EXPR, &n0, eltsz)) + return NULL; + ins_stmt + = perform_symbolic_merge (ins_stmt, &n0, source_stmt, + &n1, n, BIT_IOR_EXPR); + + if (!ins_stmt) + return NULL; + } + } + return ins_stmt; +} + /* Helper for find_bswap_or_nop and try_coalesce_bswap to compute *CMPXCHG, *CMPNOP and adjust *N. */ @@ -950,72 +1089,9 @@ find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap, in libgcc, and for initial shift/and operation of the src operand. */ int limit = tree_to_uhwi (type_size); limit += 2 * (1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit)); - gimple *ins_stmt = find_bswap_or_nop_1 (stmt, n, limit); - + gimple *ins_stmt = find_bswap_or_nop_2 (stmt, n, limit); if (!ins_stmt) - { - if (gimple_assign_rhs_code (stmt) != CONSTRUCTOR - || BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN) - return NULL; - unsigned HOST_WIDE_INT sz = tree_to_uhwi (type_size) * BITS_PER_UNIT; - if (sz != 16 && sz != 32 && sz != 64) - return NULL; - tree rhs = gimple_assign_rhs1 (stmt); - if (CONSTRUCTOR_NELTS (rhs) == 0) - return NULL; - tree eltype = TREE_TYPE (TREE_TYPE (rhs)); - unsigned HOST_WIDE_INT eltsz - = int_size_in_bytes (eltype) * BITS_PER_UNIT; - if (TYPE_PRECISION (eltype) != eltsz) - return NULL; - constructor_elt *elt; - unsigned int i; - tree type = build_nonstandard_integer_type (sz, 1); - FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (rhs), i, elt) - { - if (TREE_CODE (elt->value) != SSA_NAME - || !INTEGRAL_TYPE_P (TREE_TYPE (elt->value))) - return NULL; - struct symbolic_number n1; - gimple *source_stmt - = find_bswap_or_nop_1 (SSA_NAME_DEF_STMT (elt->value), &n1, - limit - 1); - - if (!source_stmt) - return NULL; - - n1.type = type; - if (!n1.base_addr) - n1.range = sz / BITS_PER_UNIT; - - if (i == 0) - { - ins_stmt = source_stmt; - *n = n1; - } - else - { - if (n->vuse != n1.vuse) - return NULL; - - struct symbolic_number n0 = *n; - - if (!BYTES_BIG_ENDIAN) - { - if (!do_shift_rotate (LSHIFT_EXPR, &n1, i * eltsz)) - return NULL; - } - else if (!do_shift_rotate (LSHIFT_EXPR, &n0, eltsz)) - return NULL; - ins_stmt - = perform_symbolic_merge (ins_stmt, &n0, source_stmt, &n1, n, - BIT_IOR_EXPR); - - if (!ins_stmt) - return NULL; - } - } - } + return NULL; uint64_t cmpxchg, cmpnop; uint64_t orig_range = n->range * BITS_PER_UNIT; @@ -1177,7 +1253,8 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl, if (cur_stmt) { tgt = gimple_assign_lhs (cur_stmt); - if (gimple_assign_rhs_code (cur_stmt) == CONSTRUCTOR + if ((gimple_assign_rhs_code (cur_stmt) == CONSTRUCTOR + || gimple_assign_rhs_code (cur_stmt) == VEC_PACK_TRUNC_EXPR) && tgt && VECTOR_TYPE_P (TREE_TYPE (tgt))) conv_code = VIEW_CONVERT_EXPR; @@ -1457,8 +1534,8 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl, return tgt; } -/* Try to optimize an assignment CUR_STMT with CONSTRUCTOR on the rhs - using bswap optimizations. CDI_DOMINATORS need to be +/* Try to optimize an assignment CUR_STMT with CONSTRUCTOR/VEC_PACK_TRUNC_EXPR + on the rhs using bswap optimizations. CDI_DOMINATORS need to be computed on entry. Return true if it has been optimized and TODO_update_ssa is needed. */ @@ -1469,8 +1546,15 @@ maybe_optimize_vector_constructor (gimple *cur_stmt) struct symbolic_number n; bool bswap; - gcc_assert (is_gimple_assign (cur_stmt) - && gimple_assign_rhs_code (cur_stmt) == CONSTRUCTOR); + gcc_assert (is_gimple_assign (cur_stmt)); + switch (gimple_assign_rhs_code (cur_stmt)) + { + case CONSTRUCTOR: + case VEC_PACK_TRUNC_EXPR: + break; + default: + gcc_unreachable (); + } tree rhs = gimple_assign_rhs1 (cur_stmt); if (!VECTOR_TYPE_P (TREE_TYPE (rhs)) @@ -1613,6 +1697,7 @@ pass_optimize_bswap::execute (function *fun) case PLUS_EXPR: break; case CONSTRUCTOR: + case VEC_PACK_TRUNC_EXPR: { tree rhs = gimple_assign_rhs1 (cur_stmt); if (VECTOR_TYPE_P (TREE_TYPE (rhs)) @@ -5491,7 +5576,8 @@ get_status_for_store_merging (basic_block bb) break; if (is_gimple_assign (stmt) - && gimple_assign_rhs_code (stmt) == CONSTRUCTOR) + && (gimple_assign_rhs_code (stmt) == CONSTRUCTOR + || gimple_assign_rhs_code (stmt) == VEC_PACK_TRUNC_EXPR)) { tree rhs = gimple_assign_rhs1 (stmt); if (VECTOR_TYPE_P (TREE_TYPE (rhs)) @@ -5578,7 +5664,8 @@ pass_store_merging::execute (function *fun) } if (is_gimple_assign (stmt) - && gimple_assign_rhs_code (stmt) == CONSTRUCTOR + && (gimple_assign_rhs_code (stmt) == CONSTRUCTOR + || gimple_assign_rhs_code (stmt) == VEC_PACK_TRUNC_EXPR) && maybe_optimize_vector_constructor (stmt)) continue;