From: Richard Biener Date: Sun, 23 Nov 2025 13:01:03 +0000 (+0100) Subject: Select both inbranch and notinbranch clone during SIMD call analysis X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=cea34ac07e3bd74086f854dd43afb29b4940d8de;p=thirdparty%2Fgcc.git Select both inbranch and notinbranch clone during SIMD call analysis The following recors both a possibly notinbranch and an inbranch SIMD clone during analysis so that we can properly handle the late decision on loop masking. Recording of linear-clause data from analysis is extended to cover linear-clause arguments from both clones. This also fixes AVX512 masked loop code generation in line with the previous fixes. PR tree-optimization/122776 * tree-vectorizer.h (vect_simd_clone_data::clone, vect_simd_clone_data::clone_inbranch): New fields for the two selected clones. * tree-vect-stmts.cc (vectorizable_simd_clone_call): Record both a possibly notinbranch and a inbranch clone. Delay the choice between both to code generation based on LOOP_VINFO_FULLY_MASKED_P. * gcc.dg/vect/vect-simd-clone-24.c: New testcase. * gcc.dg/gomp/pr110485.c: Adjust. --- diff --git a/gcc/testsuite/gcc.dg/gomp/pr110485.c b/gcc/testsuite/gcc.dg/gomp/pr110485.c index ba6817a127f..5183f3f403c 100644 --- a/gcc/testsuite/gcc.dg/gomp/pr110485.c +++ b/gcc/testsuite/gcc.dg/gomp/pr110485.c @@ -16,4 +16,4 @@ void foo (int n) } /* { dg-final { scan-tree-dump-not "MASK_LOAD" "vect" } } */ -/* { dg-final { scan-tree-dump "can't use a fully-masked loop because a non-masked simd clone was selected." "vect" { target x86_64-*-* } } } */ +/* { dg-final { scan-tree-dump "can't use a fully-masked loop because no masked simd clone was available" "vect" { target x86_64-*-* } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-24.c b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-24.c new file mode 100644 index 00000000000..35459d5d5b9 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-24.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-skip-if "" { *-*-* } { "-flto" } { "" } } */ +/* { dg-require-effective-target vect_simd_clones } */ +/* { dg-additional-options "-fopenmp-simd --param vect-partial-vector-usage=1 -fdump-tree-dce6 -w" } */ +/* { dg-additional-options "-mavx512f" { target avx512f } } */ + +#pragma omp declare simd simdlen(16) +int __attribute__((const)) baz (int x); + +int a[1024]; + +void foo (int n, int * __restrict b) +{ + for (int i = 0; i < n; ++i) + if (baz (a[i])) + b[i] = baz (b[i]); +} + +/* One notinbranch SIMD call, one inbranch in the main vector loop and two + inbranch in the masked epilog. */ +/* { dg-final { scan-tree-dump-times "simdclone\.\[0-9\] \\\(\[^,\]\+\\\)" 1 "dce6" { target avx512f } } } */ +/* { dg-final { scan-tree-dump-times "simdclone\.\[0-9\] \\\(\[^,\]\+,\[^,\]\+\\\)" 3 "dce6" { target avx512f } } } */ diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 641b2835693..12eb5ea5b5e 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -4201,9 +4201,12 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, poly_uint64 vf = loop_vinfo ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1; unsigned group_size = SLP_TREE_LANES (slp_node); unsigned int badness = 0; + unsigned int badness_inbranch = 0; struct cgraph_node *bestn = NULL; + struct cgraph_node *bestn_inbranch = NULL; if (!cost_vec) - bestn = cgraph_node::get (simd_clone_info[0]); + bestn = ((loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + ? data.clone_inbranch : data.clone); else for (struct cgraph_node *n = node->simd_clones; n != NULL; n = n->simdclone->next_clone) @@ -4334,14 +4337,19 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, SIMD_CLONE_ARG_TYPE_MASK); /* Penalize using a masked SIMD clone in a non-masked loop, that is not in a branch, as we'd have to construct an all-true mask. */ - if (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) - this_badness += 64; + this_badness += 64; } if (bestn == NULL || this_badness < badness) { bestn = n; badness = this_badness; } + if (n->simdclone->inbranch + && (bestn_inbranch == NULL || this_badness < badness_inbranch)) + { + bestn_inbranch = n; + badness_inbranch = this_badness; + } } if (bestn == NULL) @@ -4377,6 +4385,17 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, "incompatible vector types for invariants\n"); return false; } + + if (!bestn_inbranch && loop_vinfo) + { + if (dump_enabled_p () + && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) + dump_printf_loc (MSG_NOTE, vect_location, + "can't use a fully-masked loop because no" + " masked simd clone was available.\n"); + LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; + } + /* When the original call is pure or const but the SIMD ABI dictates an aggregate return we will have to use a virtual definition and in a loop eventually even need to add a virtual PHI. That's @@ -4390,75 +4409,71 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, so automagic virtual operand updating doesn't work. */ if (gimple_vuse (stmt)) vinfo->any_known_not_updated_vssa = true; - simd_clone_info.safe_push (bestn->decl); - for (i = 0; i < bestn->simdclone->nargs; i++) + + data.clone = bestn; + data.clone_inbranch = bestn_inbranch; + + simd_clone_info.safe_push (NULL_TREE); + for (i = 0; + i < (bestn_inbranch ? bestn_inbranch : bestn)->simdclone->nargs; i++) { - switch (bestn->simdclone->args[i].arg_type) + if (loop_vinfo + && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) + && (bestn_inbranch->simdclone->args[i].arg_type + == SIMD_CLONE_ARG_TYPE_MASK)) { - default: - continue; - case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP: - case SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP: - { - simd_clone_info.safe_grow_cleared (i * 3 + 1, true); - simd_clone_info.safe_push (arginfo[i].op); - tree lst = POINTER_TYPE_P (TREE_TYPE (arginfo[i].op)) - ? size_type_node : TREE_TYPE (arginfo[i].op); - tree ls = build_int_cst (lst, arginfo[i].linear_step); - simd_clone_info.safe_push (ls); - tree sll = arginfo[i].simd_lane_linear - ? boolean_true_node : boolean_false_node; - simd_clone_info.safe_push (sll); - } - break; - case SIMD_CLONE_ARG_TYPE_MASK: - if (loop_vinfo - && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) + if (masked_call_offset) + /* When there is an explicit mask we require the + number of elements to match up. */ + vect_record_loop_mask (loop_vinfo, + &LOOP_VINFO_MASKS (loop_vinfo), + ncopies_in, vectype, NULL_TREE); + else { - if (masked_call_offset) - /* When there is an explicit mask we require the - number of elements to match up. */ - vect_record_loop_mask (loop_vinfo, - &LOOP_VINFO_MASKS (loop_vinfo), - ncopies_in, vectype, NULL_TREE); + /* When there is no explicit mask on the call we have + more relaxed requirements. */ + tree masktype; + poly_uint64 callee_nelements; + if (SCALAR_INT_MODE_P (bestn_inbranch->simdclone->mask_mode)) + { + callee_nelements + = exact_div (bestn_inbranch->simdclone->simdlen, + bestn_inbranch->simdclone->args[i].linear_step); + masktype = get_related_vectype_for_scalar_type + (vinfo->vector_mode, TREE_TYPE (vectype), + callee_nelements); + } else { - /* When there is no explicit mask on the call we have - more relaxed requirements. */ - tree masktype; - poly_uint64 callee_nelements; - if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode)) - { - callee_nelements - = exact_div (bestn->simdclone->simdlen, - bestn->simdclone->args[i].linear_step); - masktype = get_related_vectype_for_scalar_type - (vinfo->vector_mode, TREE_TYPE (vectype), - callee_nelements); - } - else - { - masktype = bestn->simdclone->args[i].vector_type; - callee_nelements = TYPE_VECTOR_SUBPARTS (masktype); - } - auto o = vector_unroll_factor (nunits, callee_nelements); - vect_record_loop_mask (loop_vinfo, - &LOOP_VINFO_MASKS (loop_vinfo), - ncopies * o, masktype, NULL_TREE); + masktype = bestn_inbranch->simdclone->args[i].vector_type; + callee_nelements = TYPE_VECTOR_SUBPARTS (masktype); } + auto o = vector_unroll_factor (nunits, callee_nelements); + vect_record_loop_mask (loop_vinfo, + &LOOP_VINFO_MASKS (loop_vinfo), + ncopies * o, masktype, NULL_TREE); } - break; } - } - - if (!bestn->simdclone->inbranch && loop_vinfo) - { - if (dump_enabled_p () - && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) - dump_printf_loc (MSG_NOTE, vect_location, - "can't use a fully-masked loop because a" - " non-masked simd clone was selected.\n"); - LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; + else if ((bestn->simdclone->args[i].arg_type + == SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP) + || (bestn->simdclone->args[i].arg_type + == SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP) + || (bestn_inbranch + && ((bestn_inbranch->simdclone->args[i].arg_type + == SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP) + || (bestn_inbranch->simdclone->args[i].arg_type + == SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP)))) + { + simd_clone_info.safe_grow_cleared (i * 3 + 1, true); + simd_clone_info.safe_push (arginfo[i].op); + tree lst = (POINTER_TYPE_P (TREE_TYPE (arginfo[i].op)) + ? size_type_node : TREE_TYPE (arginfo[i].op)); + tree ls = build_int_cst (lst, arginfo[i].linear_step); + simd_clone_info.safe_push (ls); + tree sll = (arginfo[i].simd_lane_linear + ? boolean_true_node : boolean_false_node); + simd_clone_info.safe_push (sll); + } } SLP_TREE_TYPE (slp_node) = call_simd_clone_vec_info_type; diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 0356b129e36..606133f9172 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -265,8 +265,12 @@ struct vect_simd_clone_data : vect_data { vect_simd_clone_data () = default; vect_simd_clone_data (vect_simd_clone_data &&other) = default; + /* Selected SIMD clone and clone for in-branch. */ + cgraph_node *clone; + cgraph_node *clone_inbranch; + /* Selected SIMD clone's function info. First vector element - is SIMD clone's function decl, followed by a pair of trees (base + step) + is NULL_TREE, followed by a pair of trees (base + step) for linear arguments (pair of NULLs for other arguments). */ auto_vec simd_clone_info; };