From: Andrew Stubbs Date: Mon, 28 Jul 2025 13:58:03 +0000 (+0000) Subject: vect: Add target hook to prefer gather/scatter instructions X-Git-Url: http://git.ipfire.org/gitweb/gitweb.cgi?a=commitdiff_plain;h=49cb093be7f293bbceb9c822ca3034c55bd95e4d;p=thirdparty%2Fgcc.git vect: Add target hook to prefer gather/scatter instructions For AMD GCN, the instructions available for loading/storing vectors are always scatter/gather operations (i.e. there are separate addresses for each vector lane), so the current heuristic to avoid gather/scatter operations with too many elements in get_group_load_store_type is counterproductive. Avoiding such operations in that function can subsequently lead to a missed vectorization opportunity whereby later analyses in the vectorizer try to use a very wide array type which is not available on this target, and thus it bails out. This patch adds a target hook to override the "single_element_p" heuristic in the function as a target hook, and activates it for GCN. This allows much better code to be generated for affected loops. Co-authored-by: Julian Brown gcc/ * doc/tm.texi.in (TARGET_VECTORIZE_PREFER_GATHER_SCATTER): Add documentation hook. * doc/tm.texi: Regenerate. * target.def (prefer_gather_scatter): Add target hook under vectorizer. * hooks.cc (hook_bool_mode_int_unsigned_false): New function. * hooks.h (hook_bool_mode_int_unsigned_false): New prototype. * tree-vect-stmts.cc (vect_use_strided_gather_scatters_p): Add parameters group_size and single_element_p, and rework to use targetm.vectorize.prefer_gather_scatter. (get_group_load_store_type): Move some of the condition into vect_use_strided_gather_scatters_p. * config/gcn/gcn.cc (gcn_prefer_gather_scatter): New function. (TARGET_VECTORIZE_PREFER_GATHER_SCATTER): Define hook. (cherry picked from commit 36c5a7aa9a6dbaed07e3a2482c66743ddcb3e776) --- diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc index d21d1f7d8f5..a6f37318ebe 100644 --- a/gcc/config/gcn/gcn.cc +++ b/gcc/config/gcn/gcn.cc @@ -5795,6 +5795,16 @@ gcn_libc_has_function (enum function_class fn_class, return bsd_libc_has_function (fn_class, type); } +/* Implement TARGET_VECTORIZE_PREFER_GATHER_SCATTER. */ + +static bool +gcn_prefer_gather_scatter (machine_mode ARG_UNUSED (mode), + int ARG_UNUSED (scale), + unsigned int ARG_UNUSED (group_size)) +{ + return true; +} + /* }}} */ /* {{{ md_reorg pass. */ @@ -7994,6 +8004,8 @@ gcn_dwarf_register_span (rtx rtl) gcn_vectorize_builtin_vectorized_function #undef TARGET_VECTORIZE_GET_MASK_MODE #define TARGET_VECTORIZE_GET_MASK_MODE gcn_vectorize_get_mask_mode +#undef TARGET_VECTORIZE_PREFER_GATHER_SCATTER +#define TARGET_VECTORIZE_PREFER_GATHER_SCATTER gcn_prefer_gather_scatter #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE gcn_vectorize_preferred_simd_mode #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index a96700c0d38..ff52c4f0bcf 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -6515,6 +6515,15 @@ The default is @code{NULL_TREE} which means to not vectorize scatter stores. @end deftypefn +@deftypefn {Target Hook} bool TARGET_VECTORIZE_PREFER_GATHER_SCATTER (machine_mode @var{mode}, int @var{scale}, unsigned int @var{group_size}) +This hook returns TRUE if gather loads or scatter stores are cheaper on +this target than a sequence of elementwise loads or stores. The @var{mode} +and @var{scale} correspond to the @code{gather_load} and +@code{scatter_store} instruction patterns. The @var{group_size} is the +number of scalar elements in each scalar loop iteration that are to be +combined into the vector. +@end deftypefn + @deftypefn {Target Hook} int TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN (struct cgraph_node *@var{}, struct cgraph_simd_clone *@var{}, @var{tree}, @var{int}, @var{bool}) This hook should set @var{vecsize_mangle}, @var{vecsize_int}, @var{vecsize_float} fields in @var{simd_clone} structure pointed by @var{clone_info} argument and also diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index eccc4d88493..b03ad4c97c6 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -4311,6 +4311,8 @@ address; but often a machine-dependent strategy can generate better code. @hook TARGET_VECTORIZE_BUILTIN_SCATTER +@hook TARGET_VECTORIZE_PREFER_GATHER_SCATTER + @hook TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN @hook TARGET_SIMD_CLONE_ADJUST diff --git a/gcc/hooks.cc b/gcc/hooks.cc index 951825d4cf6..76cb593103d 100644 --- a/gcc/hooks.cc +++ b/gcc/hooks.cc @@ -117,6 +117,13 @@ hook_bool_mode_const_rtx_true (machine_mode, const_rtx) return true; } +/* Generic hook that takes (machine_mode, int, unsigned) and returns false. */ +bool +hook_bool_mode_int_unsigned_false (machine_mode, int, unsigned) +{ + return false; +} + /* Generic hook that takes (machine_mode, rtx) and returns false. */ bool hook_bool_mode_rtx_false (machine_mode, rtx) diff --git a/gcc/hooks.h b/gcc/hooks.h index c0663bf4455..e95bd11aca8 100644 --- a/gcc/hooks.h +++ b/gcc/hooks.h @@ -36,6 +36,7 @@ extern bool hook_bool_mode_true (machine_mode); extern bool hook_bool_mode_mode_true (machine_mode, machine_mode); extern bool hook_bool_mode_const_rtx_false (machine_mode, const_rtx); extern bool hook_bool_mode_const_rtx_true (machine_mode, const_rtx); +extern bool hook_bool_mode_int_unsigned_false (machine_mode, int, unsigned); extern bool hook_bool_mode_rtx_false (machine_mode, rtx); extern bool hook_bool_mode_rtx_true (machine_mode, rtx); extern bool hook_bool_const_rtx_insn_const_rtx_insn_true (const rtx_insn *, diff --git a/gcc/target.def b/gcc/target.def index 6c7cdc8126b..c63113149d7 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -2056,6 +2056,20 @@ all zeros. GCC can then try to branch around the instruction instead.", (unsigned ifn), default_empty_mask_is_expensive) +/* Prefer gather/scatter loads/stores to e.g. elementwise accesses if\n\ +we cannot use a contiguous access. */ +DEFHOOK +(prefer_gather_scatter, + "This hook returns TRUE if gather loads or scatter stores are cheaper on\n\ +this target than a sequence of elementwise loads or stores. The @var{mode}\n\ +and @var{scale} correspond to the @code{gather_load} and\n\ +@code{scatter_store} instruction patterns. The @var{group_size} is the\n\ +number of scalar elements in each scalar loop iteration that are to be\n\ +combined into the vector.", + bool, + (machine_mode mode, int scale, unsigned int group_size), + hook_bool_mode_int_unsigned_false) + /* Target builtin that implements vector gather operation. */ DEFHOOK (builtin_gather, diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 978a4626b35..f09e06b3ae8 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -1822,21 +1822,35 @@ vect_truncate_gather_scatter_offset (stmt_vec_info stmt_info, static bool vect_use_strided_gather_scatters_p (stmt_vec_info stmt_info, + tree vectype, loop_vec_info loop_vinfo, bool masked_p, gather_scatter_info *gs_info, - vec *elsvals) + vec *elsvals, + unsigned int group_size, + bool single_element_p) { if (!vect_check_gather_scatter (stmt_info, loop_vinfo, gs_info, elsvals) || gs_info->ifn == IFN_LAST) - return vect_truncate_gather_scatter_offset (stmt_info, loop_vinfo, - masked_p, gs_info, elsvals); + { + if (!vect_truncate_gather_scatter_offset (stmt_info, loop_vinfo, + masked_p, gs_info, elsvals)) + return false; + } + else + { + tree old_offset_type = TREE_TYPE (gs_info->offset); + tree new_offset_type = TREE_TYPE (gs_info->offset_vectype); - tree old_offset_type = TREE_TYPE (gs_info->offset); - tree new_offset_type = TREE_TYPE (gs_info->offset_vectype); + gcc_assert (TYPE_PRECISION (new_offset_type) + >= TYPE_PRECISION (old_offset_type)); + gs_info->offset = fold_convert (new_offset_type, gs_info->offset); + } - gcc_assert (TYPE_PRECISION (new_offset_type) - >= TYPE_PRECISION (old_offset_type)); - gs_info->offset = fold_convert (new_offset_type, gs_info->offset); + if (!single_element_p + && !targetm.vectorize.prefer_gather_scatter (TYPE_MODE (vectype), + gs_info->scale, + group_size)) + return false; if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, @@ -2397,11 +2411,11 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, allows us to use contiguous accesses. */ if ((*memory_access_type == VMAT_ELEMENTWISE || *memory_access_type == VMAT_STRIDED_SLP) - && single_element_p && (!slp_node || SLP_TREE_LANES (slp_node) == 1) && loop_vinfo - && vect_use_strided_gather_scatters_p (stmt_info, loop_vinfo, - masked_p, gs_info, elsvals)) + && vect_use_strided_gather_scatters_p (stmt_info, vectype, loop_vinfo, + masked_p, gs_info, elsvals, + group_size, single_element_p)) *memory_access_type = VMAT_GATHER_SCATTER; if (*memory_access_type == VMAT_CONTIGUOUS_DOWN @@ -2558,8 +2572,9 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, { gcc_assert (!slp_node); if (loop_vinfo - && vect_use_strided_gather_scatters_p (stmt_info, loop_vinfo, - masked_p, gs_info, elsvals)) + && vect_use_strided_gather_scatters_p (stmt_info, vectype, loop_vinfo, + masked_p, gs_info, elsvals, + 1, false)) *memory_access_type = VMAT_GATHER_SCATTER; else *memory_access_type = VMAT_ELEMENTWISE;