]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/tree-vect-data-refs.c
rs6000: Correct prefix testsuite failures on AIX.
[thirdparty/gcc.git] / gcc / tree-vect-data-refs.c
CommitLineData
b8698a0f 1/* Data References Analysis and Manipulation Utilities for Vectorization.
8d9254fc 2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
b8698a0f 3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
ebfd146a
IR
4 and Ira Rosen <irar@il.ibm.com>
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3. If not see
20<http://www.gnu.org/licenses/>. */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
c7131fb2 25#include "backend.h"
957060b5
AM
26#include "target.h"
27#include "rtl.h"
ebfd146a 28#include "tree.h"
c7131fb2 29#include "gimple.h"
957060b5 30#include "predict.h"
4d0cdd0c 31#include "memmodel.h"
957060b5 32#include "tm_p.h"
c7131fb2 33#include "ssa.h"
957060b5
AM
34#include "optabs-tree.h"
35#include "cgraph.h"
957060b5 36#include "dumpfile.h"
c7131fb2 37#include "alias.h"
40e23961 38#include "fold-const.h"
d8a2d370 39#include "stor-layout.h"
2fb9a547 40#include "tree-eh.h"
45b0be94 41#include "gimplify.h"
5be5c238 42#include "gimple-iterator.h"
18f429e2 43#include "gimplify-me.h"
e28030cf
AM
44#include "tree-ssa-loop-ivopts.h"
45#include "tree-ssa-loop-manip.h"
442b4905 46#include "tree-ssa-loop.h"
ebfd146a 47#include "cfgloop.h"
ebfd146a
IR
48#include "tree-scalar-evolution.h"
49#include "tree-vectorizer.h"
2eb79bbb 50#include "expr.h"
9b2b7279 51#include "builtins.h"
bb642979 52#include "tree-cfg.h"
9adee305 53#include "tree-hash-traits.h"
f151c9e1 54#include "vec-perm-indices.h"
bfaa08b7 55#include "internal-fn.h"
ebfd146a 56
272c6793
RS
57/* Return true if load- or store-lanes optab OPTAB is implemented for
58 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
59
60static bool
61vect_lanes_optab_supported_p (const char *name, convert_optab optab,
62 tree vectype, unsigned HOST_WIDE_INT count)
63{
695da534 64 machine_mode mode, array_mode;
272c6793
RS
65 bool limit_p;
66
67 mode = TYPE_MODE (vectype);
695da534 68 if (!targetm.array_mode (mode, count).exists (&array_mode))
272c6793 69 {
695da534
RS
70 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
71 limit_p = !targetm.array_mode_supported_p (mode, count);
72 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
73 {
74 if (dump_enabled_p ())
75 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6f795a92 76 "no array mode for %s[%wu]\n",
695da534
RS
77 GET_MODE_NAME (mode), count);
78 return false;
79 }
272c6793
RS
80 }
81
82 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
83 {
73fbfcad 84 if (dump_enabled_p ())
78c60e3d 85 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
e645e942 86 "cannot use %s<%s><%s>\n", name,
78c60e3d 87 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
272c6793
RS
88 return false;
89 }
90
73fbfcad 91 if (dump_enabled_p ())
78c60e3d 92 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 93 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
78c60e3d 94 GET_MODE_NAME (mode));
272c6793
RS
95
96 return true;
97}
98
99
32e8e429 100/* Return the smallest scalar part of STMT_INFO.
ff802fa1
IR
101 This is used to determine the vectype of the stmt. We generally set the
102 vectype according to the type of the result (lhs). For stmts whose
ebfd146a 103 result-type is different than the type of the arguments (e.g., demotion,
b8698a0f 104 promotion), vectype will be reset appropriately (later). Note that we have
ebfd146a 105 to visit the smallest datatype in this function, because that determines the
ff802fa1 106 VF. If the smallest datatype in the loop is present only as the rhs of a
ebfd146a
IR
107 promotion operation - we'd miss it.
108 Such a case, where a variable of this datatype does not appear in the lhs
109 anywhere in the loop, can only occur if it's an invariant: e.g.:
b8698a0f 110 'int_x = (int) short_inv', which we'd expect to have been optimized away by
ff802fa1
IR
111 invariant motion. However, we cannot rely on invariant motion to always
112 take invariants out of the loop, and so in the case of promotion we also
113 have to check the rhs.
ebfd146a
IR
114 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
115 types. */
116
117tree
32e8e429
RS
118vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
119 HOST_WIDE_INT *lhs_size_unit,
120 HOST_WIDE_INT *rhs_size_unit)
ebfd146a 121{
32e8e429 122 tree scalar_type = gimple_expr_type (stmt_info->stmt);
ebfd146a
IR
123 HOST_WIDE_INT lhs, rhs;
124
ac8936b4
RS
125 /* During the analysis phase, this function is called on arbitrary
126 statements that might not have scalar results. */
127 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
128 return scalar_type;
129
ebfd146a
IR
130 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
131
32e8e429 132 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
beb456c3
RS
133 if (assign
134 && (gimple_assign_cast_p (assign)
135 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
ebfd146a 140 {
beb456c3 141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
ebfd146a 142
a265c9a9
RB
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
146 }
6c0b8df1 147 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
a265c9a9 148 {
6c0b8df1
RS
149 unsigned int i = 0;
150 if (gimple_call_internal_p (call))
151 {
152 internal_fn ifn = gimple_call_internal_fn (call);
153 if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
154 /* gimple_expr_type already picked the type of the loaded
155 or stored data. */
156 i = ~0U;
157 else if (internal_fn_mask_index (ifn) == 0)
158 i = 1;
159 }
160 if (i < gimple_call_num_args (call))
161 {
162 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
163 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
164 {
165 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
166 if (rhs < lhs)
167 scalar_type = rhs_type;
168 }
169 }
ebfd146a 170 }
b8698a0f
L
171
172 *lhs_size_unit = lhs;
ebfd146a
IR
173 *rhs_size_unit = rhs;
174 return scalar_type;
175}
176
177
ebfd146a
IR
178/* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
179 tested at run-time. Return TRUE if DDR was successfully inserted.
180 Return false if versioning is not supported. */
181
f4ebbd24 182static opt_result
ebfd146a
IR
183vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
184{
99b1c316 185 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
ebfd146a 186
028d4092 187 if ((unsigned) param_vect_max_version_for_alias_checks == 0)
f4ebbd24
DM
188 return opt_result::failure_at (vect_location,
189 "will not create alias checks, as"
190 " --param vect-max-version-for-alias-checks"
191 " == 0\n");
ebfd146a 192
f4ebbd24
DM
193 opt_result res
194 = runtime_alias_check_p (ddr, loop,
195 optimize_loop_nest_for_speed_p (loop));
196 if (!res)
197 return res;
319e6439 198
9771b263 199 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
f4ebbd24 200 return opt_result::success ();
ebfd146a
IR
201}
202
a57776a1
RS
203/* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
204
205static void
206vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
207{
208 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
209 for (unsigned int i = 0; i < checks.length(); ++i)
210 if (checks[i] == value)
211 return;
212
213 if (dump_enabled_p ())
3c2a8ed0
DM
214 dump_printf_loc (MSG_NOTE, vect_location,
215 "need run-time check that %T is nonzero\n",
216 value);
a57776a1
RS
217 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
218}
219
89fa689a
RS
220/* Return true if we know that the order of vectorized DR_INFO_A and
221 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
222 DR_INFO_B. At least one of the accesses is a write. */
a57776a1
RS
223
224static bool
89fa689a 225vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
a57776a1 226{
89fa689a
RS
227 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
228 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
229
a57776a1
RS
230 /* Single statements are always kept in their original order. */
231 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
232 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
233 return true;
234
6924b5e6
RB
235 /* STMT_A and STMT_B belong to overlapping groups. All loads are
236 emitted at the position of the first scalar load.
2fd579ab 237 Stores in a group are emitted at the position of the last scalar store.
bf329927 238 Compute that position and check whether the resulting order matches
6924b5e6
RB
239 the current one. */
240 stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
241 if (il_a)
2fd579ab 242 {
6924b5e6
RB
243 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
244 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
245 s = DR_GROUP_NEXT_ELEMENT (s))
246 il_a = get_later_stmt (il_a, s);
247 else /* DR_IS_READ */
248 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
249 s = DR_GROUP_NEXT_ELEMENT (s))
250 if (get_later_stmt (il_a, s) == il_a)
251 il_a = s;
2fd579ab 252 }
bf329927 253 else
6924b5e6
RB
254 il_a = stmtinfo_a;
255 stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
256 if (il_b)
2fd579ab 257 {
6924b5e6
RB
258 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
259 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
260 s = DR_GROUP_NEXT_ELEMENT (s))
261 il_b = get_later_stmt (il_b, s);
262 else /* DR_IS_READ */
263 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
264 s = DR_GROUP_NEXT_ELEMENT (s))
265 if (get_later_stmt (il_b, s) == il_b)
266 il_b = s;
2fd579ab 267 }
bf329927 268 else
6924b5e6 269 il_b = stmtinfo_b;
2fd579ab 270 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
6924b5e6 271 return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
a57776a1 272}
a70d6342 273
dfbddbeb
RS
274/* A subroutine of vect_analyze_data_ref_dependence. Handle
275 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
276 distances. These distances are conservatively correct but they don't
277 reflect a guaranteed dependence.
278
279 Return true if this function does all the work necessary to avoid
280 an alias or false if the caller should use the dependence distances
281 to limit the vectorization factor in the usual way. LOOP_DEPTH is
282 the depth of the loop described by LOOP_VINFO and the other arguments
283 are as for vect_analyze_data_ref_dependence. */
284
285static bool
286vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
287 loop_vec_info loop_vinfo,
d9f21f6a 288 int loop_depth, unsigned int *max_vf)
dfbddbeb 289{
99b1c316 290 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
dfbddbeb
RS
291 lambda_vector dist_v;
292 unsigned int i;
293 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
294 {
295 int dist = dist_v[loop_depth];
296 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
297 {
298 /* If the user asserted safelen >= DIST consecutive iterations
299 can be executed concurrently, assume independence.
300
301 ??? An alternative would be to add the alias check even
302 in this case, and vectorize the fallback loop with the
303 maximum VF set to safelen. However, if the user has
304 explicitly given a length, it's less likely that that
305 would be a win. */
306 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
307 {
d9f21f6a 308 if ((unsigned int) loop->safelen < *max_vf)
dfbddbeb
RS
309 *max_vf = loop->safelen;
310 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
311 continue;
312 }
313
314 /* For dependence distances of 2 or more, we have the option
315 of limiting VF or checking for an alias at runtime.
316 Prefer to check at runtime if we can, to avoid limiting
317 the VF unnecessarily when the bases are in fact independent.
318
319 Note that the alias checks will be removed if the VF ends up
320 being small enough. */
f5ae2856
RS
321 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
322 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
323 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
324 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
5fa23466 325 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
dfbddbeb
RS
326 }
327 }
328 return true;
329}
330
331
ebfd146a
IR
332/* Function vect_analyze_data_ref_dependence.
333
f4ebbd24
DM
334 FIXME: I needed to change the sense of the returned flag.
335
336 Return FALSE if there (might) exist a dependence between a memory-reference
ebfd146a 337 DRA and a memory-reference DRB. When versioning for alias may check a
f4ebbd24 338 dependence at run-time, return TRUE. Adjust *MAX_VF according to
777e1f09 339 the data dependence. */
b8698a0f 340
f4ebbd24 341static opt_result
ebfd146a 342vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
d9f21f6a
RS
343 loop_vec_info loop_vinfo,
344 unsigned int *max_vf)
ebfd146a
IR
345{
346 unsigned int i;
99b1c316 347 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
ebfd146a
IR
348 struct data_reference *dra = DDR_A (ddr);
349 struct data_reference *drb = DDR_B (ddr);
f5ae2856
RS
350 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
351 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
89fa689a
RS
352 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
353 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
ebfd146a
IR
354 lambda_vector dist_v;
355 unsigned int loop_depth;
b8698a0f 356
5abe1e05 357 /* In loop analysis all data references should be vectorizable. */
4b5caab7
IR
358 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
359 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
5abe1e05 360 gcc_unreachable ();
4b5caab7 361
5abe1e05 362 /* Independent data accesses. */
ebfd146a 363 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
f4ebbd24 364 return opt_result::success ();
a70d6342 365
5abe1e05
RB
366 if (dra == drb
367 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
f4ebbd24 368 return opt_result::success ();
5961d779 369
2a5825f2 370 /* We do not have to consider dependences between accesses that belong
9e4da9b5
RS
371 to the same group, unless the stride could be smaller than the
372 group size. */
2c53b149
RB
373 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
374 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
375 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
9e4da9b5 376 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
f4ebbd24 377 return opt_result::success ();
2a5825f2 378
5961d779
RB
379 /* Even if we have an anti-dependence then, as the vectorized loop covers at
380 least two scalar iterations, there is always also a true dependence.
381 As the vectorizer does not re-order loads and stores we can ignore
382 the anti-dependence if TBAA can disambiguate both DRs similar to the
383 case with known negative distance anti-dependences (positive
384 distance anti-dependences would violate TBAA constraints). */
385 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
386 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
387 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
388 get_alias_set (DR_REF (drb))))
f4ebbd24 389 return opt_result::success ();
b8698a0f 390
5abe1e05 391 /* Unknown data dependence. */
ebfd146a
IR
392 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
393 {
74bf76ed
JJ
394 /* If user asserted safelen consecutive iterations can be
395 executed concurrently, assume independence. */
396 if (loop->safelen >= 2)
397 {
d9f21f6a 398 if ((unsigned int) loop->safelen < *max_vf)
74bf76ed 399 *max_vf = loop->safelen;
d1417442 400 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
f4ebbd24 401 return opt_result::success ();
74bf76ed
JJ
402 }
403
3bab6342
AT
404 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
405 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
f4ebbd24
DM
406 return opt_result::failure_at
407 (stmtinfo_a->stmt,
408 "versioning for alias not supported for: "
409 "can't determine dependence between %T and %T\n",
410 DR_REF (dra), DR_REF (drb));
90eb75f2 411
73fbfcad 412 if (dump_enabled_p ())
f4ebbd24 413 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
3c2a8ed0
DM
414 "versioning for alias required: "
415 "can't determine dependence between %T and %T\n",
416 DR_REF (dra), DR_REF (drb));
e4a707c4 417
5abe1e05 418 /* Add to list of ddrs that need to be tested at run-time. */
f4ebbd24 419 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
a70d6342
IR
420 }
421
5abe1e05 422 /* Known data dependence. */
ebfd146a
IR
423 if (DDR_NUM_DIST_VECTS (ddr) == 0)
424 {
74bf76ed
JJ
425 /* If user asserted safelen consecutive iterations can be
426 executed concurrently, assume independence. */
427 if (loop->safelen >= 2)
428 {
d9f21f6a 429 if ((unsigned int) loop->safelen < *max_vf)
74bf76ed 430 *max_vf = loop->safelen;
d1417442 431 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
f4ebbd24 432 return opt_result::success ();
74bf76ed
JJ
433 }
434
3bab6342
AT
435 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
436 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
f4ebbd24
DM
437 return opt_result::failure_at
438 (stmtinfo_a->stmt,
439 "versioning for alias not supported for: "
440 "bad dist vector for %T and %T\n",
441 DR_REF (dra), DR_REF (drb));
90eb75f2 442
73fbfcad 443 if (dump_enabled_p ())
f4ebbd24 444 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
3c2a8ed0
DM
445 "versioning for alias required: "
446 "bad dist vector for %T and %T\n",
447 DR_REF (dra), DR_REF (drb));
ebfd146a 448 /* Add to list of ddrs that need to be tested at run-time. */
f4ebbd24 449 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
b8698a0f 450 }
ebfd146a
IR
451
452 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
dfbddbeb
RS
453
454 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
455 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
456 loop_depth, max_vf))
f4ebbd24 457 return opt_result::success ();
dfbddbeb 458
9771b263 459 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
ebfd146a
IR
460 {
461 int dist = dist_v[loop_depth];
462
73fbfcad 463 if (dump_enabled_p ())
78c60e3d 464 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 465 "dependence distance = %d.\n", dist);
ebfd146a 466
777e1f09 467 if (dist == 0)
ebfd146a 468 {
73fbfcad 469 if (dump_enabled_p ())
3c2a8ed0
DM
470 dump_printf_loc (MSG_NOTE, vect_location,
471 "dependence distance == 0 between %T and %T\n",
472 DR_REF (dra), DR_REF (drb));
ebfd146a 473
5185d248
RB
474 /* When we perform grouped accesses and perform implicit CSE
475 by detecting equal accesses and doing disambiguation with
476 runtime alias tests like for
477 .. = a[i];
478 .. = a[i+1];
479 a[i] = ..;
480 a[i+1] = ..;
481 *p = ..;
482 .. = a[i];
483 .. = a[i+1];
484 where we will end up loading { a[i], a[i+1] } once, make
485 sure that inserting group loads before the first load and
e33f43b9
RB
486 stores after the last store will do the right thing.
487 Similar for groups like
488 a[i] = ...;
489 ... = a[i];
490 a[i+1] = ...;
491 where loads from the group interleave with the store. */
89fa689a 492 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
f4ebbd24
DM
493 return opt_result::failure_at (stmtinfo_a->stmt,
494 "READ_WRITE dependence"
495 " in interleaving.\n");
a57776a1 496
962e91fc 497 if (loop->safelen < 2)
5185d248 498 {
a57776a1 499 tree indicator = dr_zero_step_indicator (dra);
5fa23466 500 if (!indicator || integer_zerop (indicator))
f4ebbd24
DM
501 return opt_result::failure_at (stmtinfo_a->stmt,
502 "access also has a zero step\n");
5fa23466
RB
503 else if (TREE_CODE (indicator) != INTEGER_CST)
504 vect_check_nonzero_value (loop_vinfo, indicator);
ebfd146a 505 }
777e1f09
RG
506 continue;
507 }
508
509 if (dist > 0 && DDR_REVERSED_P (ddr))
510 {
511 /* If DDR_REVERSED_P the order of the data-refs in DDR was
512 reversed (to make distance vector positive), and the actual
513 distance is negative. */
73fbfcad 514 if (dump_enabled_p ())
f30d4934 515 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 516 "dependence distance negative.\n");
f30d4934
BC
517 /* When doing outer loop vectorization, we need to check if there is
518 a backward dependence at the inner loop level if the dependence
519 at the outer loop is reversed. See PR81740. */
520 if (nested_in_vect_loop_p (loop, stmtinfo_a)
521 || nested_in_vect_loop_p (loop, stmtinfo_b))
522 {
523 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
524 DDR_LOOP_NEST (ddr));
525 if (dist_v[inner_depth] < 0)
526 return opt_result::failure_at (stmtinfo_a->stmt,
527 "not vectorized, dependence "
528 "between data-refs %T and %T\n",
529 DR_REF (dra), DR_REF (drb));
530 }
f2556b68
RB
531 /* Record a negative dependence distance to later limit the
532 amount of stmt copying / unrolling we can perform.
533 Only need to handle read-after-write dependence. */
534 if (DR_IS_READ (drb)
535 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
536 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
537 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
777e1f09
RG
538 continue;
539 }
540
d9f21f6a
RS
541 unsigned int abs_dist = abs (dist);
542 if (abs_dist >= 2 && abs_dist < *max_vf)
777e1f09
RG
543 {
544 /* The dependence distance requires reduction of the maximal
545 vectorization factor. */
f30d4934 546 *max_vf = abs_dist;
73fbfcad 547 if (dump_enabled_p ())
78c60e3d 548 dump_printf_loc (MSG_NOTE, vect_location,
e645e942
TJ
549 "adjusting maximal vectorization factor to %i\n",
550 *max_vf);
ebfd146a
IR
551 }
552
d9f21f6a 553 if (abs_dist >= *max_vf)
ebfd146a 554 {
b8698a0f 555 /* Dependence distance does not create dependence, as far as
777e1f09 556 vectorization is concerned, in this case. */
73fbfcad 557 if (dump_enabled_p ())
78c60e3d 558 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 559 "dependence distance >= VF.\n");
ebfd146a
IR
560 continue;
561 }
562
f4ebbd24
DM
563 return opt_result::failure_at (stmtinfo_a->stmt,
564 "not vectorized, possible dependence "
565 "between data-refs %T and %T\n",
566 DR_REF (dra), DR_REF (drb));
ebfd146a
IR
567 }
568
f4ebbd24 569 return opt_result::success ();
ebfd146a
IR
570}
571
572/* Function vect_analyze_data_ref_dependences.
b8698a0f 573
ebfd146a 574 Examine all the data references in the loop, and make sure there do not
777e1f09
RG
575 exist any data dependences between them. Set *MAX_VF according to
576 the maximum vectorization factor the data dependences allow. */
b8698a0f 577
f4ebbd24 578opt_result
d9f21f6a
RS
579vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
580 unsigned int *max_vf)
ebfd146a
IR
581{
582 unsigned int i;
ebfd146a
IR
583 struct data_dependence_relation *ddr;
584
adac3a68 585 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
5abe1e05 586
ca823c85
RB
587 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
588 {
589 LOOP_VINFO_DDRS (loop_vinfo)
590 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
591 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
592 /* We need read-read dependences to compute
593 STMT_VINFO_SAME_ALIGN_REFS. */
c2fd033c
RB
594 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
595 &LOOP_VINFO_DDRS (loop_vinfo),
596 LOOP_VINFO_LOOP_NEST (loop_vinfo),
597 true);
598 gcc_assert (res);
ca823c85
RB
599 }
600
d1417442 601 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
5abe1e05 602
598eaaa2
YR
603 /* For epilogues we either have no aliases or alias versioning
604 was applied to original loop. Therefore we may just get max_vf
605 using VF of original loop. */
606 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
a41a6142 607 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
598eaaa2
YR
608 else
609 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
f4ebbd24
DM
610 {
611 opt_result res
612 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
613 if (!res)
614 return res;
615 }
5abe1e05 616
f4ebbd24 617 return opt_result::success ();
5abe1e05
RB
618}
619
620
621/* Function vect_slp_analyze_data_ref_dependence.
622
623 Return TRUE if there (might) exist a dependence between a memory-reference
f5ae2856
RS
624 DRA and a memory-reference DRB for VINFO. When versioning for alias
625 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
626 according to the data dependence. */
5abe1e05
RB
627
628static bool
f5ae2856
RS
629vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
630 struct data_dependence_relation *ddr)
5abe1e05
RB
631{
632 struct data_reference *dra = DDR_A (ddr);
633 struct data_reference *drb = DDR_B (ddr);
f5ae2856
RS
634 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
635 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
5abe1e05
RB
636
637 /* We need to check dependences of statements marked as unvectorizable
638 as well, they still can prohibit vectorization. */
639
640 /* Independent data accesses. */
641 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
642 return false;
643
644 if (dra == drb)
645 return false;
646
647 /* Read-read is OK. */
648 if (DR_IS_READ (dra) && DR_IS_READ (drb))
649 return false;
650
e6c9d234
RB
651 /* If dra and drb are part of the same interleaving chain consider
652 them independent. */
89fa689a
RS
653 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
654 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
655 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
e6c9d234
RB
656 return false;
657
5abe1e05
RB
658 /* Unknown data dependence. */
659 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
fcac74a1 660 {
649d196d 661 if (dump_enabled_p ())
3c2a8ed0
DM
662 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
663 "can't determine dependence between %T and %T\n",
664 DR_REF (dra), DR_REF (drb));
fcac74a1 665 }
649d196d 666 else if (dump_enabled_p ())
3c2a8ed0
DM
667 dump_printf_loc (MSG_NOTE, vect_location,
668 "determined dependence between %T and %T\n",
669 DR_REF (dra), DR_REF (drb));
b8698a0f 670
5abe1e05
RB
671 return true;
672}
673
674
c2a12ca0
RB
675/* Analyze dependences involved in the transform of SLP NODE. STORES
676 contain the vector of scalar stores of this instance if we are
677 disambiguating the loads. */
64900538
RB
678
679static bool
4e849a74 680vect_slp_analyze_node_dependences (vec_info *vinfo, slp_tree node,
b9787581 681 vec<stmt_vec_info> stores,
32e8e429 682 stmt_vec_info last_store_info)
64900538
RB
683{
684 /* This walks over all stmts involved in the SLP load/store done
685 in NODE verifying we can sink them up to the last stmt in the
686 group. */
6924b5e6 687 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
64900538 688 {
6924b5e6
RB
689 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
690 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
64900538 691 {
6924b5e6
RB
692 stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
693 if (access_info == last_access_info)
64900538 694 continue;
6924b5e6
RB
695 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
696 ao_ref ref;
697 bool ref_initialized_p = false;
698 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
699 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
27312bf2 700 {
6924b5e6
RB
701 gimple *stmt = gsi_stmt (gsi);
702 if (! gimple_vuse (stmt))
703 continue;
704
705 /* If we couldn't record a (single) data reference for this
706 stmt we have to resort to the alias oracle. */
707 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
708 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
709 if (!dr_b)
710 {
711 /* We are moving a store - this means
712 we cannot use TBAA for disambiguation. */
713 if (!ref_initialized_p)
714 ao_ref_init (&ref, DR_REF (dr_a));
715 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
716 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
717 return false;
718 continue;
719 }
720
721 bool dependent = false;
722 /* If we run into a store of this same instance (we've just
723 marked those) then delay dependence checking until we run
724 into the last store because this is where it will have
725 been sunk to (and we verify if we can do that as well). */
726 if (gimple_visited_p (stmt))
727 {
728 if (stmt_info != last_store_info)
729 continue;
730 unsigned i;
731 stmt_vec_info store_info;
732 FOR_EACH_VEC_ELT (stores, i, store_info)
733 {
734 data_reference *store_dr
735 = STMT_VINFO_DATA_REF (store_info);
736 ddr_p ddr = initialize_data_dependence_relation
737 (dr_a, store_dr, vNULL);
738 dependent
739 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
740 free_dependence_relation (ddr);
741 if (dependent)
742 break;
743 }
744 }
745 else
746 {
747 ddr_p ddr = initialize_data_dependence_relation (dr_a,
748 dr_b, vNULL);
749 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
750 free_dependence_relation (ddr);
751 }
752 if (dependent)
27312bf2 753 return false;
27312bf2 754 }
6924b5e6
RB
755 }
756 }
757 else /* DR_IS_READ */
758 {
759 stmt_vec_info first_access_info
760 = vect_find_first_scalar_stmt_in_slp (node);
761 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
762 {
763 stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
764 if (access_info == first_access_info)
765 continue;
766 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
767 ao_ref ref;
768 bool ref_initialized_p = false;
769 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
770 gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
c2a12ca0 771 {
6924b5e6
RB
772 gimple *stmt = gsi_stmt (gsi);
773 if (! gimple_vdef (stmt))
c2a12ca0 774 continue;
6924b5e6
RB
775
776 /* If we couldn't record a (single) data reference for this
777 stmt we have to resort to the alias oracle. */
778 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
779 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
780 if (!dr_b)
781 {
782 /* We are hoisting a load - this means we can use
783 TBAA for disambiguation. */
784 if (!ref_initialized_p)
785 ao_ref_init (&ref, DR_REF (dr_a));
786 if (stmt_may_clobber_ref_p_1 (stmt, &ref, true))
787 return false;
788 continue;
789 }
790
791 bool dependent = false;
792 /* If we run into a store of this same instance (we've just
793 marked those) then delay dependence checking until we run
794 into the last store because this is where it will have
795 been sunk to (and we verify if we can do that as well). */
796 if (gimple_visited_p (stmt))
c2a12ca0 797 {
6924b5e6
RB
798 if (stmt_info != last_store_info)
799 continue;
800 unsigned i;
801 stmt_vec_info store_info;
802 FOR_EACH_VEC_ELT (stores, i, store_info)
803 {
804 data_reference *store_dr
805 = STMT_VINFO_DATA_REF (store_info);
806 ddr_p ddr = initialize_data_dependence_relation
807 (dr_a, store_dr, vNULL);
808 dependent
809 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
810 free_dependence_relation (ddr);
811 if (dependent)
812 break;
813 }
814 }
815 else
816 {
817 ddr_p ddr = initialize_data_dependence_relation (dr_a,
818 dr_b, vNULL);
819 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
c2a12ca0
RB
820 free_dependence_relation (ddr);
821 }
6924b5e6
RB
822 if (dependent)
823 return false;
c2a12ca0 824 }
64900538
RB
825 }
826 }
827 return true;
828}
829
830
5abe1e05
RB
831/* Function vect_analyze_data_ref_dependences.
832
833 Examine all the data references in the basic-block, and make sure there
834 do not exist any data dependences between them. Set *MAX_VF according to
835 the maximum vectorization factor the data dependences allow. */
836
837bool
308bc496 838vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
5abe1e05 839{
adac3a68 840 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
5abe1e05 841
c2a12ca0
RB
842 /* The stores of this instance are at the root of the SLP tree. */
843 slp_tree store = SLP_INSTANCE_TREE (instance);
b9787581 844 if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
c2a12ca0
RB
845 store = NULL;
846
847 /* Verify we can sink stores to the vectorized stmt insert location. */
95c68311 848 stmt_vec_info last_store_info = NULL;
c2a12ca0 849 if (store)
64900538 850 {
4e849a74 851 if (! vect_slp_analyze_node_dependences (vinfo, store, vNULL, NULL))
c2a12ca0
RB
852 return false;
853
854 /* Mark stores in this instance and remember the last one. */
95c68311 855 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
4e849a74 856 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
b9787581 857 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
64900538 858 }
5abe1e05 859
c2a12ca0 860 bool res = true;
ebfd146a 861
c2a12ca0
RB
862 /* Verify we can sink loads to the vectorized stmt insert location,
863 special-casing stores of this instance. */
864 slp_tree load;
865 unsigned int i;
866 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
4e849a74 867 if (! vect_slp_analyze_node_dependences (vinfo, load,
c2a12ca0
RB
868 store
869 ? SLP_TREE_SCALAR_STMTS (store)
95c68311 870 : vNULL, last_store_info))
c2a12ca0
RB
871 {
872 res = false;
873 break;
874 }
875
876 /* Unset the visited flag. */
877 if (store)
4e849a74 878 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
b9787581 879 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
c2a12ca0
RB
880
881 return res;
ebfd146a
IR
882}
883
d7609678
RS
884/* Record the base alignment guarantee given by DRB, which occurs
885 in STMT_INFO. */
62c8a2cf
RS
886
887static void
308bc496 888vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
62c8a2cf
RS
889 innermost_loop_behavior *drb)
890{
891 bool existed;
892 innermost_loop_behavior *&entry
893 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
894 if (!existed || entry->base_alignment < drb->base_alignment)
895 {
896 entry = drb;
897 if (dump_enabled_p ())
3c2a8ed0
DM
898 dump_printf_loc (MSG_NOTE, vect_location,
899 "recording new base alignment for %T\n"
900 " alignment: %d\n"
901 " misalignment: %d\n"
902 " based on: %G",
903 drb->base_address,
904 drb->base_alignment,
905 drb->base_misalignment,
906 stmt_info->stmt);
62c8a2cf
RS
907 }
908}
909
910/* If the region we're going to vectorize is reached, all unconditional
911 data references occur at least once. We can therefore pool the base
912 alignment guarantees from each unconditional reference. Do this by
913 going through all the data references in VINFO and checking whether
914 the containing statement makes the reference unconditionally. If so,
915 record the alignment of the base address in VINFO so that it can be
916 used for all other references with the same base. */
917
918void
919vect_record_base_alignments (vec_info *vinfo)
920{
921 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
99b1c316 922 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
62c8a2cf
RS
923 data_reference *dr;
924 unsigned int i;
ca823c85 925 FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
3b1cffcc 926 {
f5ae2856 927 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
89fa689a 928 stmt_vec_info stmt_info = dr_info->stmt;
57c454d2 929 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
5fa23466
RB
930 && STMT_VINFO_VECTORIZABLE (stmt_info)
931 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
57c454d2 932 {
308bc496 933 vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
62c8a2cf 934
57c454d2
RB
935 /* If DR is nested in the loop that is being vectorized, we can also
936 record the alignment of the base wrt the outer loop. */
78e02b3b 937 if (loop && nested_in_vect_loop_p (loop, stmt_info))
5fa23466 938 vect_record_base_alignment
308bc496 939 (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
57c454d2 940 }
3b1cffcc 941 }
62c8a2cf
RS
942}
943
89fa689a 944/* Return the target alignment for the vectorized form of DR_INFO. */
f702e7d4 945
ca31798e 946static poly_uint64
89fa689a 947vect_calculate_target_alignment (dr_vec_info *dr_info)
f702e7d4 948{
89fa689a 949 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
f702e7d4
RS
950 return targetm.vectorize.preferred_vector_alignment (vectype);
951}
952
ebfd146a
IR
953/* Function vect_compute_data_ref_alignment
954
89fa689a 955 Compute the misalignment of the data reference DR_INFO.
ebfd146a
IR
956
957 Output:
89fa689a 958 1. DR_MISALIGNMENT (DR_INFO) is defined.
ebfd146a
IR
959
960 FOR NOW: No analysis is actually performed. Misalignment is calculated
961 only for trivial cases. TODO. */
962
5fa23466 963static void
308bc496 964vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info)
ebfd146a 965{
89fa689a 966 stmt_vec_info stmt_info = dr_info->stmt;
308bc496
RB
967 vec_base_alignments *base_alignments = &vinfo->base_alignments;
968 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
99b1c316 969 class loop *loop = NULL;
89fa689a 970 tree ref = DR_REF (dr_info->dr);
3f5e8a76 971 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
b8698a0f 972
73fbfcad 973 if (dump_enabled_p ())
78c60e3d 974 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 975 "vect_compute_data_ref_alignment:\n");
ebfd146a 976
a70d6342
IR
977 if (loop_vinfo)
978 loop = LOOP_VINFO_LOOP (loop_vinfo);
b8698a0f 979
ebfd146a 980 /* Initialize misalignment to unknown. */
89fa689a 981 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
ebfd146a 982
5fa23466
RB
983 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
984 return;
985
308bc496 986 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
3f5e8a76
RS
987 bool step_preserves_misalignment_p;
988
ca31798e
AV
989 poly_uint64 vector_alignment
990 = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
89fa689a 991 DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
f702e7d4 992
4e9d58d1
AV
993 /* If the main loop has peeled for alignment we have no way of knowing
994 whether the data accesses in the epilogues are aligned. We can't at
995 compile time answer the question whether we have entered the main loop or
996 not. Fixes PR 92351. */
997 if (loop_vinfo)
998 {
999 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1000 if (orig_loop_vinfo
1001 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1002 return;
1003 }
1004
ca31798e
AV
1005 unsigned HOST_WIDE_INT vect_align_c;
1006 if (!vector_alignment.is_constant (&vect_align_c))
1007 return;
1008
3f5e8a76
RS
1009 /* No step for BB vectorization. */
1010 if (!loop)
1011 {
1012 gcc_assert (integer_zerop (drb->step));
1013 step_preserves_misalignment_p = true;
1014 }
ebfd146a
IR
1015
1016 /* In case the dataref is in an inner-loop of the loop that is being
1017 vectorized (LOOP), we use the base and misalignment information
ff802fa1 1018 relative to the outer-loop (LOOP). This is ok only if the misalignment
ebfd146a
IR
1019 stays the same throughout the execution of the inner-loop, which is why
1020 we have to check that the stride of the dataref in the inner-loop evenly
f702e7d4 1021 divides by the vector alignment. */
78e02b3b 1022 else if (nested_in_vect_loop_p (loop, stmt_info))
ebfd146a 1023 {
3f5e8a76 1024 step_preserves_misalignment_p
ca31798e 1025 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
b8698a0f 1026
3f5e8a76 1027 if (dump_enabled_p ())
ebfd146a 1028 {
3f5e8a76
RS
1029 if (step_preserves_misalignment_p)
1030 dump_printf_loc (MSG_NOTE, vect_location,
f702e7d4 1031 "inner step divides the vector alignment.\n");
3f5e8a76 1032 else
78c60e3d 1033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
f702e7d4
RS
1034 "inner step doesn't divide the vector"
1035 " alignment.\n");
ebfd146a
IR
1036 }
1037 }
1038
91ff1504
RB
1039 /* Similarly we can only use base and misalignment information relative to
1040 an innermost loop if the misalignment stays the same throughout the
1041 execution of the loop. As above, this is the case if the stride of
f702e7d4 1042 the dataref evenly divides by the alignment. */
91ff1504 1043 else
3ebde0e9 1044 {
d9f21f6a 1045 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3f5e8a76 1046 step_preserves_misalignment_p
ca31798e 1047 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
3ebde0e9 1048
3f5e8a76
RS
1049 if (!step_preserves_misalignment_p && dump_enabled_p ())
1050 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
f702e7d4 1051 "step doesn't divide the vector alignment.\n");
3ebde0e9 1052 }
52639a61 1053
bb642979
RS
1054 unsigned int base_alignment = drb->base_alignment;
1055 unsigned int base_misalignment = drb->base_misalignment;
ebfd146a 1056
62c8a2cf
RS
1057 /* Calculate the maximum of the pooled base address alignment and the
1058 alignment that we can compute for DR itself. */
1059 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
1060 if (entry && base_alignment < (*entry)->base_alignment)
1061 {
1062 base_alignment = (*entry)->base_alignment;
1063 base_misalignment = (*entry)->base_misalignment;
1064 }
1065
ca31798e 1066 if (drb->offset_alignment < vect_align_c
832b4117
RS
1067 || !step_preserves_misalignment_p
1068 /* We need to know whether the step wrt the vectorized loop is
1069 negative when computing the starting misalignment below. */
1070 || TREE_CODE (drb->step) != INTEGER_CST)
ebfd146a 1071 {
73fbfcad 1072 if (dump_enabled_p ())
3c2a8ed0
DM
1073 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1074 "Unknown alignment for access: %T\n", ref);
5fa23466 1075 return;
ebfd146a
IR
1076 }
1077
ca31798e 1078 if (base_alignment < vect_align_c)
ebfd146a 1079 {
a199d5e7
RS
1080 unsigned int max_alignment;
1081 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
ca31798e 1082 if (max_alignment < vect_align_c
a199d5e7 1083 || !vect_can_force_dr_alignment_p (base,
ca31798e 1084 vect_align_c * BITS_PER_UNIT))
ebfd146a 1085 {
73fbfcad 1086 if (dump_enabled_p ())
3c2a8ed0
DM
1087 dump_printf_loc (MSG_NOTE, vect_location,
1088 "can't force alignment of ref: %T\n", ref);
5fa23466 1089 return;
ebfd146a 1090 }
b8698a0f 1091
ebfd146a
IR
1092 /* Force the alignment of the decl.
1093 NOTE: This is the only change to the code we make during
1094 the analysis phase, before deciding to vectorize the loop. */
73fbfcad 1095 if (dump_enabled_p ())
3c2a8ed0
DM
1096 dump_printf_loc (MSG_NOTE, vect_location,
1097 "force alignment of %T\n", ref);
720f5239 1098
89fa689a
RS
1099 dr_info->base_decl = base;
1100 dr_info->base_misaligned = true;
bb642979 1101 base_misalignment = 0;
ebfd146a 1102 }
8944b5b3
RS
1103 poly_int64 misalignment
1104 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
ebfd146a 1105
46241ea9
RG
1106 /* If this is a backward running DR then first access in the larger
1107 vectype actually is N-1 elements before the address in the DR.
1108 Adjust misalign accordingly. */
3f5e8a76 1109 if (tree_int_cst_sgn (drb->step) < 0)
bb642979
RS
1110 /* PLUS because STEP is negative. */
1111 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1112 * TREE_INT_CST_LOW (drb->step));
46241ea9 1113
8944b5b3 1114 unsigned int const_misalignment;
ca31798e 1115 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
8944b5b3
RS
1116 {
1117 if (dump_enabled_p ())
3c2a8ed0
DM
1118 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1119 "Non-constant misalignment for access: %T\n", ref);
5fa23466 1120 return;
8944b5b3
RS
1121 }
1122
89fa689a 1123 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
ebfd146a 1124
73fbfcad 1125 if (dump_enabled_p ())
3c2a8ed0
DM
1126 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1127 "misalign = %d bytes of ref %T\n",
1128 DR_MISALIGNMENT (dr_info), ref);
ebfd146a 1129
5fa23466 1130 return;
ebfd146a
IR
1131}
1132
71595748 1133/* Function vect_update_misalignment_for_peel.
89fa689a
RS
1134 Sets DR_INFO's misalignment
1135 - to 0 if it has the same alignment as DR_PEEL_INFO,
1136 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
71595748 1137 - to -1 (unknown) otherwise.
ebfd146a 1138
89fa689a
RS
1139 DR_INFO - the data reference whose misalignment is to be adjusted.
1140 DR_PEEL_INFO - the data reference whose misalignment is being made
1141 zero in the vector loop by the peel.
ebfd146a 1142 NPEEL - the number of iterations in the peel loop if the misalignment
89fa689a 1143 of DR_PEEL_INFO is known at compile time. */
ebfd146a
IR
1144
1145static void
89fa689a
RS
1146vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1147 dr_vec_info *dr_peel_info, int npeel)
ebfd146a
IR
1148{
1149 unsigned int i;
71595748 1150 vec<dr_p> same_aligned_drs;
ebfd146a 1151 struct data_reference *current_dr;
89fa689a 1152 stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
ebfd146a 1153
7ea4b8ed
RB
1154 /* It can be assumed that if dr_info has the same alignment as dr_peel,
1155 it is aligned in the vector loop. */
89fa689a 1156 same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
71595748 1157 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
ebfd146a 1158 {
89fa689a 1159 if (current_dr != dr_info->dr)
ebfd146a 1160 continue;
89fa689a
RS
1161 gcc_assert (!known_alignment_for_access_p (dr_info)
1162 || !known_alignment_for_access_p (dr_peel_info)
7ea4b8ed
RB
1163 || (DR_MISALIGNMENT (dr_info)
1164 == DR_MISALIGNMENT (dr_peel_info)));
89fa689a 1165 SET_DR_MISALIGNMENT (dr_info, 0);
ebfd146a
IR
1166 return;
1167 }
1168
ca31798e
AV
1169 unsigned HOST_WIDE_INT alignment;
1170 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1171 && known_alignment_for_access_p (dr_info)
89fa689a 1172 && known_alignment_for_access_p (dr_peel_info))
ebfd146a 1173 {
89fa689a 1174 int misal = DR_MISALIGNMENT (dr_info);
7ea4b8ed 1175 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
ca31798e 1176 misal &= alignment - 1;
89fa689a 1177 SET_DR_MISALIGNMENT (dr_info, misal);
ebfd146a
IR
1178 return;
1179 }
1180
73fbfcad 1181 if (dump_enabled_p ())
8d21ff9f
RD
1182 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1183 "to unknown (-1).\n");
89fa689a 1184 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
ebfd146a
IR
1185}
1186
d30846a0
FY
1187/* Return true if alignment is relevant for DR_INFO. */
1188
1189static bool
1190vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1191{
1192 stmt_vec_info stmt_info = dr_info->stmt;
1193
1194 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1195 return false;
1196
1197 /* For interleaving, only the alignment of the first access matters. */
1198 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1199 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1200 return false;
1201
1202 /* Scatter-gather and invariant accesses continue to address individual
1203 scalars, so vector-level alignment is irrelevant. */
1204 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1205 || integer_zerop (DR_STEP (dr_info->dr)))
1206 return false;
1207
1208 /* Strided accesses perform only component accesses, alignment is
1209 irrelevant for them. */
1210 if (STMT_VINFO_STRIDED_P (stmt_info)
1211 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1212 return false;
1213
1214 return true;
1215}
ebfd146a 1216
a5b50aa1
RB
1217/* Function verify_data_ref_alignment
1218
89fa689a 1219 Return TRUE if DR_INFO can be handled with respect to alignment. */
a5b50aa1 1220
f4ebbd24 1221static opt_result
308bc496 1222verify_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info)
a5b50aa1 1223{
52eab378 1224 enum dr_alignment_support supportable_dr_alignment
308bc496 1225 = vect_supportable_dr_alignment (vinfo, dr_info, false);
a5b50aa1 1226 if (!supportable_dr_alignment)
f4ebbd24
DM
1227 return opt_result::failure_at
1228 (dr_info->stmt->stmt,
1229 DR_IS_READ (dr_info->dr)
1230 ? "not vectorized: unsupported unaligned load: %T\n"
1231 : "not vectorized: unsupported unaligned store: %T\n",
1232 DR_REF (dr_info->dr));
a5b50aa1
RB
1233
1234 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1235 dump_printf_loc (MSG_NOTE, vect_location,
1236 "Vectorizing an unaligned access.\n");
1237
f4ebbd24 1238 return opt_result::success ();
a5b50aa1
RB
1239}
1240
ebfd146a
IR
1241/* Function vect_verify_datarefs_alignment
1242
1243 Return TRUE if all data references in the loop can be
1244 handled with respect to alignment. */
1245
f4ebbd24 1246opt_result
8df82de2 1247vect_verify_datarefs_alignment (loop_vec_info loop_vinfo)
ebfd146a 1248{
8df82de2 1249 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
ebfd146a 1250 struct data_reference *dr;
ebfd146a
IR
1251 unsigned int i;
1252
9771b263 1253 FOR_EACH_VEC_ELT (datarefs, i, dr)
31271e91 1254 {
8df82de2 1255 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
d30846a0 1256 if (!vect_relevant_for_alignment_p (dr_info))
513ecaea 1257 continue;
52eab378 1258
8df82de2 1259 opt_result res = verify_data_ref_alignment (loop_vinfo, dr_info);
f4ebbd24
DM
1260 if (!res)
1261 return res;
31271e91 1262 }
4b5caab7 1263
f4ebbd24 1264 return opt_result::success ();
ebfd146a
IR
1265}
1266
4c9bcf89
RG
1267/* Given an memory reference EXP return whether its alignment is less
1268 than its size. */
1269
1270static bool
1271not_size_aligned (tree exp)
1272{
cc269bb6 1273 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
4c9bcf89
RG
1274 return true;
1275
eb1ce453 1276 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
4c9bcf89
RG
1277 > get_object_alignment (exp));
1278}
ebfd146a
IR
1279
1280/* Function vector_alignment_reachable_p
1281
89fa689a 1282 Return true if vector alignment for DR_INFO is reachable by peeling
ebfd146a
IR
1283 a few loop iterations. Return false otherwise. */
1284
1285static bool
89fa689a 1286vector_alignment_reachable_p (dr_vec_info *dr_info)
ebfd146a 1287{
89fa689a 1288 stmt_vec_info stmt_info = dr_info->stmt;
ebfd146a
IR
1289 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1290
0d0293ac 1291 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
ebfd146a
IR
1292 {
1293 /* For interleaved access we peel only if number of iterations in
1294 the prolog loop ({VF - misalignment}), is a multiple of the
1295 number of the interleaved accesses. */
1296 int elem_size, mis_in_elements;
ebfd146a
IR
1297
1298 /* FORNOW: handle only known alignment. */
89fa689a 1299 if (!known_alignment_for_access_p (dr_info))
ebfd146a
IR
1300 return false;
1301
9031b367
RS
1302 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1303 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1304 elem_size = vector_element_size (vector_size, nelements);
89fa689a 1305 mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
ebfd146a 1306
2c53b149 1307 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
ebfd146a
IR
1308 return false;
1309 }
1310
1311 /* If misalignment is known at the compile time then allow peeling
1312 only if natural alignment is reachable through peeling. */
89fa689a 1313 if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
ebfd146a 1314 {
b8698a0f 1315 HOST_WIDE_INT elmsize =
ebfd146a 1316 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
73fbfcad 1317 if (dump_enabled_p ())
ebfd146a 1318 {
e645e942 1319 dump_printf_loc (MSG_NOTE, vect_location,
6f795a92
DM
1320 "data size = %wd. misalignment = %d.\n", elmsize,
1321 DR_MISALIGNMENT (dr_info));
ebfd146a 1322 }
89fa689a 1323 if (DR_MISALIGNMENT (dr_info) % elmsize)
ebfd146a 1324 {
73fbfcad 1325 if (dump_enabled_p ())
e645e942
TJ
1326 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1327 "data size does not divide the misalignment.\n");
ebfd146a
IR
1328 return false;
1329 }
1330 }
1331
89fa689a 1332 if (!known_alignment_for_access_p (dr_info))
ebfd146a 1333 {
89fa689a
RS
1334 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1335 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
73fbfcad 1336 if (dump_enabled_p ())
e645e942 1337 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
c2873892
RB
1338 "Unknown misalignment, %snaturally aligned\n",
1339 is_packed ? "not " : "");
1340 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
ebfd146a
IR
1341 }
1342
1343 return true;
1344}
1345
720f5239 1346
89fa689a 1347/* Calculate the cost of the memory access represented by DR_INFO. */
720f5239 1348
92345349 1349static void
308bc496 1350vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
720f5239 1351 unsigned int *inside_cost,
92345349 1352 unsigned int *outside_cost,
c76d9edb
RB
1353 stmt_vector_for_cost *body_cost_vec,
1354 stmt_vector_for_cost *prologue_cost_vec)
720f5239 1355{
89fa689a 1356 stmt_vec_info stmt_info = dr_info->stmt;
308bc496 1357 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
e8f142e2
RS
1358 int ncopies;
1359
1360 if (PURE_SLP_STMT (stmt_info))
1361 ncopies = 1;
1362 else
1363 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
720f5239 1364
89fa689a 1365 if (DR_IS_READ (dr_info->dr))
308bc496
RB
1366 vect_get_load_cost (vinfo, stmt_info, ncopies, true, inside_cost,
1367 outside_cost, prologue_cost_vec, body_cost_vec, false);
720f5239 1368 else
308bc496 1369 vect_get_store_cost (vinfo,stmt_info, ncopies, inside_cost, body_cost_vec);
720f5239 1370
73fbfcad 1371 if (dump_enabled_p ())
78c60e3d
SS
1372 dump_printf_loc (MSG_NOTE, vect_location,
1373 "vect_get_data_access_cost: inside_cost = %d, "
e645e942 1374 "outside_cost = %d.\n", *inside_cost, *outside_cost);
720f5239
IR
1375}
1376
1377
b939ea86
RB
1378typedef struct _vect_peel_info
1379{
89fa689a 1380 dr_vec_info *dr_info;
34e82342 1381 int npeel;
b939ea86
RB
1382 unsigned int count;
1383} *vect_peel_info;
1384
1385typedef struct _vect_peel_extended_info
1386{
308bc496 1387 vec_info *vinfo;
b939ea86
RB
1388 struct _vect_peel_info peel_info;
1389 unsigned int inside_cost;
1390 unsigned int outside_cost;
b939ea86
RB
1391} *vect_peel_extended_info;
1392
1393
1394/* Peeling hashtable helpers. */
1395
1396struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1397{
1398 static inline hashval_t hash (const _vect_peel_info *);
1399 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1400};
1401
1402inline hashval_t
1403peel_info_hasher::hash (const _vect_peel_info *peel_info)
1404{
1405 return (hashval_t) peel_info->npeel;
1406}
1407
1408inline bool
1409peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1410{
1411 return (a->npeel == b->npeel);
1412}
1413
1414
89fa689a 1415/* Insert DR_INFO into peeling hash table with NPEEL as key. */
720f5239
IR
1416
1417static void
b939ea86 1418vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
89fa689a 1419 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
720f5239
IR
1420 int npeel)
1421{
1422 struct _vect_peel_info elem, *slot;
bf190e8d 1423 _vect_peel_info **new_slot;
89fa689a 1424 bool supportable_dr_alignment
308bc496 1425 = vect_supportable_dr_alignment (loop_vinfo, dr_info, true);
720f5239
IR
1426
1427 elem.npeel = npeel;
b939ea86 1428 slot = peeling_htab->find (&elem);
720f5239
IR
1429 if (slot)
1430 slot->count++;
1431 else
1432 {
1433 slot = XNEW (struct _vect_peel_info);
1434 slot->npeel = npeel;
89fa689a 1435 slot->dr_info = dr_info;
720f5239 1436 slot->count = 1;
b939ea86 1437 new_slot = peeling_htab->find_slot (slot, INSERT);
720f5239
IR
1438 *new_slot = slot;
1439 }
1440
8b5e1202
SO
1441 if (!supportable_dr_alignment
1442 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
720f5239
IR
1443 slot->count += VECT_MAX_COST;
1444}
1445
1446
1447/* Traverse peeling hash table to find peeling option that aligns maximum
1448 number of data accesses. */
1449
bf190e8d
LC
1450int
1451vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1452 _vect_peel_extended_info *max)
720f5239 1453{
bf190e8d 1454 vect_peel_info elem = *slot;
720f5239 1455
44542f8e
IR
1456 if (elem->count > max->peel_info.count
1457 || (elem->count == max->peel_info.count
1458 && max->peel_info.npeel > elem->npeel))
720f5239
IR
1459 {
1460 max->peel_info.npeel = elem->npeel;
1461 max->peel_info.count = elem->count;
89fa689a 1462 max->peel_info.dr_info = elem->dr_info;
720f5239
IR
1463 }
1464
1465 return 1;
1466}
1467
f5ae2856
RS
1468/* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1469 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1470 we assume DR0_INFO's misalignment will be zero after peeling. */
720f5239 1471
71595748 1472static void
f5ae2856 1473vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
89fa689a 1474 dr_vec_info *dr0_info,
71595748
RD
1475 unsigned int *inside_cost,
1476 unsigned int *outside_cost,
1477 stmt_vector_for_cost *body_cost_vec,
c76d9edb 1478 stmt_vector_for_cost *prologue_cost_vec,
4d3d23fb
RD
1479 unsigned int npeel,
1480 bool unknown_misalignment)
720f5239 1481{
f5ae2856 1482 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
71595748
RD
1483 unsigned i;
1484 data_reference *dr;
720f5239 1485
9771b263 1486 FOR_EACH_VEC_ELT (datarefs, i, dr)
720f5239 1487 {
f5ae2856 1488 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
d30846a0 1489 if (!vect_relevant_for_alignment_p (dr_info))
8ee38286
RB
1490 continue;
1491
71595748 1492 int save_misalignment;
89fa689a 1493 save_misalignment = DR_MISALIGNMENT (dr_info);
d629ab44
RB
1494 if (npeel == 0)
1495 ;
89fa689a
RS
1496 else if (unknown_misalignment && dr_info == dr0_info)
1497 SET_DR_MISALIGNMENT (dr_info, 0);
71595748 1498 else
89fa689a 1499 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
308bc496 1500 vect_get_data_access_cost (loop_vinfo, dr_info, inside_cost, outside_cost,
c76d9edb 1501 body_cost_vec, prologue_cost_vec);
89fa689a 1502 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
720f5239 1503 }
71595748
RD
1504}
1505
1506/* Traverse peeling hash table and calculate cost for each peeling option.
1507 Find the one with the lowest cost. */
1508
1509int
1510vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1511 _vect_peel_extended_info *min)
1512{
1513 vect_peel_info elem = *slot;
1514 int dummy;
1515 unsigned int inside_cost = 0, outside_cost = 0;
308bc496 1516 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
71595748
RD
1517 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1518 epilogue_cost_vec;
1519
1520 prologue_cost_vec.create (2);
1521 body_cost_vec.create (2);
1522 epilogue_cost_vec.create (2);
1523
f5ae2856
RS
1524 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1525 &outside_cost, &body_cost_vec,
1526 &prologue_cost_vec, elem->npeel, false);
720f5239 1527
ec15a152
RD
1528 body_cost_vec.release ();
1529
696814ed
RB
1530 outside_cost += vect_get_known_peeling_cost
1531 (loop_vinfo, elem->npeel, &dummy,
6d098c57
RB
1532 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1533 &prologue_cost_vec, &epilogue_cost_vec);
92345349
BS
1534
1535 /* Prologue and epilogue costs are added to the target model later.
1536 These costs depend only on the scalar iteration cost, the
1537 number of peeling iterations finally chosen, and the number of
1538 misaligned statements. So discard the information found here. */
9771b263
DN
1539 prologue_cost_vec.release ();
1540 epilogue_cost_vec.release ();
720f5239
IR
1541
1542 if (inside_cost < min->inside_cost
71595748
RD
1543 || (inside_cost == min->inside_cost
1544 && outside_cost < min->outside_cost))
720f5239
IR
1545 {
1546 min->inside_cost = inside_cost;
1547 min->outside_cost = outside_cost;
89fa689a 1548 min->peel_info.dr_info = elem->dr_info;
720f5239 1549 min->peel_info.npeel = elem->npeel;
71595748 1550 min->peel_info.count = elem->count;
720f5239
IR
1551 }
1552
1553 return 1;
1554}
1555
1556
1557/* Choose best peeling option by traversing peeling hash table and either
1558 choosing an option with the lowest cost (if cost model is enabled) or the
1559 option that aligns as many accesses as possible. */
1560
1e69cc8f 1561static struct _vect_peel_extended_info
b939ea86 1562vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
ec15a152 1563 loop_vec_info loop_vinfo)
720f5239
IR
1564{
1565 struct _vect_peel_extended_info res;
1566
89fa689a 1567 res.peel_info.dr_info = NULL;
308bc496 1568 res.vinfo = loop_vinfo;
720f5239 1569
8b5e1202 1570 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
720f5239
IR
1571 {
1572 res.inside_cost = INT_MAX;
1573 res.outside_cost = INT_MAX;
b939ea86
RB
1574 peeling_htab->traverse <_vect_peel_extended_info *,
1575 vect_peeling_hash_get_lowest_cost> (&res);
720f5239
IR
1576 }
1577 else
1578 {
1579 res.peel_info.count = 0;
b939ea86
RB
1580 peeling_htab->traverse <_vect_peel_extended_info *,
1581 vect_peeling_hash_get_most_frequent> (&res);
1e69cc8f
RD
1582 res.inside_cost = 0;
1583 res.outside_cost = 0;
720f5239
IR
1584 }
1585
1e69cc8f 1586 return res;
720f5239
IR
1587}
1588
71595748
RD
1589/* Return true if the new peeling NPEEL is supported. */
1590
1591static bool
89fa689a 1592vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
71595748
RD
1593 unsigned npeel)
1594{
1595 unsigned i;
1596 struct data_reference *dr = NULL;
1597 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
71595748
RD
1598 enum dr_alignment_support supportable_dr_alignment;
1599
1600 /* Ensure that all data refs can be vectorized after the peel. */
1601 FOR_EACH_VEC_ELT (datarefs, i, dr)
1602 {
1603 int save_misalignment;
1604
89fa689a 1605 if (dr == dr0_info->dr)
71595748
RD
1606 continue;
1607
f5ae2856 1608 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
d30846a0 1609 if (!vect_relevant_for_alignment_p (dr_info))
71595748
RD
1610 continue;
1611
89fa689a
RS
1612 save_misalignment = DR_MISALIGNMENT (dr_info);
1613 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1614 supportable_dr_alignment
308bc496 1615 = vect_supportable_dr_alignment (loop_vinfo, dr_info, false);
89fa689a 1616 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
71595748
RD
1617
1618 if (!supportable_dr_alignment)
1619 return false;
1620 }
1621
1622 return true;
1623}
720f5239 1624
ebfd146a
IR
1625/* Function vect_enhance_data_refs_alignment
1626
1627 This pass will use loop versioning and loop peeling in order to enhance
1628 the alignment of data references in the loop.
1629
1630 FOR NOW: we assume that whatever versioning/peeling takes place, only the
ff802fa1 1631 original loop is to be vectorized. Any other loops that are created by
ebfd146a 1632 the transformations performed in this pass - are not supposed to be
ff802fa1 1633 vectorized. This restriction will be relaxed.
ebfd146a
IR
1634
1635 This pass will require a cost model to guide it whether to apply peeling
ff802fa1 1636 or versioning or a combination of the two. For example, the scheme that
ebfd146a
IR
1637 intel uses when given a loop with several memory accesses, is as follows:
1638 choose one memory access ('p') which alignment you want to force by doing
ff802fa1 1639 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
ebfd146a
IR
1640 other accesses are not necessarily aligned, or (2) use loop versioning to
1641 generate one loop in which all accesses are aligned, and another loop in
1642 which only 'p' is necessarily aligned.
1643
1644 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1645 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1646 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1647
ff802fa1 1648 Devising a cost model is the most critical aspect of this work. It will
ebfd146a 1649 guide us on which access to peel for, whether to use loop versioning, how
ff802fa1 1650 many versions to create, etc. The cost model will probably consist of
ebfd146a
IR
1651 generic considerations as well as target specific considerations (on
1652 powerpc for example, misaligned stores are more painful than misaligned
1653 loads).
1654
1655 Here are the general steps involved in alignment enhancements:
1656
1657 -- original loop, before alignment analysis:
1658 for (i=0; i<N; i++){
1659 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1660 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1661 }
1662
1663 -- After vect_compute_data_refs_alignment:
1664 for (i=0; i<N; i++){
1665 x = q[i]; # DR_MISALIGNMENT(q) = 3
1666 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1667 }
1668
1669 -- Possibility 1: we do loop versioning:
1670 if (p is aligned) {
1671 for (i=0; i<N; i++){ # loop 1A
1672 x = q[i]; # DR_MISALIGNMENT(q) = 3
1673 p[i] = y; # DR_MISALIGNMENT(p) = 0
1674 }
1675 }
1676 else {
1677 for (i=0; i<N; i++){ # loop 1B
1678 x = q[i]; # DR_MISALIGNMENT(q) = 3
1679 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1680 }
1681 }
1682
1683 -- Possibility 2: we do loop peeling:
1684 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1685 x = q[i];
1686 p[i] = y;
1687 }
1688 for (i = 3; i < N; i++){ # loop 2A
1689 x = q[i]; # DR_MISALIGNMENT(q) = 0
1690 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1691 }
1692
1693 -- Possibility 3: combination of loop peeling and versioning:
1694 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1695 x = q[i];
1696 p[i] = y;
1697 }
1698 if (p is aligned) {
1699 for (i = 3; i<N; i++){ # loop 3A
1700 x = q[i]; # DR_MISALIGNMENT(q) = 0
1701 p[i] = y; # DR_MISALIGNMENT(p) = 0
1702 }
1703 }
1704 else {
1705 for (i = 3; i<N; i++){ # loop 3B
1706 x = q[i]; # DR_MISALIGNMENT(q) = 0
1707 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1708 }
1709 }
1710
ff802fa1 1711 These loops are later passed to loop_transform to be vectorized. The
ebfd146a
IR
1712 vectorizer will use the alignment information to guide the transformation
1713 (whether to generate regular loads/stores, or with special handling for
1714 misalignment). */
1715
f4ebbd24 1716opt_result
ebfd146a
IR
1717vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1718{
9771b263 1719 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
99b1c316 1720 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4d3d23fb 1721 enum dr_alignment_support supportable_dr_alignment;
89fa689a
RS
1722 dr_vec_info *first_store = NULL;
1723 dr_vec_info *dr0_info = NULL;
ebfd146a 1724 struct data_reference *dr;
720f5239 1725 unsigned int i, j;
ebfd146a
IR
1726 bool do_peeling = false;
1727 bool do_versioning = false;
720f5239 1728 unsigned int npeel = 0;
1e69cc8f
RD
1729 bool one_misalignment_known = false;
1730 bool one_misalignment_unknown = false;
4d3d23fb 1731 bool one_dr_unsupportable = false;
89fa689a 1732 dr_vec_info *unsupportable_dr_info = NULL;
d9f21f6a 1733 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
720f5239
IR
1734 unsigned possible_npeel_number = 1;
1735 tree vectype;
d9f21f6a 1736 unsigned int mis, same_align_drs_max = 0;
b939ea86 1737 hash_table<peel_info_hasher> peeling_htab (1);
ebfd146a 1738
adac3a68 1739 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
ebfd146a 1740
ddf56386
RB
1741 /* Reset data so we can safely be called multiple times. */
1742 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1743 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1744
ebfd146a
IR
1745 /* While cost model enhancements are expected in the future, the high level
1746 view of the code at this time is as follows:
1747
673beced
RE
1748 A) If there is a misaligned access then see if peeling to align
1749 this access can make all data references satisfy
8f439681
RE
1750 vect_supportable_dr_alignment. If so, update data structures
1751 as needed and return true.
ebfd146a
IR
1752
1753 B) If peeling wasn't possible and there is a data reference with an
1754 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1755 then see if loop versioning checks can be used to make all data
1756 references satisfy vect_supportable_dr_alignment. If so, update
1757 data structures as needed and return true.
1758
1759 C) If neither peeling nor versioning were successful then return false if
1760 any data reference does not satisfy vect_supportable_dr_alignment.
1761
1762 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1763
1764 Note, Possibility 3 above (which is peeling and versioning together) is not
1765 being done at this time. */
1766
1767 /* (1) Peeling to force alignment. */
1768
1769 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1770 Considerations:
1771 + How many accesses will become aligned due to the peeling
1772 - How many accesses will become unaligned due to the peeling,
1773 and the cost of misaligned accesses.
b8698a0f 1774 - The cost of peeling (the extra runtime checks, the increase
720f5239 1775 in code size). */
ebfd146a 1776
9771b263 1777 FOR_EACH_VEC_ELT (datarefs, i, dr)
ebfd146a 1778 {
f5ae2856 1779 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
e54353a7 1780 if (!vect_relevant_for_alignment_p (dr_info))
319e6439
RG
1781 continue;
1782
e54353a7 1783 stmt_vec_info stmt_info = dr_info->stmt;
308bc496
RB
1784 supportable_dr_alignment
1785 = vect_supportable_dr_alignment (loop_vinfo, dr_info, true);
89fa689a 1786 do_peeling = vector_alignment_reachable_p (dr_info);
720f5239 1787 if (do_peeling)
ebfd146a 1788 {
89fa689a 1789 if (known_alignment_for_access_p (dr_info))
720f5239 1790 {
f702e7d4 1791 unsigned int npeel_tmp = 0;
d8ba5b19
RG
1792 bool negative = tree_int_cst_compare (DR_STEP (dr),
1793 size_zero_node) < 0;
720f5239 1794
f702e7d4 1795 vectype = STMT_VINFO_VECTYPE (stmt_info);
ca31798e
AV
1796 /* If known_alignment_for_access_p then we have set
1797 DR_MISALIGNMENT which is only done if we know it at compiler
1798 time, so it is safe to assume target alignment is constant.
1799 */
1800 unsigned int target_align =
1801 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
89fa689a
RS
1802 unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1803 mis = (negative
1804 ? DR_MISALIGNMENT (dr_info)
1805 : -DR_MISALIGNMENT (dr_info));
1806 if (DR_MISALIGNMENT (dr_info) != 0)
f702e7d4 1807 npeel_tmp = (mis & (target_align - 1)) / dr_size;
720f5239
IR
1808
1809 /* For multiple types, it is possible that the bigger type access
ff802fa1 1810 will have more than one peeling option. E.g., a loop with two
720f5239 1811 types: one of size (vector size / 4), and the other one of
ff802fa1 1812 size (vector size / 8). Vectorization factor will 8. If both
8d21ff9f 1813 accesses are misaligned by 3, the first one needs one scalar
ff802fa1 1814 iteration to be aligned, and the second one needs 5. But the
6af801f5 1815 first one will be aligned also by peeling 5 scalar
720f5239
IR
1816 iterations, and in that case both accesses will be aligned.
1817 Hence, except for the immediate peeling amount, we also want
1818 to try to add full vector size, while we don't exceed
1819 vectorization factor.
8d21ff9f
RD
1820 We do this automatically for cost model, since we calculate
1821 cost for every peeling option. */
8b5e1202 1822 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
91ff1504 1823 {
d9f21f6a 1824 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
2c53b149 1825 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
d9f21f6a
RS
1826 possible_npeel_number
1827 = vect_get_num_vectors (nscalars, vectype);
720f5239 1828
4d3d23fb
RD
1829 /* NPEEL_TMP is 0 when there is no misalignment, but also
1830 allow peeling NELEMENTS. */
89fa689a 1831 if (DR_MISALIGNMENT (dr_info) == 0)
8d21ff9f
RD
1832 possible_npeel_number++;
1833 }
720f5239 1834
8d21ff9f
RD
1835 /* Save info about DR in the hash table. Also include peeling
1836 amounts according to the explanation above. */
720f5239
IR
1837 for (j = 0; j < possible_npeel_number; j++)
1838 {
b939ea86 1839 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
89fa689a 1840 dr_info, npeel_tmp);
f702e7d4 1841 npeel_tmp += target_align / dr_size;
720f5239
IR
1842 }
1843
1e69cc8f 1844 one_misalignment_known = true;
720f5239
IR
1845 }
1846 else
1847 {
4ba5ea11
RB
1848 /* If we don't know any misalignment values, we prefer
1849 peeling for data-ref that has the maximum number of data-refs
720f5239
IR
1850 with the same alignment, unless the target prefers to align
1851 stores over load. */
1e69cc8f
RD
1852 unsigned same_align_drs
1853 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
89fa689a 1854 if (!dr0_info
1e69cc8f
RD
1855 || same_align_drs_max < same_align_drs)
1856 {
1857 same_align_drs_max = same_align_drs;
89fa689a 1858 dr0_info = dr_info;
1e69cc8f
RD
1859 }
1860 /* For data-refs with the same number of related
1861 accesses prefer the one where the misalign
1862 computation will be invariant in the outermost loop. */
1863 else if (same_align_drs_max == same_align_drs)
1864 {
99b1c316 1865 class loop *ivloop0, *ivloop;
1e69cc8f 1866 ivloop0 = outermost_invariant_loop_for_expr
89fa689a 1867 (loop, DR_BASE_ADDRESS (dr0_info->dr));
1e69cc8f
RD
1868 ivloop = outermost_invariant_loop_for_expr
1869 (loop, DR_BASE_ADDRESS (dr));
1870 if ((ivloop && !ivloop0)
1871 || (ivloop && ivloop0
1872 && flow_loop_nested_p (ivloop, ivloop0)))
89fa689a 1873 dr0_info = dr_info;
1e69cc8f 1874 }
720f5239 1875
4d3d23fb
RD
1876 one_misalignment_unknown = true;
1877
1878 /* Check for data refs with unsupportable alignment that
1879 can be peeled. */
1880 if (!supportable_dr_alignment)
1881 {
1882 one_dr_unsupportable = true;
89fa689a 1883 unsupportable_dr_info = dr_info;
4d3d23fb
RD
1884 }
1885
1e69cc8f 1886 if (!first_store && DR_IS_WRITE (dr))
89fa689a 1887 first_store = dr_info;
720f5239
IR
1888 }
1889 }
1890 else
1891 {
89fa689a 1892 if (!aligned_access_p (dr_info))
720f5239 1893 {
73fbfcad 1894 if (dump_enabled_p ())
e645e942
TJ
1895 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1896 "vector alignment may not be reachable\n");
720f5239
IR
1897 break;
1898 }
1899 }
ebfd146a
IR
1900 }
1901
afb119be
RB
1902 /* Check if we can possibly peel the loop. */
1903 if (!vect_can_advance_ivs_p (loop_vinfo)
a6c51a12
YR
1904 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1905 || loop->inner)
ebfd146a
IR
1906 do_peeling = false;
1907
64812d33
RD
1908 struct _vect_peel_extended_info peel_for_known_alignment;
1909 struct _vect_peel_extended_info peel_for_unknown_alignment;
1910 struct _vect_peel_extended_info best_peel;
1911
1912 peel_for_unknown_alignment.inside_cost = INT_MAX;
1913 peel_for_unknown_alignment.outside_cost = INT_MAX;
1914 peel_for_unknown_alignment.peel_info.count = 0;
1e69cc8f 1915
b1aef01e 1916 if (do_peeling
64812d33 1917 && one_misalignment_unknown)
720f5239 1918 {
720f5239
IR
1919 /* Check if the target requires to prefer stores over loads, i.e., if
1920 misaligned stores are more expensive than misaligned loads (taking
1921 drs with same alignment into account). */
64812d33
RD
1922 unsigned int load_inside_cost = 0;
1923 unsigned int load_outside_cost = 0;
1924 unsigned int store_inside_cost = 0;
1925 unsigned int store_outside_cost = 0;
d9f21f6a 1926 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
64812d33
RD
1927
1928 stmt_vector_for_cost dummy;
1929 dummy.create (2);
f5ae2856 1930 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
64812d33
RD
1931 &load_inside_cost,
1932 &load_outside_cost,
c76d9edb 1933 &dummy, &dummy, estimated_npeels, true);
64812d33
RD
1934 dummy.release ();
1935
1936 if (first_store)
1937 {
1e69cc8f 1938 dummy.create (2);
f5ae2856 1939 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1e69cc8f
RD
1940 &store_inside_cost,
1941 &store_outside_cost,
c76d9edb
RB
1942 &dummy, &dummy,
1943 estimated_npeels, true);
9771b263 1944 dummy.release ();
64812d33
RD
1945 }
1946 else
1947 {
1948 store_inside_cost = INT_MAX;
1949 store_outside_cost = INT_MAX;
1950 }
720f5239 1951
64812d33
RD
1952 if (load_inside_cost > store_inside_cost
1953 || (load_inside_cost == store_inside_cost
1954 && load_outside_cost > store_outside_cost))
1955 {
89fa689a 1956 dr0_info = first_store;
64812d33
RD
1957 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1958 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1959 }
1960 else
1961 {
1962 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1963 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1964 }
1e69cc8f 1965
64812d33
RD
1966 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1967 prologue_cost_vec.create (2);
1968 epilogue_cost_vec.create (2);
1e69cc8f 1969
64812d33
RD
1970 int dummy2;
1971 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
d9f21f6a 1972 (loop_vinfo, estimated_npeels, &dummy2,
64812d33
RD
1973 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1974 &prologue_cost_vec, &epilogue_cost_vec);
1e69cc8f 1975
64812d33
RD
1976 prologue_cost_vec.release ();
1977 epilogue_cost_vec.release ();
720f5239 1978
64812d33 1979 peel_for_unknown_alignment.peel_info.count = 1
89fa689a 1980 + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
720f5239
IR
1981 }
1982
64812d33 1983 peel_for_unknown_alignment.peel_info.npeel = 0;
89fa689a 1984 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
64812d33
RD
1985
1986 best_peel = peel_for_unknown_alignment;
1987
1e69cc8f
RD
1988 peel_for_known_alignment.inside_cost = INT_MAX;
1989 peel_for_known_alignment.outside_cost = INT_MAX;
1990 peel_for_known_alignment.peel_info.count = 0;
89fa689a 1991 peel_for_known_alignment.peel_info.dr_info = NULL;
1e69cc8f
RD
1992
1993 if (do_peeling && one_misalignment_known)
720f5239
IR
1994 {
1995 /* Peeling is possible, but there is no data access that is not supported
64812d33
RD
1996 unless aligned. So we try to choose the best possible peeling from
1997 the hash table. */
1e69cc8f 1998 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
ec15a152 1999 (&peeling_htab, loop_vinfo);
720f5239
IR
2000 }
2001
1e69cc8f 2002 /* Compare costs of peeling for known and unknown alignment. */
89fa689a 2003 if (peel_for_known_alignment.peel_info.dr_info != NULL
64812d33
RD
2004 && peel_for_unknown_alignment.inside_cost
2005 >= peel_for_known_alignment.inside_cost)
4d3d23fb
RD
2006 {
2007 best_peel = peel_for_known_alignment;
64812d33 2008
4d3d23fb
RD
2009 /* If the best peeling for known alignment has NPEEL == 0, perform no
2010 peeling at all except if there is an unsupportable dr that we can
2011 align. */
2012 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2013 do_peeling = false;
2014 }
64812d33 2015
4d3d23fb
RD
2016 /* If there is an unsupportable data ref, prefer this over all choices so far
2017 since we'd have to discard a chosen peeling except when it accidentally
2018 aligned the unsupportable data ref. */
2019 if (one_dr_unsupportable)
89fa689a 2020 dr0_info = unsupportable_dr_info;
4d3d23fb
RD
2021 else if (do_peeling)
2022 {
d629ab44 2023 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
ec15a152 2024 TODO: Use nopeel_outside_cost or get rid of it? */
4d3d23fb
RD
2025 unsigned nopeel_inside_cost = 0;
2026 unsigned nopeel_outside_cost = 0;
64812d33 2027
4d3d23fb
RD
2028 stmt_vector_for_cost dummy;
2029 dummy.create (2);
f5ae2856 2030 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
c76d9edb
RB
2031 &nopeel_outside_cost, &dummy, &dummy,
2032 0, false);
4d3d23fb 2033 dummy.release ();
64812d33 2034
4d3d23fb
RD
2035 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2036 costs will be recorded. */
2037 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2038 prologue_cost_vec.create (2);
2039 epilogue_cost_vec.create (2);
64812d33 2040
4d3d23fb
RD
2041 int dummy2;
2042 nopeel_outside_cost += vect_get_known_peeling_cost
2043 (loop_vinfo, 0, &dummy2,
2044 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2045 &prologue_cost_vec, &epilogue_cost_vec);
2046
2047 prologue_cost_vec.release ();
2048 epilogue_cost_vec.release ();
64812d33 2049
4d3d23fb 2050 npeel = best_peel.peel_info.npeel;
89fa689a 2051 dr0_info = best_peel.peel_info.dr_info;
1e69cc8f 2052
4d3d23fb
RD
2053 /* If no peeling is not more expensive than the best peeling we
2054 have so far, don't perform any peeling. */
2055 if (nopeel_inside_cost <= best_peel.inside_cost)
2056 do_peeling = false;
2057 }
1e69cc8f 2058
ebfd146a
IR
2059 if (do_peeling)
2060 {
89fa689a 2061 stmt_vec_info stmt_info = dr0_info->stmt;
720f5239 2062 vectype = STMT_VINFO_VECTYPE (stmt_info);
ebfd146a 2063
89fa689a 2064 if (known_alignment_for_access_p (dr0_info))
ebfd146a 2065 {
89fa689a 2066 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
d8ba5b19 2067 size_zero_node) < 0;
720f5239
IR
2068 if (!npeel)
2069 {
2070 /* Since it's known at compile time, compute the number of
2071 iterations in the peeled loop (the peeling factor) for use in
2072 updating DR_MISALIGNMENT values. The peeling factor is the
2073 vectorization factor minus the misalignment as an element
2074 count. */
89fa689a
RS
2075 mis = (negative
2076 ? DR_MISALIGNMENT (dr0_info)
2077 : -DR_MISALIGNMENT (dr0_info));
ca31798e
AV
2078 /* If known_alignment_for_access_p then we have set
2079 DR_MISALIGNMENT which is only done if we know it at compiler
2080 time, so it is safe to assume target alignment is constant.
2081 */
2082 unsigned int target_align =
2083 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
f702e7d4 2084 npeel = ((mis & (target_align - 1))
89fa689a 2085 / vect_get_scalar_dr_size (dr0_info));
720f5239 2086 }
ebfd146a 2087
b8698a0f 2088 /* For interleaved data access every iteration accesses all the
ebfd146a
IR
2089 members of the group, therefore we divide the number of iterations
2090 by the group size. */
0d0293ac 2091 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2c53b149 2092 npeel /= DR_GROUP_SIZE (stmt_info);
ebfd146a 2093
73fbfcad 2094 if (dump_enabled_p ())
78c60e3d 2095 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 2096 "Try peeling by %d\n", npeel);
ebfd146a
IR
2097 }
2098
71595748 2099 /* Ensure that all datarefs can be vectorized after the peel. */
89fa689a 2100 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
71595748 2101 do_peeling = false;
ebfd146a 2102
71595748 2103 /* Check if all datarefs are supportable and log. */
89fa689a 2104 if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
720f5239 2105 {
f4ebbd24 2106 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
720f5239
IR
2107 if (!stat)
2108 do_peeling = false;
2109 else
ec15a152 2110 return stat;
720f5239
IR
2111 }
2112
476c1280 2113 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
4f17aa0b
XDL
2114 if (do_peeling)
2115 {
2116 unsigned max_allowed_peel
028d4092 2117 = param_vect_max_peeling_for_alignment;
247afa98
RB
2118 if (flag_vect_cost_model == VECT_COST_MODEL_CHEAP)
2119 max_allowed_peel = 0;
4f17aa0b
XDL
2120 if (max_allowed_peel != (unsigned)-1)
2121 {
2122 unsigned max_peel = npeel;
2123 if (max_peel == 0)
2124 {
ca31798e
AV
2125 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2126 unsigned HOST_WIDE_INT target_align_c;
2127 if (target_align.is_constant (&target_align_c))
2128 max_peel =
2129 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2130 else
2131 {
2132 do_peeling = false;
2133 if (dump_enabled_p ())
2134 dump_printf_loc (MSG_NOTE, vect_location,
2135 "Disable peeling, max peels set and vector"
2136 " alignment unknown\n");
2137 }
4f17aa0b
XDL
2138 }
2139 if (max_peel > max_allowed_peel)
2140 {
2141 do_peeling = false;
2142 if (dump_enabled_p ())
2143 dump_printf_loc (MSG_NOTE, vect_location,
2144 "Disable peeling, max peels reached: %d\n", max_peel);
2145 }
2146 }
2147 }
2148
476c1280 2149 /* Cost model #2 - if peeling may result in a remaining loop not
d9f21f6a
RS
2150 iterating enough to be vectorized then do not peel. Since this
2151 is a cost heuristic rather than a correctness decision, use the
2152 most likely runtime value for variable vectorization factors. */
476c1280
RB
2153 if (do_peeling
2154 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2155 {
d9f21f6a
RS
2156 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2157 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2158 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2159 < assumed_vf + max_peel)
476c1280
RB
2160 do_peeling = false;
2161 }
2162
ebfd146a
IR
2163 if (do_peeling)
2164 {
2165 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2166 If the misalignment of DR_i is identical to that of dr0 then set
2167 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2168 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2169 by the peeling factor times the element size of DR_i (MOD the
2170 vectorization factor times the size). Otherwise, the
2171 misalignment of DR_i must be set to unknown. */
9771b263 2172 FOR_EACH_VEC_ELT (datarefs, i, dr)
89fa689a 2173 if (dr != dr0_info->dr)
ccbd7103 2174 {
f5ae2856 2175 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
e54353a7 2176 if (!vect_relevant_for_alignment_p (dr_info))
ccbd7103
RB
2177 continue;
2178
89fa689a 2179 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
ccbd7103 2180 }
ebfd146a 2181
1e5e6ff5 2182 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
720f5239 2183 if (npeel)
15e693cc 2184 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
720f5239 2185 else
15e693cc 2186 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
89fa689a
RS
2187 = DR_MISALIGNMENT (dr0_info);
2188 SET_DR_MISALIGNMENT (dr0_info, 0);
73fbfcad 2189 if (dump_enabled_p ())
78c60e3d
SS
2190 {
2191 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 2192 "Alignment of access forced using peeling.\n");
78c60e3d 2193 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 2194 "Peeling for alignment will be applied.\n");
78c60e3d 2195 }
ec15a152 2196
62c00445
RB
2197 /* The inside-loop cost will be accounted for in vectorizable_load
2198 and vectorizable_store correctly with adjusted alignments.
2199 Drop the body_cst_vec on the floor here. */
f4ebbd24 2200 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
ebfd146a
IR
2201 gcc_assert (stat);
2202 return stat;
2203 }
2204 }
2205
ebfd146a
IR
2206 /* (2) Versioning to force alignment. */
2207
2208 /* Try versioning if:
247afa98 2209 1) optimize loop for speed and the cost-model is not cheap
d6d11272 2210 2) there is at least one unsupported misaligned data ref with an unknown
ebfd146a 2211 misalignment, and
d6d11272
XDL
2212 3) all misaligned data refs with a known misalignment are supported, and
2213 4) the number of runtime alignment checks is within reason. */
ebfd146a 2214
247afa98
RB
2215 do_versioning
2216 = (optimize_loop_nest_for_speed_p (loop)
2217 && !loop->inner /* FORNOW */
9d99596e 2218 && flag_vect_cost_model != VECT_COST_MODEL_CHEAP);
ebfd146a
IR
2219
2220 if (do_versioning)
2221 {
9771b263 2222 FOR_EACH_VEC_ELT (datarefs, i, dr)
ebfd146a 2223 {
f5ae2856 2224 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
89fa689a 2225 if (aligned_access_p (dr_info)
d30846a0 2226 || !vect_relevant_for_alignment_p (dr_info))
ebfd146a
IR
2227 continue;
2228
d30846a0 2229 stmt_vec_info stmt_info = dr_info->stmt;
f2e2a985 2230 if (STMT_VINFO_STRIDED_P (stmt_info))
7b5fc413 2231 {
7b5fc413
RB
2232 do_versioning = false;
2233 break;
2234 }
319e6439 2235
89fa689a 2236 supportable_dr_alignment
308bc496 2237 = vect_supportable_dr_alignment (loop_vinfo, dr_info, false);
ebfd146a
IR
2238
2239 if (!supportable_dr_alignment)
2240 {
ebfd146a
IR
2241 int mask;
2242 tree vectype;
2243
89fa689a 2244 if (known_alignment_for_access_p (dr_info)
9771b263 2245 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
028d4092 2246 >= (unsigned) param_vect_max_version_for_alignment_checks)
ebfd146a
IR
2247 {
2248 do_versioning = false;
2249 break;
2250 }
2251
78e02b3b
RS
2252 vectype = STMT_VINFO_VECTYPE (stmt_info);
2253 gcc_assert (vectype);
b8698a0f 2254
cf098191
RS
2255 /* At present we don't support versioning for alignment
2256 with variable VF, since there's no guarantee that the
2257 VF is a power of two. We could relax this if we added
2258 a way of enforcing a power-of-two size. */
2259 unsigned HOST_WIDE_INT size;
2260 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2261 {
2262 do_versioning = false;
2263 break;
2264 }
2265
c9aa9108
JR
2266 /* Forcing alignment in the first iteration is no good if
2267 we don't keep it across iterations. For now, just disable
2268 versioning in this case.
6647c1e8
JJ
2269 ?? We could actually unroll the loop to achieve the required
2270 overall step alignment, and forcing the alignment could be
c9aa9108
JR
2271 done by doing some iterations of the non-vectorized loop. */
2272 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2273 * DR_STEP_ALIGNMENT (dr),
6647c1e8 2274 DR_TARGET_ALIGNMENT (dr_info)))
c9aa9108
JR
2275 {
2276 do_versioning = false;
2277 break;
2278 }
2279
ebfd146a
IR
2280 /* The rightmost bits of an aligned address must be zeros.
2281 Construct the mask needed for this test. For example,
2282 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2283 mask must be 15 = 0xf. */
cf098191 2284 mask = size - 1;
ebfd146a 2285
557532d1
RS
2286 /* FORNOW: use the same mask to test all potentially unaligned
2287 references in the loop. */
2288 if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2289 && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2290 {
2291 do_versioning = false;
2292 break;
2293 }
2294
ebfd146a 2295 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
78e02b3b 2296 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
ebfd146a
IR
2297 }
2298 }
b8698a0f 2299
ebfd146a 2300 /* Versioning requires at least one misaligned data reference. */
e9dbe7bb 2301 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
ebfd146a
IR
2302 do_versioning = false;
2303 else if (!do_versioning)
9771b263 2304 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
ebfd146a
IR
2305 }
2306
2307 if (do_versioning)
2308 {
7bcbf2d8 2309 vec<stmt_vec_info> may_misalign_stmts
ebfd146a 2310 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
7bcbf2d8 2311 stmt_vec_info stmt_info;
ebfd146a
IR
2312
2313 /* It can now be assumed that the data references in the statements
2314 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2315 of the loop being vectorized. */
7bcbf2d8 2316 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
ebfd146a 2317 {
89fa689a
RS
2318 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2319 SET_DR_MISALIGNMENT (dr_info, 0);
73fbfcad 2320 if (dump_enabled_p ())
e645e942
TJ
2321 dump_printf_loc (MSG_NOTE, vect_location,
2322 "Alignment of access forced using versioning.\n");
ebfd146a
IR
2323 }
2324
73fbfcad 2325 if (dump_enabled_p ())
e645e942
TJ
2326 dump_printf_loc (MSG_NOTE, vect_location,
2327 "Versioning for alignment will be applied.\n");
ebfd146a
IR
2328
2329 /* Peeling and versioning can't be done together at this time. */
2330 gcc_assert (! (do_peeling && do_versioning));
2331
f4ebbd24 2332 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
ebfd146a
IR
2333 gcc_assert (stat);
2334 return stat;
2335 }
2336
2337 /* This point is reached if neither peeling nor versioning is being done. */
2338 gcc_assert (! (do_peeling || do_versioning));
2339
f4ebbd24 2340 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
ebfd146a
IR
2341 return stat;
2342}
2343
2344
777e1f09
RG
2345/* Function vect_find_same_alignment_drs.
2346
f5ae2856 2347 Update group and alignment relations in VINFO according to the chosen
777e1f09
RG
2348 vectorization factor. */
2349
2350static void
f5ae2856 2351vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
777e1f09 2352{
777e1f09
RG
2353 struct data_reference *dra = DDR_A (ddr);
2354 struct data_reference *drb = DDR_B (ddr);
f5ae2856
RS
2355 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2356 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
89fa689a
RS
2357 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2358 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
777e1f09
RG
2359
2360 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2361 return;
2362
720f5239 2363 if (dra == drb)
777e1f09
RG
2364 return;
2365
5fa23466
RB
2366 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2367 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2368 return;
2369
62c8a2cf 2370 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
748bbe72
RS
2371 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2372 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
777e1f09
RG
2373 return;
2374
748bbe72 2375 /* Two references with distance zero have the same alignment. */
c0a46545
RS
2376 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2377 - wi::to_poly_offset (DR_INIT (drb)));
2378 if (maybe_ne (diff, 0))
777e1f09 2379 {
748bbe72 2380 /* Get the wider of the two alignments. */
ca31798e
AV
2381 poly_uint64 align_a =
2382 exact_div (vect_calculate_target_alignment (dr_info_a),
2383 BITS_PER_UNIT);
2384 poly_uint64 align_b =
2385 exact_div (vect_calculate_target_alignment (dr_info_b),
2386 BITS_PER_UNIT);
2387 unsigned HOST_WIDE_INT align_a_c, align_b_c;
2388 if (!align_a.is_constant (&align_a_c)
2389 || !align_b.is_constant (&align_b_c))
2390 return;
2391
2392 unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c);
748bbe72
RS
2393
2394 /* Require the gap to be a multiple of the larger vector alignment. */
c0a46545 2395 if (!multiple_p (diff, max_align))
748bbe72
RS
2396 return;
2397 }
777e1f09 2398
748bbe72
RS
2399 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2400 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2401 if (dump_enabled_p ())
3c2a8ed0
DM
2402 dump_printf_loc (MSG_NOTE, vect_location,
2403 "accesses have the same alignment: %T and %T\n",
2404 DR_REF (dra), DR_REF (drb));
777e1f09
RG
2405}
2406
2407
ebfd146a
IR
2408/* Function vect_analyze_data_refs_alignment
2409
2410 Analyze the alignment of the data-references in the loop.
2411 Return FALSE if a data reference is found that cannot be vectorized. */
2412
f4ebbd24 2413opt_result
8df82de2 2414vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
ebfd146a 2415{
adac3a68 2416 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
ebfd146a 2417
777e1f09
RG
2418 /* Mark groups of data references with same alignment using
2419 data dependence information. */
8df82de2 2420 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
a5b50aa1
RB
2421 struct data_dependence_relation *ddr;
2422 unsigned int i;
2423
2424 FOR_EACH_VEC_ELT (ddrs, i, ddr)
8df82de2 2425 vect_find_same_alignment_drs (loop_vinfo, ddr);
a5b50aa1 2426
8df82de2 2427 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
a5b50aa1
RB
2428 struct data_reference *dr;
2429
8df82de2 2430 vect_record_base_alignments (loop_vinfo);
a5b50aa1 2431 FOR_EACH_VEC_ELT (datarefs, i, dr)
777e1f09 2432 {
8df82de2 2433 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
89fa689a 2434 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
8df82de2 2435 vect_compute_data_ref_alignment (loop_vinfo, dr_info);
777e1f09
RG
2436 }
2437
f4ebbd24 2438 return opt_result::success ();
a5b50aa1
RB
2439}
2440
2441
2442/* Analyze alignment of DRs of stmts in NODE. */
2443
2444static bool
308bc496 2445vect_slp_analyze_and_verify_node_alignment (vec_info *vinfo, slp_tree node)
a5b50aa1 2446{
52eab378
RB
2447 /* We vectorize from the first scalar stmt in the node unless
2448 the node is permuted in which case we start from the first
2449 element in the group. */
b9787581 2450 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
89fa689a 2451 dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
52eab378 2452 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
bffb8014 2453 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
52eab378 2454
89fa689a 2455 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
308bc496 2456 vect_compute_data_ref_alignment (vinfo, dr_info);
6924b5e6 2457 /* In several places we need alignment of the first element anyway. */
89fa689a 2458 if (dr_info != first_dr_info)
308bc496 2459 vect_compute_data_ref_alignment (vinfo, first_dr_info);
6924b5e6
RB
2460
2461 /* For creating the data-ref pointer we need alignment of the
2462 first element as well. */
2463 first_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
2464 if (first_stmt_info != SLP_TREE_SCALAR_STMTS (node)[0])
2465 {
2466 first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2467 if (dr_info != first_dr_info)
2468 vect_compute_data_ref_alignment (vinfo, first_dr_info);
2469 }
2470
308bc496 2471 if (! verify_data_ref_alignment (vinfo, dr_info))
ebfd146a 2472 {
52eab378
RB
2473 if (dump_enabled_p ())
2474 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2475 "not vectorized: bad data alignment in basic "
2476 "block.\n");
2477 return false;
ebfd146a
IR
2478 }
2479
2480 return true;
2481}
2482
a5b50aa1
RB
2483/* Function vect_slp_analyze_instance_alignment
2484
2485 Analyze the alignment of the data-references in the SLP instance.
2486 Return FALSE if a data reference is found that cannot be vectorized. */
2487
2488bool
308bc496
RB
2489vect_slp_analyze_and_verify_instance_alignment (vec_info *vinfo,
2490 slp_instance instance)
a5b50aa1 2491{
adac3a68 2492 DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
a5b50aa1
RB
2493
2494 slp_tree node;
2495 unsigned i;
2496 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
308bc496 2497 if (! vect_slp_analyze_and_verify_node_alignment (vinfo, node))
a5b50aa1
RB
2498 return false;
2499
2500 node = SLP_INSTANCE_TREE (instance);
9758d196 2501 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
a5b50aa1 2502 && ! vect_slp_analyze_and_verify_node_alignment
308bc496 2503 (vinfo, SLP_INSTANCE_TREE (instance)))
a5b50aa1
RB
2504 return false;
2505
2506 return true;
2507}
2508
ebfd146a 2509
89fa689a 2510/* Analyze groups of accesses: check that DR_INFO belongs to a group of
0d0293ac
MM
2511 accesses of legal size, step, etc. Detect gaps, single element
2512 interleaving, and other special cases. Set grouped access info.
97af59b2
RB
2513 Collect groups of strided stores for further use in SLP analysis.
2514 Worker for vect_analyze_group_access. */
ebfd146a
IR
2515
2516static bool
308bc496 2517vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
ebfd146a 2518{
89fa689a 2519 data_reference *dr = dr_info->dr;
ebfd146a
IR
2520 tree step = DR_STEP (dr);
2521 tree scalar_type = TREE_TYPE (DR_REF (dr));
2522 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
89fa689a 2523 stmt_vec_info stmt_info = dr_info->stmt;
308bc496
RB
2524 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2525 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
7b5fc413 2526 HOST_WIDE_INT dr_step = -1;
0d0293ac 2527 HOST_WIDE_INT groupsize, last_accessed_element = 1;
ebfd146a
IR
2528 bool slp_impossible = false;
2529
0d0293ac
MM
2530 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2531 size of the interleaving group (including gaps). */
7b5fc413
RB
2532 if (tree_fits_shwi_p (step))
2533 {
2534 dr_step = tree_to_shwi (step);
993a6bd9
RB
2535 /* Check that STEP is a multiple of type size. Otherwise there is
2536 a non-element-sized gap at the end of the group which we
2c53b149 2537 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
993a6bd9 2538 ??? As we can handle non-constant step fine here we should
2c53b149
RB
2539 simply remove uses of DR_GROUP_GAP between the last and first
2540 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
993a6bd9
RB
2541 simply not include that gap. */
2542 if ((dr_step % type_size) != 0)
2543 {
2544 if (dump_enabled_p ())
3c2a8ed0
DM
2545 dump_printf_loc (MSG_NOTE, vect_location,
2546 "Step %T is not a multiple of the element size"
2547 " for %T\n",
2548 step, DR_REF (dr));
993a6bd9
RB
2549 return false;
2550 }
7b5fc413
RB
2551 groupsize = absu_hwi (dr_step) / type_size;
2552 }
2553 else
2554 groupsize = 0;
ebfd146a
IR
2555
2556 /* Not consecutive access is possible only if it is a part of interleaving. */
78e02b3b 2557 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
ebfd146a
IR
2558 {
2559 /* Check if it this DR is a part of interleaving, and is a single
2560 element of the group that is accessed in the loop. */
b8698a0f 2561
ebfd146a 2562 /* Gaps are supported only for loads. STEP must be a multiple of the type
4aa157e8 2563 size. */
ebfd146a
IR
2564 if (DR_IS_READ (dr)
2565 && (dr_step % type_size) == 0
4aa157e8 2566 && groupsize > 0)
ebfd146a 2567 {
78e02b3b
RS
2568 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2569 DR_GROUP_SIZE (stmt_info) = groupsize;
2c53b149 2570 DR_GROUP_GAP (stmt_info) = groupsize - 1;
73fbfcad 2571 if (dump_enabled_p ())
3c2a8ed0
DM
2572 dump_printf_loc (MSG_NOTE, vect_location,
2573 "Detected single element interleaving %T"
2574 " step %T\n",
2575 DR_REF (dr), step);
48df3fa6 2576
ebfd146a
IR
2577 return true;
2578 }
4b5caab7 2579
73fbfcad 2580 if (dump_enabled_p ())
3c2a8ed0
DM
2581 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2582 "not consecutive access %G", stmt_info->stmt);
4b5caab7
IR
2583
2584 if (bb_vinfo)
78e02b3b
RS
2585 {
2586 /* Mark the statement as unvectorizable. */
89fa689a 2587 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
78e02b3b
RS
2588 return true;
2589 }
78c60e3d 2590
bbeeac91
DM
2591 if (dump_enabled_p ())
2592 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
90a7a1b5
RB
2593 STMT_VINFO_STRIDED_P (stmt_info) = true;
2594 return true;
ebfd146a
IR
2595 }
2596
78e02b3b 2597 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
ebfd146a
IR
2598 {
2599 /* First stmt in the interleaving chain. Check the chain. */
bffb8014 2600 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
ebfd146a 2601 struct data_reference *data_ref = dr;
df398a37 2602 unsigned int count = 1;
ebfd146a 2603 tree prev_init = DR_INIT (data_ref);
08940f33 2604 HOST_WIDE_INT diff, gaps = 0;
ebfd146a 2605
c0a46545 2606 /* By construction, all group members have INTEGER_CST DR_INITs. */
ebfd146a
IR
2607 while (next)
2608 {
f95b7597
RB
2609 /* We never have the same DR multiple times. */
2610 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2611 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
48df3fa6 2612
bffb8014 2613 data_ref = STMT_VINFO_DATA_REF (next);
ebfd146a 2614
08940f33
RB
2615 /* All group members have the same STEP by construction. */
2616 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
ebfd146a 2617
ebfd146a
IR
2618 /* Check that the distance between two accesses is equal to the type
2619 size. Otherwise, we have gaps. */
2620 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2621 - TREE_INT_CST_LOW (prev_init)) / type_size;
2622 if (diff != 1)
2623 {
2624 /* FORNOW: SLP of accesses with gaps is not supported. */
2625 slp_impossible = true;
b0af49c4 2626 if (DR_IS_WRITE (data_ref))
ebfd146a 2627 {
73fbfcad 2628 if (dump_enabled_p ())
e645e942
TJ
2629 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2630 "interleaved store with gaps\n");
ebfd146a
IR
2631 return false;
2632 }
4da39468
IR
2633
2634 gaps += diff - 1;
ebfd146a
IR
2635 }
2636
48df3fa6
IR
2637 last_accessed_element += diff;
2638
ebfd146a 2639 /* Store the gap from the previous member of the group. If there is no
2c53b149 2640 gap in the access, DR_GROUP_GAP is always 1. */
bffb8014 2641 DR_GROUP_GAP (next) = diff;
ebfd146a 2642
bffb8014
RS
2643 prev_init = DR_INIT (data_ref);
2644 next = DR_GROUP_NEXT_ELEMENT (next);
2645 /* Count the number of data-refs in the chain. */
2646 count++;
ebfd146a
IR
2647 }
2648
7b5fc413
RB
2649 if (groupsize == 0)
2650 groupsize = count + gaps;
ebfd146a 2651
30fec2f9
RB
2652 /* This could be UINT_MAX but as we are generating code in a very
2653 inefficient way we have to cap earlier. See PR78699 for example. */
2654 if (groupsize > 4096)
97af59b2
RB
2655 {
2656 if (dump_enabled_p ())
2657 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2658 "group is too large\n");
2659 return false;
2660 }
2661
7b5fc413 2662 /* Check that the size of the interleaving is equal to count for stores,
ebfd146a 2663 i.e., that there are no gaps. */
e004aa11
RB
2664 if (groupsize != count
2665 && !DR_IS_READ (dr))
ebfd146a 2666 {
203942b8
RS
2667 groupsize = count;
2668 STMT_VINFO_STRIDED_P (stmt_info) = true;
e004aa11
RB
2669 }
2670
2671 /* If there is a gap after the last load in the group it is the
2672 difference between the groupsize and the last accessed
2673 element.
2674 When there is no gap, this difference should be 0. */
78e02b3b 2675 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
ebfd146a 2676
78e02b3b 2677 DR_GROUP_SIZE (stmt_info) = groupsize;
73fbfcad 2678 if (dump_enabled_p ())
e004aa11
RB
2679 {
2680 dump_printf_loc (MSG_NOTE, vect_location,
97af59b2
RB
2681 "Detected interleaving ");
2682 if (DR_IS_READ (dr))
2683 dump_printf (MSG_NOTE, "load ");
203942b8
RS
2684 else if (STMT_VINFO_STRIDED_P (stmt_info))
2685 dump_printf (MSG_NOTE, "strided store ");
97af59b2
RB
2686 else
2687 dump_printf (MSG_NOTE, "store ");
7ea4b8ed
RB
2688 dump_printf (MSG_NOTE, "of size %u\n",
2689 (unsigned)groupsize);
2690 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2691 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2692 while (next)
2693 {
2694 if (DR_GROUP_GAP (next) != 1)
2695 dump_printf_loc (MSG_NOTE, vect_location,
2696 "\t<gap of %d elements>\n",
2697 DR_GROUP_GAP (next) - 1);
2698 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2699 next = DR_GROUP_NEXT_ELEMENT (next);
2700 }
78e02b3b 2701 if (DR_GROUP_GAP (stmt_info) != 0)
e004aa11 2702 dump_printf_loc (MSG_NOTE, vect_location,
7ea4b8ed 2703 "\t<gap of %d elements>\n",
78e02b3b 2704 DR_GROUP_GAP (stmt_info));
e004aa11 2705 }
ebfd146a 2706
b8698a0f 2707 /* SLP: create an SLP data structure for every interleaving group of
ebfd146a 2708 stores for further analysis in vect_analyse_slp. */
b0af49c4 2709 if (DR_IS_WRITE (dr) && !slp_impossible)
78e02b3b
RS
2710 {
2711 if (loop_vinfo)
2712 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2713 if (bb_vinfo)
2714 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2715 }
ebfd146a
IR
2716 }
2717
2718 return true;
2719}
2720
89fa689a 2721/* Analyze groups of accesses: check that DR_INFO belongs to a group of
97af59b2
RB
2722 accesses of legal size, step, etc. Detect gaps, single element
2723 interleaving, and other special cases. Set grouped access info.
2724 Collect groups of strided stores for further use in SLP analysis. */
2725
2726static bool
308bc496 2727vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
97af59b2 2728{
308bc496 2729 if (!vect_analyze_group_access_1 (vinfo, dr_info))
97af59b2
RB
2730 {
2731 /* Dissolve the group if present. */
89fa689a 2732 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
bffb8014 2733 while (stmt_info)
97af59b2 2734 {
bffb8014
RS
2735 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2736 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2737 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2738 stmt_info = next;
97af59b2
RB
2739 }
2740 return false;
2741 }
2742 return true;
2743}
ebfd146a 2744
89fa689a 2745/* Analyze the access pattern of the data-reference DR_INFO.
ebfd146a 2746 In case of non-consecutive accesses call vect_analyze_group_access() to
0d0293ac 2747 analyze groups of accesses. */
ebfd146a
IR
2748
2749static bool
308bc496 2750vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
ebfd146a 2751{
89fa689a 2752 data_reference *dr = dr_info->dr;
ebfd146a
IR
2753 tree step = DR_STEP (dr);
2754 tree scalar_type = TREE_TYPE (DR_REF (dr));
89fa689a 2755 stmt_vec_info stmt_info = dr_info->stmt;
308bc496 2756 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
99b1c316 2757 class loop *loop = NULL;
ebfd146a 2758
f307441a
RS
2759 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2760 return true;
2761
a70d6342
IR
2762 if (loop_vinfo)
2763 loop = LOOP_VINFO_LOOP (loop_vinfo);
b8698a0f 2764
a70d6342 2765 if (loop_vinfo && !step)
ebfd146a 2766 {
73fbfcad 2767 if (dump_enabled_p ())
e645e942
TJ
2768 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2769 "bad data-ref access in loop\n");
ebfd146a
IR
2770 return false;
2771 }
2772
c134cf2a 2773 /* Allow loads with zero step in inner-loop vectorization. */
319e6439 2774 if (loop_vinfo && integer_zerop (step))
39becbac 2775 {
78e02b3b
RS
2776 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2777 if (!nested_in_vect_loop_p (loop, stmt_info))
c134cf2a
YR
2778 return DR_IS_READ (dr);
2779 /* Allow references with zero step for outer loops marked
2780 with pragma omp simd only - it guarantees absence of
2781 loop-carried dependencies between inner loop iterations. */
962e91fc 2782 if (loop->safelen < 2)
6e8dad05
RB
2783 {
2784 if (dump_enabled_p ())
2785 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 2786 "zero step in inner loop of nest\n");
6e8dad05
RB
2787 return false;
2788 }
39becbac 2789 }
ebfd146a 2790
78e02b3b 2791 if (loop && nested_in_vect_loop_p (loop, stmt_info))
ebfd146a
IR
2792 {
2793 /* Interleaved accesses are not yet supported within outer-loop
2794 vectorization for references in the inner-loop. */
78e02b3b 2795 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
ebfd146a
IR
2796
2797 /* For the rest of the analysis we use the outer-loop step. */
2798 step = STMT_VINFO_DR_STEP (stmt_info);
319e6439 2799 if (integer_zerop (step))
ebfd146a 2800 {
73fbfcad 2801 if (dump_enabled_p ())
78c60e3d 2802 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 2803 "zero step in outer loop.\n");
3bab6342 2804 return DR_IS_READ (dr);
ebfd146a
IR
2805 }
2806 }
2807
2808 /* Consecutive? */
319e6439 2809 if (TREE_CODE (step) == INTEGER_CST)
ebfd146a 2810 {
319e6439
RG
2811 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2812 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2813 || (dr_step < 0
2814 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2815 {
2816 /* Mark that it is not interleaving. */
78e02b3b 2817 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
319e6439
RG
2818 return true;
2819 }
ebfd146a
IR
2820 }
2821
78e02b3b 2822 if (loop && nested_in_vect_loop_p (loop, stmt_info))
ebfd146a 2823 {
73fbfcad 2824 if (dump_enabled_p ())
78c60e3d 2825 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 2826 "grouped access in outer loop.\n");
ebfd146a
IR
2827 return false;
2828 }
2829
7b5fc413 2830
319e6439
RG
2831 /* Assume this is a DR handled by non-constant strided load case. */
2832 if (TREE_CODE (step) != INTEGER_CST)
f2e2a985 2833 return (STMT_VINFO_STRIDED_P (stmt_info)
7b5fc413 2834 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
308bc496 2835 || vect_analyze_group_access (vinfo, dr_info)));
319e6439 2836
ebfd146a 2837 /* Not consecutive access - check if it's a part of interleaving group. */
308bc496 2838 return vect_analyze_group_access (vinfo, dr_info);
ebfd146a
IR
2839}
2840
5abe1e05
RB
2841/* Compare two data-references DRA and DRB to group them into chunks
2842 suitable for grouping. */
2843
2844static int
2845dr_group_sort_cmp (const void *dra_, const void *drb_)
2846{
2847 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2848 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
5abe1e05
RB
2849 int cmp;
2850
2851 /* Stabilize sort. */
2852 if (dra == drb)
2853 return 0;
2854
8349b024
RB
2855 /* DRs in different loops never belong to the same group. */
2856 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2857 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2858 if (loopa != loopb)
2859 return loopa->num < loopb->num ? -1 : 1;
2860
5abe1e05 2861 /* Ordering of DRs according to base. */
d20eac1b
RB
2862 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2863 DR_BASE_ADDRESS (drb));
2864 if (cmp != 0)
2865 return cmp;
5abe1e05
RB
2866
2867 /* And according to DR_OFFSET. */
d20eac1b
RB
2868 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2869 if (cmp != 0)
2870 return cmp;
5abe1e05
RB
2871
2872 /* Put reads before writes. */
2873 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2874 return DR_IS_READ (dra) ? -1 : 1;
2875
2876 /* Then sort after access size. */
d20eac1b
RB
2877 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2878 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2879 if (cmp != 0)
2880 return cmp;
5abe1e05
RB
2881
2882 /* And after step. */
d20eac1b
RB
2883 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2884 if (cmp != 0)
2885 return cmp;
5abe1e05
RB
2886
2887 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
36fd6408 2888 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
5abe1e05
RB
2889 if (cmp == 0)
2890 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2891 return cmp;
2892}
ebfd146a 2893
7e11fc7f
RS
2894/* If OP is the result of a conversion, return the unconverted value,
2895 otherwise return null. */
2896
2897static tree
2898strip_conversion (tree op)
2899{
2900 if (TREE_CODE (op) != SSA_NAME)
2901 return NULL_TREE;
2902 gimple *stmt = SSA_NAME_DEF_STMT (op);
2903 if (!is_gimple_assign (stmt)
2904 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2905 return NULL_TREE;
2906 return gimple_assign_rhs1 (stmt);
2907}
2908
32e8e429 2909/* Return true if vectorizable_* routines can handle statements STMT1_INFO
99763671
AM
2910 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
2911 be grouped in SLP mode. */
7e11fc7f
RS
2912
2913static bool
99763671
AM
2914can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
2915 bool allow_slp_p)
7e11fc7f 2916{
32e8e429
RS
2917 if (gimple_assign_single_p (stmt1_info->stmt))
2918 return gimple_assign_single_p (stmt2_info->stmt);
7e11fc7f 2919
32e8e429 2920 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
beb456c3 2921 if (call1 && gimple_call_internal_p (call1))
7e11fc7f
RS
2922 {
2923 /* Check for two masked loads or two masked stores. */
32e8e429 2924 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
beb456c3 2925 if (!call2 || !gimple_call_internal_p (call2))
7e11fc7f 2926 return false;
beb456c3 2927 internal_fn ifn = gimple_call_internal_fn (call1);
7e11fc7f
RS
2928 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2929 return false;
beb456c3 2930 if (ifn != gimple_call_internal_fn (call2))
7e11fc7f
RS
2931 return false;
2932
2933 /* Check that the masks are the same. Cope with casts of masks,
2934 like those created by build_mask_conversion. */
beb456c3
RS
2935 tree mask1 = gimple_call_arg (call1, 2);
2936 tree mask2 = gimple_call_arg (call2, 2);
99763671
AM
2937 if (!operand_equal_p (mask1, mask2, 0)
2938 && (ifn == IFN_MASK_STORE || !allow_slp_p))
7e11fc7f
RS
2939 {
2940 mask1 = strip_conversion (mask1);
2941 if (!mask1)
2942 return false;
2943 mask2 = strip_conversion (mask2);
2944 if (!mask2)
2945 return false;
2946 if (!operand_equal_p (mask1, mask2, 0))
2947 return false;
2948 }
2949 return true;
2950 }
2951
2952 return false;
2953}
2954
ebfd146a
IR
2955/* Function vect_analyze_data_ref_accesses.
2956
2957 Analyze the access pattern of all the data references in the loop.
2958
2959 FORNOW: the only access pattern that is considered vectorizable is a
2960 simple step 1 (consecutive) access.
2961
2962 FORNOW: handle only arrays and pointer accesses. */
2963
f4ebbd24 2964opt_result
310213d4 2965vect_analyze_data_ref_accesses (vec_info *vinfo)
ebfd146a
IR
2966{
2967 unsigned int i;
ca823c85 2968 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
ebfd146a
IR
2969 struct data_reference *dr;
2970
adac3a68 2971 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
ebfd146a 2972
5abe1e05 2973 if (datarefs.is_empty ())
f4ebbd24 2974 return opt_result::success ();
5abe1e05
RB
2975
2976 /* Sort the array of datarefs to make building the interleaving chains
3d54b29d
JJ
2977 linear. Don't modify the original vector's order, it is needed for
2978 determining what dependencies are reversed. */
2979 vec<data_reference_p> datarefs_copy = datarefs.copy ();
75509ba2 2980 datarefs_copy.qsort (dr_group_sort_cmp);
be43a887 2981 hash_set<stmt_vec_info> to_fixup;
5abe1e05
RB
2982
2983 /* Build the interleaving chains. */
3d54b29d 2984 for (i = 0; i < datarefs_copy.length () - 1;)
5abe1e05 2985 {
3d54b29d 2986 data_reference_p dra = datarefs_copy[i];
f5ae2856 2987 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
89fa689a 2988 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
5abe1e05 2989 stmt_vec_info lastinfo = NULL;
82279a51
RS
2990 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2991 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
f955c4c4
RB
2992 {
2993 ++i;
2994 continue;
2995 }
3d54b29d 2996 for (i = i + 1; i < datarefs_copy.length (); ++i)
5abe1e05 2997 {
3d54b29d 2998 data_reference_p drb = datarefs_copy[i];
f5ae2856 2999 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
89fa689a 3000 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
82279a51
RS
3001 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3002 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
f955c4c4 3003 break;
5abe1e05
RB
3004
3005 /* ??? Imperfect sorting (non-compatible types, non-modulo
3006 accesses, same accesses) can lead to a group to be artificially
3007 split here as we don't just skip over those. If it really
3008 matters we can push those to a worklist and re-iterate
3009 over them. The we can just skip ahead to the next DR here. */
3010
8349b024
RB
3011 /* DRs in a different loop should not be put into the same
3012 interleaving group. */
3013 if (gimple_bb (DR_STMT (dra))->loop_father
3014 != gimple_bb (DR_STMT (drb))->loop_father)
3015 break;
3016
5abe1e05 3017 /* Check that the data-refs have same first location (except init)
61331c48
JJ
3018 and they are both either store or load (not load and store,
3019 not masked loads or stores). */
5abe1e05 3020 if (DR_IS_READ (dra) != DR_IS_READ (drb)
d20eac1b
RB
3021 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3022 DR_BASE_ADDRESS (drb)) != 0
3023 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
99763671 3024 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
5abe1e05
RB
3025 break;
3026
7b5fc413 3027 /* Check that the data-refs have the same constant size. */
5abe1e05
RB
3028 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3029 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
cc269bb6
RS
3030 if (!tree_fits_uhwi_p (sza)
3031 || !tree_fits_uhwi_p (szb)
7b5fc413
RB
3032 || !tree_int_cst_equal (sza, szb))
3033 break;
3034
3035 /* Check that the data-refs have the same step. */
d20eac1b 3036 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
5abe1e05
RB
3037 break;
3038
5abe1e05
RB
3039 /* Check the types are compatible.
3040 ??? We don't distinguish this during sorting. */
3041 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3042 TREE_TYPE (DR_REF (drb))))
3043 break;
3044
c0a46545
RS
3045 /* Check that the DR_INITs are compile-time constants. */
3046 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3047 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3048 break;
3049
0356aab8
JJ
3050 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3051 just hold extra information. */
3052 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3053 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3054 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3055 break;
3056
5abe1e05
RB
3057 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3058 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3059 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
fb607032
RB
3060 HOST_WIDE_INT init_prev
3061 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3062 gcc_assert (init_a <= init_b
3063 && init_a <= init_prev
3064 && init_prev <= init_b);
3065
3066 /* Do not place the same access in the interleaving chain twice. */
3067 if (init_b == init_prev)
3068 {
3069 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3070 < gimple_uid (DR_STMT (drb)));
be43a887 3071 /* Simply link in duplicates and fix up the chain below. */
fb607032 3072 }
be43a887 3073 else
7b5fc413 3074 {
be43a887
RB
3075 /* If init_b == init_a + the size of the type * k, we have an
3076 interleaving, and DRA is accessed before DRB. */
3077 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3078 if (type_size_a == 0
3079 || (init_b - init_a) % type_size_a != 0)
7b5fc413 3080 break;
be43a887
RB
3081
3082 /* If we have a store, the accesses are adjacent. This splits
3083 groups into chunks we support (we don't support vectorization
3084 of stores with gaps). */
3085 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3086 break;
3087
3088 /* If the step (if not zero or non-constant) is greater than the
3089 difference between data-refs' inits this splits groups into
3090 suitable sizes. */
3091 if (tree_fits_shwi_p (DR_STEP (dra)))
3092 {
3093 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3094 if (step != 0 && step <= (init_b - init_a))
3095 break;
3096 }
7b5fc413 3097 }
5abe1e05
RB
3098
3099 if (dump_enabled_p ())
3c2a8ed0
DM
3100 dump_printf_loc (MSG_NOTE, vect_location,
3101 DR_IS_READ (dra)
3102 ? "Detected interleaving load %T and %T\n"
3103 : "Detected interleaving store %T and %T\n",
3104 DR_REF (dra), DR_REF (drb));
5abe1e05
RB
3105
3106 /* Link the found element into the group list. */
2c53b149 3107 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
5abe1e05 3108 {
91987857 3109 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
5abe1e05
RB
3110 lastinfo = stmtinfo_a;
3111 }
91987857
RS
3112 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3113 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
5abe1e05 3114 lastinfo = stmtinfo_b;
be43a887 3115
99763671
AM
3116 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3117 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3118
3119 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3120 dump_printf_loc (MSG_NOTE, vect_location,
3121 "Load suitable for SLP vectorization only.\n");
3122
be43a887
RB
3123 if (init_b == init_prev
3124 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3125 && dump_enabled_p ())
3126 dump_printf_loc (MSG_NOTE, vect_location,
3127 "Queuing group with duplicate access for fixup\n");
5abe1e05
RB
3128 }
3129 }
3130
be43a887
RB
3131 /* Fixup groups with duplicate entries by splitting it. */
3132 while (1)
3133 {
3134 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3135 if (!(it != to_fixup.end ()))
3136 break;
3137 stmt_vec_info grp = *it;
3138 to_fixup.remove (grp);
3139
3140 /* Find the earliest duplicate group member. */
3141 unsigned first_duplicate = -1u;
3142 stmt_vec_info next, g = grp;
3143 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3144 {
f95b7597
RB
3145 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3146 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
be43a887
RB
3147 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3148 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3149 g = next;
3150 }
3151 if (first_duplicate == -1U)
3152 continue;
3153
3154 /* Then move all stmts after the first duplicate to a new group.
3155 Note this is a heuristic but one with the property that *it
3156 is fixed up completely. */
3157 g = grp;
303d8f77 3158 stmt_vec_info newgroup = NULL, ng = grp;
be43a887
RB
3159 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3160 {
3161 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3162 {
3163 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3164 if (!newgroup)
3165 newgroup = next;
3166 else
3167 DR_GROUP_NEXT_ELEMENT (ng) = next;
3168 ng = next;
3169 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3170 }
3171 else
3172 g = DR_GROUP_NEXT_ELEMENT (g);
3173 }
3174 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3175
3176 /* Fixup the new group which still may contain duplicates. */
3177 to_fixup.add (newgroup);
3178 }
3179
3d54b29d 3180 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
89fa689a 3181 {
f5ae2856 3182 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
89fa689a 3183 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
308bc496 3184 && !vect_analyze_data_ref_access (vinfo, dr_info))
89fa689a
RS
3185 {
3186 if (dump_enabled_p ())
3187 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3188 "not vectorized: complicated access pattern.\n");
4b5caab7 3189
89fa689a
RS
3190 if (is_a <bb_vec_info> (vinfo))
3191 {
3192 /* Mark the statement as not vectorizable. */
3193 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3194 continue;
3195 }
3196 else
3197 {
3198 datarefs_copy.release ();
f4ebbd24
DM
3199 return opt_result::failure_at (dr_info->stmt->stmt,
3200 "not vectorized:"
3201 " complicated access pattern.\n");
89fa689a
RS
3202 }
3203 }
3204 }
ebfd146a 3205
3d54b29d 3206 datarefs_copy.release ();
f4ebbd24 3207 return opt_result::success ();
ebfd146a
IR
3208}
3209
a05a89fa
CH
3210/* Function vect_vfa_segment_size.
3211
a05a89fa 3212 Input:
89fa689a 3213 DR_INFO: The data reference.
a05a89fa
CH
3214 LENGTH_FACTOR: segment length to consider.
3215
a57776a1
RS
3216 Return a value suitable for the dr_with_seg_len::seg_len field.
3217 This is the "distance travelled" by the pointer from the first
3218 iteration in the segment to the last. Note that it does not include
3219 the size of the access; in effect it only describes the first byte. */
a05a89fa
CH
3220
3221static tree
89fa689a 3222vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
a05a89fa 3223{
a57776a1
RS
3224 length_factor = size_binop (MINUS_EXPR,
3225 fold_convert (sizetype, length_factor),
3226 size_one_node);
89fa689a 3227 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
a57776a1
RS
3228 length_factor);
3229}
a05a89fa 3230
89fa689a 3231/* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
a57776a1 3232 gives the worst-case number of bytes covered by the segment. */
a05a89fa 3233
a57776a1 3234static unsigned HOST_WIDE_INT
308bc496 3235vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
a57776a1 3236{
89fa689a
RS
3237 stmt_vec_info stmt_vinfo = dr_info->stmt;
3238 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
a57776a1
RS
3239 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3240 unsigned HOST_WIDE_INT access_size = ref_size;
2c53b149 3241 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
a05a89fa 3242 {
89fa689a 3243 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
2c53b149 3244 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
a57776a1 3245 }
b05d5563 3246 if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
308bc496 3247 && (vect_supportable_dr_alignment (vinfo, dr_info, false)
a57776a1
RS
3248 == dr_explicit_realign_optimized))
3249 {
3250 /* We might access a full vector's worth. */
3251 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3252 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
a05a89fa 3253 }
a57776a1
RS
3254 return access_size;
3255}
3256
89fa689a
RS
3257/* Get the minimum alignment for all the scalar accesses that DR_INFO
3258 describes. */
a57776a1
RS
3259
3260static unsigned int
89fa689a 3261vect_vfa_align (dr_vec_info *dr_info)
a57776a1 3262{
89fa689a 3263 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
a05a89fa
CH
3264}
3265
6fa3d4b4
BC
3266/* Function vect_no_alias_p.
3267
b064d4f9
RS
3268 Given data references A and B with equal base and offset, see whether
3269 the alias relation can be decided at compilation time. Return 1 if
3270 it can and the references alias, 0 if it can and the references do
a57776a1
RS
3271 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3272 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3273 of dr_with_seg_len::{seg_len,access_size} for A and B. */
6fa3d4b4 3274
b064d4f9 3275static int
89fa689a 3276vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
a57776a1
RS
3277 tree segment_length_a, tree segment_length_b,
3278 unsigned HOST_WIDE_INT access_size_a,
3279 unsigned HOST_WIDE_INT access_size_b)
6fa3d4b4 3280{
89fa689a
RS
3281 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3282 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
b064d4f9
RS
3283 poly_uint64 const_length_a;
3284 poly_uint64 const_length_b;
6fa3d4b4 3285
6fa3d4b4
BC
3286 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3287 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3288 [a, a+12) */
89fa689a 3289 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
6fa3d4b4 3290 {
b064d4f9 3291 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
f91aa3e6 3292 offset_a -= const_length_a;
6fa3d4b4 3293 }
b064d4f9
RS
3294 else
3295 const_length_a = tree_to_poly_uint64 (segment_length_a);
89fa689a 3296 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
6fa3d4b4 3297 {
b064d4f9 3298 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
f91aa3e6 3299 offset_b -= const_length_b;
6fa3d4b4 3300 }
b064d4f9
RS
3301 else
3302 const_length_b = tree_to_poly_uint64 (segment_length_b);
6fa3d4b4 3303
a57776a1
RS
3304 const_length_a += access_size_a;
3305 const_length_b += access_size_b;
3306
b064d4f9
RS
3307 if (ranges_known_overlap_p (offset_a, const_length_a,
3308 offset_b, const_length_b))
3309 return 1;
6fa3d4b4 3310
b064d4f9
RS
3311 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3312 offset_b, const_length_b))
3313 return 0;
3314
3315 return -1;
6fa3d4b4
BC
3316}
3317
dfbddbeb
RS
3318/* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3319 in DDR is >= VF. */
3320
3321static bool
3322dependence_distance_ge_vf (data_dependence_relation *ddr,
d9f21f6a 3323 unsigned int loop_depth, poly_uint64 vf)
dfbddbeb
RS
3324{
3325 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3326 || DDR_NUM_DIST_VECTS (ddr) == 0)
3327 return false;
3328
3329 /* If the dependence is exact, we should have limited the VF instead. */
3330 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3331
3332 unsigned int i;
3333 lambda_vector dist_v;
3334 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3335 {
3336 HOST_WIDE_INT dist = dist_v[loop_depth];
3337 if (dist != 0
3338 && !(dist > 0 && DDR_REVERSED_P (ddr))
d9f21f6a 3339 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
dfbddbeb
RS
3340 return false;
3341 }
3342
3343 if (dump_enabled_p ())
3c2a8ed0
DM
3344 dump_printf_loc (MSG_NOTE, vect_location,
3345 "dependence distance between %T and %T is >= VF\n",
3346 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
dfbddbeb
RS
3347
3348 return true;
3349}
3350
a57776a1
RS
3351/* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3352
3353static void
4af78ef8 3354dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
a57776a1 3355{
3c2a8ed0
DM
3356 dump_printf (dump_kind, "%s (%T) >= ",
3357 lower_bound.unsigned_p ? "unsigned" : "abs",
3358 lower_bound.expr);
a57776a1
RS
3359 dump_dec (dump_kind, lower_bound.min_value);
3360}
3361
3362/* Record that the vectorized loop requires the vec_lower_bound described
3363 by EXPR, UNSIGNED_P and MIN_VALUE. */
3364
3365static void
3366vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3367 poly_uint64 min_value)
3368{
3369 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3370 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3371 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3372 {
3373 unsigned_p &= lower_bounds[i].unsigned_p;
3374 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3375 if (lower_bounds[i].unsigned_p != unsigned_p
3376 || maybe_lt (lower_bounds[i].min_value, min_value))
3377 {
3378 lower_bounds[i].unsigned_p = unsigned_p;
3379 lower_bounds[i].min_value = min_value;
3380 if (dump_enabled_p ())
3381 {
3382 dump_printf_loc (MSG_NOTE, vect_location,
3383 "updating run-time check to ");
3384 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3385 dump_printf (MSG_NOTE, "\n");
3386 }
3387 }
3388 return;
3389 }
3390
3391 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3392 if (dump_enabled_p ())
3393 {
3394 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3395 dump_lower_bound (MSG_NOTE, lower_bound);
3396 dump_printf (MSG_NOTE, "\n");
3397 }
3398 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3399}
3400
89fa689a 3401/* Return true if it's unlikely that the step of the vectorized form of DR_INFO
a57776a1
RS
3402 will span fewer than GAP bytes. */
3403
3404static bool
89fa689a
RS
3405vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3406 poly_int64 gap)
a57776a1 3407{
89fa689a 3408 stmt_vec_info stmt_info = dr_info->stmt;
a57776a1
RS
3409 HOST_WIDE_INT count
3410 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2c53b149 3411 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
bffb8014 3412 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
89fa689a
RS
3413 return (estimated_poly_value (gap)
3414 <= count * vect_get_scalar_dr_size (dr_info));
a57776a1
RS
3415}
3416
89fa689a
RS
3417/* Return true if we know that there is no alias between DR_INFO_A and
3418 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3419 When returning true, set *LOWER_BOUND_OUT to this N. */
a57776a1
RS
3420
3421static bool
89fa689a 3422vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
a57776a1
RS
3423 poly_uint64 *lower_bound_out)
3424{
3425 /* Check that there is a constant gap of known sign between DR_A
3426 and DR_B. */
89fa689a
RS
3427 data_reference *dr_a = dr_info_a->dr;
3428 data_reference *dr_b = dr_info_b->dr;
a57776a1
RS
3429 poly_int64 init_a, init_b;
3430 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3431 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3432 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3433 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3434 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3435 || !ordered_p (init_a, init_b))
3436 return false;
3437
3438 /* Sort DR_A and DR_B by the address they access. */
3439 if (maybe_lt (init_b, init_a))
3440 {
3441 std::swap (init_a, init_b);
89fa689a 3442 std::swap (dr_info_a, dr_info_b);
a57776a1
RS
3443 std::swap (dr_a, dr_b);
3444 }
3445
3446 /* If the two accesses could be dependent within a scalar iteration,
3447 make sure that we'd retain their order. */
89fa689a
RS
3448 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3449 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
a57776a1
RS
3450 return false;
3451
3452 /* There is no alias if abs (DR_STEP) is greater than or equal to
3453 the bytes spanned by the combination of the two accesses. */
89fa689a 3454 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
a57776a1
RS
3455 return true;
3456}
3457
ebfd146a
IR
3458/* Function vect_prune_runtime_alias_test_list.
3459
3460 Prune a list of ddrs to be tested at run-time by versioning for alias.
a05a89fa 3461 Merge several alias checks into one if possible.
ebfd146a
IR
3462 Return FALSE if resulting list of ddrs is longer then allowed by
3463 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3464
f4ebbd24 3465opt_result
ebfd146a
IR
3466vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3467{
9adee305
RS
3468 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3469 hash_set <tree_pair_hash> compared_objects;
3470
3471 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3472 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3473 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3474 vec<vec_object_pair> &check_unequal_addrs
3475 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
d9f21f6a 3476 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
a05a89fa
CH
3477 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3478
3479 ddr_p ddr;
3480 unsigned int i;
3481 tree length_factor;
ebfd146a 3482
adac3a68 3483 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
ebfd146a 3484
a57776a1
RS
3485 /* Step values are irrelevant for aliasing if the number of vector
3486 iterations is equal to the number of scalar iterations (which can
3487 happen for fully-SLP loops). */
3488 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3489
3490 if (!ignore_step_p)
3491 {
3492 /* Convert the checks for nonzero steps into bound tests. */
3493 tree value;
3494 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3495 vect_check_lower_bound (loop_vinfo, value, true, 1);
3496 }
3497
a05a89fa 3498 if (may_alias_ddrs.is_empty ())
f4ebbd24 3499 return opt_result::success ();
a05a89fa 3500
a05a89fa
CH
3501 comp_alias_ddrs.create (may_alias_ddrs.length ());
3502
dfbddbeb
RS
3503 unsigned int loop_depth
3504 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3505 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3506
a05a89fa
CH
3507 /* First, we collect all data ref pairs for aliasing checks. */
3508 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
ebfd146a 3509 {
a57776a1 3510 poly_uint64 lower_bound;
a05a89fa 3511 tree segment_length_a, segment_length_b;
a57776a1
RS
3512 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3513 unsigned int align_a, align_b;
a05a89fa 3514
dfbddbeb
RS
3515 /* Ignore the alias if the VF we chose ended up being no greater
3516 than the dependence distance. */
3517 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3518 continue;
3519
9adee305
RS
3520 if (DDR_OBJECT_A (ddr))
3521 {
3522 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3523 if (!compared_objects.add (new_pair))
3524 {
3525 if (dump_enabled_p ())
3c2a8ed0
DM
3526 dump_printf_loc (MSG_NOTE, vect_location,
3527 "checking that %T and %T"
3528 " have different addresses\n",
3529 new_pair.first, new_pair.second);
9adee305
RS
3530 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3531 }
3532 continue;
3533 }
3534
f5ae2856 3535 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
89fa689a 3536 stmt_vec_info stmt_info_a = dr_info_a->stmt;
a57776a1 3537
f5ae2856 3538 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
89fa689a 3539 stmt_vec_info stmt_info_b = dr_info_b->stmt;
a57776a1 3540
e9acf80c
RS
3541 bool preserves_scalar_order_p
3542 = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3543
a57776a1
RS
3544 /* Skip the pair if inter-iteration dependencies are irrelevant
3545 and intra-iteration dependencies are guaranteed to be honored. */
3546 if (ignore_step_p
e9acf80c 3547 && (preserves_scalar_order_p
89fa689a
RS
3548 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3549 &lower_bound)))
a57776a1
RS
3550 {
3551 if (dump_enabled_p ())
3c2a8ed0
DM
3552 dump_printf_loc (MSG_NOTE, vect_location,
3553 "no need for alias check between "
3554 "%T and %T when VF is 1\n",
3555 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
a57776a1
RS
3556 continue;
3557 }
3558
3559 /* See whether we can handle the alias using a bounds check on
3560 the step, and whether that's likely to be the best approach.
3561 (It might not be, for example, if the minimum step is much larger
3562 than the number of bytes handled by one vector iteration.) */
3563 if (!ignore_step_p
89fa689a
RS
3564 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3565 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3566 &lower_bound)
3567 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3568 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
a57776a1 3569 {
89fa689a 3570 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
a57776a1
RS
3571 if (dump_enabled_p ())
3572 {
3c2a8ed0
DM
3573 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3574 "%T and %T when the step %T is outside ",
3575 DR_REF (dr_info_a->dr),
3576 DR_REF (dr_info_b->dr),
3577 DR_STEP (dr_info_a->dr));
a57776a1
RS
3578 if (unsigned_p)
3579 dump_printf (MSG_NOTE, "[0");
3580 else
3581 {
3582 dump_printf (MSG_NOTE, "(");
3583 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3584 }
3585 dump_printf (MSG_NOTE, ", ");
3586 dump_dec (MSG_NOTE, lower_bound);
3587 dump_printf (MSG_NOTE, ")\n");
3588 }
89fa689a
RS
3589 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3590 unsigned_p, lower_bound);
a57776a1
RS
3591 continue;
3592 }
3593
bffb8014 3594 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
a05a89fa
CH
3595 if (dr_group_first_a)
3596 {
bffb8014 3597 stmt_info_a = dr_group_first_a;
89fa689a 3598 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
a05a89fa 3599 }
ebfd146a 3600
bffb8014 3601 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
a05a89fa
CH
3602 if (dr_group_first_b)
3603 {
bffb8014 3604 stmt_info_b = dr_group_first_b;
89fa689a 3605 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
a05a89fa 3606 }
ebfd146a 3607
a57776a1
RS
3608 if (ignore_step_p)
3609 {
3610 segment_length_a = size_zero_node;
3611 segment_length_b = size_zero_node;
3612 }
a05a89fa 3613 else
a57776a1 3614 {
89fa689a
RS
3615 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3616 DR_STEP (dr_info_b->dr), 0))
a57776a1
RS
3617 length_factor = scalar_loop_iters;
3618 else
3619 length_factor = size_int (vect_factor);
89fa689a
RS
3620 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3621 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
a57776a1 3622 }
308bc496
RB
3623 access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3624 access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
89fa689a
RS
3625 align_a = vect_vfa_align (dr_info_a);
3626 align_b = vect_vfa_align (dr_info_b);
a05a89fa 3627
b064d4f9 3628 /* See whether the alias is known at compilation time. */
1fb2b0f6
RS
3629 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3630 DR_BASE_ADDRESS (dr_info_b->dr), 0)
3631 && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3632 DR_OFFSET (dr_info_b->dr), 0)
89fa689a
RS
3633 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3634 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
b064d4f9
RS
3635 && poly_int_tree_p (segment_length_a)
3636 && poly_int_tree_p (segment_length_b))
6fa3d4b4 3637 {
89fa689a 3638 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
b064d4f9 3639 segment_length_a,
a57776a1
RS
3640 segment_length_b,
3641 access_size_a,
3642 access_size_b);
3643 if (res >= 0 && dump_enabled_p ())
3644 {
3645 dump_printf_loc (MSG_NOTE, vect_location,
3c2a8ed0
DM
3646 "can tell at compile time that %T and %T",
3647 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
a57776a1
RS
3648 if (res == 0)
3649 dump_printf (MSG_NOTE, " do not alias\n");
3650 else
3651 dump_printf (MSG_NOTE, " alias\n");
3652 }
3653
b064d4f9 3654 if (res == 0)
6fa3d4b4
BC
3655 continue;
3656
b064d4f9 3657 if (res == 1)
f4ebbd24
DM
3658 return opt_result::failure_at (stmt_info_b->stmt,
3659 "not vectorized:"
3660 " compilation time alias: %G%G",
3661 stmt_info_a->stmt,
3662 stmt_info_b->stmt);
6fa3d4b4
BC
3663 }
3664
e9acf80c
RS
3665 dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3666 access_size_a, align_a);
3667 dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3668 access_size_b, align_b);
3669 /* Canonicalize the order to be the one that's needed for accurate
3670 RAW, WAR and WAW flags, in cases where the data references are
3671 well-ordered. The order doesn't really matter otherwise,
3672 but we might as well be consistent. */
3673 if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3674 std::swap (dr_a, dr_b);
3675
93bdc3ed 3676 dr_with_seg_len_pair_t dr_with_seg_len_pair
e9acf80c
RS
3677 (dr_a, dr_b, (preserves_scalar_order_p
3678 ? dr_with_seg_len_pair_t::WELL_ORDERED
3679 : dr_with_seg_len_pair_t::REORDERED));
93bdc3ed 3680
a05a89fa
CH
3681 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3682 }
3683
d9f21f6a 3684 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
9adee305
RS
3685
3686 unsigned int count = (comp_alias_ddrs.length ()
3687 + check_unequal_addrs.length ());
a57776a1 3688
bbeeac91
DM
3689 if (dump_enabled_p ())
3690 dump_printf_loc (MSG_NOTE, vect_location,
3691 "improved number of alias checks from %d to %d\n",
3692 may_alias_ddrs.length (), count);
028d4092 3693 unsigned limit = param_vect_max_version_for_alias_checks;
247afa98 3694 if (flag_simd_cost_model == VECT_COST_MODEL_CHEAP)
028d4092 3695 limit = param_vect_max_version_for_alias_checks * 6 / 10;
247afa98 3696 if (count > limit)
f4ebbd24
DM
3697 return opt_result::failure_at
3698 (vect_location,
247afa98
RB
3699 "number of versioning for alias run-time tests exceeds %d "
3700 "(--param vect-max-version-for-alias-checks)\n", limit);
f4ebbd24
DM
3701
3702 return opt_result::success ();
ebfd146a
IR
3703}
3704
bfaa08b7
RS
3705/* Check whether we can use an internal function for a gather load
3706 or scatter store. READ_P is true for loads and false for stores.
3707 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
09eb042a
RS
3708 the type of the memory elements being loaded or stored. OFFSET_TYPE
3709 is the type of the offset that is being applied to the invariant
3710 base address. SCALE is the amount by which the offset should
bfaa08b7
RS
3711 be multiplied *after* it has been converted to address width.
3712
09eb042a
RS
3713 Return true if the function is supported, storing the function id in
3714 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
bfaa08b7 3715
429ef523 3716bool
09eb042a
RS
3717vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3718 tree vectype, tree memory_type, tree offset_type,
3719 int scale, internal_fn *ifn_out,
3720 tree *offset_vectype_out)
bfaa08b7
RS
3721{
3722 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
d17a896d 3723 unsigned int element_bits = vector_element_bits (vectype);
bfaa08b7
RS
3724 if (element_bits != memory_bits)
3725 /* For now the vector elements must be the same width as the
3726 memory elements. */
3727 return false;
3728
3729 /* Work out which function we need. */
3730 internal_fn ifn;
3731 if (read_p)
3732 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3733 else
f307441a 3734 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
bfaa08b7 3735
09eb042a
RS
3736 for (;;)
3737 {
3738 tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3739 if (!offset_vectype)
3740 return false;
bfaa08b7 3741
09eb042a
RS
3742 /* Test whether the target supports this combination. */
3743 if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3744 offset_vectype, scale))
3745 {
3746 *ifn_out = ifn;
3747 *offset_vectype_out = offset_vectype;
3748 return true;
3749 }
3750
3751 if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3752 && TYPE_PRECISION (offset_type) >= element_bits)
3753 return false;
3754
3755 offset_type = build_nonstandard_integer_type
3756 (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3757 }
bfaa08b7
RS
3758}
3759
82570274 3760/* STMT_INFO is a call to an internal gather load or scatter store function.
bfaa08b7
RS
3761 Describe the operation in INFO. */
3762
3763static void
82570274
RS
3764vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3765 gather_scatter_info *info)
bfaa08b7 3766{
82570274 3767 gcall *call = as_a <gcall *> (stmt_info->stmt);
bfaa08b7
RS
3768 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3769 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3770
3771 info->ifn = gimple_call_internal_fn (call);
3772 info->decl = NULL_TREE;
3773 info->base = gimple_call_arg (call, 0);
3774 info->offset = gimple_call_arg (call, 1);
3775 info->offset_dt = vect_unknown_def_type;
3776 info->offset_vectype = NULL_TREE;
3777 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3778 info->element_type = TREE_TYPE (vectype);
3779 info->memory_type = TREE_TYPE (DR_REF (dr));
3780}
3781
32e8e429 3782/* Return true if a non-affine read or write in STMT_INFO is suitable for a
134c85ca 3783 gather load or scatter store. Describe the operation in *INFO if so. */
aec7ae7d 3784
134c85ca 3785bool
32e8e429 3786vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
134c85ca 3787 gather_scatter_info *info)
aec7ae7d 3788{
f37fac2b
RS
3789 HOST_WIDE_INT scale = 1;
3790 poly_int64 pbitpos, pbitsize;
99b1c316 3791 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
aec7ae7d
JJ
3792 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3793 tree offtype = NULL_TREE;
bfaa08b7
RS
3794 tree decl = NULL_TREE, base, off;
3795 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3796 tree memory_type = TREE_TYPE (DR_REF (dr));
ef4bddc2 3797 machine_mode pmode;
ee45a32d 3798 int punsignedp, reversep, pvolatilep = 0;
bfaa08b7 3799 internal_fn ifn;
09eb042a 3800 tree offset_vectype;
bfaa08b7
RS
3801 bool masked_p = false;
3802
3803 /* See whether this is already a call to a gather/scatter internal function.
3804 If not, see whether it's a masked load or store. */
86a91c0a 3805 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
bfaa08b7
RS
3806 if (call && gimple_call_internal_p (call))
3807 {
beb456c3 3808 ifn = gimple_call_internal_fn (call);
bfaa08b7
RS
3809 if (internal_gather_scatter_fn_p (ifn))
3810 {
82570274 3811 vect_describe_gather_scatter_call (stmt_info, info);
bfaa08b7
RS
3812 return true;
3813 }
3814 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3815 }
3816
3817 /* True if we should aim to use internal functions rather than
3818 built-in functions. */
3819 bool use_ifn_p = (DR_IS_READ (dr)
f307441a
RS
3820 ? supports_vec_gather_load_p ()
3821 : supports_vec_scatter_store_p ());
aec7ae7d 3822
5ce9450f
JJ
3823 base = DR_REF (dr);
3824 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3825 see if we can use the def stmt of the address. */
bfaa08b7 3826 if (masked_p
5ce9450f
JJ
3827 && TREE_CODE (base) == MEM_REF
3828 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3829 && integer_zerop (TREE_OPERAND (base, 1))
3830 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3831 {
355fe088 3832 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
5ce9450f
JJ
3833 if (is_gimple_assign (def_stmt)
3834 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3835 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3836 }
3837
3bab6342 3838 /* The gather and scatter builtins need address of the form
aec7ae7d
JJ
3839 loop_invariant + vector * {1, 2, 4, 8}
3840 or
3841 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3842 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3843 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3844 multiplications and additions in it. To get a vector, we need
3845 a single SSA_NAME that will be defined in the loop and will
3846 contain everything that is not loop invariant and that can be
3847 vectorized. The following code attempts to find such a preexistng
3848 SSA_NAME OFF and put the loop invariants into a tree BASE
3849 that can be gimplified before the loop. */
ee45a32d 3850 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
25b75a48 3851 &punsignedp, &reversep, &pvolatilep);
8c963290
RB
3852 if (reversep)
3853 return false;
3854
f37fac2b 3855 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
aec7ae7d
JJ
3856
3857 if (TREE_CODE (base) == MEM_REF)
3858 {
3859 if (!integer_zerop (TREE_OPERAND (base, 1)))
3860 {
3861 if (off == NULL_TREE)
aca52e6f 3862 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
aec7ae7d
JJ
3863 else
3864 off = size_binop (PLUS_EXPR, off,
3865 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3866 }
3867 base = TREE_OPERAND (base, 0);
3868 }
3869 else
3870 base = build_fold_addr_expr (base);
3871
3872 if (off == NULL_TREE)
3873 off = size_zero_node;
3874
3875 /* If base is not loop invariant, either off is 0, then we start with just
3876 the constant offset in the loop invariant BASE and continue with base
3877 as OFF, otherwise give up.
3878 We could handle that case by gimplifying the addition of base + off
3879 into some SSA_NAME and use that as off, but for now punt. */
3880 if (!expr_invariant_in_loop_p (loop, base))
3881 {
3882 if (!integer_zerop (off))
134c85ca 3883 return false;
aec7ae7d 3884 off = base;
f37fac2b 3885 base = size_int (pbytepos);
aec7ae7d
JJ
3886 }
3887 /* Otherwise put base + constant offset into the loop invariant BASE
3888 and continue with OFF. */
3889 else
3890 {
3891 base = fold_convert (sizetype, base);
f37fac2b 3892 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
aec7ae7d
JJ
3893 }
3894
3895 /* OFF at this point may be either a SSA_NAME or some tree expression
3896 from get_inner_reference. Try to peel off loop invariants from it
3897 into BASE as long as possible. */
3898 STRIP_NOPS (off);
3899 while (offtype == NULL_TREE)
3900 {
3901 enum tree_code code;
3902 tree op0, op1, add = NULL_TREE;
3903
3904 if (TREE_CODE (off) == SSA_NAME)
3905 {
355fe088 3906 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
aec7ae7d
JJ
3907
3908 if (expr_invariant_in_loop_p (loop, off))
134c85ca 3909 return false;
aec7ae7d
JJ
3910
3911 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3912 break;
3913
3914 op0 = gimple_assign_rhs1 (def_stmt);
3915 code = gimple_assign_rhs_code (def_stmt);
3916 op1 = gimple_assign_rhs2 (def_stmt);
3917 }
3918 else
3919 {
3920 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
134c85ca 3921 return false;
aec7ae7d
JJ
3922 code = TREE_CODE (off);
3923 extract_ops_from_tree (off, &code, &op0, &op1);
3924 }
3925 switch (code)
3926 {
3927 case POINTER_PLUS_EXPR:
3928 case PLUS_EXPR:
3929 if (expr_invariant_in_loop_p (loop, op0))
3930 {
3931 add = op0;
3932 off = op1;
3933 do_add:
3934 add = fold_convert (sizetype, add);
3935 if (scale != 1)
3936 add = size_binop (MULT_EXPR, add, size_int (scale));
3937 base = size_binop (PLUS_EXPR, base, add);
3938 continue;
3939 }
3940 if (expr_invariant_in_loop_p (loop, op1))
3941 {
3942 add = op1;
3943 off = op0;
3944 goto do_add;
3945 }
3946 break;
3947 case MINUS_EXPR:
3948 if (expr_invariant_in_loop_p (loop, op1))
3949 {
3950 add = fold_convert (sizetype, op1);
3951 add = size_binop (MINUS_EXPR, size_zero_node, add);
3952 off = op0;
3953 goto do_add;
3954 }
3955 break;
3956 case MULT_EXPR:
9541ffee 3957 if (scale == 1 && tree_fits_shwi_p (op1))
aec7ae7d 3958 {
bfaa08b7
RS
3959 int new_scale = tree_to_shwi (op1);
3960 /* Only treat this as a scaling operation if the target
09eb042a 3961 supports it for at least some offset type. */
bfaa08b7 3962 if (use_ifn_p
09eb042a
RS
3963 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3964 masked_p, vectype, memory_type,
3965 signed_char_type_node,
3966 new_scale, &ifn,
3967 &offset_vectype)
3968 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3969 masked_p, vectype, memory_type,
3970 unsigned_char_type_node,
bfaa08b7 3971 new_scale, &ifn,
09eb042a 3972 &offset_vectype))
bfaa08b7
RS
3973 break;
3974 scale = new_scale;
aec7ae7d
JJ
3975 off = op0;
3976 continue;
3977 }
3978 break;
3979 case SSA_NAME:
3980 off = op0;
3981 continue;
3982 CASE_CONVERT:
3983 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3984 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3985 break;
09eb042a
RS
3986
3987 /* Don't include the conversion if the target is happy with
3988 the current offset type. */
3989 if (use_ifn_p
3990 && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3991 masked_p, vectype, memory_type,
3992 TREE_TYPE (off), scale, &ifn,
3993 &offset_vectype))
3994 break;
3995
aec7ae7d
JJ
3996 if (TYPE_PRECISION (TREE_TYPE (op0))
3997 == TYPE_PRECISION (TREE_TYPE (off)))
3998 {
3999 off = op0;
4000 continue;
4001 }
bfaa08b7 4002
aec7ae7d
JJ
4003 if (TYPE_PRECISION (TREE_TYPE (op0))
4004 < TYPE_PRECISION (TREE_TYPE (off)))
4005 {
4006 off = op0;
4007 offtype = TREE_TYPE (off);
4008 STRIP_NOPS (off);
4009 continue;
4010 }
4011 break;
4012 default:
4013 break;
4014 }
4015 break;
4016 }
4017
4018 /* If at the end OFF still isn't a SSA_NAME or isn't
4019 defined in the loop, punt. */
4020 if (TREE_CODE (off) != SSA_NAME
4021 || expr_invariant_in_loop_p (loop, off))
134c85ca 4022 return false;
aec7ae7d
JJ
4023
4024 if (offtype == NULL_TREE)
4025 offtype = TREE_TYPE (off);
4026
bfaa08b7
RS
4027 if (use_ifn_p)
4028 {
09eb042a
RS
4029 if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4030 vectype, memory_type, offtype, scale,
4031 &ifn, &offset_vectype))
bfaa08b7
RS
4032 return false;
4033 }
3bab6342 4034 else
bfaa08b7
RS
4035 {
4036 if (DR_IS_READ (dr))
ab2fc782
RS
4037 {
4038 if (targetm.vectorize.builtin_gather)
4039 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4040 }
bfaa08b7 4041 else
ab2fc782
RS
4042 {
4043 if (targetm.vectorize.builtin_scatter)
4044 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4045 }
3bab6342 4046
bfaa08b7
RS
4047 if (!decl)
4048 return false;
4049
4050 ifn = IFN_LAST;
09eb042a
RS
4051 /* The offset vector type will be read from DECL when needed. */
4052 offset_vectype = NULL_TREE;
bfaa08b7 4053 }
134c85ca 4054
bfaa08b7 4055 info->ifn = ifn;
134c85ca
RS
4056 info->decl = decl;
4057 info->base = base;
4058 info->offset = off;
4059 info->offset_dt = vect_unknown_def_type;
09eb042a 4060 info->offset_vectype = offset_vectype;
134c85ca 4061 info->scale = scale;
09eb042a 4062 info->element_type = TREE_TYPE (vectype);
bfaa08b7 4063 info->memory_type = memory_type;
134c85ca 4064 return true;
aec7ae7d
JJ
4065}
4066
8e846c66
RB
4067/* Find the data references in STMT, analyze them with respect to LOOP and
4068 append them to DATAREFS. Return false if datarefs in this stmt cannot
4069 be handled. */
4070
f4ebbd24 4071opt_result
8e846c66
RB
4072vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4073 vec<data_reference_p> *datarefs)
4074{
4075 /* We can ignore clobbers for dataref analysis - they are removed during
4076 loop vectorization and BB vectorization checks dependences with a
4077 stmt walk. */
4078 if (gimple_clobber_p (stmt))
f4ebbd24 4079 return opt_result::success ();
8e846c66
RB
4080
4081 if (gimple_has_volatile_ops (stmt))
f4ebbd24
DM
4082 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4083 stmt);
8e846c66 4084
36bbc05d 4085 if (stmt_can_throw_internal (cfun, stmt))
f4ebbd24
DM
4086 return opt_result::failure_at (stmt,
4087 "not vectorized:"
4088 " statement can throw an exception: %G",
4089 stmt);
8e846c66
RB
4090
4091 auto_vec<data_reference_p, 2> refs;
f4ebbd24
DM
4092 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4093 if (!res)
4094 return res;
8e846c66
RB
4095
4096 if (refs.is_empty ())
f4ebbd24 4097 return opt_result::success ();
8e846c66
RB
4098
4099 if (refs.length () > 1)
f4ebbd24
DM
4100 return opt_result::failure_at (stmt,
4101 "not vectorized:"
4102 " more than one data ref in stmt: %G", stmt);
8e846c66
RB
4103
4104 if (gcall *call = dyn_cast <gcall *> (stmt))
4105 if (!gimple_call_internal_p (call)
4106 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4107 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
f4ebbd24
DM
4108 return opt_result::failure_at (stmt,
4109 "not vectorized: dr in a call %G", stmt);
8e846c66
RB
4110
4111 data_reference_p dr = refs.pop ();
4112 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4113 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
f4ebbd24
DM
4114 return opt_result::failure_at (stmt,
4115 "not vectorized:"
4116 " statement is bitfield access %G", stmt);
8e846c66
RB
4117
4118 if (DR_BASE_ADDRESS (dr)
4119 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
f4ebbd24
DM
4120 return opt_result::failure_at (stmt,
4121 "not vectorized:"
4122 " base addr of dr is a constant\n");
8e846c66 4123
f2227a66
RB
4124 /* Check whether this may be a SIMD lane access and adjust the
4125 DR to make it easier for us to handle it. */
4126 if (loop
4127 && loop->simduid
4128 && (!DR_BASE_ADDRESS (dr)
4129 || !DR_OFFSET (dr)
4130 || !DR_INIT (dr)
4131 || !DR_STEP (dr)))
4132 {
4133 struct data_reference *newdr
4134 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4135 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4136 if (DR_BASE_ADDRESS (newdr)
4137 && DR_OFFSET (newdr)
4138 && DR_INIT (newdr)
4139 && DR_STEP (newdr)
c13c129f 4140 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
f2227a66
RB
4141 && integer_zerop (DR_STEP (newdr)))
4142 {
c13c129f 4143 tree base_address = DR_BASE_ADDRESS (newdr);
f2227a66 4144 tree off = DR_OFFSET (newdr);
080c269b 4145 tree step = ssize_int (1);
c13c129f
JJ
4146 if (integer_zerop (off)
4147 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4148 {
4149 off = TREE_OPERAND (base_address, 1);
4150 base_address = TREE_OPERAND (base_address, 0);
4151 }
f2227a66 4152 STRIP_NOPS (off);
080c269b 4153 if (TREE_CODE (off) == MULT_EXPR
f2227a66
RB
4154 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4155 {
080c269b 4156 step = TREE_OPERAND (off, 1);
f2227a66
RB
4157 off = TREE_OPERAND (off, 0);
4158 STRIP_NOPS (off);
080c269b 4159 }
c13c129f
JJ
4160 if (CONVERT_EXPR_P (off)
4161 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4162 < TYPE_PRECISION (TREE_TYPE (off))))
4163 off = TREE_OPERAND (off, 0);
4164 if (TREE_CODE (off) == SSA_NAME)
080c269b 4165 {
c13c129f
JJ
4166 gimple *def = SSA_NAME_DEF_STMT (off);
4167 /* Look through widening conversion. */
4168 if (is_gimple_assign (def)
4169 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4170 {
4171 tree rhs1 = gimple_assign_rhs1 (def);
4172 if (TREE_CODE (rhs1) == SSA_NAME
4173 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4174 && (TYPE_PRECISION (TREE_TYPE (off))
4175 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4176 def = SSA_NAME_DEF_STMT (rhs1);
4177 }
4178 if (is_gimple_call (def)
4179 && gimple_call_internal_p (def)
4180 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
f2227a66 4181 {
c13c129f 4182 tree arg = gimple_call_arg (def, 0);
f2227a66 4183 tree reft = TREE_TYPE (DR_REF (newdr));
c13c129f
JJ
4184 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4185 arg = SSA_NAME_VAR (arg);
4186 if (arg == loop->simduid
4187 /* For now. */
4188 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
f2227a66 4189 {
c13c129f
JJ
4190 DR_BASE_ADDRESS (newdr) = base_address;
4191 DR_OFFSET (newdr) = ssize_int (0);
4192 DR_STEP (newdr) = step;
4193 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4194 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4195 /* Mark as simd-lane access. */
4196 tree arg2 = gimple_call_arg (def, 1);
4197 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4198 free_data_ref (dr);
4199 datarefs->safe_push (newdr);
4200 return opt_result::success ();
f2227a66
RB
4201 }
4202 }
4203 }
4204 }
4205 free_data_ref (newdr);
4206 }
4207
8e846c66 4208 datarefs->safe_push (dr);
f4ebbd24 4209 return opt_result::success ();
8e846c66
RB
4210}
4211
ebfd146a
IR
4212/* Function vect_analyze_data_refs.
4213
a70d6342 4214 Find all the data references in the loop or basic block.
ebfd146a
IR
4215
4216 The general structure of the analysis of data refs in the vectorizer is as
4217 follows:
b8698a0f 4218 1- vect_analyze_data_refs(loop/bb): call
a70d6342
IR
4219 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4220 in the loop/bb and their dependences.
ebfd146a
IR
4221 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4222 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4223 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4224
4225*/
4226
f4ebbd24 4227opt_result
a7b3509e 4228vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
ebfd146a 4229{
99b1c316 4230 class loop *loop = NULL;
ebfd146a 4231 unsigned int i;
ebfd146a
IR
4232 struct data_reference *dr;
4233 tree scalar_type;
4234
adac3a68 4235 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
b8698a0f 4236
310213d4 4237 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
428db0ba 4238 loop = LOOP_VINFO_LOOP (loop_vinfo);
ebfd146a 4239
ff802fa1
IR
4240 /* Go through the data-refs, check that the analysis succeeded. Update
4241 pointer from stmt_vec_info struct to DR and vectype. */
ebfd146a 4242
ca823c85 4243 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
9771b263 4244 FOR_EACH_VEC_ELT (datarefs, i, dr)
ebfd146a 4245 {
3bab6342 4246 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
d9f21f6a 4247 poly_uint64 vf;
b8698a0f 4248
8e846c66 4249 gcc_assert (DR_REF (dr));
f44fb7aa
RS
4250 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4251 gcc_assert (!stmt_info->dr_aux.dr);
4252 stmt_info->dr_aux.dr = dr;
4253 stmt_info->dr_aux.stmt = stmt_info;
ebfd146a
IR
4254
4255 /* Check that analysis of the data-ref succeeded. */
4256 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
aec7ae7d 4257 || !DR_STEP (dr))
ebfd146a 4258 {
74bf76ed
JJ
4259 bool maybe_gather
4260 = DR_IS_READ (dr)
aec7ae7d 4261 && !TREE_THIS_VOLATILE (DR_REF (dr))
bfaa08b7
RS
4262 && (targetm.vectorize.builtin_gather != NULL
4263 || supports_vec_gather_load_p ());
3bab6342
AT
4264 bool maybe_scatter
4265 = DR_IS_WRITE (dr)
4266 && !TREE_THIS_VOLATILE (DR_REF (dr))
f307441a
RS
4267 && (targetm.vectorize.builtin_scatter != NULL
4268 || supports_vec_scatter_store_p ());
74bf76ed 4269
f2227a66
RB
4270 /* If target supports vector gather loads or scatter stores,
4271 see if they can't be used. */
310213d4 4272 if (is_a <loop_vec_info> (vinfo)
78e02b3b 4273 && !nested_in_vect_loop_p (loop, stmt_info))
aec7ae7d 4274 {
f2227a66 4275 if (maybe_gather || maybe_scatter)
5fa23466
RB
4276 {
4277 if (maybe_gather)
4278 gatherscatter = GATHER;
4279 else
4280 gatherscatter = SCATTER;
aec7ae7d 4281 }
aec7ae7d 4282 }
4b5caab7 4283
f2227a66 4284 if (gatherscatter == SG_NONE)
aec7ae7d 4285 {
73fbfcad 4286 if (dump_enabled_p ())
3c2a8ed0
DM
4287 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4288 "not vectorized: data ref analysis "
4289 "failed %G", stmt_info->stmt);
310213d4 4290 if (is_a <bb_vec_info> (vinfo))
27312bf2
RB
4291 {
4292 /* In BB vectorization the ref can still participate
4293 in dependence analysis, we just can't vectorize it. */
4294 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4295 continue;
4296 }
f4ebbd24
DM
4297 return opt_result::failure_at (stmt_info->stmt,
4298 "not vectorized:"
4299 " data ref analysis failed: %G",
4300 stmt_info->stmt);
aec7ae7d 4301 }
ebfd146a
IR
4302 }
4303
f2227a66 4304 /* See if this was detected as SIMD lane access. */
0356aab8
JJ
4305 if (dr->aux == (void *)-1
4306 || dr->aux == (void *)-2
1612b1fe
JJ
4307 || dr->aux == (void *)-3
4308 || dr->aux == (void *)-4)
f2227a66 4309 {
78e02b3b 4310 if (nested_in_vect_loop_p (loop, stmt_info))
f4ebbd24
DM
4311 return opt_result::failure_at (stmt_info->stmt,
4312 "not vectorized:"
4313 " data ref analysis failed: %G",
4314 stmt_info->stmt);
0356aab8
JJ
4315 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4316 = -(uintptr_t) dr->aux;
f2227a66
RB
4317 }
4318
5fa23466
RB
4319 tree base = get_base_address (DR_REF (dr));
4320 if (base && VAR_P (base) && DECL_NONALIASED (base))
508ef0c6 4321 {
73fbfcad 4322 if (dump_enabled_p ())
3c2a8ed0
DM
4323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4324 "not vectorized: base object not addressable "
4325 "for stmt: %G", stmt_info->stmt);
310213d4 4326 if (is_a <bb_vec_info> (vinfo))
8e846c66
RB
4327 {
4328 /* In BB vectorization the ref can still participate
4329 in dependence analysis, we just can't vectorize it. */
4330 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4331 continue;
4332 }
f4ebbd24
DM
4333 return opt_result::failure_at (stmt_info->stmt,
4334 "not vectorized: base object not"
4335 " addressable for stmt: %G",
4336 stmt_info->stmt);
508ef0c6
RG
4337 }
4338
8e846c66 4339 if (is_a <loop_vec_info> (vinfo)
5fa23466 4340 && DR_STEP (dr)
8e846c66 4341 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
9c239085 4342 {
78e02b3b 4343 if (nested_in_vect_loop_p (loop, stmt_info))
f4ebbd24 4344 return opt_result::failure_at (stmt_info->stmt,
f8cb8bcd 4345 "not vectorized: "
f4ebbd24
DM
4346 "not suitable for strided load %G",
4347 stmt_info->stmt);
8e846c66 4348 STMT_VINFO_STRIDED_P (stmt_info) = true;
9c239085
JJ
4349 }
4350
ebfd146a 4351 /* Update DR field in stmt_vec_info struct. */
ebfd146a
IR
4352
4353 /* If the dataref is in an inner-loop of the loop that is considered for
4354 for vectorization, we also want to analyze the access relative to
b8698a0f 4355 the outer-loop (DR contains information only relative to the
ebfd146a
IR
4356 inner-most enclosing loop). We do that by building a reference to the
4357 first location accessed by the inner-loop, and analyze it relative to
b8698a0f 4358 the outer-loop. */
78e02b3b 4359 if (loop && nested_in_vect_loop_p (loop, stmt_info))
ebfd146a 4360 {
b8698a0f 4361 /* Build a reference to the first location accessed by the
bb642979
RS
4362 inner loop: *(BASE + INIT + OFFSET). By construction,
4363 this address must be invariant in the inner loop, so we
4364 can consider it as being used in the outer loop. */
8e846c66
RB
4365 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4366 tree offset = unshare_expr (DR_OFFSET (dr));
4367 tree init = unshare_expr (DR_INIT (dr));
bb642979
RS
4368 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4369 init, offset);
4370 tree init_addr = fold_build_pointer_plus (base, init_offset);
4371 tree init_ref = build_fold_indirect_ref (init_addr);
ebfd146a 4372
73fbfcad 4373 if (dump_enabled_p ())
3c2a8ed0
DM
4374 dump_printf_loc (MSG_NOTE, vect_location,
4375 "analyze in outer loop: %T\n", init_ref);
ebfd146a 4376
f4ebbd24
DM
4377 opt_result res
4378 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4379 init_ref, loop, stmt_info->stmt);
4380 if (!res)
bb642979 4381 /* dr_analyze_innermost already explained the failure. */
f4ebbd24 4382 return res;
ebfd146a 4383
73fbfcad 4384 if (dump_enabled_p ())
3c2a8ed0
DM
4385 dump_printf_loc (MSG_NOTE, vect_location,
4386 "\touter base_address: %T\n"
4387 "\touter offset from base address: %T\n"
4388 "\touter constant offset from base address: %T\n"
4389 "\touter step: %T\n"
4390 "\touter base alignment: %d\n\n"
4391 "\touter base misalignment: %d\n"
4392 "\touter offset alignment: %d\n"
4393 "\touter step alignment: %d\n",
4394 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4395 STMT_VINFO_DR_OFFSET (stmt_info),
4396 STMT_VINFO_DR_INIT (stmt_info),
4397 STMT_VINFO_DR_STEP (stmt_info),
4398 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4399 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4400 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4401 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
ebfd146a
IR
4402 }
4403
ebfd146a
IR
4404 /* Set vectype for STMT. */
4405 scalar_type = TREE_TYPE (DR_REF (dr));
9b75f56d
RS
4406 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4407 if (!vectype)
ebfd146a 4408 {
73fbfcad 4409 if (dump_enabled_p ())
ebfd146a 4410 {
e645e942 4411 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3c2a8ed0
DM
4412 "not vectorized: no vectype for stmt: %G",
4413 stmt_info->stmt);
78c60e3d
SS
4414 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4415 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4416 scalar_type);
e645e942 4417 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
ebfd146a 4418 }
4b5caab7 4419
310213d4 4420 if (is_a <bb_vec_info> (vinfo))
64900538
RB
4421 {
4422 /* No vector type is fine, the ref can still participate
4423 in dependence analysis, we just can't vectorize it. */
4424 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4425 continue;
4426 }
1f88cc26
RB
4427 if (fatal)
4428 *fatal = false;
f4ebbd24
DM
4429 return opt_result::failure_at (stmt_info->stmt,
4430 "not vectorized:"
4431 " no vectype for stmt: %G"
4432 " scalar_type: %T\n",
4433 stmt_info->stmt, scalar_type);
ebfd146a 4434 }
451dabda
RB
4435 else
4436 {
4437 if (dump_enabled_p ())
3c2a8ed0
DM
4438 dump_printf_loc (MSG_NOTE, vect_location,
4439 "got vectype for stmt: %G%T\n",
9b75f56d 4440 stmt_info->stmt, vectype);
451dabda 4441 }
777e1f09
RG
4442
4443 /* Adjust the minimal vectorization factor according to the
4444 vector type. */
9b75f56d 4445 vf = TYPE_VECTOR_SUBPARTS (vectype);
d9f21f6a 4446 *min_vf = upper_bound (*min_vf, vf);
aec7ae7d 4447
9b75f56d
RS
4448 /* Leave the BB vectorizer to pick the vector type later, based on
4449 the final dataref group size and SLP node size. */
4450 if (is_a <loop_vec_info> (vinfo))
4451 STMT_VINFO_VECTYPE (stmt_info) = vectype;
4452
3bab6342 4453 if (gatherscatter != SG_NONE)
aec7ae7d 4454 {
134c85ca 4455 gather_scatter_info gs_info;
78e02b3b
RS
4456 if (!vect_check_gather_scatter (stmt_info,
4457 as_a <loop_vec_info> (vinfo),
134c85ca 4458 &gs_info)
7ed54790
RS
4459 || !get_vectype_for_scalar_type (vinfo,
4460 TREE_TYPE (gs_info.offset)))
a7b3509e
JJ
4461 {
4462 if (fatal)
4463 *fatal = false;
4464 return opt_result::failure_at
4465 (stmt_info->stmt,
4466 (gatherscatter == GATHER)
4467 ? "not vectorized: not suitable for gather load %G"
4468 : "not vectorized: not suitable for scatter store %G",
4469 stmt_info->stmt);
4470 }
3bab6342 4471 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
319e6439 4472 }
ebfd146a 4473 }
b8698a0f 4474
27312bf2
RB
4475 /* We used to stop processing and prune the list here. Verify we no
4476 longer need to. */
4477 gcc_assert (i == datarefs.length ());
fcac74a1 4478
f4ebbd24 4479 return opt_result::success ();
ebfd146a
IR
4480}
4481
4482
4483/* Function vect_get_new_vect_var.
4484
ff802fa1 4485 Returns a name for a new variable. The current naming scheme appends the
b8698a0f
L
4486 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4487 the name of vectorizer generated variables, and appends that to NAME if
ebfd146a
IR
4488 provided. */
4489
4490tree
4491vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4492{
4493 const char *prefix;
4494 tree new_vect_var;
4495
4496 switch (var_kind)
4497 {
4498 case vect_simple_var:
451dabda 4499 prefix = "vect";
ebfd146a
IR
4500 break;
4501 case vect_scalar_var:
451dabda 4502 prefix = "stmp";
ebfd146a 4503 break;
42fd8198
IE
4504 case vect_mask_var:
4505 prefix = "mask";
4506 break;
ebfd146a 4507 case vect_pointer_var:
451dabda 4508 prefix = "vectp";
ebfd146a
IR
4509 break;
4510 default:
4511 gcc_unreachable ();
4512 }
4513
4514 if (name)
4515 {
451dabda 4516 char* tmp = concat (prefix, "_", name, NULL);
65876d24 4517 new_vect_var = create_tmp_reg (type, tmp);
ebfd146a
IR
4518 free (tmp);
4519 }
4520 else
65876d24 4521 new_vect_var = create_tmp_reg (type, prefix);
ebfd146a
IR
4522
4523 return new_vect_var;
4524}
4525
0e22bb5a
RB
4526/* Like vect_get_new_vect_var but return an SSA name. */
4527
4528tree
4529vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4530{
4531 const char *prefix;
4532 tree new_vect_var;
4533
4534 switch (var_kind)
4535 {
4536 case vect_simple_var:
4537 prefix = "vect";
4538 break;
4539 case vect_scalar_var:
4540 prefix = "stmp";
4541 break;
4542 case vect_pointer_var:
4543 prefix = "vectp";
4544 break;
4545 default:
4546 gcc_unreachable ();
4547 }
4548
4549 if (name)
4550 {
4551 char* tmp = concat (prefix, "_", name, NULL);
4552 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4553 free (tmp);
4554 }
4555 else
4556 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4557
4558 return new_vect_var;
4559}
4560
89fa689a 4561/* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */
faf4220c
JJ
4562
4563static void
89fa689a 4564vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
faf4220c 4565{
89fa689a
RS
4566 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4567 int misalign = DR_MISALIGNMENT (dr_info);
8d21ff9f 4568 if (misalign == DR_MISALIGNMENT_UNKNOWN)
faf4220c
JJ
4569 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4570 else
f702e7d4 4571 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
ca31798e
AV
4572 known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4573 misalign);
faf4220c 4574}
ebfd146a
IR
4575
4576/* Function vect_create_addr_base_for_vector_ref.
4577
4578 Create an expression that computes the address of the first memory location
4579 that will be accessed for a data reference.
4580
4581 Input:
32e8e429 4582 STMT_INFO: The statement containing the data reference.
ebfd146a
IR
4583 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4584 OFFSET: Optional. If supplied, it is be added to the initial address.
4585 LOOP: Specify relative to which loop-nest should the address be computed.
4586 For example, when the dataref is in an inner-loop nested in an
4587 outer-loop that is now being vectorized, LOOP can be either the
ff802fa1 4588 outer-loop, or the inner-loop. The first memory location accessed
ebfd146a
IR
4589 by the following dataref ('in' points to short):
4590
4591 for (i=0; i<N; i++)
4592 for (j=0; j<M; j++)
4593 s += in[i+j]
4594
4595 is as follows:
4596 if LOOP=i_loop: &in (relative to i_loop)
4597 if LOOP=j_loop: &in+i*2B (relative to j_loop)
356bbc4c
JJ
4598 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4599 initial address. Unlike OFFSET, which is number of elements to
4600 be added, BYTE_OFFSET is measured in bytes.
ebfd146a
IR
4601
4602 Output:
b8698a0f 4603 1. Return an SSA_NAME whose value is the address of the memory location of
ebfd146a
IR
4604 the first vector of the data reference.
4605 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4606 these statement(s) which define the returned SSA_NAME.
4607
4608 FORNOW: We are only handling array accesses with step 1. */
4609
4610tree
308bc496 4611vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
ebfd146a
IR
4612 gimple_seq *new_stmt_list,
4613 tree offset,
356bbc4c 4614 tree byte_offset)
ebfd146a 4615{
89fa689a
RS
4616 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4617 struct data_reference *dr = dr_info->dr;
595c2679 4618 const char *base_name;
4bdd44c4 4619 tree addr_base;
ebfd146a
IR
4620 tree dest;
4621 gimple_seq seq = NULL;
8644a673 4622 tree vect_ptr_type;
ebfd146a 4623 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
308bc496
RB
4624 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4625 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
ebfd146a 4626
3f5e8a76 4627 tree data_ref_base = unshare_expr (drb->base_address);
308bc496 4628 tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
3f5e8a76 4629 tree init = unshare_expr (drb->init);
ebfd146a 4630
a70d6342 4631 if (loop_vinfo)
595c2679 4632 base_name = get_name (data_ref_base);
a70d6342
IR
4633 else
4634 {
4635 base_offset = ssize_int (0);
4636 init = ssize_int (0);
595c2679 4637 base_name = get_name (DR_REF (dr));
b8698a0f 4638 }
a70d6342 4639
ebfd146a
IR
4640 /* Create base_offset */
4641 base_offset = size_binop (PLUS_EXPR,
4642 fold_convert (sizetype, base_offset),
4643 fold_convert (sizetype, init));
ebfd146a
IR
4644
4645 if (offset)
4646 {
ebfd146a
IR
4647 offset = fold_build2 (MULT_EXPR, sizetype,
4648 fold_convert (sizetype, offset), step);
4649 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4650 base_offset, offset);
ebfd146a 4651 }
356bbc4c
JJ
4652 if (byte_offset)
4653 {
4654 byte_offset = fold_convert (sizetype, byte_offset);
4655 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4656 base_offset, byte_offset);
4657 }
ebfd146a
IR
4658
4659 /* base + base_offset */
a70d6342 4660 if (loop_vinfo)
5d49b6a7 4661 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
a70d6342
IR
4662 else
4663 {
70f34814
RG
4664 addr_base = build1 (ADDR_EXPR,
4665 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4666 unshare_expr (DR_REF (dr)));
a70d6342 4667 }
b8698a0f 4668
ebfd146a 4669 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4bdd44c4 4670 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
aed93b23 4671 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
ebfd146a
IR
4672 gimple_seq_add_seq (new_stmt_list, seq);
4673
17fc049f 4674 if (DR_PTR_INFO (dr)
aed93b23
RB
4675 && TREE_CODE (addr_base) == SSA_NAME
4676 && !SSA_NAME_PTR_INFO (addr_base))
128aaeed 4677 {
89fa689a 4678 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
faf4220c 4679 if (offset || byte_offset)
4bdd44c4 4680 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
128aaeed 4681 }
17fc049f 4682
73fbfcad 4683 if (dump_enabled_p ())
3c2a8ed0 4684 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
8644a673 4685
4bdd44c4 4686 return addr_base;
ebfd146a
IR
4687}
4688
4689
4690/* Function vect_create_data_ref_ptr.
4691
920e8172 4692 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
32e8e429 4693 location accessed in the loop by STMT_INFO, along with the def-use update
920e8172
RS
4694 chain to appropriately advance the pointer through the loop iterations.
4695 Also set aliasing information for the pointer. This pointer is used by
4696 the callers to this function to create a memory reference expression for
4697 vector load/store access.
ebfd146a
IR
4698
4699 Input:
32e8e429 4700 1. STMT_INFO: a stmt that references memory. Expected to be of the form
ebfd146a
IR
4701 GIMPLE_ASSIGN <name, data-ref> or
4702 GIMPLE_ASSIGN <data-ref, name>.
920e8172
RS
4703 2. AGGR_TYPE: the type of the reference, which should be either a vector
4704 or an array.
4705 3. AT_LOOP: the loop where the vector memref is to be created.
4706 4. OFFSET (optional): an offset to be added to the initial address accessed
32e8e429 4707 by the data-ref in STMT_INFO.
920e8172
RS
4708 5. BSI: location where the new stmts are to be placed if there is no loop
4709 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
ebfd146a 4710 pointing to the initial address.
356bbc4c 4711 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
32e8e429 4712 to the initial address accessed by the data-ref in STMT_INFO. This is
356bbc4c
JJ
4713 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4714 in bytes.
ab2fc782
RS
4715 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4716 to the IV during each iteration of the loop. NULL says to move
4717 by one copy of AGGR_TYPE up or down, depending on the step of the
4718 data reference.
ebfd146a
IR
4719
4720 Output:
4721 1. Declare a new ptr to vector_type, and have it point to the base of the
4722 data reference (initial addressed accessed by the data reference).
4723 For example, for vector of type V8HI, the following code is generated:
4724
920e8172
RS
4725 v8hi *ap;
4726 ap = (v8hi *)initial_address;
ebfd146a
IR
4727
4728 if OFFSET is not supplied:
4729 initial_address = &a[init];
4730 if OFFSET is supplied:
4731 initial_address = &a[init + OFFSET];
356bbc4c
JJ
4732 if BYTE_OFFSET is supplied:
4733 initial_address = &a[init] + BYTE_OFFSET;
ebfd146a
IR
4734
4735 Return the initial_address in INITIAL_ADDRESS.
4736
4737 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
b8698a0f 4738 update the pointer in each iteration of the loop.
ebfd146a
IR
4739
4740 Return the increment stmt that updates the pointer in PTR_INCR.
4741
2d4bca81 4742 3. Return the pointer. */
ebfd146a
IR
4743
4744tree
308bc496
RB
4745vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4746 tree aggr_type, class loop *at_loop, tree offset,
32e8e429 4747 tree *initial_address, gimple_stmt_iterator *gsi,
2d4bca81 4748 gimple **ptr_incr, bool only_init,
32e8e429 4749 tree byte_offset, tree iv_step)
ebfd146a 4750{
595c2679 4751 const char *base_name;
308bc496 4752 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
99b1c316 4753 class loop *loop = NULL;
a70d6342 4754 bool nested_in_vect_loop = false;
99b1c316 4755 class loop *containing_loop = NULL;
920e8172
RS
4756 tree aggr_ptr_type;
4757 tree aggr_ptr;
ebfd146a 4758 tree new_temp;
ebfd146a 4759 gimple_seq new_stmt_list = NULL;
a70d6342 4760 edge pe = NULL;
ebfd146a 4761 basic_block new_bb;
920e8172 4762 tree aggr_ptr_init;
89fa689a
RS
4763 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4764 struct data_reference *dr = dr_info->dr;
920e8172 4765 tree aptr;
ebfd146a
IR
4766 gimple_stmt_iterator incr_gsi;
4767 bool insert_after;
4768 tree indx_before_incr, indx_after_incr;
355fe088 4769 gimple *incr;
308bc496 4770 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
b8698a0f 4771
ab2fc782
RS
4772 gcc_assert (iv_step != NULL_TREE
4773 || TREE_CODE (aggr_type) == ARRAY_TYPE
920e8172
RS
4774 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4775
a70d6342
IR
4776 if (loop_vinfo)
4777 {
4778 loop = LOOP_VINFO_LOOP (loop_vinfo);
86a91c0a
RS
4779 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4780 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
a70d6342
IR
4781 pe = loop_preheader_edge (loop);
4782 }
4783 else
4784 {
4785 gcc_assert (bb_vinfo);
4786 only_init = true;
4787 *ptr_incr = NULL;
4788 }
b8698a0f 4789
ebfd146a 4790 /* Create an expression for the first address accessed by this load
b8698a0f 4791 in LOOP. */
595c2679 4792 base_name = get_name (DR_BASE_ADDRESS (dr));
ebfd146a 4793
73fbfcad 4794 if (dump_enabled_p ())
ebfd146a 4795 {
595c2679 4796 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
78c60e3d 4797 dump_printf_loc (MSG_NOTE, vect_location,
3c2a8ed0
DM
4798 "create %s-pointer variable to type: %T",
4799 get_tree_code_name (TREE_CODE (aggr_type)),
4800 aggr_type);
595c2679 4801 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
78c60e3d 4802 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
38000232
MG
4803 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4804 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
595c2679 4805 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
78c60e3d 4806 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
595c2679 4807 else
78c60e3d 4808 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
3c2a8ed0 4809 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
ebfd146a
IR
4810 }
4811
4bdd44c4
RB
4812 /* (1) Create the new aggregate-pointer variable.
4813 Vector and array types inherit the alias set of their component
920e8172
RS
4814 type by default so we need to use a ref-all pointer if the data
4815 reference does not conflict with the created aggregated data
4816 reference because it is not addressable. */
4bdd44c4
RB
4817 bool need_ref_all = false;
4818 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3f49ba3f 4819 get_alias_set (DR_REF (dr))))
4bdd44c4 4820 need_ref_all = true;
3f49ba3f 4821 /* Likewise for any of the data references in the stmt group. */
2c53b149 4822 else if (DR_GROUP_SIZE (stmt_info) > 1)
ebfd146a 4823 {
bffb8014 4824 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5006671f
RG
4825 do
4826 {
4bdd44c4
RB
4827 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4828 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4829 get_alias_set (DR_REF (sdr))))
5006671f 4830 {
4bdd44c4 4831 need_ref_all = true;
5006671f
RG
4832 break;
4833 }
bffb8014 4834 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5006671f 4835 }
bffb8014 4836 while (sinfo);
ebfd146a 4837 }
4bdd44c4
RB
4838 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4839 need_ref_all);
4840 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4841
ebfd146a 4842
ff802fa1
IR
4843 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4844 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4845 def-use update cycles for the pointer: one relative to the outer-loop
4846 (LOOP), which is what steps (3) and (4) below do. The other is relative
4847 to the inner-loop (which is the inner-most loop containing the dataref),
4848 and this is done be step (5) below.
ebfd146a 4849
ff802fa1
IR
4850 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4851 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4852 redundant. Steps (3),(4) create the following:
ebfd146a
IR
4853
4854 vp0 = &base_addr;
4855 LOOP: vp1 = phi(vp0,vp2)
b8698a0f 4856 ...
ebfd146a
IR
4857 ...
4858 vp2 = vp1 + step
4859 goto LOOP
b8698a0f 4860
ff802fa1
IR
4861 If there is an inner-loop nested in loop, then step (5) will also be
4862 applied, and an additional update in the inner-loop will be created:
ebfd146a
IR
4863
4864 vp0 = &base_addr;
4865 LOOP: vp1 = phi(vp0,vp2)
4866 ...
4867 inner: vp3 = phi(vp1,vp4)
4868 vp4 = vp3 + inner_step
4869 if () goto inner
4870 ...
4871 vp2 = vp1 + step
4872 if () goto LOOP */
4873
920e8172
RS
4874 /* (2) Calculate the initial address of the aggregate-pointer, and set
4875 the aggregate-pointer to point to it before the loop. */
ebfd146a 4876
356bbc4c 4877 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
ebfd146a 4878
308bc496
RB
4879 new_temp = vect_create_addr_base_for_vector_ref (vinfo,
4880 stmt_info, &new_stmt_list,
3f5e8a76 4881 offset, byte_offset);
ebfd146a
IR
4882 if (new_stmt_list)
4883 {
a70d6342
IR
4884 if (pe)
4885 {
4886 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4887 gcc_assert (!new_bb);
4888 }
4889 else
1b29f05e 4890 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
ebfd146a
IR
4891 }
4892
4893 *initial_address = new_temp;
aed93b23 4894 aggr_ptr_init = new_temp;
ebfd146a 4895
920e8172 4896 /* (3) Handle the updating of the aggregate-pointer inside the loop.
ff802fa1
IR
4897 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4898 inner-loop nested in LOOP (during outer-loop vectorization). */
ebfd146a 4899
a70d6342 4900 /* No update in loop is required. */
b8698a0f 4901 if (only_init && (!loop_vinfo || at_loop == loop))
920e8172 4902 aptr = aggr_ptr_init;
ebfd146a
IR
4903 else
4904 {
2d4bca81
RS
4905 /* Accesses to invariant addresses should be handled specially
4906 by the caller. */
308bc496 4907 tree step = vect_dr_behavior (vinfo, dr_info)->step;
2d4bca81
RS
4908 gcc_assert (!integer_zerop (step));
4909
ab2fc782
RS
4910 if (iv_step == NULL_TREE)
4911 {
2d4bca81
RS
4912 /* The step of the aggregate pointer is the type size,
4913 negated for downward accesses. */
ab2fc782 4914 iv_step = TYPE_SIZE_UNIT (aggr_type);
2d4bca81 4915 if (tree_int_cst_sgn (step) == -1)
ab2fc782
RS
4916 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4917 }
ebfd146a
IR
4918
4919 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4920
920e8172 4921 create_iv (aggr_ptr_init,
08940f33 4922 fold_convert (aggr_ptr_type, iv_step),
920e8172 4923 aggr_ptr, loop, &incr_gsi, insert_after,
ebfd146a
IR
4924 &indx_before_incr, &indx_after_incr);
4925 incr = gsi_stmt (incr_gsi);
ebfd146a
IR
4926
4927 /* Copy the points-to information if it exists. */
4928 if (DR_PTR_INFO (dr))
4929 {
89fa689a
RS
4930 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4931 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
ebfd146a 4932 }
ebfd146a
IR
4933 if (ptr_incr)
4934 *ptr_incr = incr;
4935
920e8172 4936 aptr = indx_before_incr;
ebfd146a
IR
4937 }
4938
4939 if (!nested_in_vect_loop || only_init)
920e8172 4940 return aptr;
ebfd146a
IR
4941
4942
920e8172 4943 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
ff802fa1 4944 nested in LOOP, if exists. */
ebfd146a
IR
4945
4946 gcc_assert (nested_in_vect_loop);
4947 if (!only_init)
4948 {
4949 standard_iv_increment_position (containing_loop, &incr_gsi,
4950 &insert_after);
920e8172 4951 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
ebfd146a
IR
4952 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4953 &indx_after_incr);
4954 incr = gsi_stmt (incr_gsi);
ebfd146a
IR
4955
4956 /* Copy the points-to information if it exists. */
4957 if (DR_PTR_INFO (dr))
4958 {
89fa689a
RS
4959 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4960 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
ebfd146a 4961 }
ebfd146a
IR
4962 if (ptr_incr)
4963 *ptr_incr = incr;
4964
b8698a0f 4965 return indx_before_incr;
ebfd146a
IR
4966 }
4967 else
4968 gcc_unreachable ();
4969}
4970
4971
4972/* Function bump_vector_ptr
4973
4974 Increment a pointer (to a vector type) by vector-size. If requested,
b8698a0f 4975 i.e. if PTR-INCR is given, then also connect the new increment stmt
ebfd146a
IR
4976 to the existing def-use update-chain of the pointer, by modifying
4977 the PTR_INCR as illustrated below:
4978
4979 The pointer def-use update-chain before this function:
4980 DATAREF_PTR = phi (p_0, p_2)
4981 ....
b8698a0f 4982 PTR_INCR: p_2 = DATAREF_PTR + step
ebfd146a
IR
4983
4984 The pointer def-use update-chain after this function:
4985 DATAREF_PTR = phi (p_0, p_2)
4986 ....
4987 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4988 ....
4989 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4990
4991 Input:
b8698a0f 4992 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
ebfd146a 4993 in the loop.
b8698a0f 4994 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
ebfd146a 4995 the loop. The increment amount across iterations is expected
b8698a0f 4996 to be vector_size.
ebfd146a 4997 BSI - location where the new update stmt is to be placed.
32e8e429 4998 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
ebfd146a
IR
4999 BUMP - optional. The offset by which to bump the pointer. If not given,
5000 the offset is assumed to be vector_size.
5001
5002 Output: Return NEW_DATAREF_PTR as illustrated above.
b8698a0f 5003
ebfd146a
IR
5004*/
5005
5006tree
308bc496
RB
5007bump_vector_ptr (vec_info *vinfo,
5008 tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
32e8e429 5009 stmt_vec_info stmt_info, tree bump)
ebfd146a 5010{
ebfd146a
IR
5011 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5012 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
ebfd146a 5013 tree update = TYPE_SIZE_UNIT (vectype);
538dd0b7 5014 gassign *incr_stmt;
ebfd146a
IR
5015 ssa_op_iter iter;
5016 use_operand_p use_p;
5017 tree new_dataref_ptr;
5018
5019 if (bump)
5020 update = bump;
b8698a0f 5021
aed93b23
RB
5022 if (TREE_CODE (dataref_ptr) == SSA_NAME)
5023 new_dataref_ptr = copy_ssa_name (dataref_ptr);
5024 else
5025 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
0d0e4a03
JJ
5026 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5027 dataref_ptr, update);
308bc496 5028 vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
ebfd146a
IR
5029
5030 /* Copy the points-to information if it exists. */
5031 if (DR_PTR_INFO (dr))
128aaeed
RB
5032 {
5033 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
644ffefd 5034 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
128aaeed 5035 }
ebfd146a
IR
5036
5037 if (!ptr_incr)
5038 return new_dataref_ptr;
5039
5040 /* Update the vector-pointer's cross-iteration increment. */
5041 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5042 {
5043 tree use = USE_FROM_PTR (use_p);
5044
5045 if (use == dataref_ptr)
5046 SET_USE (use_p, new_dataref_ptr);
5047 else
ab2fc782 5048 gcc_assert (operand_equal_p (use, update, 0));
ebfd146a
IR
5049 }
5050
5051 return new_dataref_ptr;
5052}
5053
5054
19986382
RB
5055/* Copy memory reference info such as base/clique from the SRC reference
5056 to the DEST MEM_REF. */
5057
5058void
5059vect_copy_ref_info (tree dest, tree src)
5060{
5061 if (TREE_CODE (dest) != MEM_REF)
5062 return;
5063
5064 tree src_base = src;
5065 while (handled_component_p (src_base))
5066 src_base = TREE_OPERAND (src_base, 0);
5067 if (TREE_CODE (src_base) != MEM_REF
5068 && TREE_CODE (src_base) != TARGET_MEM_REF)
5069 return;
5070
5071 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5072 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5073}
5074
5075
ebfd146a
IR
5076/* Function vect_create_destination_var.
5077
5078 Create a new temporary of type VECTYPE. */
5079
5080tree
5081vect_create_destination_var (tree scalar_dest, tree vectype)
5082{
5083 tree vec_dest;
451dabda
RB
5084 const char *name;
5085 char *new_name;
ebfd146a
IR
5086 tree type;
5087 enum vect_var_kind kind;
5088
42fd8198
IE
5089 kind = vectype
5090 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5091 ? vect_mask_var
5092 : vect_simple_var
5093 : vect_scalar_var;
ebfd146a
IR
5094 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5095
5096 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5097
451dabda
RB
5098 name = get_name (scalar_dest);
5099 if (name)
378b2932 5100 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
451dabda 5101 else
378b2932 5102 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
ebfd146a 5103 vec_dest = vect_get_new_vect_var (type, kind, new_name);
451dabda 5104 free (new_name);
ebfd146a
IR
5105
5106 return vec_dest;
5107}
5108
0d0293ac 5109/* Function vect_grouped_store_supported.
ebfd146a 5110
e2c83630
RH
5111 Returns TRUE if interleave high and interleave low permutations
5112 are supported, and FALSE otherwise. */
ebfd146a
IR
5113
5114bool
0d0293ac 5115vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
ebfd146a 5116{
ef4bddc2 5117 machine_mode mode = TYPE_MODE (vectype);
b8698a0f 5118
e1377713
ES
5119 /* vect_permute_store_chain requires the group size to be equal to 3 or
5120 be a power of two. */
5121 if (count != 3 && exact_log2 (count) == -1)
b602d918 5122 {
73fbfcad 5123 if (dump_enabled_p ())
78c60e3d 5124 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
e1377713
ES
5125 "the size of the group of accesses"
5126 " is not a power of 2 or not eqaul to 3\n");
b602d918
RS
5127 return false;
5128 }
5129
e2c83630 5130 /* Check that the permutation is supported. */
3fcc1b55
JJ
5131 if (VECTOR_MODE_P (mode))
5132 {
7b777afa 5133 unsigned int i;
e1377713 5134 if (count == 3)
3fcc1b55 5135 {
e1377713
ES
5136 unsigned int j0 = 0, j1 = 0, j2 = 0;
5137 unsigned int i, j;
5138
7b777afa
RS
5139 unsigned int nelt;
5140 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5141 {
5142 if (dump_enabled_p ())
5143 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5144 "cannot handle groups of 3 stores for"
5145 " variable-length vectors\n");
5146 return false;
5147 }
5148
d980067b
RS
5149 vec_perm_builder sel (nelt, nelt, 1);
5150 sel.quick_grow (nelt);
e3342de4 5151 vec_perm_indices indices;
e1377713
ES
5152 for (j = 0; j < 3; j++)
5153 {
5154 int nelt0 = ((3 - j) * nelt) % 3;
5155 int nelt1 = ((3 - j) * nelt + 1) % 3;
5156 int nelt2 = ((3 - j) * nelt + 2) % 3;
5157 for (i = 0; i < nelt; i++)
5158 {
5159 if (3 * i + nelt0 < nelt)
5160 sel[3 * i + nelt0] = j0++;
5161 if (3 * i + nelt1 < nelt)
5162 sel[3 * i + nelt1] = nelt + j1++;
5163 if (3 * i + nelt2 < nelt)
5164 sel[3 * i + nelt2] = 0;
5165 }
e3342de4
RS
5166 indices.new_vector (sel, 2, nelt);
5167 if (!can_vec_perm_const_p (mode, indices))
e1377713
ES
5168 {
5169 if (dump_enabled_p ())
5170 dump_printf (MSG_MISSED_OPTIMIZATION,
7ac7e286 5171 "permutation op not supported by target.\n");
e1377713
ES
5172 return false;
5173 }
5174
5175 for (i = 0; i < nelt; i++)
5176 {
5177 if (3 * i + nelt0 < nelt)
5178 sel[3 * i + nelt0] = 3 * i + nelt0;
5179 if (3 * i + nelt1 < nelt)
5180 sel[3 * i + nelt1] = 3 * i + nelt1;
5181 if (3 * i + nelt2 < nelt)
5182 sel[3 * i + nelt2] = nelt + j2++;
5183 }
e3342de4
RS
5184 indices.new_vector (sel, 2, nelt);
5185 if (!can_vec_perm_const_p (mode, indices))
e1377713
ES
5186 {
5187 if (dump_enabled_p ())
5188 dump_printf (MSG_MISSED_OPTIMIZATION,
7ac7e286 5189 "permutation op not supported by target.\n");
e1377713
ES
5190 return false;
5191 }
5192 }
5193 return true;
3fcc1b55 5194 }
e1377713 5195 else
3fcc1b55 5196 {
e1377713 5197 /* If length is not equal to 3 then only power of 2 is supported. */
146ec50f 5198 gcc_assert (pow2p_hwi (count));
7b777afa 5199 poly_uint64 nelt = GET_MODE_NUNITS (mode);
e1377713 5200
d980067b
RS
5201 /* The encoding has 2 interleaved stepped patterns. */
5202 vec_perm_builder sel (nelt, 2, 3);
5203 sel.quick_grow (6);
5204 for (i = 0; i < 3; i++)
e1377713
ES
5205 {
5206 sel[i * 2] = i;
5207 sel[i * 2 + 1] = i + nelt;
5208 }
e3342de4
RS
5209 vec_perm_indices indices (sel, 2, nelt);
5210 if (can_vec_perm_const_p (mode, indices))
908a1a16 5211 {
d980067b 5212 for (i = 0; i < 6; i++)
7b777afa 5213 sel[i] += exact_div (nelt, 2);
e3342de4
RS
5214 indices.new_vector (sel, 2, nelt);
5215 if (can_vec_perm_const_p (mode, indices))
908a1a16
RS
5216 return true;
5217 }
3fcc1b55
JJ
5218 }
5219 }
ebfd146a 5220
73fbfcad 5221 if (dump_enabled_p ())
78c60e3d 5222 dump_printf (MSG_MISSED_OPTIMIZATION,
429ca5b4 5223 "permutation op not supported by target.\n");
a6b3dfde 5224 return false;
ebfd146a
IR
5225}
5226
5227
7e11fc7f
RS
5228/* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5229 type VECTYPE. MASKED_P says whether the masked form is needed. */
272c6793
RS
5230
5231bool
7e11fc7f
RS
5232vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5233 bool masked_p)
272c6793 5234{
7e11fc7f
RS
5235 if (masked_p)
5236 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5237 vec_mask_store_lanes_optab,
5238 vectype, count);
5239 else
5240 return vect_lanes_optab_supported_p ("vec_store_lanes",
5241 vec_store_lanes_optab,
5242 vectype, count);
272c6793
RS
5243}
5244
5245
ebfd146a
IR
5246/* Function vect_permute_store_chain.
5247
5248 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
e1377713
ES
5249 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5250 the data correctly for the stores. Return the final references for stores
5251 in RESULT_CHAIN.
ebfd146a
IR
5252
5253 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
ff802fa1
IR
5254 The input is 4 vectors each containing 8 elements. We assign a number to
5255 each element, the input sequence is:
ebfd146a
IR
5256
5257 1st vec: 0 1 2 3 4 5 6 7
5258 2nd vec: 8 9 10 11 12 13 14 15
b8698a0f 5259 3rd vec: 16 17 18 19 20 21 22 23
ebfd146a
IR
5260 4th vec: 24 25 26 27 28 29 30 31
5261
5262 The output sequence should be:
5263
5264 1st vec: 0 8 16 24 1 9 17 25
5265 2nd vec: 2 10 18 26 3 11 19 27
5266 3rd vec: 4 12 20 28 5 13 21 30
5267 4th vec: 6 14 22 30 7 15 23 31
5268
5269 i.e., we interleave the contents of the four vectors in their order.
5270
ff802fa1 5271 We use interleave_high/low instructions to create such output. The input of
ebfd146a 5272 each interleave_high/low operation is two vectors:
b8698a0f
L
5273 1st vec 2nd vec
5274 0 1 2 3 4 5 6 7
5275 the even elements of the result vector are obtained left-to-right from the
ff802fa1 5276 high/low elements of the first vector. The odd elements of the result are
ebfd146a
IR
5277 obtained left-to-right from the high/low elements of the second vector.
5278 The output of interleave_high will be: 0 4 1 5
5279 and of interleave_low: 2 6 3 7
5280
b8698a0f 5281
ff802fa1 5282 The permutation is done in log LENGTH stages. In each stage interleave_high
b8698a0f
L
5283 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5284 where the first argument is taken from the first half of DR_CHAIN and the
5285 second argument from it's second half.
5286 In our example,
ebfd146a
IR
5287
5288 I1: interleave_high (1st vec, 3rd vec)
5289 I2: interleave_low (1st vec, 3rd vec)
5290 I3: interleave_high (2nd vec, 4th vec)
5291 I4: interleave_low (2nd vec, 4th vec)
5292
5293 The output for the first stage is:
5294
5295 I1: 0 16 1 17 2 18 3 19
5296 I2: 4 20 5 21 6 22 7 23
5297 I3: 8 24 9 25 10 26 11 27
5298 I4: 12 28 13 29 14 30 15 31
5299
5300 The output of the second stage, i.e. the final result is:
5301
5302 I1: 0 8 16 24 1 9 17 25
5303 I2: 2 10 18 26 3 11 19 27
5304 I3: 4 12 20 28 5 13 21 30
5305 I4: 6 14 22 30 7 15 23 31. */
b8698a0f 5306
b602d918 5307void
308bc496 5308vect_permute_store_chain (vec_info *vinfo, vec<tree> dr_chain,
b8698a0f 5309 unsigned int length,
32e8e429 5310 stmt_vec_info stmt_info,
ebfd146a 5311 gimple_stmt_iterator *gsi,
9771b263 5312 vec<tree> *result_chain)
ebfd146a 5313{
83d5977e 5314 tree vect1, vect2, high, low;
355fe088 5315 gimple *perm_stmt;
91987857 5316 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3fcc1b55 5317 tree perm_mask_low, perm_mask_high;
e1377713
ES
5318 tree data_ref;
5319 tree perm3_mask_low, perm3_mask_high;
edab8e10 5320 unsigned int i, j, n, log_length = exact_log2 (length);
908a1a16 5321
b6b9227d
JJ
5322 result_chain->quick_grow (length);
5323 memcpy (result_chain->address (), dr_chain.address (),
5324 length * sizeof (tree));
ebfd146a 5325
e1377713 5326 if (length == 3)
3fcc1b55 5327 {
edab8e10 5328 /* vect_grouped_store_supported ensures that this is constant. */
928686b1 5329 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
e1377713 5330 unsigned int j0 = 0, j1 = 0, j2 = 0;
e2c83630 5331
d980067b
RS
5332 vec_perm_builder sel (nelt, nelt, 1);
5333 sel.quick_grow (nelt);
e3342de4 5334 vec_perm_indices indices;
e1377713
ES
5335 for (j = 0; j < 3; j++)
5336 {
5337 int nelt0 = ((3 - j) * nelt) % 3;
5338 int nelt1 = ((3 - j) * nelt + 1) % 3;
5339 int nelt2 = ((3 - j) * nelt + 2) % 3;
3fcc1b55 5340
e1377713
ES
5341 for (i = 0; i < nelt; i++)
5342 {
5343 if (3 * i + nelt0 < nelt)
5344 sel[3 * i + nelt0] = j0++;
5345 if (3 * i + nelt1 < nelt)
5346 sel[3 * i + nelt1] = nelt + j1++;
5347 if (3 * i + nelt2 < nelt)
5348 sel[3 * i + nelt2] = 0;
5349 }
e3342de4
RS
5350 indices.new_vector (sel, 2, nelt);
5351 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
e1377713
ES
5352
5353 for (i = 0; i < nelt; i++)
5354 {
5355 if (3 * i + nelt0 < nelt)
5356 sel[3 * i + nelt0] = 3 * i + nelt0;
5357 if (3 * i + nelt1 < nelt)
5358 sel[3 * i + nelt1] = 3 * i + nelt1;
5359 if (3 * i + nelt2 < nelt)
5360 sel[3 * i + nelt2] = nelt + j2++;
5361 }
e3342de4
RS
5362 indices.new_vector (sel, 2, nelt);
5363 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
e1377713
ES
5364
5365 vect1 = dr_chain[0];
5366 vect2 = dr_chain[1];
ebfd146a
IR
5367
5368 /* Create interleaving stmt:
e1377713
ES
5369 low = VEC_PERM_EXPR <vect1, vect2,
5370 {j, nelt, *, j + 1, nelt + j + 1, *,
5371 j + 2, nelt + j + 2, *, ...}> */
5372 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
0d0e4a03
JJ
5373 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5374 vect2, perm3_mask_low);
308bc496 5375 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
ebfd146a 5376
e1377713
ES
5377 vect1 = data_ref;
5378 vect2 = dr_chain[2];
ebfd146a 5379 /* Create interleaving stmt:
e1377713
ES
5380 low = VEC_PERM_EXPR <vect1, vect2,
5381 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5382 6, 7, nelt + j + 2, ...}> */
5383 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
0d0e4a03
JJ
5384 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5385 vect2, perm3_mask_high);
308bc496 5386 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
e1377713 5387 (*result_chain)[j] = data_ref;
ebfd146a 5388 }
e1377713
ES
5389 }
5390 else
5391 {
5392 /* If length is not equal to 3 then only power of 2 is supported. */
146ec50f 5393 gcc_assert (pow2p_hwi (length));
e1377713 5394
d980067b 5395 /* The encoding has 2 interleaved stepped patterns. */
928686b1 5396 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
d980067b
RS
5397 vec_perm_builder sel (nelt, 2, 3);
5398 sel.quick_grow (6);
5399 for (i = 0; i < 3; i++)
e1377713
ES
5400 {
5401 sel[i * 2] = i;
5402 sel[i * 2 + 1] = i + nelt;
5403 }
e3342de4
RS
5404 vec_perm_indices indices (sel, 2, nelt);
5405 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
e1377713 5406
d980067b 5407 for (i = 0; i < 6; i++)
928686b1 5408 sel[i] += exact_div (nelt, 2);
e3342de4
RS
5409 indices.new_vector (sel, 2, nelt);
5410 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
e1377713
ES
5411
5412 for (i = 0, n = log_length; i < n; i++)
5413 {
5414 for (j = 0; j < length/2; j++)
5415 {
5416 vect1 = dr_chain[j];
5417 vect2 = dr_chain[j+length/2];
5418
5419 /* Create interleaving stmt:
5420 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5421 ...}> */
5422 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
0d0e4a03
JJ
5423 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5424 vect2, perm_mask_high);
308bc496 5425 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
e1377713
ES
5426 (*result_chain)[2*j] = high;
5427
5428 /* Create interleaving stmt:
5429 low = VEC_PERM_EXPR <vect1, vect2,
5430 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5431 ...}> */
5432 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
0d0e4a03
JJ
5433 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5434 vect2, perm_mask_low);
308bc496 5435 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
e1377713
ES
5436 (*result_chain)[2*j+1] = low;
5437 }
5438 memcpy (dr_chain.address (), result_chain->address (),
5439 length * sizeof (tree));
5440 }
ebfd146a 5441 }
ebfd146a
IR
5442}
5443
5444/* Function vect_setup_realignment
b8698a0f 5445
ebfd146a
IR
5446 This function is called when vectorizing an unaligned load using
5447 the dr_explicit_realign[_optimized] scheme.
5448 This function generates the following code at the loop prolog:
5449
5450 p = initial_addr;
5451 x msq_init = *(floor(p)); # prolog load
b8698a0f 5452 realignment_token = call target_builtin;
ebfd146a
IR
5453 loop:
5454 x msq = phi (msq_init, ---)
5455
b8698a0f 5456 The stmts marked with x are generated only for the case of
ebfd146a
IR
5457 dr_explicit_realign_optimized.
5458
b8698a0f 5459 The code above sets up a new (vector) pointer, pointing to the first
32e8e429
RS
5460 location accessed by STMT_INFO, and a "floor-aligned" load using that
5461 pointer. It also generates code to compute the "realignment-token"
5462 (if the relevant target hook was defined), and creates a phi-node at the
5463 loop-header bb whose arguments are the result of the prolog-load (created
5464 by this function) and the result of a load that takes place in the loop
5465 (to be created by the caller to this function).
ebfd146a
IR
5466
5467 For the case of dr_explicit_realign_optimized:
b8698a0f 5468 The caller to this function uses the phi-result (msq) to create the
ebfd146a
IR
5469 realignment code inside the loop, and sets up the missing phi argument,
5470 as follows:
b8698a0f 5471 loop:
ebfd146a
IR
5472 msq = phi (msq_init, lsq)
5473 lsq = *(floor(p')); # load in loop
5474 result = realign_load (msq, lsq, realignment_token);
5475
5476 For the case of dr_explicit_realign:
5477 loop:
5478 msq = *(floor(p)); # load in loop
5479 p' = p + (VS-1);
5480 lsq = *(floor(p')); # load in loop
5481 result = realign_load (msq, lsq, realignment_token);
5482
5483 Input:
32e8e429
RS
5484 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5485 a memory location that may be unaligned.
ebfd146a
IR
5486 BSI - place where new code is to be inserted.
5487 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
b8698a0f
L
5488 is used.
5489
ebfd146a
IR
5490 Output:
5491 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5492 target hook, if defined.
5493 Return value - the result of the loop-header phi node. */
5494
5495tree
308bc496
RB
5496vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5497 gimple_stmt_iterator *gsi, tree *realignment_token,
ebfd146a
IR
5498 enum dr_alignment_support alignment_support_scheme,
5499 tree init_addr,
99b1c316 5500 class loop **at_loop)
ebfd146a 5501{
ebfd146a 5502 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
308bc496 5503 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
89fa689a
RS
5504 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5505 struct data_reference *dr = dr_info->dr;
99b1c316 5506 class loop *loop = NULL;
69f11a13 5507 edge pe = NULL;
86a91c0a 5508 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
ebfd146a 5509 tree vec_dest;
355fe088 5510 gimple *inc;
ebfd146a
IR
5511 tree ptr;
5512 tree data_ref;
ebfd146a
IR
5513 basic_block new_bb;
5514 tree msq_init = NULL_TREE;
5515 tree new_temp;
538dd0b7 5516 gphi *phi_stmt;
ebfd146a
IR
5517 tree msq = NULL_TREE;
5518 gimple_seq stmts = NULL;
ebfd146a 5519 bool compute_in_loop = false;
69f11a13 5520 bool nested_in_vect_loop = false;
99b1c316
MS
5521 class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5522 class loop *loop_for_initial_load = NULL;
69f11a13
IR
5523
5524 if (loop_vinfo)
5525 {
5526 loop = LOOP_VINFO_LOOP (loop_vinfo);
86a91c0a 5527 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
69f11a13 5528 }
ebfd146a
IR
5529
5530 gcc_assert (alignment_support_scheme == dr_explicit_realign
5531 || alignment_support_scheme == dr_explicit_realign_optimized);
5532
5533 /* We need to generate three things:
5534 1. the misalignment computation
5535 2. the extra vector load (for the optimized realignment scheme).
5536 3. the phi node for the two vectors from which the realignment is
ff802fa1 5537 done (for the optimized realignment scheme). */
ebfd146a
IR
5538
5539 /* 1. Determine where to generate the misalignment computation.
5540
5541 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5542 calculation will be generated by this function, outside the loop (in the
5543 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5544 caller, inside the loop.
5545
5546 Background: If the misalignment remains fixed throughout the iterations of
5547 the loop, then both realignment schemes are applicable, and also the
5548 misalignment computation can be done outside LOOP. This is because we are
5549 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5550 are a multiple of VS (the Vector Size), and therefore the misalignment in
5551 different vectorized LOOP iterations is always the same.
5552 The problem arises only if the memory access is in an inner-loop nested
5553 inside LOOP, which is now being vectorized using outer-loop vectorization.
5554 This is the only case when the misalignment of the memory access may not
5555 remain fixed throughout the iterations of the inner-loop (as explained in
5556 detail in vect_supportable_dr_alignment). In this case, not only is the
5557 optimized realignment scheme not applicable, but also the misalignment
5558 computation (and generation of the realignment token that is passed to
5559 REALIGN_LOAD) have to be done inside the loop.
5560
5561 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5562 or not, which in turn determines if the misalignment is computed inside
5563 the inner-loop, or outside LOOP. */
5564
69f11a13 5565 if (init_addr != NULL_TREE || !loop_vinfo)
ebfd146a
IR
5566 {
5567 compute_in_loop = true;
5568 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5569 }
5570
5571
5572 /* 2. Determine where to generate the extra vector load.
5573
5574 For the optimized realignment scheme, instead of generating two vector
5575 loads in each iteration, we generate a single extra vector load in the
5576 preheader of the loop, and in each iteration reuse the result of the
5577 vector load from the previous iteration. In case the memory access is in
5578 an inner-loop nested inside LOOP, which is now being vectorized using
5579 outer-loop vectorization, we need to determine whether this initial vector
5580 load should be generated at the preheader of the inner-loop, or can be
5581 generated at the preheader of LOOP. If the memory access has no evolution
5582 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5583 to be generated inside LOOP (in the preheader of the inner-loop). */
5584
5585 if (nested_in_vect_loop)
5586 {
5587 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5588 bool invariant_in_outerloop =
5589 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5590 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5591 }
5592 else
5593 loop_for_initial_load = loop;
5594 if (at_loop)
5595 *at_loop = loop_for_initial_load;
5596
69f11a13
IR
5597 if (loop_for_initial_load)
5598 pe = loop_preheader_edge (loop_for_initial_load);
5599
ebfd146a
IR
5600 /* 3. For the case of the optimized realignment, create the first vector
5601 load at the loop preheader. */
5602
5603 if (alignment_support_scheme == dr_explicit_realign_optimized)
5604 {
5605 /* Create msq_init = *(floor(p1)) in the loop preheader */
538dd0b7 5606 gassign *new_stmt;
ebfd146a
IR
5607
5608 gcc_assert (!compute_in_loop);
ebfd146a 5609 vec_dest = vect_create_destination_var (scalar_dest, vectype);
308bc496 5610 ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
86a91c0a 5611 loop_for_initial_load, NULL_TREE,
2d4bca81 5612 &init_addr, NULL, &inc, true);
b89dfa17
RB
5613 if (TREE_CODE (ptr) == SSA_NAME)
5614 new_temp = copy_ssa_name (ptr);
5615 else
5616 new_temp = make_ssa_name (TREE_TYPE (ptr));
ca31798e
AV
5617 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5618 tree type = TREE_TYPE (ptr);
0d0e4a03
JJ
5619 new_stmt = gimple_build_assign
5620 (new_temp, BIT_AND_EXPR, ptr,
ca31798e
AV
5621 fold_build2 (MINUS_EXPR, type,
5622 build_int_cst (type, 0),
5623 build_int_cst (type, align)));
75421dcd
RG
5624 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5625 gcc_assert (!new_bb);
20ede5c6
RG
5626 data_ref
5627 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5628 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
19986382 5629 vect_copy_ref_info (data_ref, DR_REF (dr));
ebfd146a
IR
5630 new_stmt = gimple_build_assign (vec_dest, data_ref);
5631 new_temp = make_ssa_name (vec_dest, new_stmt);
5632 gimple_assign_set_lhs (new_stmt, new_temp);
69f11a13
IR
5633 if (pe)
5634 {
5635 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5636 gcc_assert (!new_bb);
5637 }
5638 else
5639 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5640
ebfd146a
IR
5641 msq_init = gimple_assign_lhs (new_stmt);
5642 }
5643
5644 /* 4. Create realignment token using a target builtin, if available.
5645 It is done either inside the containing loop, or before LOOP (as
5646 determined above). */
5647
5648 if (targetm.vectorize.builtin_mask_for_load)
5649 {
538dd0b7 5650 gcall *new_stmt;
ebfd146a
IR
5651 tree builtin_decl;
5652
5653 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
69f11a13 5654 if (!init_addr)
ebfd146a
IR
5655 {
5656 /* Generate the INIT_ADDR computation outside LOOP. */
308bc496
RB
5657 init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5658 stmt_info, &stmts,
3f5e8a76 5659 NULL_TREE);
69f11a13
IR
5660 if (loop)
5661 {
5662 pe = loop_preheader_edge (loop);
5663 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5664 gcc_assert (!new_bb);
5665 }
5666 else
5667 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
ebfd146a
IR
5668 }
5669
5670 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5671 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5672 vec_dest =
5673 vect_create_destination_var (scalar_dest,
5674 gimple_call_return_type (new_stmt));
5675 new_temp = make_ssa_name (vec_dest, new_stmt);
5676 gimple_call_set_lhs (new_stmt, new_temp);
5677
5678 if (compute_in_loop)
5679 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5680 else
5681 {
5682 /* Generate the misalignment computation outside LOOP. */
5683 pe = loop_preheader_edge (loop);
5684 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5685 gcc_assert (!new_bb);
5686 }
5687
5688 *realignment_token = gimple_call_lhs (new_stmt);
5689
5690 /* The result of the CALL_EXPR to this builtin is determined from
5691 the value of the parameter and no global variables are touched
5692 which makes the builtin a "const" function. Requiring the
5693 builtin to have the "const" attribute makes it unnecessary
5694 to call mark_call_clobbered. */
5695 gcc_assert (TREE_READONLY (builtin_decl));
5696 }
5697
5698 if (alignment_support_scheme == dr_explicit_realign)
5699 return msq;
5700
5701 gcc_assert (!compute_in_loop);
5702 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5703
5704
5705 /* 5. Create msq = phi <msq_init, lsq> in loop */
5706
5707 pe = loop_preheader_edge (containing_loop);
5708 vec_dest = vect_create_destination_var (scalar_dest, vectype);
b731b390 5709 msq = make_ssa_name (vec_dest);
ebfd146a 5710 phi_stmt = create_phi_node (msq, containing_loop->header);
9e227d60 5711 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
ebfd146a
IR
5712
5713 return msq;
5714}
5715
5716
0d0293ac 5717/* Function vect_grouped_load_supported.
ebfd146a 5718
4fb8ba9d
RS
5719 COUNT is the size of the load group (the number of statements plus the
5720 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5721 only one statement, with a gap of COUNT - 1.
5722
5723 Returns true if a suitable permute exists. */
ebfd146a
IR
5724
5725bool
4fb8ba9d
RS
5726vect_grouped_load_supported (tree vectype, bool single_element_p,
5727 unsigned HOST_WIDE_INT count)
ebfd146a 5728{
ef4bddc2 5729 machine_mode mode = TYPE_MODE (vectype);
ebfd146a 5730
4fb8ba9d
RS
5731 /* If this is single-element interleaving with an element distance
5732 that leaves unused vector loads around punt - we at least create
5733 very sub-optimal code in that case (and blow up memory,
5734 see PR65518). */
928686b1 5735 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
4fb8ba9d
RS
5736 {
5737 if (dump_enabled_p ())
5738 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5739 "single-element interleaving not supported "
5740 "for not adjacent vector loads\n");
5741 return false;
5742 }
5743
2c23db6d
ES
5744 /* vect_permute_load_chain requires the group size to be equal to 3 or
5745 be a power of two. */
5746 if (count != 3 && exact_log2 (count) == -1)
b602d918 5747 {
73fbfcad 5748 if (dump_enabled_p ())
78c60e3d 5749 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2c23db6d
ES
5750 "the size of the group of accesses"
5751 " is not a power of 2 or not equal to 3\n");
b602d918
RS
5752 return false;
5753 }
5754
e2c83630
RH
5755 /* Check that the permutation is supported. */
5756 if (VECTOR_MODE_P (mode))
5757 {
7b777afa 5758 unsigned int i, j;
2c23db6d 5759 if (count == 3)
e2c83630 5760 {
7b777afa
RS
5761 unsigned int nelt;
5762 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5763 {
5764 if (dump_enabled_p ())
5765 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5766 "cannot handle groups of 3 loads for"
5767 " variable-length vectors\n");
5768 return false;
5769 }
5770
d980067b
RS
5771 vec_perm_builder sel (nelt, nelt, 1);
5772 sel.quick_grow (nelt);
e3342de4 5773 vec_perm_indices indices;
2c23db6d
ES
5774 unsigned int k;
5775 for (k = 0; k < 3; k++)
5776 {
5777 for (i = 0; i < nelt; i++)
5778 if (3 * i + k < 2 * nelt)
5779 sel[i] = 3 * i + k;
5780 else
5781 sel[i] = 0;
e3342de4
RS
5782 indices.new_vector (sel, 2, nelt);
5783 if (!can_vec_perm_const_p (mode, indices))
2c23db6d
ES
5784 {
5785 if (dump_enabled_p ())
5786 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5787 "shuffle of 3 loads is not supported by"
5788 " target\n");
21c0a521 5789 return false;
2c23db6d
ES
5790 }
5791 for (i = 0, j = 0; i < nelt; i++)
5792 if (3 * i + k < 2 * nelt)
5793 sel[i] = i;
5794 else
5795 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
e3342de4
RS
5796 indices.new_vector (sel, 2, nelt);
5797 if (!can_vec_perm_const_p (mode, indices))
2c23db6d
ES
5798 {
5799 if (dump_enabled_p ())
5800 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5801 "shuffle of 3 loads is not supported by"
5802 " target\n");
5803 return false;
5804 }
5805 }
5806 return true;
5807 }
5808 else
5809 {
5810 /* If length is not equal to 3 then only power of 2 is supported. */
146ec50f 5811 gcc_assert (pow2p_hwi (count));
7b777afa 5812 poly_uint64 nelt = GET_MODE_NUNITS (mode);
e3342de4 5813
d980067b
RS
5814 /* The encoding has a single stepped pattern. */
5815 vec_perm_builder sel (nelt, 1, 3);
5816 sel.quick_grow (3);
5817 for (i = 0; i < 3; i++)
2c23db6d 5818 sel[i] = i * 2;
e3342de4
RS
5819 vec_perm_indices indices (sel, 2, nelt);
5820 if (can_vec_perm_const_p (mode, indices))
2c23db6d 5821 {
d980067b 5822 for (i = 0; i < 3; i++)
2c23db6d 5823 sel[i] = i * 2 + 1;
e3342de4
RS
5824 indices.new_vector (sel, 2, nelt);
5825 if (can_vec_perm_const_p (mode, indices))
2c23db6d
ES
5826 return true;
5827 }
5828 }
e2c83630 5829 }
ebfd146a 5830
73fbfcad 5831 if (dump_enabled_p ())
78c60e3d 5832 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2c23db6d 5833 "extract even/odd not supported by target\n");
a6b3dfde 5834 return false;
ebfd146a
IR
5835}
5836
7e11fc7f
RS
5837/* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5838 type VECTYPE. MASKED_P says whether the masked form is needed. */
272c6793
RS
5839
5840bool
7e11fc7f
RS
5841vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5842 bool masked_p)
272c6793 5843{
7e11fc7f
RS
5844 if (masked_p)
5845 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5846 vec_mask_load_lanes_optab,
5847 vectype, count);
5848 else
5849 return vect_lanes_optab_supported_p ("vec_load_lanes",
5850 vec_load_lanes_optab,
5851 vectype, count);
272c6793 5852}
ebfd146a
IR
5853
5854/* Function vect_permute_load_chain.
5855
5856 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
2c23db6d
ES
5857 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5858 the input data correctly. Return the final references for loads in
5859 RESULT_CHAIN.
ebfd146a
IR
5860
5861 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5862 The input is 4 vectors each containing 8 elements. We assign a number to each
5863 element, the input sequence is:
5864
5865 1st vec: 0 1 2 3 4 5 6 7
5866 2nd vec: 8 9 10 11 12 13 14 15
b8698a0f 5867 3rd vec: 16 17 18 19 20 21 22 23
ebfd146a
IR
5868 4th vec: 24 25 26 27 28 29 30 31
5869
5870 The output sequence should be:
5871
5872 1st vec: 0 4 8 12 16 20 24 28
5873 2nd vec: 1 5 9 13 17 21 25 29
b8698a0f 5874 3rd vec: 2 6 10 14 18 22 26 30
ebfd146a
IR
5875 4th vec: 3 7 11 15 19 23 27 31
5876
5877 i.e., the first output vector should contain the first elements of each
5878 interleaving group, etc.
5879
ff802fa1
IR
5880 We use extract_even/odd instructions to create such output. The input of
5881 each extract_even/odd operation is two vectors
b8698a0f
L
5882 1st vec 2nd vec
5883 0 1 2 3 4 5 6 7
ebfd146a 5884
ff802fa1 5885 and the output is the vector of extracted even/odd elements. The output of
ebfd146a
IR
5886 extract_even will be: 0 2 4 6
5887 and of extract_odd: 1 3 5 7
5888
b8698a0f 5889
ff802fa1
IR
5890 The permutation is done in log LENGTH stages. In each stage extract_even
5891 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5892 their order. In our example,
ebfd146a
IR
5893
5894 E1: extract_even (1st vec, 2nd vec)
5895 E2: extract_odd (1st vec, 2nd vec)
5896 E3: extract_even (3rd vec, 4th vec)
5897 E4: extract_odd (3rd vec, 4th vec)
5898
5899 The output for the first stage will be:
5900
5901 E1: 0 2 4 6 8 10 12 14
5902 E2: 1 3 5 7 9 11 13 15
b8698a0f 5903 E3: 16 18 20 22 24 26 28 30
ebfd146a
IR
5904 E4: 17 19 21 23 25 27 29 31
5905
5906 In order to proceed and create the correct sequence for the next stage (or
b8698a0f
L
5907 for the correct output, if the second stage is the last one, as in our
5908 example), we first put the output of extract_even operation and then the
ebfd146a
IR
5909 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5910 The input for the second stage is:
5911
5912 1st vec (E1): 0 2 4 6 8 10 12 14
b8698a0f
L
5913 2nd vec (E3): 16 18 20 22 24 26 28 30
5914 3rd vec (E2): 1 3 5 7 9 11 13 15
ebfd146a
IR
5915 4th vec (E4): 17 19 21 23 25 27 29 31
5916
5917 The output of the second stage:
5918
5919 E1: 0 4 8 12 16 20 24 28
5920 E2: 2 6 10 14 18 22 26 30
5921 E3: 1 5 9 13 17 21 25 29
5922 E4: 3 7 11 15 19 23 27 31
5923
5924 And RESULT_CHAIN after reordering:
5925
5926 1st vec (E1): 0 4 8 12 16 20 24 28
5927 2nd vec (E3): 1 5 9 13 17 21 25 29
b8698a0f 5928 3rd vec (E2): 2 6 10 14 18 22 26 30
ebfd146a
IR
5929 4th vec (E4): 3 7 11 15 19 23 27 31. */
5930
b602d918 5931static void
308bc496 5932vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
b8698a0f 5933 unsigned int length,
32e8e429 5934 stmt_vec_info stmt_info,
ebfd146a 5935 gimple_stmt_iterator *gsi,
9771b263 5936 vec<tree> *result_chain)
ebfd146a 5937{
83d5977e 5938 tree data_ref, first_vect, second_vect;
e2c83630 5939 tree perm_mask_even, perm_mask_odd;
2c23db6d 5940 tree perm3_mask_low, perm3_mask_high;
355fe088 5941 gimple *perm_stmt;
91987857 5942 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
e2c83630 5943 unsigned int i, j, log_length = exact_log2 (length);
908a1a16 5944
3f292312
JJ
5945 result_chain->quick_grow (length);
5946 memcpy (result_chain->address (), dr_chain.address (),
5947 length * sizeof (tree));
e2c83630 5948
2c23db6d 5949 if (length == 3)
ebfd146a 5950 {
edab8e10 5951 /* vect_grouped_load_supported ensures that this is constant. */
928686b1 5952 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
2c23db6d 5953 unsigned int k;
ebfd146a 5954
d980067b
RS
5955 vec_perm_builder sel (nelt, nelt, 1);
5956 sel.quick_grow (nelt);
e3342de4 5957 vec_perm_indices indices;
2c23db6d
ES
5958 for (k = 0; k < 3; k++)
5959 {
5960 for (i = 0; i < nelt; i++)
5961 if (3 * i + k < 2 * nelt)
5962 sel[i] = 3 * i + k;
5963 else
5964 sel[i] = 0;
e3342de4
RS
5965 indices.new_vector (sel, 2, nelt);
5966 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
2c23db6d
ES
5967
5968 for (i = 0, j = 0; i < nelt; i++)
5969 if (3 * i + k < 2 * nelt)
5970 sel[i] = i;
5971 else
5972 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
e3342de4
RS
5973 indices.new_vector (sel, 2, nelt);
5974 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
2c23db6d
ES
5975
5976 first_vect = dr_chain[0];
5977 second_vect = dr_chain[1];
5978
5979 /* Create interleaving stmt (low part of):
5980 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5981 ...}> */
f598c55c 5982 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
0d0e4a03
JJ
5983 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5984 second_vect, perm3_mask_low);
308bc496 5985 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
b8698a0f 5986
2c23db6d
ES
5987 /* Create interleaving stmt (high part of):
5988 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5989 ...}> */
5990 first_vect = data_ref;
5991 second_vect = dr_chain[2];
f598c55c 5992 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
0d0e4a03
JJ
5993 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5994 second_vect, perm3_mask_high);
308bc496 5995 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
2c23db6d 5996 (*result_chain)[k] = data_ref;
ebfd146a 5997 }
ebfd146a 5998 }
2c23db6d
ES
5999 else
6000 {
6001 /* If length is not equal to 3 then only power of 2 is supported. */
146ec50f 6002 gcc_assert (pow2p_hwi (length));
2c23db6d 6003
d980067b 6004 /* The encoding has a single stepped pattern. */
928686b1 6005 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
d980067b
RS
6006 vec_perm_builder sel (nelt, 1, 3);
6007 sel.quick_grow (3);
6008 for (i = 0; i < 3; ++i)
2c23db6d 6009 sel[i] = i * 2;
e3342de4
RS
6010 vec_perm_indices indices (sel, 2, nelt);
6011 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
2c23db6d 6012
d980067b 6013 for (i = 0; i < 3; ++i)
2c23db6d 6014 sel[i] = i * 2 + 1;
e3342de4
RS
6015 indices.new_vector (sel, 2, nelt);
6016 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
ebfd146a 6017
2c23db6d
ES
6018 for (i = 0; i < log_length; i++)
6019 {
6020 for (j = 0; j < length; j += 2)
6021 {
6022 first_vect = dr_chain[j];
6023 second_vect = dr_chain[j+1];
6024
6025 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6026 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
0d0e4a03
JJ
6027 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6028 first_vect, second_vect,
6029 perm_mask_even);
308bc496 6030 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
2c23db6d
ES
6031 (*result_chain)[j/2] = data_ref;
6032
6033 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6034 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
0d0e4a03
JJ
6035 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6036 first_vect, second_vect,
6037 perm_mask_odd);
308bc496 6038 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
2c23db6d
ES
6039 (*result_chain)[j/2+length/2] = data_ref;
6040 }
6041 memcpy (dr_chain.address (), result_chain->address (),
6042 length * sizeof (tree));
6043 }
6044 }
6045}
ebfd146a 6046
f7917029
ES
6047/* Function vect_shift_permute_load_chain.
6048
6049 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6050 sequence of stmts to reorder the input data accordingly.
6051 Return the final references for loads in RESULT_CHAIN.
6052 Return true if successed, false otherwise.
6053
6054 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6055 The input is 3 vectors each containing 8 elements. We assign a
6056 number to each element, the input sequence is:
6057
6058 1st vec: 0 1 2 3 4 5 6 7
6059 2nd vec: 8 9 10 11 12 13 14 15
6060 3rd vec: 16 17 18 19 20 21 22 23
6061
6062 The output sequence should be:
6063
6064 1st vec: 0 3 6 9 12 15 18 21
6065 2nd vec: 1 4 7 10 13 16 19 22
6066 3rd vec: 2 5 8 11 14 17 20 23
6067
6068 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6069
6070 First we shuffle all 3 vectors to get correct elements order:
6071
6072 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6073 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6074 3rd vec: (16 19 22) (17 20 23) (18 21)
6075
6076 Next we unite and shift vector 3 times:
6077
6078 1st step:
6079 shift right by 6 the concatenation of:
6080 "1st vec" and "2nd vec"
6081 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6082 "2nd vec" and "3rd vec"
6083 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6084 "3rd vec" and "1st vec"
6085 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6086 | New vectors |
6087
6088 So that now new vectors are:
6089
6090 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6091 2nd vec: (10 13) (16 19 22) (17 20 23)
6092 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6093
6094 2nd step:
6095 shift right by 5 the concatenation of:
6096 "1st vec" and "3rd vec"
6097 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6098 "2nd vec" and "1st vec"
6099 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6100 "3rd vec" and "2nd vec"
6101 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6102 | New vectors |
6103
6104 So that now new vectors are:
6105
6106 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6107 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6108 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6109
6110 3rd step:
6111 shift right by 5 the concatenation of:
6112 "1st vec" and "1st vec"
6113 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6114 shift right by 3 the concatenation of:
6115 "2nd vec" and "2nd vec"
6116 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6117 | New vectors |
6118
6119 So that now all vectors are READY:
6120 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6121 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6122 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6123
6124 This algorithm is faster than one in vect_permute_load_chain if:
6125 1. "shift of a concatination" is faster than general permutation.
6126 This is usually so.
6127 2. The TARGET machine can't execute vector instructions in parallel.
6128 This is because each step of the algorithm depends on previous.
6129 The algorithm in vect_permute_load_chain is much more parallel.
6130
6131 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6132*/
6133
6134static bool
308bc496 6135vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
f7917029 6136 unsigned int length,
32e8e429 6137 stmt_vec_info stmt_info,
f7917029
ES
6138 gimple_stmt_iterator *gsi,
6139 vec<tree> *result_chain)
6140{
6141 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6142 tree perm2_mask1, perm2_mask2, perm3_mask;
6143 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
355fe088 6144 gimple *perm_stmt;
f7917029 6145
91987857 6146 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
f7917029 6147 unsigned int i;
308bc496 6148 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
f7917029 6149
928686b1
RS
6150 unsigned HOST_WIDE_INT nelt, vf;
6151 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6152 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
d9f21f6a
RS
6153 /* Not supported for variable-length vectors. */
6154 return false;
6155
e3342de4 6156 vec_perm_builder sel (nelt, nelt, 1);
908a1a16
RS
6157 sel.quick_grow (nelt);
6158
f7917029
ES
6159 result_chain->quick_grow (length);
6160 memcpy (result_chain->address (), dr_chain.address (),
6161 length * sizeof (tree));
6162
d9f21f6a 6163 if (pow2p_hwi (length) && vf > 4)
f7917029 6164 {
af4c011e 6165 unsigned int j, log_length = exact_log2 (length);
f7917029
ES
6166 for (i = 0; i < nelt / 2; ++i)
6167 sel[i] = i * 2;
6168 for (i = 0; i < nelt / 2; ++i)
6169 sel[nelt / 2 + i] = i * 2 + 1;
e3342de4
RS
6170 vec_perm_indices indices (sel, 2, nelt);
6171 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
f7917029
ES
6172 {
6173 if (dump_enabled_p ())
6174 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6175 "shuffle of 2 fields structure is not \
6176 supported by target\n");
6177 return false;
6178 }
e3342de4 6179 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
f7917029
ES
6180
6181 for (i = 0; i < nelt / 2; ++i)
6182 sel[i] = i * 2 + 1;
6183 for (i = 0; i < nelt / 2; ++i)
6184 sel[nelt / 2 + i] = i * 2;
e3342de4
RS
6185 indices.new_vector (sel, 2, nelt);
6186 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
f7917029
ES
6187 {
6188 if (dump_enabled_p ())
6189 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6190 "shuffle of 2 fields structure is not \
6191 supported by target\n");
6192 return false;
6193 }
e3342de4 6194 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
f7917029
ES
6195
6196 /* Generating permutation constant to shift all elements.
6197 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6198 for (i = 0; i < nelt; i++)
6199 sel[i] = nelt / 2 + i;
e3342de4
RS
6200 indices.new_vector (sel, 2, nelt);
6201 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
f7917029
ES
6202 {
6203 if (dump_enabled_p ())
6204 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6205 "shift permutation is not supported by target\n");
6206 return false;
6207 }
e3342de4 6208 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
f7917029
ES
6209
6210 /* Generating permutation constant to select vector from 2.
6211 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6212 for (i = 0; i < nelt / 2; i++)
6213 sel[i] = i;
6214 for (i = nelt / 2; i < nelt; i++)
6215 sel[i] = nelt + i;
e3342de4
RS
6216 indices.new_vector (sel, 2, nelt);
6217 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
f7917029
ES
6218 {
6219 if (dump_enabled_p ())
6220 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6221 "select is not supported by target\n");
6222 return false;
6223 }
e3342de4 6224 select_mask = vect_gen_perm_mask_checked (vectype, indices);
f7917029 6225
af4c011e
ES
6226 for (i = 0; i < log_length; i++)
6227 {
6228 for (j = 0; j < length; j += 2)
6229 {
6230 first_vect = dr_chain[j];
6231 second_vect = dr_chain[j + 1];
6232
6233 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
0d0e4a03
JJ
6234 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6235 first_vect, first_vect,
6236 perm2_mask1);
308bc496 6237 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
af4c011e
ES
6238 vect[0] = data_ref;
6239
6240 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
0d0e4a03
JJ
6241 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6242 second_vect, second_vect,
6243 perm2_mask2);
308bc496 6244 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
af4c011e 6245 vect[1] = data_ref;
f7917029 6246
af4c011e 6247 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
0d0e4a03
JJ
6248 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6249 vect[0], vect[1], shift1_mask);
308bc496 6250 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
af4c011e
ES
6251 (*result_chain)[j/2 + length/2] = data_ref;
6252
6253 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
0d0e4a03
JJ
6254 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6255 vect[0], vect[1], select_mask);
308bc496 6256 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
af4c011e
ES
6257 (*result_chain)[j/2] = data_ref;
6258 }
6259 memcpy (dr_chain.address (), result_chain->address (),
6260 length * sizeof (tree));
6261 }
f7917029
ES
6262 return true;
6263 }
d9f21f6a 6264 if (length == 3 && vf > 2)
f7917029
ES
6265 {
6266 unsigned int k = 0, l = 0;
6267
6268 /* Generating permutation constant to get all elements in rigth order.
6269 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6270 for (i = 0; i < nelt; i++)
6271 {
6272 if (3 * k + (l % 3) >= nelt)
6273 {
6274 k = 0;
6275 l += (3 - (nelt % 3));
6276 }
6277 sel[i] = 3 * k + (l % 3);
6278 k++;
6279 }
e3342de4
RS
6280 vec_perm_indices indices (sel, 2, nelt);
6281 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
f7917029
ES
6282 {
6283 if (dump_enabled_p ())
6284 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6285 "shuffle of 3 fields structure is not \
6286 supported by target\n");
6287 return false;
6288 }
e3342de4 6289 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
f7917029
ES
6290
6291 /* Generating permutation constant to shift all elements.
6292 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6293 for (i = 0; i < nelt; i++)
6294 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
e3342de4
RS
6295 indices.new_vector (sel, 2, nelt);
6296 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
f7917029
ES
6297 {
6298 if (dump_enabled_p ())
6299 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6300 "shift permutation is not supported by target\n");
6301 return false;
6302 }
e3342de4 6303 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
f7917029
ES
6304
6305 /* Generating permutation constant to shift all elements.
6306 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6307 for (i = 0; i < nelt; i++)
6308 sel[i] = 2 * (nelt / 3) + 1 + i;
e3342de4
RS
6309 indices.new_vector (sel, 2, nelt);
6310 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
f7917029
ES
6311 {
6312 if (dump_enabled_p ())
6313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6314 "shift permutation is not supported by target\n");
6315 return false;
6316 }
e3342de4 6317 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
f7917029
ES
6318
6319 /* Generating permutation constant to shift all elements.
6320 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6321 for (i = 0; i < nelt; i++)
6322 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
e3342de4
RS
6323 indices.new_vector (sel, 2, nelt);
6324 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
f7917029
ES
6325 {
6326 if (dump_enabled_p ())
6327 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6328 "shift permutation is not supported by target\n");
6329 return false;
6330 }
e3342de4 6331 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
f7917029
ES
6332
6333 /* Generating permutation constant to shift all elements.
6334 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6335 for (i = 0; i < nelt; i++)
6336 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
e3342de4
RS
6337 indices.new_vector (sel, 2, nelt);
6338 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
f7917029
ES
6339 {
6340 if (dump_enabled_p ())
6341 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6342 "shift permutation is not supported by target\n");
6343 return false;
6344 }
e3342de4 6345 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
f7917029
ES
6346
6347 for (k = 0; k < 3; k++)
6348 {
f598c55c 6349 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
0d0e4a03
JJ
6350 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6351 dr_chain[k], dr_chain[k],
6352 perm3_mask);
308bc496 6353 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
f7917029
ES
6354 vect[k] = data_ref;
6355 }
6356
6357 for (k = 0; k < 3; k++)
6358 {
6359 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
0d0e4a03
JJ
6360 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6361 vect[k % 3], vect[(k + 1) % 3],
6362 shift1_mask);
308bc496 6363 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
f7917029
ES
6364 vect_shift[k] = data_ref;
6365 }
6366
6367 for (k = 0; k < 3; k++)
6368 {
6369 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
0d0e4a03
JJ
6370 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6371 vect_shift[(4 - k) % 3],
6372 vect_shift[(3 - k) % 3],
6373 shift2_mask);
308bc496 6374 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
f7917029
ES
6375 vect[k] = data_ref;
6376 }
6377
6378 (*result_chain)[3 - (nelt % 3)] = vect[2];
6379
6380 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
0d0e4a03
JJ
6381 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6382 vect[0], shift3_mask);
308bc496 6383 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
f7917029
ES
6384 (*result_chain)[nelt % 3] = data_ref;
6385
6386 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
0d0e4a03
JJ
6387 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6388 vect[1], shift4_mask);
308bc496 6389 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
f7917029
ES
6390 (*result_chain)[0] = data_ref;
6391 return true;
6392 }
6393 return false;
6394}
6395
0d0293ac 6396/* Function vect_transform_grouped_load.
ebfd146a
IR
6397
6398 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6399 to perform their permutation and ascribe the result vectorized statements to
6400 the scalar statements.
6401*/
6402
b602d918 6403void
308bc496
RB
6404vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6405 vec<tree> dr_chain,
32e8e429 6406 int size, gimple_stmt_iterator *gsi)
ebfd146a 6407{
ef4bddc2 6408 machine_mode mode;
6e1aa848 6409 vec<tree> result_chain = vNULL;
ebfd146a 6410
b8698a0f
L
6411 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6412 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
ebfd146a 6413 vectors, that are ready for vector computation. */
9771b263 6414 result_chain.create (size);
f7917029
ES
6415
6416 /* If reassociation width for vector type is 2 or greater target machine can
6417 execute 2 or more vector instructions in parallel. Otherwise try to
6418 get chain for loads group using vect_shift_permute_load_chain. */
91987857 6419 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
f7917029 6420 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
146ec50f 6421 || pow2p_hwi (size)
308bc496 6422 || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
f7917029 6423 gsi, &result_chain))
308bc496
RB
6424 vect_permute_load_chain (vinfo, dr_chain,
6425 size, stmt_info, gsi, &result_chain);
6426 vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
9771b263 6427 result_chain.release ();
272c6793
RS
6428}
6429
0d0293ac 6430/* RESULT_CHAIN contains the output of a group of grouped loads that were
32e8e429 6431 generated as part of the vectorization of STMT_INFO. Assign the statement
272c6793
RS
6432 for each vector to the associated scalar statement. */
6433
6434void
f25161bd 6435vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
32e8e429 6436 vec<tree> result_chain)
272c6793 6437{
bffb8014 6438 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
272c6793
RS
6439 unsigned int i, gap_count;
6440 tree tmp_data_ref;
ebfd146a 6441
b8698a0f
L
6442 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6443 Since we scan the chain starting from it's first node, their order
ebfd146a 6444 corresponds the order of data-refs in RESULT_CHAIN. */
bffb8014 6445 stmt_vec_info next_stmt_info = first_stmt_info;
ebfd146a 6446 gap_count = 1;
9771b263 6447 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
ebfd146a 6448 {
bffb8014 6449 if (!next_stmt_info)
ebfd146a
IR
6450 break;
6451
ff802fa1
IR
6452 /* Skip the gaps. Loads created for the gaps will be removed by dead
6453 code elimination pass later. No need to check for the first stmt in
ebfd146a 6454 the group, since it always exists.
2c53b149
RB
6455 DR_GROUP_GAP is the number of steps in elements from the previous
6456 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
ff802fa1 6457 correspond to the gaps. */
bffb8014
RS
6458 if (next_stmt_info != first_stmt_info
6459 && gap_count < DR_GROUP_GAP (next_stmt_info))
f95b7597
RB
6460 {
6461 gap_count++;
6462 continue;
6463 }
ebfd146a 6464
f95b7597
RB
6465 /* ??? The following needs cleanup after the removal of
6466 DR_GROUP_SAME_DR_STMT. */
6467 if (next_stmt_info)
ebfd146a 6468 {
f25161bd 6469 gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
ebfd146a 6470 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
b05d5563 6471 copies, and we put the new vector statement last. */
f25161bd 6472 STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
ebfd146a 6473
bffb8014 6474 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
ebfd146a 6475 gap_count = 1;
ebfd146a
IR
6476 }
6477 }
ebfd146a
IR
6478}
6479
6480/* Function vect_force_dr_alignment_p.
6481
6482 Returns whether the alignment of a DECL can be forced to be aligned
6483 on ALIGNMENT bit boundary. */
6484
b8698a0f 6485bool
ca31798e 6486vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
ebfd146a 6487{
8813a647 6488 if (!VAR_P (decl))
ebfd146a
IR
6489 return false;
6490
428f0c67
JH
6491 if (decl_in_symtab_p (decl)
6492 && !symtab_node::get (decl)->can_increase_alignment_p ())
6192fa79
JH
6493 return false;
6494
ebfd146a 6495 if (TREE_STATIC (decl))
b2581735
IS
6496 return (known_le (alignment,
6497 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
ebfd146a 6498 else
ca31798e 6499 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
ebfd146a
IR
6500}
6501
ebfd146a 6502
89fa689a 6503/* Return whether the data reference DR_INFO is supported with respect to its
720f5239
IR
6504 alignment.
6505 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6506 it is aligned, i.e., check if it is possible to vectorize it with different
ebfd146a
IR
6507 alignment. */
6508
6509enum dr_alignment_support
308bc496 6510vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
720f5239 6511 bool check_aligned_accesses)
ebfd146a 6512{
89fa689a
RS
6513 data_reference *dr = dr_info->dr;
6514 stmt_vec_info stmt_info = dr_info->stmt;
ebfd146a 6515 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
ef4bddc2 6516 machine_mode mode = TYPE_MODE (vectype);
308bc496 6517 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
99b1c316 6518 class loop *vect_loop = NULL;
a70d6342 6519 bool nested_in_vect_loop = false;
ebfd146a 6520
89fa689a 6521 if (aligned_access_p (dr_info) && !check_aligned_accesses)
ebfd146a
IR
6522 return dr_aligned;
6523
5ce9450f
JJ
6524 /* For now assume all conditional loads/stores support unaligned
6525 access without any special code. */
78e02b3b
RS
6526 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6527 if (gimple_call_internal_p (stmt)
6528 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6529 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6530 return dr_unaligned_supported;
5ce9450f 6531
69f11a13
IR
6532 if (loop_vinfo)
6533 {
6534 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
78e02b3b 6535 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
69f11a13 6536 }
a70d6342 6537
ebfd146a
IR
6538 /* Possibly unaligned access. */
6539
6540 /* We can choose between using the implicit realignment scheme (generating
6541 a misaligned_move stmt) and the explicit realignment scheme (generating
ff802fa1
IR
6542 aligned loads with a REALIGN_LOAD). There are two variants to the
6543 explicit realignment scheme: optimized, and unoptimized.
ebfd146a
IR
6544 We can optimize the realignment only if the step between consecutive
6545 vector loads is equal to the vector size. Since the vector memory
6546 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6547 is guaranteed that the misalignment amount remains the same throughout the
6548 execution of the vectorized loop. Therefore, we can create the
6549 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6550 at the loop preheader.
6551
6552 However, in the case of outer-loop vectorization, when vectorizing a
6553 memory access in the inner-loop nested within the LOOP that is now being
6554 vectorized, while it is guaranteed that the misalignment of the
6555 vectorized memory access will remain the same in different outer-loop
6556 iterations, it is *not* guaranteed that is will remain the same throughout
6557 the execution of the inner-loop. This is because the inner-loop advances
6558 with the original scalar step (and not in steps of VS). If the inner-loop
6559 step happens to be a multiple of VS, then the misalignment remains fixed
6560 and we can use the optimized realignment scheme. For example:
6561
6562 for (i=0; i<N; i++)
6563 for (j=0; j<M; j++)
6564 s += a[i+j];
6565
6566 When vectorizing the i-loop in the above example, the step between
6567 consecutive vector loads is 1, and so the misalignment does not remain
6568 fixed across the execution of the inner-loop, and the realignment cannot
6569 be optimized (as illustrated in the following pseudo vectorized loop):
6570
6571 for (i=0; i<N; i+=4)
6572 for (j=0; j<M; j++){
6573 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6574 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6575 // (assuming that we start from an aligned address).
6576 }
6577
6578 We therefore have to use the unoptimized realignment scheme:
6579
6580 for (i=0; i<N; i+=4)
6581 for (j=k; j<M; j+=4)
6582 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6583 // that the misalignment of the initial address is
6584 // 0).
6585
6586 The loop can then be vectorized as follows:
6587
6588 for (k=0; k<4; k++){
6589 rt = get_realignment_token (&vp[k]);
6590 for (i=0; i<N; i+=4){
6591 v1 = vp[i+k];
6592 for (j=k; j<M; j+=4){
6593 v2 = vp[i+j+VS-1];
6594 va = REALIGN_LOAD <v1,v2,rt>;
6595 vs += va;
6596 v1 = v2;
6597 }
6598 }
6599 } */
6600
6601 if (DR_IS_READ (dr))
6602 {
0601d0cf
RE
6603 bool is_packed = false;
6604 tree type = (TREE_TYPE (DR_REF (dr)));
6605
947131ba 6606 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
ebfd146a
IR
6607 && (!targetm.vectorize.builtin_mask_for_load
6608 || targetm.vectorize.builtin_mask_for_load ()))
6609 {
6610 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
0ab34b9e
RB
6611
6612 /* If we are doing SLP then the accesses need not have the
6613 same alignment, instead it depends on the SLP group size. */
6614 if (loop_vinfo
6615 && STMT_SLP_TYPE (stmt_info)
d9f21f6a 6616 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
bffb8014
RS
6617 * (DR_GROUP_SIZE
6618 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
d9f21f6a 6619 TYPE_VECTOR_SUBPARTS (vectype)))
0ab34b9e
RB
6620 ;
6621 else if (!loop_vinfo
6622 || (nested_in_vect_loop
cf098191
RS
6623 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6624 GET_MODE_SIZE (TYPE_MODE (vectype)))))
ebfd146a
IR
6625 return dr_explicit_realign;
6626 else
6627 return dr_explicit_realign_optimized;
6628 }
89fa689a 6629 if (!known_alignment_for_access_p (dr_info))
4c9bcf89 6630 is_packed = not_size_aligned (DR_REF (dr));
b8698a0f 6631
c2873892 6632 if (targetm.vectorize.support_vector_misalignment
89fa689a 6633 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
ebfd146a
IR
6634 /* Can't software pipeline the loads, but can at least do them. */
6635 return dr_unaligned_supported;
6636 }
0601d0cf
RE
6637 else
6638 {
6639 bool is_packed = false;
6640 tree type = (TREE_TYPE (DR_REF (dr)));
ebfd146a 6641
89fa689a 6642 if (!known_alignment_for_access_p (dr_info))
4c9bcf89 6643 is_packed = not_size_aligned (DR_REF (dr));
b8698a0f 6644
c2873892 6645 if (targetm.vectorize.support_vector_misalignment
89fa689a 6646 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
0601d0cf
RE
6647 return dr_unaligned_supported;
6648 }
b8698a0f 6649
ebfd146a
IR
6650 /* Unsupported. */
6651 return dr_unaligned_unsupported;
6652}