]>
Commit | Line | Data |
---|---|---|
48e1416a | 1 | /* Data References Analysis and Manipulation Utilities for Vectorization. |
fbd26352 | 2 | Copyright (C) 2003-2019 Free Software Foundation, Inc. |
48e1416a | 3 | Contributed by Dorit Naishlos <dorit@il.ibm.com> |
fb85abff | 4 | and Ira Rosen <irar@il.ibm.com> |
5 | ||
6 | This file is part of GCC. | |
7 | ||
8 | GCC is free software; you can redistribute it and/or modify it under | |
9 | the terms of the GNU General Public License as published by the Free | |
10 | Software Foundation; either version 3, or (at your option) any later | |
11 | version. | |
12 | ||
13 | GCC is distributed in the hope that it will be useful, but WITHOUT ANY | |
14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
15 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
16 | for more details. | |
17 | ||
18 | You should have received a copy of the GNU General Public License | |
19 | along with GCC; see the file COPYING3. If not see | |
20 | <http://www.gnu.org/licenses/>. */ | |
21 | ||
22 | #include "config.h" | |
23 | #include "system.h" | |
24 | #include "coretypes.h" | |
9ef16211 | 25 | #include "backend.h" |
7c29e30e | 26 | #include "target.h" |
27 | #include "rtl.h" | |
fb85abff | 28 | #include "tree.h" |
9ef16211 | 29 | #include "gimple.h" |
7c29e30e | 30 | #include "predict.h" |
ad7b10a2 | 31 | #include "memmodel.h" |
7c29e30e | 32 | #include "tm_p.h" |
9ef16211 | 33 | #include "ssa.h" |
7c29e30e | 34 | #include "optabs-tree.h" |
35 | #include "cgraph.h" | |
7c29e30e | 36 | #include "dumpfile.h" |
9ef16211 | 37 | #include "alias.h" |
b20a8bb4 | 38 | #include "fold-const.h" |
9ed99284 | 39 | #include "stor-layout.h" |
bc61cadb | 40 | #include "tree-eh.h" |
a8783bee | 41 | #include "gimplify.h" |
dcf1a1ec | 42 | #include "gimple-iterator.h" |
e795d6e1 | 43 | #include "gimplify-me.h" |
05d9c18a | 44 | #include "tree-ssa-loop-ivopts.h" |
45 | #include "tree-ssa-loop-manip.h" | |
073c1fd5 | 46 | #include "tree-ssa-loop.h" |
fb85abff | 47 | #include "cfgloop.h" |
fb85abff | 48 | #include "tree-scalar-evolution.h" |
49 | #include "tree-vectorizer.h" | |
8e3cb73b | 50 | #include "expr.h" |
f7715905 | 51 | #include "builtins.h" |
0d8001a7 | 52 | #include "params.h" |
a5456a6d | 53 | #include "tree-cfg.h" |
f68a7726 | 54 | #include "tree-hash-traits.h" |
d37760c5 | 55 | #include "vec-perm-indices.h" |
1619606c | 56 | #include "internal-fn.h" |
fb85abff | 57 | |
94b7b4dd | 58 | /* Return true if load- or store-lanes optab OPTAB is implemented for |
59 | COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */ | |
60 | ||
61 | static bool | |
62 | vect_lanes_optab_supported_p (const char *name, convert_optab optab, | |
63 | tree vectype, unsigned HOST_WIDE_INT count) | |
64 | { | |
30d26b1c | 65 | machine_mode mode, array_mode; |
94b7b4dd | 66 | bool limit_p; |
67 | ||
68 | mode = TYPE_MODE (vectype); | |
30d26b1c | 69 | if (!targetm.array_mode (mode, count).exists (&array_mode)) |
94b7b4dd | 70 | { |
30d26b1c | 71 | poly_uint64 bits = count * GET_MODE_BITSIZE (mode); |
72 | limit_p = !targetm.array_mode_supported_p (mode, count); | |
73 | if (!int_mode_for_size (bits, limit_p).exists (&array_mode)) | |
74 | { | |
75 | if (dump_enabled_p ()) | |
76 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
bffe1cb4 | 77 | "no array mode for %s[%wu]\n", |
30d26b1c | 78 | GET_MODE_NAME (mode), count); |
79 | return false; | |
80 | } | |
94b7b4dd | 81 | } |
82 | ||
83 | if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing) | |
84 | { | |
6d8fb6cf | 85 | if (dump_enabled_p ()) |
7bd765d4 | 86 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
78bb46f5 | 87 | "cannot use %s<%s><%s>\n", name, |
7bd765d4 | 88 | GET_MODE_NAME (array_mode), GET_MODE_NAME (mode)); |
94b7b4dd | 89 | return false; |
90 | } | |
91 | ||
6d8fb6cf | 92 | if (dump_enabled_p ()) |
7bd765d4 | 93 | dump_printf_loc (MSG_NOTE, vect_location, |
78bb46f5 | 94 | "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode), |
7bd765d4 | 95 | GET_MODE_NAME (mode)); |
94b7b4dd | 96 | |
97 | return true; | |
98 | } | |
99 | ||
100 | ||
ecc42a77 | 101 | /* Return the smallest scalar part of STMT_INFO. |
282bf14c | 102 | This is used to determine the vectype of the stmt. We generally set the |
103 | vectype according to the type of the result (lhs). For stmts whose | |
fb85abff | 104 | result-type is different than the type of the arguments (e.g., demotion, |
48e1416a | 105 | promotion), vectype will be reset appropriately (later). Note that we have |
fb85abff | 106 | to visit the smallest datatype in this function, because that determines the |
282bf14c | 107 | VF. If the smallest datatype in the loop is present only as the rhs of a |
fb85abff | 108 | promotion operation - we'd miss it. |
109 | Such a case, where a variable of this datatype does not appear in the lhs | |
110 | anywhere in the loop, can only occur if it's an invariant: e.g.: | |
48e1416a | 111 | 'int_x = (int) short_inv', which we'd expect to have been optimized away by |
282bf14c | 112 | invariant motion. However, we cannot rely on invariant motion to always |
113 | take invariants out of the loop, and so in the case of promotion we also | |
114 | have to check the rhs. | |
fb85abff | 115 | LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding |
116 | types. */ | |
117 | ||
118 | tree | |
ecc42a77 | 119 | vect_get_smallest_scalar_type (stmt_vec_info stmt_info, |
120 | HOST_WIDE_INT *lhs_size_unit, | |
121 | HOST_WIDE_INT *rhs_size_unit) | |
fb85abff | 122 | { |
ecc42a77 | 123 | tree scalar_type = gimple_expr_type (stmt_info->stmt); |
fb85abff | 124 | HOST_WIDE_INT lhs, rhs; |
125 | ||
0b86fa32 | 126 | /* During the analysis phase, this function is called on arbitrary |
127 | statements that might not have scalar results. */ | |
128 | if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type))) | |
129 | return scalar_type; | |
130 | ||
f9ae6f95 | 131 | lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); |
fb85abff | 132 | |
ecc42a77 | 133 | gassign *assign = dyn_cast <gassign *> (stmt_info->stmt); |
5b4b7bcc | 134 | if (assign |
135 | && (gimple_assign_cast_p (assign) | |
136 | || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR | |
137 | || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR | |
138 | || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR | |
139 | || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR | |
140 | || gimple_assign_rhs_code (assign) == FLOAT_EXPR)) | |
fb85abff | 141 | { |
5b4b7bcc | 142 | tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign)); |
fb85abff | 143 | |
7e41cae0 | 144 | rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type)); |
145 | if (rhs < lhs) | |
146 | scalar_type = rhs_type; | |
147 | } | |
37d0d1b1 | 148 | else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt)) |
7e41cae0 | 149 | { |
37d0d1b1 | 150 | unsigned int i = 0; |
151 | if (gimple_call_internal_p (call)) | |
152 | { | |
153 | internal_fn ifn = gimple_call_internal_fn (call); | |
154 | if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn)) | |
155 | /* gimple_expr_type already picked the type of the loaded | |
156 | or stored data. */ | |
157 | i = ~0U; | |
158 | else if (internal_fn_mask_index (ifn) == 0) | |
159 | i = 1; | |
160 | } | |
161 | if (i < gimple_call_num_args (call)) | |
162 | { | |
163 | tree rhs_type = TREE_TYPE (gimple_call_arg (call, i)); | |
164 | if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type))) | |
165 | { | |
166 | rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type)); | |
167 | if (rhs < lhs) | |
168 | scalar_type = rhs_type; | |
169 | } | |
170 | } | |
fb85abff | 171 | } |
48e1416a | 172 | |
173 | *lhs_size_unit = lhs; | |
fb85abff | 174 | *rhs_size_unit = rhs; |
175 | return scalar_type; | |
176 | } | |
177 | ||
178 | ||
fb85abff | 179 | /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be |
180 | tested at run-time. Return TRUE if DDR was successfully inserted. | |
181 | Return false if versioning is not supported. */ | |
182 | ||
ed9370cc | 183 | static opt_result |
fb85abff | 184 | vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo) |
185 | { | |
2e966e2a | 186 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
fb85abff | 187 | |
188 | if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0) | |
ed9370cc | 189 | return opt_result::failure_at (vect_location, |
190 | "will not create alias checks, as" | |
191 | " --param vect-max-version-for-alias-checks" | |
192 | " == 0\n"); | |
fb85abff | 193 | |
ed9370cc | 194 | opt_result res |
195 | = runtime_alias_check_p (ddr, loop, | |
196 | optimize_loop_nest_for_speed_p (loop)); | |
197 | if (!res) | |
198 | return res; | |
f634c3e9 | 199 | |
f1f41a6c | 200 | LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr); |
ed9370cc | 201 | return opt_result::success (); |
fb85abff | 202 | } |
203 | ||
e85b4a5e | 204 | /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */ |
205 | ||
206 | static void | |
207 | vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value) | |
208 | { | |
209 | vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo); | |
210 | for (unsigned int i = 0; i < checks.length(); ++i) | |
211 | if (checks[i] == value) | |
212 | return; | |
213 | ||
214 | if (dump_enabled_p ()) | |
a4e972e3 | 215 | dump_printf_loc (MSG_NOTE, vect_location, |
216 | "need run-time check that %T is nonzero\n", | |
217 | value); | |
e85b4a5e | 218 | LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value); |
219 | } | |
220 | ||
abc9513d | 221 | /* Return true if we know that the order of vectorized DR_INFO_A and |
222 | vectorized DR_INFO_B will be the same as the order of DR_INFO_A and | |
223 | DR_INFO_B. At least one of the accesses is a write. */ | |
e85b4a5e | 224 | |
225 | static bool | |
abc9513d | 226 | vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b) |
e85b4a5e | 227 | { |
abc9513d | 228 | stmt_vec_info stmtinfo_a = dr_info_a->stmt; |
229 | stmt_vec_info stmtinfo_b = dr_info_b->stmt; | |
230 | ||
e85b4a5e | 231 | /* Single statements are always kept in their original order. */ |
232 | if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a) | |
233 | && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)) | |
234 | return true; | |
235 | ||
236 | /* STMT_A and STMT_B belong to overlapping groups. All loads in a | |
29659b56 | 237 | SLP group are emitted at the position of the last scalar load and |
238 | all loads in an interleaving group are emitted at the position | |
239 | of the first scalar load. | |
240 | Stores in a group are emitted at the position of the last scalar store. | |
ce8e9d74 | 241 | Compute that position and check whether the resulting order matches |
29659b56 | 242 | the current one. |
243 | We have not yet decided between SLP and interleaving so we have | |
244 | to conservatively assume both. */ | |
245 | stmt_vec_info il_a; | |
246 | stmt_vec_info last_a = il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a); | |
ce8e9d74 | 247 | if (last_a) |
29659b56 | 248 | { |
249 | for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s; | |
250 | s = DR_GROUP_NEXT_ELEMENT (s)) | |
251 | last_a = get_later_stmt (last_a, s); | |
252 | if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a))) | |
253 | { | |
254 | for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s; | |
255 | s = DR_GROUP_NEXT_ELEMENT (s)) | |
256 | if (get_later_stmt (il_a, s) == il_a) | |
257 | il_a = s; | |
258 | } | |
259 | else | |
260 | il_a = last_a; | |
261 | } | |
ce8e9d74 | 262 | else |
29659b56 | 263 | last_a = il_a = stmtinfo_a; |
264 | stmt_vec_info il_b; | |
265 | stmt_vec_info last_b = il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b); | |
ce8e9d74 | 266 | if (last_b) |
29659b56 | 267 | { |
268 | for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s; | |
269 | s = DR_GROUP_NEXT_ELEMENT (s)) | |
270 | last_b = get_later_stmt (last_b, s); | |
271 | if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b))) | |
272 | { | |
273 | for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s; | |
274 | s = DR_GROUP_NEXT_ELEMENT (s)) | |
275 | if (get_later_stmt (il_b, s) == il_b) | |
276 | il_b = s; | |
277 | } | |
278 | else | |
279 | il_b = last_b; | |
280 | } | |
ce8e9d74 | 281 | else |
29659b56 | 282 | last_b = il_b = stmtinfo_b; |
283 | bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a); | |
284 | return (/* SLP */ | |
285 | (get_later_stmt (last_a, last_b) == last_a) == a_after_b | |
286 | /* Interleaving */ | |
287 | && (get_later_stmt (il_a, il_b) == il_a) == a_after_b | |
288 | /* Mixed */ | |
289 | && (get_later_stmt (il_a, last_b) == il_a) == a_after_b | |
290 | && (get_later_stmt (last_a, il_b) == last_a) == a_after_b); | |
e85b4a5e | 291 | } |
37545e54 | 292 | |
403965f7 | 293 | /* A subroutine of vect_analyze_data_ref_dependence. Handle |
294 | DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence | |
295 | distances. These distances are conservatively correct but they don't | |
296 | reflect a guaranteed dependence. | |
297 | ||
298 | Return true if this function does all the work necessary to avoid | |
299 | an alias or false if the caller should use the dependence distances | |
300 | to limit the vectorization factor in the usual way. LOOP_DEPTH is | |
301 | the depth of the loop described by LOOP_VINFO and the other arguments | |
302 | are as for vect_analyze_data_ref_dependence. */ | |
303 | ||
304 | static bool | |
305 | vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr, | |
306 | loop_vec_info loop_vinfo, | |
d75596cd | 307 | int loop_depth, unsigned int *max_vf) |
403965f7 | 308 | { |
2e966e2a | 309 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
403965f7 | 310 | lambda_vector dist_v; |
311 | unsigned int i; | |
312 | FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) | |
313 | { | |
314 | int dist = dist_v[loop_depth]; | |
315 | if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr))) | |
316 | { | |
317 | /* If the user asserted safelen >= DIST consecutive iterations | |
318 | can be executed concurrently, assume independence. | |
319 | ||
320 | ??? An alternative would be to add the alias check even | |
321 | in this case, and vectorize the fallback loop with the | |
322 | maximum VF set to safelen. However, if the user has | |
323 | explicitly given a length, it's less likely that that | |
324 | would be a win. */ | |
325 | if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen) | |
326 | { | |
d75596cd | 327 | if ((unsigned int) loop->safelen < *max_vf) |
403965f7 | 328 | *max_vf = loop->safelen; |
329 | LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; | |
330 | continue; | |
331 | } | |
332 | ||
333 | /* For dependence distances of 2 or more, we have the option | |
334 | of limiting VF or checking for an alias at runtime. | |
335 | Prefer to check at runtime if we can, to avoid limiting | |
336 | the VF unnecessarily when the bases are in fact independent. | |
337 | ||
338 | Note that the alias checks will be removed if the VF ends up | |
339 | being small enough. */ | |
db72d3bf | 340 | dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr)); |
341 | dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr)); | |
342 | return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt) | |
343 | && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt) | |
fa681b45 | 344 | && vect_mark_for_runtime_alias_test (ddr, loop_vinfo)); |
403965f7 | 345 | } |
346 | } | |
347 | return true; | |
348 | } | |
349 | ||
350 | ||
fb85abff | 351 | /* Function vect_analyze_data_ref_dependence. |
352 | ||
ed9370cc | 353 | FIXME: I needed to change the sense of the returned flag. |
354 | ||
355 | Return FALSE if there (might) exist a dependence between a memory-reference | |
fb85abff | 356 | DRA and a memory-reference DRB. When versioning for alias may check a |
ed9370cc | 357 | dependence at run-time, return TRUE. Adjust *MAX_VF according to |
91a74fc6 | 358 | the data dependence. */ |
48e1416a | 359 | |
ed9370cc | 360 | static opt_result |
fb85abff | 361 | vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr, |
d75596cd | 362 | loop_vec_info loop_vinfo, |
363 | unsigned int *max_vf) | |
fb85abff | 364 | { |
365 | unsigned int i; | |
2e966e2a | 366 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
fb85abff | 367 | struct data_reference *dra = DDR_A (ddr); |
368 | struct data_reference *drb = DDR_B (ddr); | |
db72d3bf | 369 | dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra); |
370 | dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb); | |
abc9513d | 371 | stmt_vec_info stmtinfo_a = dr_info_a->stmt; |
372 | stmt_vec_info stmtinfo_b = dr_info_b->stmt; | |
fb85abff | 373 | lambda_vector dist_v; |
374 | unsigned int loop_depth; | |
48e1416a | 375 | |
68f15e9d | 376 | /* In loop analysis all data references should be vectorizable. */ |
6ea6a380 | 377 | if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a) |
378 | || !STMT_VINFO_VECTORIZABLE (stmtinfo_b)) | |
68f15e9d | 379 | gcc_unreachable (); |
6ea6a380 | 380 | |
68f15e9d | 381 | /* Independent data accesses. */ |
fb85abff | 382 | if (DDR_ARE_DEPENDENT (ddr) == chrec_known) |
ed9370cc | 383 | return opt_result::success (); |
37545e54 | 384 | |
68f15e9d | 385 | if (dra == drb |
386 | || (DR_IS_READ (dra) && DR_IS_READ (drb))) | |
ed9370cc | 387 | return opt_result::success (); |
48e1416a | 388 | |
5695a690 | 389 | /* We do not have to consider dependences between accesses that belong |
472a8968 | 390 | to the same group, unless the stride could be smaller than the |
391 | group size. */ | |
e1009321 | 392 | if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a) |
393 | && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a) | |
394 | == DR_GROUP_FIRST_ELEMENT (stmtinfo_b)) | |
472a8968 | 395 | && !STMT_VINFO_STRIDED_P (stmtinfo_a)) |
ed9370cc | 396 | return opt_result::success (); |
5695a690 | 397 | |
0f52e33a | 398 | /* Even if we have an anti-dependence then, as the vectorized loop covers at |
399 | least two scalar iterations, there is always also a true dependence. | |
400 | As the vectorizer does not re-order loads and stores we can ignore | |
401 | the anti-dependence if TBAA can disambiguate both DRs similar to the | |
402 | case with known negative distance anti-dependences (positive | |
403 | distance anti-dependences would violate TBAA constraints). */ | |
404 | if (((DR_IS_READ (dra) && DR_IS_WRITE (drb)) | |
405 | || (DR_IS_WRITE (dra) && DR_IS_READ (drb))) | |
406 | && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)), | |
407 | get_alias_set (DR_REF (drb)))) | |
ed9370cc | 408 | return opt_result::success (); |
48e1416a | 409 | |
68f15e9d | 410 | /* Unknown data dependence. */ |
fb85abff | 411 | if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) |
412 | { | |
3d483a94 | 413 | /* If user asserted safelen consecutive iterations can be |
414 | executed concurrently, assume independence. */ | |
415 | if (loop->safelen >= 2) | |
416 | { | |
d75596cd | 417 | if ((unsigned int) loop->safelen < *max_vf) |
3d483a94 | 418 | *max_vf = loop->safelen; |
c7a8722c | 419 | LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; |
ed9370cc | 420 | return opt_result::success (); |
3d483a94 | 421 | } |
422 | ||
0bd6d857 | 423 | if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) |
424 | || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) | |
ed9370cc | 425 | return opt_result::failure_at |
426 | (stmtinfo_a->stmt, | |
427 | "versioning for alias not supported for: " | |
428 | "can't determine dependence between %T and %T\n", | |
429 | DR_REF (dra), DR_REF (drb)); | |
95e19962 | 430 | |
6d8fb6cf | 431 | if (dump_enabled_p ()) |
ed9370cc | 432 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt, |
a4e972e3 | 433 | "versioning for alias required: " |
434 | "can't determine dependence between %T and %T\n", | |
435 | DR_REF (dra), DR_REF (drb)); | |
d4b21757 | 436 | |
68f15e9d | 437 | /* Add to list of ddrs that need to be tested at run-time. */ |
ed9370cc | 438 | return vect_mark_for_runtime_alias_test (ddr, loop_vinfo); |
37545e54 | 439 | } |
440 | ||
68f15e9d | 441 | /* Known data dependence. */ |
fb85abff | 442 | if (DDR_NUM_DIST_VECTS (ddr) == 0) |
443 | { | |
3d483a94 | 444 | /* If user asserted safelen consecutive iterations can be |
445 | executed concurrently, assume independence. */ | |
446 | if (loop->safelen >= 2) | |
447 | { | |
d75596cd | 448 | if ((unsigned int) loop->safelen < *max_vf) |
3d483a94 | 449 | *max_vf = loop->safelen; |
c7a8722c | 450 | LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; |
ed9370cc | 451 | return opt_result::success (); |
3d483a94 | 452 | } |
453 | ||
0bd6d857 | 454 | if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) |
455 | || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) | |
ed9370cc | 456 | return opt_result::failure_at |
457 | (stmtinfo_a->stmt, | |
458 | "versioning for alias not supported for: " | |
459 | "bad dist vector for %T and %T\n", | |
460 | DR_REF (dra), DR_REF (drb)); | |
95e19962 | 461 | |
6d8fb6cf | 462 | if (dump_enabled_p ()) |
ed9370cc | 463 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt, |
a4e972e3 | 464 | "versioning for alias required: " |
465 | "bad dist vector for %T and %T\n", | |
466 | DR_REF (dra), DR_REF (drb)); | |
fb85abff | 467 | /* Add to list of ddrs that need to be tested at run-time. */ |
ed9370cc | 468 | return vect_mark_for_runtime_alias_test (ddr, loop_vinfo); |
48e1416a | 469 | } |
fb85abff | 470 | |
471 | loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr)); | |
403965f7 | 472 | |
473 | if (DDR_COULD_BE_INDEPENDENT_P (ddr) | |
474 | && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo, | |
475 | loop_depth, max_vf)) | |
ed9370cc | 476 | return opt_result::success (); |
403965f7 | 477 | |
f1f41a6c | 478 | FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) |
fb85abff | 479 | { |
480 | int dist = dist_v[loop_depth]; | |
481 | ||
6d8fb6cf | 482 | if (dump_enabled_p ()) |
7bd765d4 | 483 | dump_printf_loc (MSG_NOTE, vect_location, |
78bb46f5 | 484 | "dependence distance = %d.\n", dist); |
fb85abff | 485 | |
91a74fc6 | 486 | if (dist == 0) |
fb85abff | 487 | { |
6d8fb6cf | 488 | if (dump_enabled_p ()) |
a4e972e3 | 489 | dump_printf_loc (MSG_NOTE, vect_location, |
490 | "dependence distance == 0 between %T and %T\n", | |
491 | DR_REF (dra), DR_REF (drb)); | |
fb85abff | 492 | |
4d525783 | 493 | /* When we perform grouped accesses and perform implicit CSE |
494 | by detecting equal accesses and doing disambiguation with | |
495 | runtime alias tests like for | |
496 | .. = a[i]; | |
497 | .. = a[i+1]; | |
498 | a[i] = ..; | |
499 | a[i+1] = ..; | |
500 | *p = ..; | |
501 | .. = a[i]; | |
502 | .. = a[i+1]; | |
503 | where we will end up loading { a[i], a[i+1] } once, make | |
504 | sure that inserting group loads before the first load and | |
5a91be9e | 505 | stores after the last store will do the right thing. |
506 | Similar for groups like | |
507 | a[i] = ...; | |
508 | ... = a[i]; | |
509 | a[i+1] = ...; | |
510 | where loads from the group interleave with the store. */ | |
abc9513d | 511 | if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b)) |
ed9370cc | 512 | return opt_result::failure_at (stmtinfo_a->stmt, |
513 | "READ_WRITE dependence" | |
514 | " in interleaving.\n"); | |
e85b4a5e | 515 | |
84017e0e | 516 | if (loop->safelen < 2) |
4d525783 | 517 | { |
e85b4a5e | 518 | tree indicator = dr_zero_step_indicator (dra); |
fa681b45 | 519 | if (!indicator || integer_zerop (indicator)) |
ed9370cc | 520 | return opt_result::failure_at (stmtinfo_a->stmt, |
521 | "access also has a zero step\n"); | |
fa681b45 | 522 | else if (TREE_CODE (indicator) != INTEGER_CST) |
523 | vect_check_nonzero_value (loop_vinfo, indicator); | |
fb85abff | 524 | } |
91a74fc6 | 525 | continue; |
526 | } | |
527 | ||
528 | if (dist > 0 && DDR_REVERSED_P (ddr)) | |
529 | { | |
530 | /* If DDR_REVERSED_P the order of the data-refs in DDR was | |
531 | reversed (to make distance vector positive), and the actual | |
532 | distance is negative. */ | |
6d8fb6cf | 533 | if (dump_enabled_p ()) |
ed85753c | 534 | dump_printf_loc (MSG_NOTE, vect_location, |
78bb46f5 | 535 | "dependence distance negative.\n"); |
ed85753c | 536 | /* When doing outer loop vectorization, we need to check if there is |
537 | a backward dependence at the inner loop level if the dependence | |
538 | at the outer loop is reversed. See PR81740. */ | |
539 | if (nested_in_vect_loop_p (loop, stmtinfo_a) | |
540 | || nested_in_vect_loop_p (loop, stmtinfo_b)) | |
541 | { | |
542 | unsigned inner_depth = index_in_loop_nest (loop->inner->num, | |
543 | DDR_LOOP_NEST (ddr)); | |
544 | if (dist_v[inner_depth] < 0) | |
545 | return opt_result::failure_at (stmtinfo_a->stmt, | |
546 | "not vectorized, dependence " | |
547 | "between data-refs %T and %T\n", | |
548 | DR_REF (dra), DR_REF (drb)); | |
549 | } | |
a8cf7702 | 550 | /* Record a negative dependence distance to later limit the |
551 | amount of stmt copying / unrolling we can perform. | |
552 | Only need to handle read-after-write dependence. */ | |
553 | if (DR_IS_READ (drb) | |
554 | && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0 | |
555 | || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist)) | |
556 | STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist; | |
91a74fc6 | 557 | continue; |
558 | } | |
559 | ||
d75596cd | 560 | unsigned int abs_dist = abs (dist); |
561 | if (abs_dist >= 2 && abs_dist < *max_vf) | |
91a74fc6 | 562 | { |
563 | /* The dependence distance requires reduction of the maximal | |
564 | vectorization factor. */ | |
ed85753c | 565 | *max_vf = abs_dist; |
6d8fb6cf | 566 | if (dump_enabled_p ()) |
7bd765d4 | 567 | dump_printf_loc (MSG_NOTE, vect_location, |
78bb46f5 | 568 | "adjusting maximal vectorization factor to %i\n", |
569 | *max_vf); | |
fb85abff | 570 | } |
571 | ||
d75596cd | 572 | if (abs_dist >= *max_vf) |
fb85abff | 573 | { |
48e1416a | 574 | /* Dependence distance does not create dependence, as far as |
91a74fc6 | 575 | vectorization is concerned, in this case. */ |
6d8fb6cf | 576 | if (dump_enabled_p ()) |
7bd765d4 | 577 | dump_printf_loc (MSG_NOTE, vect_location, |
78bb46f5 | 578 | "dependence distance >= VF.\n"); |
fb85abff | 579 | continue; |
580 | } | |
581 | ||
ed9370cc | 582 | return opt_result::failure_at (stmtinfo_a->stmt, |
583 | "not vectorized, possible dependence " | |
584 | "between data-refs %T and %T\n", | |
585 | DR_REF (dra), DR_REF (drb)); | |
fb85abff | 586 | } |
587 | ||
ed9370cc | 588 | return opt_result::success (); |
fb85abff | 589 | } |
590 | ||
591 | /* Function vect_analyze_data_ref_dependences. | |
48e1416a | 592 | |
fb85abff | 593 | Examine all the data references in the loop, and make sure there do not |
91a74fc6 | 594 | exist any data dependences between them. Set *MAX_VF according to |
595 | the maximum vectorization factor the data dependences allow. */ | |
48e1416a | 596 | |
ed9370cc | 597 | opt_result |
d75596cd | 598 | vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, |
599 | unsigned int *max_vf) | |
fb85abff | 600 | { |
601 | unsigned int i; | |
fb85abff | 602 | struct data_dependence_relation *ddr; |
603 | ||
88f6eb8f | 604 | DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences"); |
68f15e9d | 605 | |
a99aba41 | 606 | if (!LOOP_VINFO_DDRS (loop_vinfo).exists ()) |
607 | { | |
608 | LOOP_VINFO_DDRS (loop_vinfo) | |
609 | .create (LOOP_VINFO_DATAREFS (loop_vinfo).length () | |
610 | * LOOP_VINFO_DATAREFS (loop_vinfo).length ()); | |
611 | /* We need read-read dependences to compute | |
612 | STMT_VINFO_SAME_ALIGN_REFS. */ | |
03ad9f74 | 613 | bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo), |
614 | &LOOP_VINFO_DDRS (loop_vinfo), | |
615 | LOOP_VINFO_LOOP_NEST (loop_vinfo), | |
616 | true); | |
617 | gcc_assert (res); | |
a99aba41 | 618 | } |
619 | ||
c7a8722c | 620 | LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true; |
68f15e9d | 621 | |
5b631e09 | 622 | /* For epilogues we either have no aliases or alias versioning |
623 | was applied to original loop. Therefore we may just get max_vf | |
624 | using VF of original loop. */ | |
625 | if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) | |
4a85c0b1 | 626 | *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo); |
5b631e09 | 627 | else |
628 | FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr) | |
ed9370cc | 629 | { |
630 | opt_result res | |
631 | = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf); | |
632 | if (!res) | |
633 | return res; | |
634 | } | |
68f15e9d | 635 | |
ed9370cc | 636 | return opt_result::success (); |
68f15e9d | 637 | } |
638 | ||
639 | ||
640 | /* Function vect_slp_analyze_data_ref_dependence. | |
641 | ||
642 | Return TRUE if there (might) exist a dependence between a memory-reference | |
db72d3bf | 643 | DRA and a memory-reference DRB for VINFO. When versioning for alias |
644 | may check a dependence at run-time, return FALSE. Adjust *MAX_VF | |
645 | according to the data dependence. */ | |
68f15e9d | 646 | |
647 | static bool | |
db72d3bf | 648 | vect_slp_analyze_data_ref_dependence (vec_info *vinfo, |
649 | struct data_dependence_relation *ddr) | |
68f15e9d | 650 | { |
651 | struct data_reference *dra = DDR_A (ddr); | |
652 | struct data_reference *drb = DDR_B (ddr); | |
db72d3bf | 653 | dr_vec_info *dr_info_a = vinfo->lookup_dr (dra); |
654 | dr_vec_info *dr_info_b = vinfo->lookup_dr (drb); | |
68f15e9d | 655 | |
656 | /* We need to check dependences of statements marked as unvectorizable | |
657 | as well, they still can prohibit vectorization. */ | |
658 | ||
659 | /* Independent data accesses. */ | |
660 | if (DDR_ARE_DEPENDENT (ddr) == chrec_known) | |
661 | return false; | |
662 | ||
663 | if (dra == drb) | |
664 | return false; | |
665 | ||
666 | /* Read-read is OK. */ | |
667 | if (DR_IS_READ (dra) && DR_IS_READ (drb)) | |
668 | return false; | |
669 | ||
1fa434e3 | 670 | /* If dra and drb are part of the same interleaving chain consider |
671 | them independent. */ | |
abc9513d | 672 | if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt) |
673 | && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt) | |
674 | == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt))) | |
1fa434e3 | 675 | return false; |
676 | ||
68f15e9d | 677 | /* Unknown data dependence. */ |
678 | if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) | |
07e3bcbf | 679 | { |
50e6c257 | 680 | if (dump_enabled_p ()) |
a4e972e3 | 681 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
682 | "can't determine dependence between %T and %T\n", | |
683 | DR_REF (dra), DR_REF (drb)); | |
07e3bcbf | 684 | } |
50e6c257 | 685 | else if (dump_enabled_p ()) |
a4e972e3 | 686 | dump_printf_loc (MSG_NOTE, vect_location, |
687 | "determined dependence between %T and %T\n", | |
688 | DR_REF (dra), DR_REF (drb)); | |
48e1416a | 689 | |
68f15e9d | 690 | return true; |
691 | } | |
692 | ||
693 | ||
c256513d | 694 | /* Analyze dependences involved in the transform of SLP NODE. STORES |
695 | contain the vector of scalar stores of this instance if we are | |
696 | disambiguating the loads. */ | |
77d241ed | 697 | |
698 | static bool | |
c256513d | 699 | vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node, |
06bb64b8 | 700 | vec<stmt_vec_info> stores, |
ecc42a77 | 701 | stmt_vec_info last_store_info) |
77d241ed | 702 | { |
703 | /* This walks over all stmts involved in the SLP load/store done | |
704 | in NODE verifying we can sink them up to the last stmt in the | |
705 | group. */ | |
3d9c962c | 706 | stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node); |
1c2fef9a | 707 | vec_info *vinfo = last_access_info->vinfo; |
77d241ed | 708 | for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) |
709 | { | |
06bb64b8 | 710 | stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k]; |
3d9c962c | 711 | if (access_info == last_access_info) |
77d241ed | 712 | continue; |
06bb64b8 | 713 | data_reference *dr_a = STMT_VINFO_DATA_REF (access_info); |
58cfef6b | 714 | ao_ref ref; |
715 | bool ref_initialized_p = false; | |
06bb64b8 | 716 | for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt); |
3d9c962c | 717 | gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi)) |
77d241ed | 718 | { |
719 | gimple *stmt = gsi_stmt (gsi); | |
d144c8b2 | 720 | if (! gimple_vuse (stmt) |
721 | || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt))) | |
77d241ed | 722 | continue; |
723 | ||
d144c8b2 | 724 | /* If we couldn't record a (single) data reference for this |
58cfef6b | 725 | stmt we have to resort to the alias oracle. */ |
1c2fef9a | 726 | stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt); |
727 | data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info); | |
d144c8b2 | 728 | if (!dr_b) |
58cfef6b | 729 | { |
730 | /* We are moving a store or sinking a load - this means | |
731 | we cannot use TBAA for disambiguation. */ | |
732 | if (!ref_initialized_p) | |
733 | ao_ref_init (&ref, DR_REF (dr_a)); | |
734 | if (stmt_may_clobber_ref_p_1 (stmt, &ref, false) | |
735 | || ref_maybe_used_by_stmt_p (stmt, &ref, false)) | |
736 | return false; | |
737 | continue; | |
738 | } | |
d144c8b2 | 739 | |
92bf253d | 740 | bool dependent = false; |
c256513d | 741 | /* If we run into a store of this same instance (we've just |
742 | marked those) then delay dependence checking until we run | |
743 | into the last store because this is where it will have | |
744 | been sunk to (and we verify if we can do that as well). */ | |
745 | if (gimple_visited_p (stmt)) | |
746 | { | |
ecc42a77 | 747 | if (stmt_info != last_store_info) |
c256513d | 748 | continue; |
749 | unsigned i; | |
06bb64b8 | 750 | stmt_vec_info store_info; |
751 | FOR_EACH_VEC_ELT (stores, i, store_info) | |
c256513d | 752 | { |
06bb64b8 | 753 | data_reference *store_dr = STMT_VINFO_DATA_REF (store_info); |
c256513d | 754 | ddr_p ddr = initialize_data_dependence_relation |
755 | (dr_a, store_dr, vNULL); | |
db72d3bf | 756 | dependent |
757 | = vect_slp_analyze_data_ref_dependence (vinfo, ddr); | |
c256513d | 758 | free_dependence_relation (ddr); |
92bf253d | 759 | if (dependent) |
760 | break; | |
c256513d | 761 | } |
762 | } | |
92bf253d | 763 | else |
77d241ed | 764 | { |
92bf253d | 765 | ddr_p ddr = initialize_data_dependence_relation (dr_a, |
766 | dr_b, vNULL); | |
db72d3bf | 767 | dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr); |
77d241ed | 768 | free_dependence_relation (ddr); |
77d241ed | 769 | } |
92bf253d | 770 | if (dependent) |
771 | return false; | |
77d241ed | 772 | } |
773 | } | |
774 | return true; | |
775 | } | |
776 | ||
777 | ||
68f15e9d | 778 | /* Function vect_analyze_data_ref_dependences. |
779 | ||
780 | Examine all the data references in the basic-block, and make sure there | |
781 | do not exist any data dependences between them. Set *MAX_VF according to | |
782 | the maximum vectorization factor the data dependences allow. */ | |
783 | ||
784 | bool | |
c256513d | 785 | vect_slp_analyze_instance_dependence (slp_instance instance) |
68f15e9d | 786 | { |
88f6eb8f | 787 | DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence"); |
68f15e9d | 788 | |
c256513d | 789 | /* The stores of this instance are at the root of the SLP tree. */ |
790 | slp_tree store = SLP_INSTANCE_TREE (instance); | |
06bb64b8 | 791 | if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0])) |
c256513d | 792 | store = NULL; |
793 | ||
794 | /* Verify we can sink stores to the vectorized stmt insert location. */ | |
3d9c962c | 795 | stmt_vec_info last_store_info = NULL; |
c256513d | 796 | if (store) |
77d241ed | 797 | { |
c256513d | 798 | if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL)) |
799 | return false; | |
800 | ||
801 | /* Mark stores in this instance and remember the last one. */ | |
3d9c962c | 802 | last_store_info = vect_find_last_scalar_stmt_in_slp (store); |
c256513d | 803 | for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) |
06bb64b8 | 804 | gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true); |
77d241ed | 805 | } |
68f15e9d | 806 | |
c256513d | 807 | bool res = true; |
fb85abff | 808 | |
c256513d | 809 | /* Verify we can sink loads to the vectorized stmt insert location, |
810 | special-casing stores of this instance. */ | |
811 | slp_tree load; | |
812 | unsigned int i; | |
813 | FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load) | |
814 | if (! vect_slp_analyze_node_dependences (instance, load, | |
815 | store | |
816 | ? SLP_TREE_SCALAR_STMTS (store) | |
3d9c962c | 817 | : vNULL, last_store_info)) |
c256513d | 818 | { |
819 | res = false; | |
820 | break; | |
821 | } | |
822 | ||
823 | /* Unset the visited flag. */ | |
824 | if (store) | |
825 | for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) | |
06bb64b8 | 826 | gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false); |
c256513d | 827 | |
828 | return res; | |
fb85abff | 829 | } |
830 | ||
52643160 | 831 | /* Record the base alignment guarantee given by DRB, which occurs |
832 | in STMT_INFO. */ | |
4f372c2c | 833 | |
834 | static void | |
52643160 | 835 | vect_record_base_alignment (stmt_vec_info stmt_info, |
4f372c2c | 836 | innermost_loop_behavior *drb) |
837 | { | |
52643160 | 838 | vec_info *vinfo = stmt_info->vinfo; |
4f372c2c | 839 | bool existed; |
840 | innermost_loop_behavior *&entry | |
841 | = vinfo->base_alignments.get_or_insert (drb->base_address, &existed); | |
842 | if (!existed || entry->base_alignment < drb->base_alignment) | |
843 | { | |
844 | entry = drb; | |
845 | if (dump_enabled_p ()) | |
a4e972e3 | 846 | dump_printf_loc (MSG_NOTE, vect_location, |
847 | "recording new base alignment for %T\n" | |
848 | " alignment: %d\n" | |
849 | " misalignment: %d\n" | |
850 | " based on: %G", | |
851 | drb->base_address, | |
852 | drb->base_alignment, | |
853 | drb->base_misalignment, | |
854 | stmt_info->stmt); | |
4f372c2c | 855 | } |
856 | } | |
857 | ||
858 | /* If the region we're going to vectorize is reached, all unconditional | |
859 | data references occur at least once. We can therefore pool the base | |
860 | alignment guarantees from each unconditional reference. Do this by | |
861 | going through all the data references in VINFO and checking whether | |
862 | the containing statement makes the reference unconditionally. If so, | |
863 | record the alignment of the base address in VINFO so that it can be | |
864 | used for all other references with the same base. */ | |
865 | ||
866 | void | |
867 | vect_record_base_alignments (vec_info *vinfo) | |
868 | { | |
869 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); | |
2e966e2a | 870 | class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL; |
4f372c2c | 871 | data_reference *dr; |
872 | unsigned int i; | |
a99aba41 | 873 | FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr) |
adebd8d4 | 874 | { |
db72d3bf | 875 | dr_vec_info *dr_info = vinfo->lookup_dr (dr); |
abc9513d | 876 | stmt_vec_info stmt_info = dr_info->stmt; |
1ce0a2db | 877 | if (!DR_IS_CONDITIONAL_IN_STMT (dr) |
fa681b45 | 878 | && STMT_VINFO_VECTORIZABLE (stmt_info) |
879 | && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)) | |
1ce0a2db | 880 | { |
52643160 | 881 | vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr)); |
4f372c2c | 882 | |
1ce0a2db | 883 | /* If DR is nested in the loop that is being vectorized, we can also |
884 | record the alignment of the base wrt the outer loop. */ | |
0219dc42 | 885 | if (loop && nested_in_vect_loop_p (loop, stmt_info)) |
fa681b45 | 886 | vect_record_base_alignment |
52643160 | 887 | (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info)); |
1ce0a2db | 888 | } |
adebd8d4 | 889 | } |
4f372c2c | 890 | } |
891 | ||
abc9513d | 892 | /* Return the target alignment for the vectorized form of DR_INFO. */ |
aec313e5 | 893 | |
e092c20e | 894 | static poly_uint64 |
abc9513d | 895 | vect_calculate_target_alignment (dr_vec_info *dr_info) |
aec313e5 | 896 | { |
abc9513d | 897 | tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt); |
aec313e5 | 898 | return targetm.vectorize.preferred_vector_alignment (vectype); |
899 | } | |
900 | ||
fb85abff | 901 | /* Function vect_compute_data_ref_alignment |
902 | ||
abc9513d | 903 | Compute the misalignment of the data reference DR_INFO. |
fb85abff | 904 | |
905 | Output: | |
abc9513d | 906 | 1. DR_MISALIGNMENT (DR_INFO) is defined. |
fb85abff | 907 | |
908 | FOR NOW: No analysis is actually performed. Misalignment is calculated | |
909 | only for trivial cases. TODO. */ | |
910 | ||
fa681b45 | 911 | static void |
abc9513d | 912 | vect_compute_data_ref_alignment (dr_vec_info *dr_info) |
fb85abff | 913 | { |
abc9513d | 914 | stmt_vec_info stmt_info = dr_info->stmt; |
4f372c2c | 915 | vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments; |
fb85abff | 916 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); |
2e966e2a | 917 | class loop *loop = NULL; |
abc9513d | 918 | tree ref = DR_REF (dr_info->dr); |
9e879814 | 919 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
48e1416a | 920 | |
6d8fb6cf | 921 | if (dump_enabled_p ()) |
7bd765d4 | 922 | dump_printf_loc (MSG_NOTE, vect_location, |
78bb46f5 | 923 | "vect_compute_data_ref_alignment:\n"); |
fb85abff | 924 | |
37545e54 | 925 | if (loop_vinfo) |
926 | loop = LOOP_VINFO_LOOP (loop_vinfo); | |
48e1416a | 927 | |
fb85abff | 928 | /* Initialize misalignment to unknown. */ |
abc9513d | 929 | SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN); |
fb85abff | 930 | |
fa681b45 | 931 | if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) |
932 | return; | |
933 | ||
abc9513d | 934 | innermost_loop_behavior *drb = vect_dr_behavior (dr_info); |
9e879814 | 935 | bool step_preserves_misalignment_p; |
936 | ||
e092c20e | 937 | poly_uint64 vector_alignment |
938 | = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT); | |
abc9513d | 939 | DR_TARGET_ALIGNMENT (dr_info) = vector_alignment; |
aec313e5 | 940 | |
e092c20e | 941 | unsigned HOST_WIDE_INT vect_align_c; |
942 | if (!vector_alignment.is_constant (&vect_align_c)) | |
943 | return; | |
944 | ||
9e879814 | 945 | /* No step for BB vectorization. */ |
946 | if (!loop) | |
947 | { | |
948 | gcc_assert (integer_zerop (drb->step)); | |
949 | step_preserves_misalignment_p = true; | |
950 | } | |
fb85abff | 951 | |
952 | /* In case the dataref is in an inner-loop of the loop that is being | |
953 | vectorized (LOOP), we use the base and misalignment information | |
282bf14c | 954 | relative to the outer-loop (LOOP). This is ok only if the misalignment |
fb85abff | 955 | stays the same throughout the execution of the inner-loop, which is why |
956 | we have to check that the stride of the dataref in the inner-loop evenly | |
aec313e5 | 957 | divides by the vector alignment. */ |
0219dc42 | 958 | else if (nested_in_vect_loop_p (loop, stmt_info)) |
fb85abff | 959 | { |
9e879814 | 960 | step_preserves_misalignment_p |
e092c20e | 961 | = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0; |
48e1416a | 962 | |
9e879814 | 963 | if (dump_enabled_p ()) |
fb85abff | 964 | { |
9e879814 | 965 | if (step_preserves_misalignment_p) |
966 | dump_printf_loc (MSG_NOTE, vect_location, | |
aec313e5 | 967 | "inner step divides the vector alignment.\n"); |
9e879814 | 968 | else |
7bd765d4 | 969 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
aec313e5 | 970 | "inner step doesn't divide the vector" |
971 | " alignment.\n"); | |
fb85abff | 972 | } |
973 | } | |
974 | ||
c1bee668 | 975 | /* Similarly we can only use base and misalignment information relative to |
976 | an innermost loop if the misalignment stays the same throughout the | |
977 | execution of the loop. As above, this is the case if the stride of | |
aec313e5 | 978 | the dataref evenly divides by the alignment. */ |
c1bee668 | 979 | else |
38682b67 | 980 | { |
d75596cd | 981 | poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
9e879814 | 982 | step_preserves_misalignment_p |
e092c20e | 983 | = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c); |
38682b67 | 984 | |
9e879814 | 985 | if (!step_preserves_misalignment_p && dump_enabled_p ()) |
986 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
aec313e5 | 987 | "step doesn't divide the vector alignment.\n"); |
38682b67 | 988 | } |
9dd88d41 | 989 | |
a5456a6d | 990 | unsigned int base_alignment = drb->base_alignment; |
991 | unsigned int base_misalignment = drb->base_misalignment; | |
fb85abff | 992 | |
4f372c2c | 993 | /* Calculate the maximum of the pooled base address alignment and the |
994 | alignment that we can compute for DR itself. */ | |
995 | innermost_loop_behavior **entry = base_alignments->get (drb->base_address); | |
996 | if (entry && base_alignment < (*entry)->base_alignment) | |
997 | { | |
998 | base_alignment = (*entry)->base_alignment; | |
999 | base_misalignment = (*entry)->base_misalignment; | |
1000 | } | |
1001 | ||
e092c20e | 1002 | if (drb->offset_alignment < vect_align_c |
668dd7dc | 1003 | || !step_preserves_misalignment_p |
1004 | /* We need to know whether the step wrt the vectorized loop is | |
1005 | negative when computing the starting misalignment below. */ | |
1006 | || TREE_CODE (drb->step) != INTEGER_CST) | |
fb85abff | 1007 | { |
6d8fb6cf | 1008 | if (dump_enabled_p ()) |
a4e972e3 | 1009 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1010 | "Unknown alignment for access: %T\n", ref); | |
fa681b45 | 1011 | return; |
fb85abff | 1012 | } |
1013 | ||
e092c20e | 1014 | if (base_alignment < vect_align_c) |
fb85abff | 1015 | { |
469f7bc0 | 1016 | unsigned int max_alignment; |
1017 | tree base = get_base_for_alignment (drb->base_address, &max_alignment); | |
e092c20e | 1018 | if (max_alignment < vect_align_c |
469f7bc0 | 1019 | || !vect_can_force_dr_alignment_p (base, |
e092c20e | 1020 | vect_align_c * BITS_PER_UNIT)) |
fb85abff | 1021 | { |
6d8fb6cf | 1022 | if (dump_enabled_p ()) |
a4e972e3 | 1023 | dump_printf_loc (MSG_NOTE, vect_location, |
1024 | "can't force alignment of ref: %T\n", ref); | |
fa681b45 | 1025 | return; |
fb85abff | 1026 | } |
48e1416a | 1027 | |
fb85abff | 1028 | /* Force the alignment of the decl. |
1029 | NOTE: This is the only change to the code we make during | |
1030 | the analysis phase, before deciding to vectorize the loop. */ | |
6d8fb6cf | 1031 | if (dump_enabled_p ()) |
a4e972e3 | 1032 | dump_printf_loc (MSG_NOTE, vect_location, |
1033 | "force alignment of %T\n", ref); | |
0822b158 | 1034 | |
abc9513d | 1035 | dr_info->base_decl = base; |
1036 | dr_info->base_misaligned = true; | |
a5456a6d | 1037 | base_misalignment = 0; |
fb85abff | 1038 | } |
658a2c19 | 1039 | poly_int64 misalignment |
1040 | = base_misalignment + wi::to_poly_offset (drb->init).force_shwi (); | |
fb85abff | 1041 | |
85a846a2 | 1042 | /* If this is a backward running DR then first access in the larger |
1043 | vectype actually is N-1 elements before the address in the DR. | |
1044 | Adjust misalign accordingly. */ | |
9e879814 | 1045 | if (tree_int_cst_sgn (drb->step) < 0) |
a5456a6d | 1046 | /* PLUS because STEP is negative. */ |
1047 | misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1) | |
1048 | * TREE_INT_CST_LOW (drb->step)); | |
85a846a2 | 1049 | |
658a2c19 | 1050 | unsigned int const_misalignment; |
e092c20e | 1051 | if (!known_misalignment (misalignment, vect_align_c, &const_misalignment)) |
658a2c19 | 1052 | { |
1053 | if (dump_enabled_p ()) | |
a4e972e3 | 1054 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1055 | "Non-constant misalignment for access: %T\n", ref); | |
fa681b45 | 1056 | return; |
658a2c19 | 1057 | } |
1058 | ||
abc9513d | 1059 | SET_DR_MISALIGNMENT (dr_info, const_misalignment); |
fb85abff | 1060 | |
6d8fb6cf | 1061 | if (dump_enabled_p ()) |
a4e972e3 | 1062 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1063 | "misalign = %d bytes of ref %T\n", | |
1064 | DR_MISALIGNMENT (dr_info), ref); | |
fb85abff | 1065 | |
fa681b45 | 1066 | return; |
fb85abff | 1067 | } |
1068 | ||
cd8306bf | 1069 | /* Function vect_update_misalignment_for_peel. |
abc9513d | 1070 | Sets DR_INFO's misalignment |
1071 | - to 0 if it has the same alignment as DR_PEEL_INFO, | |
1072 | - to the misalignment computed using NPEEL if DR_INFO's salignment is known, | |
cd8306bf | 1073 | - to -1 (unknown) otherwise. |
fb85abff | 1074 | |
abc9513d | 1075 | DR_INFO - the data reference whose misalignment is to be adjusted. |
1076 | DR_PEEL_INFO - the data reference whose misalignment is being made | |
1077 | zero in the vector loop by the peel. | |
fb85abff | 1078 | NPEEL - the number of iterations in the peel loop if the misalignment |
abc9513d | 1079 | of DR_PEEL_INFO is known at compile time. */ |
fb85abff | 1080 | |
1081 | static void | |
abc9513d | 1082 | vect_update_misalignment_for_peel (dr_vec_info *dr_info, |
1083 | dr_vec_info *dr_peel_info, int npeel) | |
fb85abff | 1084 | { |
1085 | unsigned int i; | |
cd8306bf | 1086 | vec<dr_p> same_aligned_drs; |
fb85abff | 1087 | struct data_reference *current_dr; |
abc9513d | 1088 | stmt_vec_info peel_stmt_info = dr_peel_info->stmt; |
fb85abff | 1089 | |
b4d2979c | 1090 | /* It can be assumed that if dr_info has the same alignment as dr_peel, |
1091 | it is aligned in the vector loop. */ | |
abc9513d | 1092 | same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info); |
cd8306bf | 1093 | FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr) |
fb85abff | 1094 | { |
abc9513d | 1095 | if (current_dr != dr_info->dr) |
fb85abff | 1096 | continue; |
abc9513d | 1097 | gcc_assert (!known_alignment_for_access_p (dr_info) |
1098 | || !known_alignment_for_access_p (dr_peel_info) | |
b4d2979c | 1099 | || (DR_MISALIGNMENT (dr_info) |
1100 | == DR_MISALIGNMENT (dr_peel_info))); | |
abc9513d | 1101 | SET_DR_MISALIGNMENT (dr_info, 0); |
fb85abff | 1102 | return; |
1103 | } | |
1104 | ||
e092c20e | 1105 | unsigned HOST_WIDE_INT alignment; |
1106 | if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment) | |
1107 | && known_alignment_for_access_p (dr_info) | |
abc9513d | 1108 | && known_alignment_for_access_p (dr_peel_info)) |
fb85abff | 1109 | { |
abc9513d | 1110 | int misal = DR_MISALIGNMENT (dr_info); |
b4d2979c | 1111 | misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr)); |
e092c20e | 1112 | misal &= alignment - 1; |
abc9513d | 1113 | SET_DR_MISALIGNMENT (dr_info, misal); |
fb85abff | 1114 | return; |
1115 | } | |
1116 | ||
6d8fb6cf | 1117 | if (dump_enabled_p ()) |
df8e9f7a | 1118 | dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \ |
1119 | "to unknown (-1).\n"); | |
abc9513d | 1120 | SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN); |
fb85abff | 1121 | } |
1122 | ||
1123 | ||
2f6fec15 | 1124 | /* Function verify_data_ref_alignment |
1125 | ||
abc9513d | 1126 | Return TRUE if DR_INFO can be handled with respect to alignment. */ |
2f6fec15 | 1127 | |
ed9370cc | 1128 | static opt_result |
abc9513d | 1129 | verify_data_ref_alignment (dr_vec_info *dr_info) |
2f6fec15 | 1130 | { |
f6593f36 | 1131 | enum dr_alignment_support supportable_dr_alignment |
abc9513d | 1132 | = vect_supportable_dr_alignment (dr_info, false); |
2f6fec15 | 1133 | if (!supportable_dr_alignment) |
ed9370cc | 1134 | return opt_result::failure_at |
1135 | (dr_info->stmt->stmt, | |
1136 | DR_IS_READ (dr_info->dr) | |
1137 | ? "not vectorized: unsupported unaligned load: %T\n" | |
1138 | : "not vectorized: unsupported unaligned store: %T\n", | |
1139 | DR_REF (dr_info->dr)); | |
2f6fec15 | 1140 | |
1141 | if (supportable_dr_alignment != dr_aligned && dump_enabled_p ()) | |
1142 | dump_printf_loc (MSG_NOTE, vect_location, | |
1143 | "Vectorizing an unaligned access.\n"); | |
1144 | ||
ed9370cc | 1145 | return opt_result::success (); |
2f6fec15 | 1146 | } |
1147 | ||
fb85abff | 1148 | /* Function vect_verify_datarefs_alignment |
1149 | ||
1150 | Return TRUE if all data references in the loop can be | |
1151 | handled with respect to alignment. */ | |
1152 | ||
ed9370cc | 1153 | opt_result |
2f6fec15 | 1154 | vect_verify_datarefs_alignment (loop_vec_info vinfo) |
fb85abff | 1155 | { |
a99aba41 | 1156 | vec<data_reference_p> datarefs = vinfo->shared->datarefs; |
fb85abff | 1157 | struct data_reference *dr; |
fb85abff | 1158 | unsigned int i; |
1159 | ||
f1f41a6c | 1160 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
433b0ea3 | 1161 | { |
db72d3bf | 1162 | dr_vec_info *dr_info = vinfo->lookup_dr (dr); |
abc9513d | 1163 | stmt_vec_info stmt_info = dr_info->stmt; |
433b0ea3 | 1164 | |
1165 | if (!STMT_VINFO_RELEVANT_P (stmt_info)) | |
1166 | continue; | |
f6593f36 | 1167 | |
1168 | /* For interleaving, only the alignment of the first access matters. */ | |
1169 | if (STMT_VINFO_GROUPED_ACCESS (stmt_info) | |
0219dc42 | 1170 | && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info) |
c86d8a47 | 1171 | continue; |
f6593f36 | 1172 | |
1173 | /* Strided accesses perform only component accesses, alignment is | |
1174 | irrelevant for them. */ | |
1175 | if (STMT_VINFO_STRIDED_P (stmt_info) | |
1176 | && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) | |
c86d8a47 | 1177 | continue; |
f6593f36 | 1178 | |
ed9370cc | 1179 | opt_result res = verify_data_ref_alignment (dr_info); |
1180 | if (!res) | |
1181 | return res; | |
433b0ea3 | 1182 | } |
6ea6a380 | 1183 | |
ed9370cc | 1184 | return opt_result::success (); |
fb85abff | 1185 | } |
1186 | ||
cfa724cf | 1187 | /* Given an memory reference EXP return whether its alignment is less |
1188 | than its size. */ | |
1189 | ||
1190 | static bool | |
1191 | not_size_aligned (tree exp) | |
1192 | { | |
e913b5cd | 1193 | if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp)))) |
cfa724cf | 1194 | return true; |
1195 | ||
e913b5cd | 1196 | return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp))) |
cfa724cf | 1197 | > get_object_alignment (exp)); |
1198 | } | |
fb85abff | 1199 | |
1200 | /* Function vector_alignment_reachable_p | |
1201 | ||
abc9513d | 1202 | Return true if vector alignment for DR_INFO is reachable by peeling |
fb85abff | 1203 | a few loop iterations. Return false otherwise. */ |
1204 | ||
1205 | static bool | |
abc9513d | 1206 | vector_alignment_reachable_p (dr_vec_info *dr_info) |
fb85abff | 1207 | { |
abc9513d | 1208 | stmt_vec_info stmt_info = dr_info->stmt; |
fb85abff | 1209 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
1210 | ||
ee612634 | 1211 | if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) |
fb85abff | 1212 | { |
1213 | /* For interleaved access we peel only if number of iterations in | |
1214 | the prolog loop ({VF - misalignment}), is a multiple of the | |
1215 | number of the interleaved accesses. */ | |
1216 | int elem_size, mis_in_elements; | |
fb85abff | 1217 | |
1218 | /* FORNOW: handle only known alignment. */ | |
abc9513d | 1219 | if (!known_alignment_for_access_p (dr_info)) |
fb85abff | 1220 | return false; |
1221 | ||
32a4b2d8 | 1222 | poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype); |
1223 | poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype)); | |
1224 | elem_size = vector_element_size (vector_size, nelements); | |
abc9513d | 1225 | mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size; |
fb85abff | 1226 | |
e1009321 | 1227 | if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info))) |
fb85abff | 1228 | return false; |
1229 | } | |
1230 | ||
1231 | /* If misalignment is known at the compile time then allow peeling | |
1232 | only if natural alignment is reachable through peeling. */ | |
abc9513d | 1233 | if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info)) |
fb85abff | 1234 | { |
48e1416a | 1235 | HOST_WIDE_INT elmsize = |
fb85abff | 1236 | int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); |
6d8fb6cf | 1237 | if (dump_enabled_p ()) |
fb85abff | 1238 | { |
78bb46f5 | 1239 | dump_printf_loc (MSG_NOTE, vect_location, |
bffe1cb4 | 1240 | "data size = %wd. misalignment = %d.\n", elmsize, |
1241 | DR_MISALIGNMENT (dr_info)); | |
fb85abff | 1242 | } |
abc9513d | 1243 | if (DR_MISALIGNMENT (dr_info) % elmsize) |
fb85abff | 1244 | { |
6d8fb6cf | 1245 | if (dump_enabled_p ()) |
78bb46f5 | 1246 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1247 | "data size does not divide the misalignment.\n"); | |
fb85abff | 1248 | return false; |
1249 | } | |
1250 | } | |
1251 | ||
abc9513d | 1252 | if (!known_alignment_for_access_p (dr_info)) |
fb85abff | 1253 | { |
abc9513d | 1254 | tree type = TREE_TYPE (DR_REF (dr_info->dr)); |
1255 | bool is_packed = not_size_aligned (DR_REF (dr_info->dr)); | |
6d8fb6cf | 1256 | if (dump_enabled_p ()) |
78bb46f5 | 1257 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
33a82fb9 | 1258 | "Unknown misalignment, %snaturally aligned\n", |
1259 | is_packed ? "not " : ""); | |
1260 | return targetm.vectorize.vector_alignment_reachable (type, is_packed); | |
fb85abff | 1261 | } |
1262 | ||
1263 | return true; | |
1264 | } | |
1265 | ||
0822b158 | 1266 | |
abc9513d | 1267 | /* Calculate the cost of the memory access represented by DR_INFO. */ |
0822b158 | 1268 | |
f97dec81 | 1269 | static void |
abc9513d | 1270 | vect_get_data_access_cost (dr_vec_info *dr_info, |
0822b158 | 1271 | unsigned int *inside_cost, |
f97dec81 | 1272 | unsigned int *outside_cost, |
28d0cd4a | 1273 | stmt_vector_for_cost *body_cost_vec, |
1274 | stmt_vector_for_cost *prologue_cost_vec) | |
0822b158 | 1275 | { |
abc9513d | 1276 | stmt_vec_info stmt_info = dr_info->stmt; |
0822b158 | 1277 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); |
4eb17cb6 | 1278 | int ncopies; |
1279 | ||
1280 | if (PURE_SLP_STMT (stmt_info)) | |
1281 | ncopies = 1; | |
1282 | else | |
1283 | ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info)); | |
0822b158 | 1284 | |
abc9513d | 1285 | if (DR_IS_READ (dr_info->dr)) |
1ce0a2db | 1286 | vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost, |
28d0cd4a | 1287 | prologue_cost_vec, body_cost_vec, false); |
0822b158 | 1288 | else |
1ce0a2db | 1289 | vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec); |
0822b158 | 1290 | |
6d8fb6cf | 1291 | if (dump_enabled_p ()) |
7bd765d4 | 1292 | dump_printf_loc (MSG_NOTE, vect_location, |
1293 | "vect_get_data_access_cost: inside_cost = %d, " | |
78bb46f5 | 1294 | "outside_cost = %d.\n", *inside_cost, *outside_cost); |
0822b158 | 1295 | } |
1296 | ||
1297 | ||
41500e78 | 1298 | typedef struct _vect_peel_info |
1299 | { | |
abc9513d | 1300 | dr_vec_info *dr_info; |
487798e2 | 1301 | int npeel; |
41500e78 | 1302 | unsigned int count; |
1303 | } *vect_peel_info; | |
1304 | ||
1305 | typedef struct _vect_peel_extended_info | |
1306 | { | |
1307 | struct _vect_peel_info peel_info; | |
1308 | unsigned int inside_cost; | |
1309 | unsigned int outside_cost; | |
41500e78 | 1310 | } *vect_peel_extended_info; |
1311 | ||
1312 | ||
1313 | /* Peeling hashtable helpers. */ | |
1314 | ||
1315 | struct peel_info_hasher : free_ptr_hash <_vect_peel_info> | |
1316 | { | |
1317 | static inline hashval_t hash (const _vect_peel_info *); | |
1318 | static inline bool equal (const _vect_peel_info *, const _vect_peel_info *); | |
1319 | }; | |
1320 | ||
1321 | inline hashval_t | |
1322 | peel_info_hasher::hash (const _vect_peel_info *peel_info) | |
1323 | { | |
1324 | return (hashval_t) peel_info->npeel; | |
1325 | } | |
1326 | ||
1327 | inline bool | |
1328 | peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b) | |
1329 | { | |
1330 | return (a->npeel == b->npeel); | |
1331 | } | |
1332 | ||
1333 | ||
abc9513d | 1334 | /* Insert DR_INFO into peeling hash table with NPEEL as key. */ |
0822b158 | 1335 | |
1336 | static void | |
41500e78 | 1337 | vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab, |
abc9513d | 1338 | loop_vec_info loop_vinfo, dr_vec_info *dr_info, |
0822b158 | 1339 | int npeel) |
1340 | { | |
1341 | struct _vect_peel_info elem, *slot; | |
3e871d4d | 1342 | _vect_peel_info **new_slot; |
abc9513d | 1343 | bool supportable_dr_alignment |
1344 | = vect_supportable_dr_alignment (dr_info, true); | |
0822b158 | 1345 | |
1346 | elem.npeel = npeel; | |
41500e78 | 1347 | slot = peeling_htab->find (&elem); |
0822b158 | 1348 | if (slot) |
1349 | slot->count++; | |
1350 | else | |
1351 | { | |
1352 | slot = XNEW (struct _vect_peel_info); | |
1353 | slot->npeel = npeel; | |
abc9513d | 1354 | slot->dr_info = dr_info; |
0822b158 | 1355 | slot->count = 1; |
41500e78 | 1356 | new_slot = peeling_htab->find_slot (slot, INSERT); |
0822b158 | 1357 | *new_slot = slot; |
1358 | } | |
1359 | ||
3e398f5b | 1360 | if (!supportable_dr_alignment |
1361 | && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) | |
0822b158 | 1362 | slot->count += VECT_MAX_COST; |
1363 | } | |
1364 | ||
1365 | ||
1366 | /* Traverse peeling hash table to find peeling option that aligns maximum | |
1367 | number of data accesses. */ | |
1368 | ||
3e871d4d | 1369 | int |
1370 | vect_peeling_hash_get_most_frequent (_vect_peel_info **slot, | |
1371 | _vect_peel_extended_info *max) | |
0822b158 | 1372 | { |
3e871d4d | 1373 | vect_peel_info elem = *slot; |
0822b158 | 1374 | |
593fa4d1 | 1375 | if (elem->count > max->peel_info.count |
1376 | || (elem->count == max->peel_info.count | |
1377 | && max->peel_info.npeel > elem->npeel)) | |
0822b158 | 1378 | { |
1379 | max->peel_info.npeel = elem->npeel; | |
1380 | max->peel_info.count = elem->count; | |
abc9513d | 1381 | max->peel_info.dr_info = elem->dr_info; |
0822b158 | 1382 | } |
1383 | ||
1384 | return 1; | |
1385 | } | |
1386 | ||
db72d3bf | 1387 | /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking |
1388 | data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true, | |
1389 | we assume DR0_INFO's misalignment will be zero after peeling. */ | |
0822b158 | 1390 | |
cd8306bf | 1391 | static void |
db72d3bf | 1392 | vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo, |
abc9513d | 1393 | dr_vec_info *dr0_info, |
cd8306bf | 1394 | unsigned int *inside_cost, |
1395 | unsigned int *outside_cost, | |
1396 | stmt_vector_for_cost *body_cost_vec, | |
28d0cd4a | 1397 | stmt_vector_for_cost *prologue_cost_vec, |
5081fac8 | 1398 | unsigned int npeel, |
1399 | bool unknown_misalignment) | |
0822b158 | 1400 | { |
db72d3bf | 1401 | vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); |
cd8306bf | 1402 | unsigned i; |
1403 | data_reference *dr; | |
0822b158 | 1404 | |
f1f41a6c | 1405 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
0822b158 | 1406 | { |
db72d3bf | 1407 | dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); |
abc9513d | 1408 | stmt_vec_info stmt_info = dr_info->stmt; |
3bbc3f79 | 1409 | if (!STMT_VINFO_RELEVANT_P (stmt_info)) |
1410 | continue; | |
1411 | ||
0822b158 | 1412 | /* For interleaving, only the alignment of the first access |
1413 | matters. */ | |
ee612634 | 1414 | if (STMT_VINFO_GROUPED_ACCESS (stmt_info) |
0219dc42 | 1415 | && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info) |
1416 | continue; | |
0822b158 | 1417 | |
d84b8514 | 1418 | /* Strided accesses perform only component accesses, alignment is |
1419 | irrelevant for them. */ | |
1420 | if (STMT_VINFO_STRIDED_P (stmt_info) | |
1421 | && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) | |
1422 | continue; | |
1423 | ||
cd8306bf | 1424 | int save_misalignment; |
abc9513d | 1425 | save_misalignment = DR_MISALIGNMENT (dr_info); |
db755b03 | 1426 | if (npeel == 0) |
1427 | ; | |
abc9513d | 1428 | else if (unknown_misalignment && dr_info == dr0_info) |
1429 | SET_DR_MISALIGNMENT (dr_info, 0); | |
cd8306bf | 1430 | else |
abc9513d | 1431 | vect_update_misalignment_for_peel (dr_info, dr0_info, npeel); |
1432 | vect_get_data_access_cost (dr_info, inside_cost, outside_cost, | |
28d0cd4a | 1433 | body_cost_vec, prologue_cost_vec); |
abc9513d | 1434 | SET_DR_MISALIGNMENT (dr_info, save_misalignment); |
0822b158 | 1435 | } |
cd8306bf | 1436 | } |
1437 | ||
1438 | /* Traverse peeling hash table and calculate cost for each peeling option. | |
1439 | Find the one with the lowest cost. */ | |
1440 | ||
1441 | int | |
1442 | vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot, | |
1443 | _vect_peel_extended_info *min) | |
1444 | { | |
1445 | vect_peel_info elem = *slot; | |
1446 | int dummy; | |
1447 | unsigned int inside_cost = 0, outside_cost = 0; | |
abc9513d | 1448 | stmt_vec_info stmt_info = elem->dr_info->stmt; |
cd8306bf | 1449 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); |
1450 | stmt_vector_for_cost prologue_cost_vec, body_cost_vec, | |
1451 | epilogue_cost_vec; | |
1452 | ||
1453 | prologue_cost_vec.create (2); | |
1454 | body_cost_vec.create (2); | |
1455 | epilogue_cost_vec.create (2); | |
1456 | ||
db72d3bf | 1457 | vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost, |
1458 | &outside_cost, &body_cost_vec, | |
1459 | &prologue_cost_vec, elem->npeel, false); | |
0822b158 | 1460 | |
f0f51716 | 1461 | body_cost_vec.release (); |
1462 | ||
41ae9eb4 | 1463 | outside_cost += vect_get_known_peeling_cost |
1464 | (loop_vinfo, elem->npeel, &dummy, | |
2a9a3444 | 1465 | &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), |
1466 | &prologue_cost_vec, &epilogue_cost_vec); | |
f97dec81 | 1467 | |
1468 | /* Prologue and epilogue costs are added to the target model later. | |
1469 | These costs depend only on the scalar iteration cost, the | |
1470 | number of peeling iterations finally chosen, and the number of | |
1471 | misaligned statements. So discard the information found here. */ | |
f1f41a6c | 1472 | prologue_cost_vec.release (); |
1473 | epilogue_cost_vec.release (); | |
0822b158 | 1474 | |
1475 | if (inside_cost < min->inside_cost | |
cd8306bf | 1476 | || (inside_cost == min->inside_cost |
1477 | && outside_cost < min->outside_cost)) | |
0822b158 | 1478 | { |
1479 | min->inside_cost = inside_cost; | |
1480 | min->outside_cost = outside_cost; | |
abc9513d | 1481 | min->peel_info.dr_info = elem->dr_info; |
0822b158 | 1482 | min->peel_info.npeel = elem->npeel; |
cd8306bf | 1483 | min->peel_info.count = elem->count; |
0822b158 | 1484 | } |
1485 | ||
1486 | return 1; | |
1487 | } | |
1488 | ||
1489 | ||
1490 | /* Choose best peeling option by traversing peeling hash table and either | |
1491 | choosing an option with the lowest cost (if cost model is enabled) or the | |
1492 | option that aligns as many accesses as possible. */ | |
1493 | ||
83786d5e | 1494 | static struct _vect_peel_extended_info |
41500e78 | 1495 | vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab, |
f0f51716 | 1496 | loop_vec_info loop_vinfo) |
0822b158 | 1497 | { |
1498 | struct _vect_peel_extended_info res; | |
1499 | ||
abc9513d | 1500 | res.peel_info.dr_info = NULL; |
0822b158 | 1501 | |
3e398f5b | 1502 | if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) |
0822b158 | 1503 | { |
1504 | res.inside_cost = INT_MAX; | |
1505 | res.outside_cost = INT_MAX; | |
41500e78 | 1506 | peeling_htab->traverse <_vect_peel_extended_info *, |
1507 | vect_peeling_hash_get_lowest_cost> (&res); | |
0822b158 | 1508 | } |
1509 | else | |
1510 | { | |
1511 | res.peel_info.count = 0; | |
41500e78 | 1512 | peeling_htab->traverse <_vect_peel_extended_info *, |
1513 | vect_peeling_hash_get_most_frequent> (&res); | |
83786d5e | 1514 | res.inside_cost = 0; |
1515 | res.outside_cost = 0; | |
0822b158 | 1516 | } |
1517 | ||
83786d5e | 1518 | return res; |
0822b158 | 1519 | } |
1520 | ||
cd8306bf | 1521 | /* Return true if the new peeling NPEEL is supported. */ |
1522 | ||
1523 | static bool | |
abc9513d | 1524 | vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info, |
cd8306bf | 1525 | unsigned npeel) |
1526 | { | |
1527 | unsigned i; | |
1528 | struct data_reference *dr = NULL; | |
1529 | vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); | |
cd8306bf | 1530 | enum dr_alignment_support supportable_dr_alignment; |
1531 | ||
1532 | /* Ensure that all data refs can be vectorized after the peel. */ | |
1533 | FOR_EACH_VEC_ELT (datarefs, i, dr) | |
1534 | { | |
1535 | int save_misalignment; | |
1536 | ||
abc9513d | 1537 | if (dr == dr0_info->dr) |
cd8306bf | 1538 | continue; |
1539 | ||
db72d3bf | 1540 | dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); |
abc9513d | 1541 | stmt_vec_info stmt_info = dr_info->stmt; |
cd8306bf | 1542 | /* For interleaving, only the alignment of the first access |
1543 | matters. */ | |
1544 | if (STMT_VINFO_GROUPED_ACCESS (stmt_info) | |
0219dc42 | 1545 | && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info) |
cd8306bf | 1546 | continue; |
1547 | ||
1548 | /* Strided accesses perform only component accesses, alignment is | |
1549 | irrelevant for them. */ | |
1550 | if (STMT_VINFO_STRIDED_P (stmt_info) | |
1551 | && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) | |
1552 | continue; | |
1553 | ||
abc9513d | 1554 | save_misalignment = DR_MISALIGNMENT (dr_info); |
1555 | vect_update_misalignment_for_peel (dr_info, dr0_info, npeel); | |
1556 | supportable_dr_alignment | |
1557 | = vect_supportable_dr_alignment (dr_info, false); | |
1558 | SET_DR_MISALIGNMENT (dr_info, save_misalignment); | |
cd8306bf | 1559 | |
1560 | if (!supportable_dr_alignment) | |
1561 | return false; | |
1562 | } | |
1563 | ||
1564 | return true; | |
1565 | } | |
0822b158 | 1566 | |
fb85abff | 1567 | /* Function vect_enhance_data_refs_alignment |
1568 | ||
1569 | This pass will use loop versioning and loop peeling in order to enhance | |
1570 | the alignment of data references in the loop. | |
1571 | ||
1572 | FOR NOW: we assume that whatever versioning/peeling takes place, only the | |
282bf14c | 1573 | original loop is to be vectorized. Any other loops that are created by |
fb85abff | 1574 | the transformations performed in this pass - are not supposed to be |
282bf14c | 1575 | vectorized. This restriction will be relaxed. |
fb85abff | 1576 | |
1577 | This pass will require a cost model to guide it whether to apply peeling | |
282bf14c | 1578 | or versioning or a combination of the two. For example, the scheme that |
fb85abff | 1579 | intel uses when given a loop with several memory accesses, is as follows: |
1580 | choose one memory access ('p') which alignment you want to force by doing | |
282bf14c | 1581 | peeling. Then, either (1) generate a loop in which 'p' is aligned and all |
fb85abff | 1582 | other accesses are not necessarily aligned, or (2) use loop versioning to |
1583 | generate one loop in which all accesses are aligned, and another loop in | |
1584 | which only 'p' is necessarily aligned. | |
1585 | ||
1586 | ("Automatic Intra-Register Vectorization for the Intel Architecture", | |
1587 | Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International | |
1588 | Journal of Parallel Programming, Vol. 30, No. 2, April 2002.) | |
1589 | ||
282bf14c | 1590 | Devising a cost model is the most critical aspect of this work. It will |
fb85abff | 1591 | guide us on which access to peel for, whether to use loop versioning, how |
282bf14c | 1592 | many versions to create, etc. The cost model will probably consist of |
fb85abff | 1593 | generic considerations as well as target specific considerations (on |
1594 | powerpc for example, misaligned stores are more painful than misaligned | |
1595 | loads). | |
1596 | ||
1597 | Here are the general steps involved in alignment enhancements: | |
1598 | ||
1599 | -- original loop, before alignment analysis: | |
1600 | for (i=0; i<N; i++){ | |
1601 | x = q[i]; # DR_MISALIGNMENT(q) = unknown | |
1602 | p[i] = y; # DR_MISALIGNMENT(p) = unknown | |
1603 | } | |
1604 | ||
1605 | -- After vect_compute_data_refs_alignment: | |
1606 | for (i=0; i<N; i++){ | |
1607 | x = q[i]; # DR_MISALIGNMENT(q) = 3 | |
1608 | p[i] = y; # DR_MISALIGNMENT(p) = unknown | |
1609 | } | |
1610 | ||
1611 | -- Possibility 1: we do loop versioning: | |
1612 | if (p is aligned) { | |
1613 | for (i=0; i<N; i++){ # loop 1A | |
1614 | x = q[i]; # DR_MISALIGNMENT(q) = 3 | |
1615 | p[i] = y; # DR_MISALIGNMENT(p) = 0 | |
1616 | } | |
1617 | } | |
1618 | else { | |
1619 | for (i=0; i<N; i++){ # loop 1B | |
1620 | x = q[i]; # DR_MISALIGNMENT(q) = 3 | |
1621 | p[i] = y; # DR_MISALIGNMENT(p) = unaligned | |
1622 | } | |
1623 | } | |
1624 | ||
1625 | -- Possibility 2: we do loop peeling: | |
1626 | for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). | |
1627 | x = q[i]; | |
1628 | p[i] = y; | |
1629 | } | |
1630 | for (i = 3; i < N; i++){ # loop 2A | |
1631 | x = q[i]; # DR_MISALIGNMENT(q) = 0 | |
1632 | p[i] = y; # DR_MISALIGNMENT(p) = unknown | |
1633 | } | |
1634 | ||
1635 | -- Possibility 3: combination of loop peeling and versioning: | |
1636 | for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). | |
1637 | x = q[i]; | |
1638 | p[i] = y; | |
1639 | } | |
1640 | if (p is aligned) { | |
1641 | for (i = 3; i<N; i++){ # loop 3A | |
1642 | x = q[i]; # DR_MISALIGNMENT(q) = 0 | |
1643 | p[i] = y; # DR_MISALIGNMENT(p) = 0 | |
1644 | } | |
1645 | } | |
1646 | else { | |
1647 | for (i = 3; i<N; i++){ # loop 3B | |
1648 | x = q[i]; # DR_MISALIGNMENT(q) = 0 | |
1649 | p[i] = y; # DR_MISALIGNMENT(p) = unaligned | |
1650 | } | |
1651 | } | |
1652 | ||
282bf14c | 1653 | These loops are later passed to loop_transform to be vectorized. The |
fb85abff | 1654 | vectorizer will use the alignment information to guide the transformation |
1655 | (whether to generate regular loads/stores, or with special handling for | |
1656 | misalignment). */ | |
1657 | ||
ed9370cc | 1658 | opt_result |
fb85abff | 1659 | vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) |
1660 | { | |
f1f41a6c | 1661 | vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); |
2e966e2a | 1662 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
5081fac8 | 1663 | enum dr_alignment_support supportable_dr_alignment; |
abc9513d | 1664 | dr_vec_info *first_store = NULL; |
1665 | dr_vec_info *dr0_info = NULL; | |
fb85abff | 1666 | struct data_reference *dr; |
0822b158 | 1667 | unsigned int i, j; |
fb85abff | 1668 | bool do_peeling = false; |
1669 | bool do_versioning = false; | |
0822b158 | 1670 | unsigned int npeel = 0; |
83786d5e | 1671 | bool one_misalignment_known = false; |
1672 | bool one_misalignment_unknown = false; | |
5081fac8 | 1673 | bool one_dr_unsupportable = false; |
abc9513d | 1674 | dr_vec_info *unsupportable_dr_info = NULL; |
d75596cd | 1675 | poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
0822b158 | 1676 | unsigned possible_npeel_number = 1; |
1677 | tree vectype; | |
d75596cd | 1678 | unsigned int mis, same_align_drs_max = 0; |
41500e78 | 1679 | hash_table<peel_info_hasher> peeling_htab (1); |
fb85abff | 1680 | |
88f6eb8f | 1681 | DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment"); |
fb85abff | 1682 | |
00ecf4da | 1683 | /* Reset data so we can safely be called multiple times. */ |
1684 | LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0); | |
1685 | LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0; | |
1686 | ||
fb85abff | 1687 | /* While cost model enhancements are expected in the future, the high level |
1688 | view of the code at this time is as follows: | |
1689 | ||
ec2886ed | 1690 | A) If there is a misaligned access then see if peeling to align |
1691 | this access can make all data references satisfy | |
454f25be | 1692 | vect_supportable_dr_alignment. If so, update data structures |
1693 | as needed and return true. | |
fb85abff | 1694 | |
1695 | B) If peeling wasn't possible and there is a data reference with an | |
1696 | unknown misalignment that does not satisfy vect_supportable_dr_alignment | |
1697 | then see if loop versioning checks can be used to make all data | |
1698 | references satisfy vect_supportable_dr_alignment. If so, update | |
1699 | data structures as needed and return true. | |
1700 | ||
1701 | C) If neither peeling nor versioning were successful then return false if | |
1702 | any data reference does not satisfy vect_supportable_dr_alignment. | |
1703 | ||
1704 | D) Return true (all data references satisfy vect_supportable_dr_alignment). | |
1705 | ||
1706 | Note, Possibility 3 above (which is peeling and versioning together) is not | |
1707 | being done at this time. */ | |
1708 | ||
1709 | /* (1) Peeling to force alignment. */ | |
1710 | ||
1711 | /* (1.1) Decide whether to perform peeling, and how many iterations to peel: | |
1712 | Considerations: | |
1713 | + How many accesses will become aligned due to the peeling | |
1714 | - How many accesses will become unaligned due to the peeling, | |
1715 | and the cost of misaligned accesses. | |
48e1416a | 1716 | - The cost of peeling (the extra runtime checks, the increase |
0822b158 | 1717 | in code size). */ |
fb85abff | 1718 | |
f1f41a6c | 1719 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
fb85abff | 1720 | { |
db72d3bf | 1721 | dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); |
abc9513d | 1722 | stmt_vec_info stmt_info = dr_info->stmt; |
fb85abff | 1723 | |
1ad41595 | 1724 | if (!STMT_VINFO_RELEVANT_P (stmt_info)) |
b04940e7 | 1725 | continue; |
1726 | ||
fb85abff | 1727 | /* For interleaving, only the alignment of the first access |
1728 | matters. */ | |
ee612634 | 1729 | if (STMT_VINFO_GROUPED_ACCESS (stmt_info) |
0219dc42 | 1730 | && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info) |
1731 | continue; | |
fb85abff | 1732 | |
fa681b45 | 1733 | /* For scatter-gather or invariant accesses there is nothing |
1734 | to enhance. */ | |
1735 | if (STMT_VINFO_GATHER_SCATTER_P (stmt_info) | |
1736 | || integer_zerop (DR_STEP (dr))) | |
b04940e7 | 1737 | continue; |
1738 | ||
e1c75243 | 1739 | /* Strided accesses perform only component accesses, alignment is |
f634c3e9 | 1740 | irrelevant for them. */ |
e1c75243 | 1741 | if (STMT_VINFO_STRIDED_P (stmt_info) |
994be998 | 1742 | && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) |
f634c3e9 | 1743 | continue; |
1744 | ||
abc9513d | 1745 | supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true); |
1746 | do_peeling = vector_alignment_reachable_p (dr_info); | |
0822b158 | 1747 | if (do_peeling) |
fb85abff | 1748 | { |
abc9513d | 1749 | if (known_alignment_for_access_p (dr_info)) |
0822b158 | 1750 | { |
aec313e5 | 1751 | unsigned int npeel_tmp = 0; |
f1b8c740 | 1752 | bool negative = tree_int_cst_compare (DR_STEP (dr), |
1753 | size_zero_node) < 0; | |
0822b158 | 1754 | |
aec313e5 | 1755 | vectype = STMT_VINFO_VECTYPE (stmt_info); |
e092c20e | 1756 | /* If known_alignment_for_access_p then we have set |
1757 | DR_MISALIGNMENT which is only done if we know it at compiler | |
1758 | time, so it is safe to assume target alignment is constant. | |
1759 | */ | |
1760 | unsigned int target_align = | |
1761 | DR_TARGET_ALIGNMENT (dr_info).to_constant (); | |
abc9513d | 1762 | unsigned int dr_size = vect_get_scalar_dr_size (dr_info); |
1763 | mis = (negative | |
1764 | ? DR_MISALIGNMENT (dr_info) | |
1765 | : -DR_MISALIGNMENT (dr_info)); | |
1766 | if (DR_MISALIGNMENT (dr_info) != 0) | |
aec313e5 | 1767 | npeel_tmp = (mis & (target_align - 1)) / dr_size; |
0822b158 | 1768 | |
1769 | /* For multiple types, it is possible that the bigger type access | |
282bf14c | 1770 | will have more than one peeling option. E.g., a loop with two |
0822b158 | 1771 | types: one of size (vector size / 4), and the other one of |
282bf14c | 1772 | size (vector size / 8). Vectorization factor will 8. If both |
df8e9f7a | 1773 | accesses are misaligned by 3, the first one needs one scalar |
282bf14c | 1774 | iteration to be aligned, and the second one needs 5. But the |
4bec4fee | 1775 | first one will be aligned also by peeling 5 scalar |
0822b158 | 1776 | iterations, and in that case both accesses will be aligned. |
1777 | Hence, except for the immediate peeling amount, we also want | |
1778 | to try to add full vector size, while we don't exceed | |
1779 | vectorization factor. | |
df8e9f7a | 1780 | We do this automatically for cost model, since we calculate |
1781 | cost for every peeling option. */ | |
3e398f5b | 1782 | if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) |
c1bee668 | 1783 | { |
d75596cd | 1784 | poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info) |
e1009321 | 1785 | ? vf * DR_GROUP_SIZE (stmt_info) : vf); |
d75596cd | 1786 | possible_npeel_number |
1787 | = vect_get_num_vectors (nscalars, vectype); | |
0822b158 | 1788 | |
5081fac8 | 1789 | /* NPEEL_TMP is 0 when there is no misalignment, but also |
1790 | allow peeling NELEMENTS. */ | |
abc9513d | 1791 | if (DR_MISALIGNMENT (dr_info) == 0) |
df8e9f7a | 1792 | possible_npeel_number++; |
1793 | } | |
0822b158 | 1794 | |
df8e9f7a | 1795 | /* Save info about DR in the hash table. Also include peeling |
1796 | amounts according to the explanation above. */ | |
0822b158 | 1797 | for (j = 0; j < possible_npeel_number; j++) |
1798 | { | |
41500e78 | 1799 | vect_peeling_hash_insert (&peeling_htab, loop_vinfo, |
abc9513d | 1800 | dr_info, npeel_tmp); |
aec313e5 | 1801 | npeel_tmp += target_align / dr_size; |
0822b158 | 1802 | } |
1803 | ||
83786d5e | 1804 | one_misalignment_known = true; |
0822b158 | 1805 | } |
1806 | else | |
1807 | { | |
6046367e | 1808 | /* If we don't know any misalignment values, we prefer |
1809 | peeling for data-ref that has the maximum number of data-refs | |
0822b158 | 1810 | with the same alignment, unless the target prefers to align |
1811 | stores over load. */ | |
83786d5e | 1812 | unsigned same_align_drs |
1813 | = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length (); | |
abc9513d | 1814 | if (!dr0_info |
83786d5e | 1815 | || same_align_drs_max < same_align_drs) |
1816 | { | |
1817 | same_align_drs_max = same_align_drs; | |
abc9513d | 1818 | dr0_info = dr_info; |
83786d5e | 1819 | } |
1820 | /* For data-refs with the same number of related | |
1821 | accesses prefer the one where the misalign | |
1822 | computation will be invariant in the outermost loop. */ | |
1823 | else if (same_align_drs_max == same_align_drs) | |
1824 | { | |
2e966e2a | 1825 | class loop *ivloop0, *ivloop; |
83786d5e | 1826 | ivloop0 = outermost_invariant_loop_for_expr |
abc9513d | 1827 | (loop, DR_BASE_ADDRESS (dr0_info->dr)); |
83786d5e | 1828 | ivloop = outermost_invariant_loop_for_expr |
1829 | (loop, DR_BASE_ADDRESS (dr)); | |
1830 | if ((ivloop && !ivloop0) | |
1831 | || (ivloop && ivloop0 | |
1832 | && flow_loop_nested_p (ivloop, ivloop0))) | |
abc9513d | 1833 | dr0_info = dr_info; |
83786d5e | 1834 | } |
0822b158 | 1835 | |
5081fac8 | 1836 | one_misalignment_unknown = true; |
1837 | ||
1838 | /* Check for data refs with unsupportable alignment that | |
1839 | can be peeled. */ | |
1840 | if (!supportable_dr_alignment) | |
1841 | { | |
1842 | one_dr_unsupportable = true; | |
abc9513d | 1843 | unsupportable_dr_info = dr_info; |
5081fac8 | 1844 | } |
1845 | ||
83786d5e | 1846 | if (!first_store && DR_IS_WRITE (dr)) |
abc9513d | 1847 | first_store = dr_info; |
0822b158 | 1848 | } |
1849 | } | |
1850 | else | |
1851 | { | |
abc9513d | 1852 | if (!aligned_access_p (dr_info)) |
0822b158 | 1853 | { |
6d8fb6cf | 1854 | if (dump_enabled_p ()) |
78bb46f5 | 1855 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1856 | "vector alignment may not be reachable\n"); | |
0822b158 | 1857 | break; |
1858 | } | |
1859 | } | |
fb85abff | 1860 | } |
1861 | ||
2cd0995e | 1862 | /* Check if we can possibly peel the loop. */ |
1863 | if (!vect_can_advance_ivs_p (loop_vinfo) | |
5ee742c4 | 1864 | || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)) |
1865 | || loop->inner) | |
fb85abff | 1866 | do_peeling = false; |
1867 | ||
b565a9ba | 1868 | struct _vect_peel_extended_info peel_for_known_alignment; |
1869 | struct _vect_peel_extended_info peel_for_unknown_alignment; | |
1870 | struct _vect_peel_extended_info best_peel; | |
1871 | ||
1872 | peel_for_unknown_alignment.inside_cost = INT_MAX; | |
1873 | peel_for_unknown_alignment.outside_cost = INT_MAX; | |
1874 | peel_for_unknown_alignment.peel_info.count = 0; | |
83786d5e | 1875 | |
192f7876 | 1876 | if (do_peeling |
b565a9ba | 1877 | && one_misalignment_unknown) |
0822b158 | 1878 | { |
0822b158 | 1879 | /* Check if the target requires to prefer stores over loads, i.e., if |
1880 | misaligned stores are more expensive than misaligned loads (taking | |
1881 | drs with same alignment into account). */ | |
b565a9ba | 1882 | unsigned int load_inside_cost = 0; |
1883 | unsigned int load_outside_cost = 0; | |
1884 | unsigned int store_inside_cost = 0; | |
1885 | unsigned int store_outside_cost = 0; | |
d75596cd | 1886 | unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2; |
b565a9ba | 1887 | |
1888 | stmt_vector_for_cost dummy; | |
1889 | dummy.create (2); | |
db72d3bf | 1890 | vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info, |
b565a9ba | 1891 | &load_inside_cost, |
1892 | &load_outside_cost, | |
28d0cd4a | 1893 | &dummy, &dummy, estimated_npeels, true); |
b565a9ba | 1894 | dummy.release (); |
1895 | ||
1896 | if (first_store) | |
1897 | { | |
83786d5e | 1898 | dummy.create (2); |
db72d3bf | 1899 | vect_get_peeling_costs_all_drs (loop_vinfo, first_store, |
83786d5e | 1900 | &store_inside_cost, |
1901 | &store_outside_cost, | |
28d0cd4a | 1902 | &dummy, &dummy, |
1903 | estimated_npeels, true); | |
f1f41a6c | 1904 | dummy.release (); |
b565a9ba | 1905 | } |
1906 | else | |
1907 | { | |
1908 | store_inside_cost = INT_MAX; | |
1909 | store_outside_cost = INT_MAX; | |
1910 | } | |
0822b158 | 1911 | |
b565a9ba | 1912 | if (load_inside_cost > store_inside_cost |
1913 | || (load_inside_cost == store_inside_cost | |
1914 | && load_outside_cost > store_outside_cost)) | |
1915 | { | |
abc9513d | 1916 | dr0_info = first_store; |
b565a9ba | 1917 | peel_for_unknown_alignment.inside_cost = store_inside_cost; |
1918 | peel_for_unknown_alignment.outside_cost = store_outside_cost; | |
1919 | } | |
1920 | else | |
1921 | { | |
1922 | peel_for_unknown_alignment.inside_cost = load_inside_cost; | |
1923 | peel_for_unknown_alignment.outside_cost = load_outside_cost; | |
1924 | } | |
83786d5e | 1925 | |
b565a9ba | 1926 | stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec; |
1927 | prologue_cost_vec.create (2); | |
1928 | epilogue_cost_vec.create (2); | |
83786d5e | 1929 | |
b565a9ba | 1930 | int dummy2; |
1931 | peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost | |
d75596cd | 1932 | (loop_vinfo, estimated_npeels, &dummy2, |
b565a9ba | 1933 | &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), |
1934 | &prologue_cost_vec, &epilogue_cost_vec); | |
83786d5e | 1935 | |
b565a9ba | 1936 | prologue_cost_vec.release (); |
1937 | epilogue_cost_vec.release (); | |
0822b158 | 1938 | |
b565a9ba | 1939 | peel_for_unknown_alignment.peel_info.count = 1 |
abc9513d | 1940 | + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length (); |
0822b158 | 1941 | } |
1942 | ||
b565a9ba | 1943 | peel_for_unknown_alignment.peel_info.npeel = 0; |
abc9513d | 1944 | peel_for_unknown_alignment.peel_info.dr_info = dr0_info; |
b565a9ba | 1945 | |
1946 | best_peel = peel_for_unknown_alignment; | |
1947 | ||
83786d5e | 1948 | peel_for_known_alignment.inside_cost = INT_MAX; |
1949 | peel_for_known_alignment.outside_cost = INT_MAX; | |
1950 | peel_for_known_alignment.peel_info.count = 0; | |
abc9513d | 1951 | peel_for_known_alignment.peel_info.dr_info = NULL; |
83786d5e | 1952 | |
1953 | if (do_peeling && one_misalignment_known) | |
0822b158 | 1954 | { |
1955 | /* Peeling is possible, but there is no data access that is not supported | |
b565a9ba | 1956 | unless aligned. So we try to choose the best possible peeling from |
1957 | the hash table. */ | |
83786d5e | 1958 | peel_for_known_alignment = vect_peeling_hash_choose_best_peeling |
f0f51716 | 1959 | (&peeling_htab, loop_vinfo); |
0822b158 | 1960 | } |
1961 | ||
83786d5e | 1962 | /* Compare costs of peeling for known and unknown alignment. */ |
abc9513d | 1963 | if (peel_for_known_alignment.peel_info.dr_info != NULL |
b565a9ba | 1964 | && peel_for_unknown_alignment.inside_cost |
1965 | >= peel_for_known_alignment.inside_cost) | |
5081fac8 | 1966 | { |
1967 | best_peel = peel_for_known_alignment; | |
b565a9ba | 1968 | |
5081fac8 | 1969 | /* If the best peeling for known alignment has NPEEL == 0, perform no |
1970 | peeling at all except if there is an unsupportable dr that we can | |
1971 | align. */ | |
1972 | if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable) | |
1973 | do_peeling = false; | |
1974 | } | |
b565a9ba | 1975 | |
5081fac8 | 1976 | /* If there is an unsupportable data ref, prefer this over all choices so far |
1977 | since we'd have to discard a chosen peeling except when it accidentally | |
1978 | aligned the unsupportable data ref. */ | |
1979 | if (one_dr_unsupportable) | |
abc9513d | 1980 | dr0_info = unsupportable_dr_info; |
5081fac8 | 1981 | else if (do_peeling) |
1982 | { | |
db755b03 | 1983 | /* Calculate the penalty for no peeling, i.e. leaving everything as-is. |
f0f51716 | 1984 | TODO: Use nopeel_outside_cost or get rid of it? */ |
5081fac8 | 1985 | unsigned nopeel_inside_cost = 0; |
1986 | unsigned nopeel_outside_cost = 0; | |
b565a9ba | 1987 | |
5081fac8 | 1988 | stmt_vector_for_cost dummy; |
1989 | dummy.create (2); | |
db72d3bf | 1990 | vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost, |
28d0cd4a | 1991 | &nopeel_outside_cost, &dummy, &dummy, |
1992 | 0, false); | |
5081fac8 | 1993 | dummy.release (); |
b565a9ba | 1994 | |
5081fac8 | 1995 | /* Add epilogue costs. As we do not peel for alignment here, no prologue |
1996 | costs will be recorded. */ | |
1997 | stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec; | |
1998 | prologue_cost_vec.create (2); | |
1999 | epilogue_cost_vec.create (2); | |
b565a9ba | 2000 | |
5081fac8 | 2001 | int dummy2; |
2002 | nopeel_outside_cost += vect_get_known_peeling_cost | |
2003 | (loop_vinfo, 0, &dummy2, | |
2004 | &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), | |
2005 | &prologue_cost_vec, &epilogue_cost_vec); | |
2006 | ||
2007 | prologue_cost_vec.release (); | |
2008 | epilogue_cost_vec.release (); | |
b565a9ba | 2009 | |
5081fac8 | 2010 | npeel = best_peel.peel_info.npeel; |
abc9513d | 2011 | dr0_info = best_peel.peel_info.dr_info; |
83786d5e | 2012 | |
5081fac8 | 2013 | /* If no peeling is not more expensive than the best peeling we |
2014 | have so far, don't perform any peeling. */ | |
2015 | if (nopeel_inside_cost <= best_peel.inside_cost) | |
2016 | do_peeling = false; | |
2017 | } | |
83786d5e | 2018 | |
fb85abff | 2019 | if (do_peeling) |
2020 | { | |
abc9513d | 2021 | stmt_vec_info stmt_info = dr0_info->stmt; |
0822b158 | 2022 | vectype = STMT_VINFO_VECTYPE (stmt_info); |
fb85abff | 2023 | |
abc9513d | 2024 | if (known_alignment_for_access_p (dr0_info)) |
fb85abff | 2025 | { |
abc9513d | 2026 | bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr), |
f1b8c740 | 2027 | size_zero_node) < 0; |
0822b158 | 2028 | if (!npeel) |
2029 | { | |
2030 | /* Since it's known at compile time, compute the number of | |
2031 | iterations in the peeled loop (the peeling factor) for use in | |
2032 | updating DR_MISALIGNMENT values. The peeling factor is the | |
2033 | vectorization factor minus the misalignment as an element | |
2034 | count. */ | |
abc9513d | 2035 | mis = (negative |
2036 | ? DR_MISALIGNMENT (dr0_info) | |
2037 | : -DR_MISALIGNMENT (dr0_info)); | |
e092c20e | 2038 | /* If known_alignment_for_access_p then we have set |
2039 | DR_MISALIGNMENT which is only done if we know it at compiler | |
2040 | time, so it is safe to assume target alignment is constant. | |
2041 | */ | |
2042 | unsigned int target_align = | |
2043 | DR_TARGET_ALIGNMENT (dr0_info).to_constant (); | |
aec313e5 | 2044 | npeel = ((mis & (target_align - 1)) |
abc9513d | 2045 | / vect_get_scalar_dr_size (dr0_info)); |
0822b158 | 2046 | } |
fb85abff | 2047 | |
48e1416a | 2048 | /* For interleaved data access every iteration accesses all the |
fb85abff | 2049 | members of the group, therefore we divide the number of iterations |
2050 | by the group size. */ | |
ee612634 | 2051 | if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) |
e1009321 | 2052 | npeel /= DR_GROUP_SIZE (stmt_info); |
fb85abff | 2053 | |
6d8fb6cf | 2054 | if (dump_enabled_p ()) |
7bd765d4 | 2055 | dump_printf_loc (MSG_NOTE, vect_location, |
78bb46f5 | 2056 | "Try peeling by %d\n", npeel); |
fb85abff | 2057 | } |
2058 | ||
cd8306bf | 2059 | /* Ensure that all datarefs can be vectorized after the peel. */ |
abc9513d | 2060 | if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel)) |
cd8306bf | 2061 | do_peeling = false; |
fb85abff | 2062 | |
cd8306bf | 2063 | /* Check if all datarefs are supportable and log. */ |
abc9513d | 2064 | if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0) |
0822b158 | 2065 | { |
ed9370cc | 2066 | opt_result stat = vect_verify_datarefs_alignment (loop_vinfo); |
0822b158 | 2067 | if (!stat) |
2068 | do_peeling = false; | |
2069 | else | |
f0f51716 | 2070 | return stat; |
0822b158 | 2071 | } |
2072 | ||
eb10b471 | 2073 | /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */ |
d7d7032a | 2074 | if (do_peeling) |
2075 | { | |
2076 | unsigned max_allowed_peel | |
2077 | = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT); | |
2078 | if (max_allowed_peel != (unsigned)-1) | |
2079 | { | |
2080 | unsigned max_peel = npeel; | |
2081 | if (max_peel == 0) | |
2082 | { | |
e092c20e | 2083 | poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info); |
2084 | unsigned HOST_WIDE_INT target_align_c; | |
2085 | if (target_align.is_constant (&target_align_c)) | |
2086 | max_peel = | |
2087 | target_align_c / vect_get_scalar_dr_size (dr0_info) - 1; | |
2088 | else | |
2089 | { | |
2090 | do_peeling = false; | |
2091 | if (dump_enabled_p ()) | |
2092 | dump_printf_loc (MSG_NOTE, vect_location, | |
2093 | "Disable peeling, max peels set and vector" | |
2094 | " alignment unknown\n"); | |
2095 | } | |
d7d7032a | 2096 | } |
2097 | if (max_peel > max_allowed_peel) | |
2098 | { | |
2099 | do_peeling = false; | |
2100 | if (dump_enabled_p ()) | |
2101 | dump_printf_loc (MSG_NOTE, vect_location, | |
2102 | "Disable peeling, max peels reached: %d\n", max_peel); | |
2103 | } | |
2104 | } | |
2105 | } | |
2106 | ||
eb10b471 | 2107 | /* Cost model #2 - if peeling may result in a remaining loop not |
d75596cd | 2108 | iterating enough to be vectorized then do not peel. Since this |
2109 | is a cost heuristic rather than a correctness decision, use the | |
2110 | most likely runtime value for variable vectorization factors. */ | |
eb10b471 | 2111 | if (do_peeling |
2112 | && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) | |
2113 | { | |
d75596cd | 2114 | unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); |
2115 | unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel; | |
2116 | if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo) | |
2117 | < assumed_vf + max_peel) | |
eb10b471 | 2118 | do_peeling = false; |
2119 | } | |
2120 | ||
fb85abff | 2121 | if (do_peeling) |
2122 | { | |
2123 | /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i. | |
2124 | If the misalignment of DR_i is identical to that of dr0 then set | |
2125 | DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and | |
2126 | dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i) | |
2127 | by the peeling factor times the element size of DR_i (MOD the | |
2128 | vectorization factor times the size). Otherwise, the | |
2129 | misalignment of DR_i must be set to unknown. */ | |
f1f41a6c | 2130 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
abc9513d | 2131 | if (dr != dr0_info->dr) |
1ca1d9b2 | 2132 | { |
2133 | /* Strided accesses perform only component accesses, alignment | |
2134 | is irrelevant for them. */ | |
db72d3bf | 2135 | dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); |
abc9513d | 2136 | stmt_info = dr_info->stmt; |
1ca1d9b2 | 2137 | if (STMT_VINFO_STRIDED_P (stmt_info) |
2138 | && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) | |
2139 | continue; | |
2140 | ||
abc9513d | 2141 | vect_update_misalignment_for_peel (dr_info, dr0_info, npeel); |
1ca1d9b2 | 2142 | } |
fb85abff | 2143 | |
ec5bf0fb | 2144 | LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info; |
0822b158 | 2145 | if (npeel) |
313a5120 | 2146 | LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel; |
0822b158 | 2147 | else |
313a5120 | 2148 | LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) |
abc9513d | 2149 | = DR_MISALIGNMENT (dr0_info); |
2150 | SET_DR_MISALIGNMENT (dr0_info, 0); | |
6d8fb6cf | 2151 | if (dump_enabled_p ()) |
7bd765d4 | 2152 | { |
2153 | dump_printf_loc (MSG_NOTE, vect_location, | |
78bb46f5 | 2154 | "Alignment of access forced using peeling.\n"); |
7bd765d4 | 2155 | dump_printf_loc (MSG_NOTE, vect_location, |
78bb46f5 | 2156 | "Peeling for alignment will be applied.\n"); |
7bd765d4 | 2157 | } |
f0f51716 | 2158 | |
e4eca2de | 2159 | /* The inside-loop cost will be accounted for in vectorizable_load |
2160 | and vectorizable_store correctly with adjusted alignments. | |
2161 | Drop the body_cst_vec on the floor here. */ | |
ed9370cc | 2162 | opt_result stat = vect_verify_datarefs_alignment (loop_vinfo); |
fb85abff | 2163 | gcc_assert (stat); |
2164 | return stat; | |
2165 | } | |
2166 | } | |
2167 | ||
fb85abff | 2168 | /* (2) Versioning to force alignment. */ |
2169 | ||
2170 | /* Try versioning if: | |
1dbf9bd1 | 2171 | 1) optimize loop for speed |
2172 | 2) there is at least one unsupported misaligned data ref with an unknown | |
fb85abff | 2173 | misalignment, and |
1dbf9bd1 | 2174 | 3) all misaligned data refs with a known misalignment are supported, and |
2175 | 4) the number of runtime alignment checks is within reason. */ | |
fb85abff | 2176 | |
48e1416a | 2177 | do_versioning = |
1dbf9bd1 | 2178 | optimize_loop_nest_for_speed_p (loop) |
fb85abff | 2179 | && (!loop->inner); /* FORNOW */ |
2180 | ||
2181 | if (do_versioning) | |
2182 | { | |
f1f41a6c | 2183 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
fb85abff | 2184 | { |
db72d3bf | 2185 | dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); |
abc9513d | 2186 | stmt_vec_info stmt_info = dr_info->stmt; |
fb85abff | 2187 | |
2188 | /* For interleaving, only the alignment of the first access | |
2189 | matters. */ | |
abc9513d | 2190 | if (aligned_access_p (dr_info) |
ee612634 | 2191 | || (STMT_VINFO_GROUPED_ACCESS (stmt_info) |
0219dc42 | 2192 | && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)) |
fb85abff | 2193 | continue; |
2194 | ||
e1c75243 | 2195 | if (STMT_VINFO_STRIDED_P (stmt_info)) |
994be998 | 2196 | { |
2197 | /* Strided loads perform only component accesses, alignment is | |
2198 | irrelevant for them. */ | |
2199 | if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) | |
2200 | continue; | |
2201 | do_versioning = false; | |
2202 | break; | |
2203 | } | |
f634c3e9 | 2204 | |
abc9513d | 2205 | supportable_dr_alignment |
2206 | = vect_supportable_dr_alignment (dr_info, false); | |
fb85abff | 2207 | |
2208 | if (!supportable_dr_alignment) | |
2209 | { | |
fb85abff | 2210 | int mask; |
2211 | tree vectype; | |
2212 | ||
abc9513d | 2213 | if (known_alignment_for_access_p (dr_info) |
f1f41a6c | 2214 | || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length () |
fb85abff | 2215 | >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS)) |
2216 | { | |
2217 | do_versioning = false; | |
2218 | break; | |
2219 | } | |
2220 | ||
0219dc42 | 2221 | vectype = STMT_VINFO_VECTYPE (stmt_info); |
2222 | gcc_assert (vectype); | |
48e1416a | 2223 | |
52acb7ae | 2224 | /* At present we don't support versioning for alignment |
2225 | with variable VF, since there's no guarantee that the | |
2226 | VF is a power of two. We could relax this if we added | |
2227 | a way of enforcing a power-of-two size. */ | |
2228 | unsigned HOST_WIDE_INT size; | |
2229 | if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size)) | |
2230 | { | |
2231 | do_versioning = false; | |
2232 | break; | |
2233 | } | |
2234 | ||
dff96e64 | 2235 | /* Forcing alignment in the first iteration is no good if |
2236 | we don't keep it across iterations. For now, just disable | |
2237 | versioning in this case. | |
2588e836 | 2238 | ?? We could actually unroll the loop to achieve the required |
2239 | overall step alignment, and forcing the alignment could be | |
dff96e64 | 2240 | done by doing some iterations of the non-vectorized loop. */ |
2241 | if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo) | |
2242 | * DR_STEP_ALIGNMENT (dr), | |
2588e836 | 2243 | DR_TARGET_ALIGNMENT (dr_info))) |
dff96e64 | 2244 | { |
2245 | do_versioning = false; | |
2246 | break; | |
2247 | } | |
2248 | ||
fb85abff | 2249 | /* The rightmost bits of an aligned address must be zeros. |
2250 | Construct the mask needed for this test. For example, | |
2251 | GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the | |
2252 | mask must be 15 = 0xf. */ | |
52acb7ae | 2253 | mask = size - 1; |
fb85abff | 2254 | |
2255 | /* FORNOW: use the same mask to test all potentially unaligned | |
2256 | references in the loop. The vectorizer currently supports | |
2257 | a single vector size, see the reference to | |
2258 | GET_MODE_NUNITS (TYPE_MODE (vectype)) where the | |
2259 | vectorization factor is computed. */ | |
2260 | gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo) | |
2261 | || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask); | |
2262 | LOOP_VINFO_PTR_MASK (loop_vinfo) = mask; | |
0219dc42 | 2263 | LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info); |
fb85abff | 2264 | } |
2265 | } | |
48e1416a | 2266 | |
fb85abff | 2267 | /* Versioning requires at least one misaligned data reference. */ |
10095225 | 2268 | if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)) |
fb85abff | 2269 | do_versioning = false; |
2270 | else if (!do_versioning) | |
f1f41a6c | 2271 | LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0); |
fb85abff | 2272 | } |
2273 | ||
2274 | if (do_versioning) | |
2275 | { | |
ab98e625 | 2276 | vec<stmt_vec_info> may_misalign_stmts |
fb85abff | 2277 | = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); |
ab98e625 | 2278 | stmt_vec_info stmt_info; |
fb85abff | 2279 | |
2280 | /* It can now be assumed that the data references in the statements | |
2281 | in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version | |
2282 | of the loop being vectorized. */ | |
ab98e625 | 2283 | FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info) |
fb85abff | 2284 | { |
abc9513d | 2285 | dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); |
2286 | SET_DR_MISALIGNMENT (dr_info, 0); | |
6d8fb6cf | 2287 | if (dump_enabled_p ()) |
78bb46f5 | 2288 | dump_printf_loc (MSG_NOTE, vect_location, |
2289 | "Alignment of access forced using versioning.\n"); | |
fb85abff | 2290 | } |
2291 | ||
6d8fb6cf | 2292 | if (dump_enabled_p ()) |
78bb46f5 | 2293 | dump_printf_loc (MSG_NOTE, vect_location, |
2294 | "Versioning for alignment will be applied.\n"); | |
fb85abff | 2295 | |
2296 | /* Peeling and versioning can't be done together at this time. */ | |
2297 | gcc_assert (! (do_peeling && do_versioning)); | |
2298 | ||
ed9370cc | 2299 | opt_result stat = vect_verify_datarefs_alignment (loop_vinfo); |
fb85abff | 2300 | gcc_assert (stat); |
2301 | return stat; | |
2302 | } | |
2303 | ||
2304 | /* This point is reached if neither peeling nor versioning is being done. */ | |
2305 | gcc_assert (! (do_peeling || do_versioning)); | |
2306 | ||
ed9370cc | 2307 | opt_result stat = vect_verify_datarefs_alignment (loop_vinfo); |
fb85abff | 2308 | return stat; |
2309 | } | |
2310 | ||
2311 | ||
91a74fc6 | 2312 | /* Function vect_find_same_alignment_drs. |
2313 | ||
db72d3bf | 2314 | Update group and alignment relations in VINFO according to the chosen |
91a74fc6 | 2315 | vectorization factor. */ |
2316 | ||
2317 | static void | |
db72d3bf | 2318 | vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr) |
91a74fc6 | 2319 | { |
91a74fc6 | 2320 | struct data_reference *dra = DDR_A (ddr); |
2321 | struct data_reference *drb = DDR_B (ddr); | |
db72d3bf | 2322 | dr_vec_info *dr_info_a = vinfo->lookup_dr (dra); |
2323 | dr_vec_info *dr_info_b = vinfo->lookup_dr (drb); | |
abc9513d | 2324 | stmt_vec_info stmtinfo_a = dr_info_a->stmt; |
2325 | stmt_vec_info stmtinfo_b = dr_info_b->stmt; | |
91a74fc6 | 2326 | |
2327 | if (DDR_ARE_DEPENDENT (ddr) == chrec_known) | |
2328 | return; | |
2329 | ||
0822b158 | 2330 | if (dra == drb) |
91a74fc6 | 2331 | return; |
2332 | ||
fa681b45 | 2333 | if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) |
2334 | || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) | |
2335 | return; | |
2336 | ||
4f372c2c | 2337 | if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0) |
7d4e73a6 | 2338 | || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0) |
2339 | || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0)) | |
91a74fc6 | 2340 | return; |
2341 | ||
7d4e73a6 | 2342 | /* Two references with distance zero have the same alignment. */ |
c4d25d8a | 2343 | poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra)) |
2344 | - wi::to_poly_offset (DR_INIT (drb))); | |
2345 | if (maybe_ne (diff, 0)) | |
91a74fc6 | 2346 | { |
7d4e73a6 | 2347 | /* Get the wider of the two alignments. */ |
e092c20e | 2348 | poly_uint64 align_a = |
2349 | exact_div (vect_calculate_target_alignment (dr_info_a), | |
2350 | BITS_PER_UNIT); | |
2351 | poly_uint64 align_b = | |
2352 | exact_div (vect_calculate_target_alignment (dr_info_b), | |
2353 | BITS_PER_UNIT); | |
2354 | unsigned HOST_WIDE_INT align_a_c, align_b_c; | |
2355 | if (!align_a.is_constant (&align_a_c) | |
2356 | || !align_b.is_constant (&align_b_c)) | |
2357 | return; | |
2358 | ||
2359 | unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c); | |
7d4e73a6 | 2360 | |
2361 | /* Require the gap to be a multiple of the larger vector alignment. */ | |
c4d25d8a | 2362 | if (!multiple_p (diff, max_align)) |
7d4e73a6 | 2363 | return; |
2364 | } | |
91a74fc6 | 2365 | |
7d4e73a6 | 2366 | STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb); |
2367 | STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra); | |
2368 | if (dump_enabled_p ()) | |
a4e972e3 | 2369 | dump_printf_loc (MSG_NOTE, vect_location, |
2370 | "accesses have the same alignment: %T and %T\n", | |
2371 | DR_REF (dra), DR_REF (drb)); | |
91a74fc6 | 2372 | } |
2373 | ||
2374 | ||
fb85abff | 2375 | /* Function vect_analyze_data_refs_alignment |
2376 | ||
2377 | Analyze the alignment of the data-references in the loop. | |
2378 | Return FALSE if a data reference is found that cannot be vectorized. */ | |
2379 | ||
ed9370cc | 2380 | opt_result |
2f6fec15 | 2381 | vect_analyze_data_refs_alignment (loop_vec_info vinfo) |
fb85abff | 2382 | { |
88f6eb8f | 2383 | DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment"); |
fb85abff | 2384 | |
91a74fc6 | 2385 | /* Mark groups of data references with same alignment using |
2386 | data dependence information. */ | |
a99aba41 | 2387 | vec<ddr_p> ddrs = vinfo->shared->ddrs; |
2f6fec15 | 2388 | struct data_dependence_relation *ddr; |
2389 | unsigned int i; | |
2390 | ||
2391 | FOR_EACH_VEC_ELT (ddrs, i, ddr) | |
db72d3bf | 2392 | vect_find_same_alignment_drs (vinfo, ddr); |
2f6fec15 | 2393 | |
a99aba41 | 2394 | vec<data_reference_p> datarefs = vinfo->shared->datarefs; |
2f6fec15 | 2395 | struct data_reference *dr; |
2396 | ||
4f372c2c | 2397 | vect_record_base_alignments (vinfo); |
2f6fec15 | 2398 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
91a74fc6 | 2399 | { |
db72d3bf | 2400 | dr_vec_info *dr_info = vinfo->lookup_dr (dr); |
abc9513d | 2401 | if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)) |
2402 | vect_compute_data_ref_alignment (dr_info); | |
91a74fc6 | 2403 | } |
2404 | ||
ed9370cc | 2405 | return opt_result::success (); |
2f6fec15 | 2406 | } |
2407 | ||
2408 | ||
2409 | /* Analyze alignment of DRs of stmts in NODE. */ | |
2410 | ||
2411 | static bool | |
2412 | vect_slp_analyze_and_verify_node_alignment (slp_tree node) | |
2413 | { | |
f6593f36 | 2414 | /* We vectorize from the first scalar stmt in the node unless |
2415 | the node is permuted in which case we start from the first | |
2416 | element in the group. */ | |
06bb64b8 | 2417 | stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; |
abc9513d | 2418 | dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info); |
f6593f36 | 2419 | if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) |
cd24aa3c | 2420 | first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info); |
f6593f36 | 2421 | |
abc9513d | 2422 | dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info); |
2423 | vect_compute_data_ref_alignment (dr_info); | |
fa681b45 | 2424 | /* For creating the data-ref pointer we need alignment of the |
2425 | first element anyway. */ | |
abc9513d | 2426 | if (dr_info != first_dr_info) |
2427 | vect_compute_data_ref_alignment (first_dr_info); | |
2428 | if (! verify_data_ref_alignment (dr_info)) | |
fb85abff | 2429 | { |
f6593f36 | 2430 | if (dump_enabled_p ()) |
2431 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
2432 | "not vectorized: bad data alignment in basic " | |
2433 | "block.\n"); | |
2434 | return false; | |
fb85abff | 2435 | } |
2436 | ||
2437 | return true; | |
2438 | } | |
2439 | ||
2f6fec15 | 2440 | /* Function vect_slp_analyze_instance_alignment |
2441 | ||
2442 | Analyze the alignment of the data-references in the SLP instance. | |
2443 | Return FALSE if a data reference is found that cannot be vectorized. */ | |
2444 | ||
2445 | bool | |
2446 | vect_slp_analyze_and_verify_instance_alignment (slp_instance instance) | |
2447 | { | |
88f6eb8f | 2448 | DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment"); |
2f6fec15 | 2449 | |
2450 | slp_tree node; | |
2451 | unsigned i; | |
2452 | FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node) | |
2453 | if (! vect_slp_analyze_and_verify_node_alignment (node)) | |
2454 | return false; | |
2455 | ||
2456 | node = SLP_INSTANCE_TREE (instance); | |
06bb64b8 | 2457 | if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0]) |
2f6fec15 | 2458 | && ! vect_slp_analyze_and_verify_node_alignment |
2459 | (SLP_INSTANCE_TREE (instance))) | |
2460 | return false; | |
2461 | ||
2462 | return true; | |
2463 | } | |
2464 | ||
fb85abff | 2465 | |
abc9513d | 2466 | /* Analyze groups of accesses: check that DR_INFO belongs to a group of |
ee612634 | 2467 | accesses of legal size, step, etc. Detect gaps, single element |
2468 | interleaving, and other special cases. Set grouped access info. | |
39e23eaa | 2469 | Collect groups of strided stores for further use in SLP analysis. |
2470 | Worker for vect_analyze_group_access. */ | |
fb85abff | 2471 | |
2472 | static bool | |
abc9513d | 2473 | vect_analyze_group_access_1 (dr_vec_info *dr_info) |
fb85abff | 2474 | { |
abc9513d | 2475 | data_reference *dr = dr_info->dr; |
fb85abff | 2476 | tree step = DR_STEP (dr); |
2477 | tree scalar_type = TREE_TYPE (DR_REF (dr)); | |
f9ae6f95 | 2478 | HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); |
abc9513d | 2479 | stmt_vec_info stmt_info = dr_info->stmt; |
fb85abff | 2480 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); |
37545e54 | 2481 | bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); |
994be998 | 2482 | HOST_WIDE_INT dr_step = -1; |
ee612634 | 2483 | HOST_WIDE_INT groupsize, last_accessed_element = 1; |
fb85abff | 2484 | bool slp_impossible = false; |
2485 | ||
ee612634 | 2486 | /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the |
2487 | size of the interleaving group (including gaps). */ | |
994be998 | 2488 | if (tree_fits_shwi_p (step)) |
2489 | { | |
2490 | dr_step = tree_to_shwi (step); | |
0d77042c | 2491 | /* Check that STEP is a multiple of type size. Otherwise there is |
2492 | a non-element-sized gap at the end of the group which we | |
e1009321 | 2493 | cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE. |
0d77042c | 2494 | ??? As we can handle non-constant step fine here we should |
e1009321 | 2495 | simply remove uses of DR_GROUP_GAP between the last and first |
2496 | element and instead rely on DR_STEP. DR_GROUP_SIZE then would | |
0d77042c | 2497 | simply not include that gap. */ |
2498 | if ((dr_step % type_size) != 0) | |
2499 | { | |
2500 | if (dump_enabled_p ()) | |
a4e972e3 | 2501 | dump_printf_loc (MSG_NOTE, vect_location, |
2502 | "Step %T is not a multiple of the element size" | |
2503 | " for %T\n", | |
2504 | step, DR_REF (dr)); | |
0d77042c | 2505 | return false; |
2506 | } | |
994be998 | 2507 | groupsize = absu_hwi (dr_step) / type_size; |
2508 | } | |
2509 | else | |
2510 | groupsize = 0; | |
fb85abff | 2511 | |
2512 | /* Not consecutive access is possible only if it is a part of interleaving. */ | |
0219dc42 | 2513 | if (!DR_GROUP_FIRST_ELEMENT (stmt_info)) |
fb85abff | 2514 | { |
2515 | /* Check if it this DR is a part of interleaving, and is a single | |
2516 | element of the group that is accessed in the loop. */ | |
48e1416a | 2517 | |
fb85abff | 2518 | /* Gaps are supported only for loads. STEP must be a multiple of the type |
f5d5e8fa | 2519 | size. */ |
fb85abff | 2520 | if (DR_IS_READ (dr) |
2521 | && (dr_step % type_size) == 0 | |
f5d5e8fa | 2522 | && groupsize > 0) |
fb85abff | 2523 | { |
0219dc42 | 2524 | DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info; |
2525 | DR_GROUP_SIZE (stmt_info) = groupsize; | |
e1009321 | 2526 | DR_GROUP_GAP (stmt_info) = groupsize - 1; |
6d8fb6cf | 2527 | if (dump_enabled_p ()) |
a4e972e3 | 2528 | dump_printf_loc (MSG_NOTE, vect_location, |
2529 | "Detected single element interleaving %T" | |
2530 | " step %T\n", | |
2531 | DR_REF (dr), step); | |
a4ee7fac | 2532 | |
fb85abff | 2533 | return true; |
2534 | } | |
6ea6a380 | 2535 | |
6d8fb6cf | 2536 | if (dump_enabled_p ()) |
a4e972e3 | 2537 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2538 | "not consecutive access %G", stmt_info->stmt); | |
6ea6a380 | 2539 | |
2540 | if (bb_vinfo) | |
0219dc42 | 2541 | { |
2542 | /* Mark the statement as unvectorizable. */ | |
abc9513d | 2543 | STMT_VINFO_VECTORIZABLE (stmt_info) = false; |
0219dc42 | 2544 | return true; |
2545 | } | |
7bd765d4 | 2546 | |
91f42adc | 2547 | if (dump_enabled_p ()) |
2548 | dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n"); | |
71de77d8 | 2549 | STMT_VINFO_STRIDED_P (stmt_info) = true; |
2550 | return true; | |
fb85abff | 2551 | } |
2552 | ||
0219dc42 | 2553 | if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info) |
fb85abff | 2554 | { |
2555 | /* First stmt in the interleaving chain. Check the chain. */ | |
cd24aa3c | 2556 | stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info); |
fb85abff | 2557 | struct data_reference *data_ref = dr; |
1a0e7d51 | 2558 | unsigned int count = 1; |
fb85abff | 2559 | tree prev_init = DR_INIT (data_ref); |
8bbe6b75 | 2560 | HOST_WIDE_INT diff, gaps = 0; |
fb85abff | 2561 | |
c4d25d8a | 2562 | /* By construction, all group members have INTEGER_CST DR_INITs. */ |
fb85abff | 2563 | while (next) |
2564 | { | |
6883ce83 | 2565 | /* We never have the same DR multiple times. */ |
2566 | gcc_assert (tree_int_cst_compare (DR_INIT (data_ref), | |
2567 | DR_INIT (STMT_VINFO_DATA_REF (next))) != 0); | |
a4ee7fac | 2568 | |
cd24aa3c | 2569 | data_ref = STMT_VINFO_DATA_REF (next); |
fb85abff | 2570 | |
8bbe6b75 | 2571 | /* All group members have the same STEP by construction. */ |
2572 | gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0)); | |
fb85abff | 2573 | |
fb85abff | 2574 | /* Check that the distance between two accesses is equal to the type |
2575 | size. Otherwise, we have gaps. */ | |
f9ae6f95 | 2576 | diff = (TREE_INT_CST_LOW (DR_INIT (data_ref)) |
2577 | - TREE_INT_CST_LOW (prev_init)) / type_size; | |
fb85abff | 2578 | if (diff != 1) |
2579 | { | |
2580 | /* FORNOW: SLP of accesses with gaps is not supported. */ | |
2581 | slp_impossible = true; | |
9ff25603 | 2582 | if (DR_IS_WRITE (data_ref)) |
fb85abff | 2583 | { |
6d8fb6cf | 2584 | if (dump_enabled_p ()) |
78bb46f5 | 2585 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2586 | "interleaved store with gaps\n"); | |
fb85abff | 2587 | return false; |
2588 | } | |
b11576bf | 2589 | |
2590 | gaps += diff - 1; | |
fb85abff | 2591 | } |
2592 | ||
a4ee7fac | 2593 | last_accessed_element += diff; |
2594 | ||
fb85abff | 2595 | /* Store the gap from the previous member of the group. If there is no |
e1009321 | 2596 | gap in the access, DR_GROUP_GAP is always 1. */ |
cd24aa3c | 2597 | DR_GROUP_GAP (next) = diff; |
fb85abff | 2598 | |
cd24aa3c | 2599 | prev_init = DR_INIT (data_ref); |
2600 | next = DR_GROUP_NEXT_ELEMENT (next); | |
2601 | /* Count the number of data-refs in the chain. */ | |
2602 | count++; | |
fb85abff | 2603 | } |
2604 | ||
994be998 | 2605 | if (groupsize == 0) |
2606 | groupsize = count + gaps; | |
fb85abff | 2607 | |
26aad5fc | 2608 | /* This could be UINT_MAX but as we are generating code in a very |
2609 | inefficient way we have to cap earlier. See PR78699 for example. */ | |
2610 | if (groupsize > 4096) | |
39e23eaa | 2611 | { |
2612 | if (dump_enabled_p ()) | |
2613 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
2614 | "group is too large\n"); | |
2615 | return false; | |
2616 | } | |
2617 | ||
994be998 | 2618 | /* Check that the size of the interleaving is equal to count for stores, |
fb85abff | 2619 | i.e., that there are no gaps. */ |
904bd865 | 2620 | if (groupsize != count |
2621 | && !DR_IS_READ (dr)) | |
fb85abff | 2622 | { |
05b97b35 | 2623 | groupsize = count; |
2624 | STMT_VINFO_STRIDED_P (stmt_info) = true; | |
904bd865 | 2625 | } |
2626 | ||
2627 | /* If there is a gap after the last load in the group it is the | |
2628 | difference between the groupsize and the last accessed | |
2629 | element. | |
2630 | When there is no gap, this difference should be 0. */ | |
0219dc42 | 2631 | DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element; |
fb85abff | 2632 | |
0219dc42 | 2633 | DR_GROUP_SIZE (stmt_info) = groupsize; |
6d8fb6cf | 2634 | if (dump_enabled_p ()) |
904bd865 | 2635 | { |
2636 | dump_printf_loc (MSG_NOTE, vect_location, | |
39e23eaa | 2637 | "Detected interleaving "); |
2638 | if (DR_IS_READ (dr)) | |
2639 | dump_printf (MSG_NOTE, "load "); | |
05b97b35 | 2640 | else if (STMT_VINFO_STRIDED_P (stmt_info)) |
2641 | dump_printf (MSG_NOTE, "strided store "); | |
39e23eaa | 2642 | else |
2643 | dump_printf (MSG_NOTE, "store "); | |
b4d2979c | 2644 | dump_printf (MSG_NOTE, "of size %u\n", |
2645 | (unsigned)groupsize); | |
2646 | dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt); | |
2647 | next = DR_GROUP_NEXT_ELEMENT (stmt_info); | |
2648 | while (next) | |
2649 | { | |
2650 | if (DR_GROUP_GAP (next) != 1) | |
2651 | dump_printf_loc (MSG_NOTE, vect_location, | |
2652 | "\t<gap of %d elements>\n", | |
2653 | DR_GROUP_GAP (next) - 1); | |
2654 | dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt); | |
2655 | next = DR_GROUP_NEXT_ELEMENT (next); | |
2656 | } | |
0219dc42 | 2657 | if (DR_GROUP_GAP (stmt_info) != 0) |
904bd865 | 2658 | dump_printf_loc (MSG_NOTE, vect_location, |
b4d2979c | 2659 | "\t<gap of %d elements>\n", |
0219dc42 | 2660 | DR_GROUP_GAP (stmt_info)); |
904bd865 | 2661 | } |
fb85abff | 2662 | |
48e1416a | 2663 | /* SLP: create an SLP data structure for every interleaving group of |
fb85abff | 2664 | stores for further analysis in vect_analyse_slp. */ |
9ff25603 | 2665 | if (DR_IS_WRITE (dr) && !slp_impossible) |
0219dc42 | 2666 | { |
2667 | if (loop_vinfo) | |
2668 | LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info); | |
2669 | if (bb_vinfo) | |
2670 | BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info); | |
2671 | } | |
fb85abff | 2672 | } |
2673 | ||
2674 | return true; | |
2675 | } | |
2676 | ||
abc9513d | 2677 | /* Analyze groups of accesses: check that DR_INFO belongs to a group of |
39e23eaa | 2678 | accesses of legal size, step, etc. Detect gaps, single element |
2679 | interleaving, and other special cases. Set grouped access info. | |
2680 | Collect groups of strided stores for further use in SLP analysis. */ | |
2681 | ||
2682 | static bool | |
abc9513d | 2683 | vect_analyze_group_access (dr_vec_info *dr_info) |
39e23eaa | 2684 | { |
abc9513d | 2685 | if (!vect_analyze_group_access_1 (dr_info)) |
39e23eaa | 2686 | { |
2687 | /* Dissolve the group if present. */ | |
abc9513d | 2688 | stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt); |
cd24aa3c | 2689 | while (stmt_info) |
39e23eaa | 2690 | { |
cd24aa3c | 2691 | stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info); |
2692 | DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL; | |
2693 | DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL; | |
2694 | stmt_info = next; | |
39e23eaa | 2695 | } |
2696 | return false; | |
2697 | } | |
2698 | return true; | |
2699 | } | |
fb85abff | 2700 | |
abc9513d | 2701 | /* Analyze the access pattern of the data-reference DR_INFO. |
fb85abff | 2702 | In case of non-consecutive accesses call vect_analyze_group_access() to |
ee612634 | 2703 | analyze groups of accesses. */ |
fb85abff | 2704 | |
2705 | static bool | |
abc9513d | 2706 | vect_analyze_data_ref_access (dr_vec_info *dr_info) |
fb85abff | 2707 | { |
abc9513d | 2708 | data_reference *dr = dr_info->dr; |
fb85abff | 2709 | tree step = DR_STEP (dr); |
2710 | tree scalar_type = TREE_TYPE (DR_REF (dr)); | |
abc9513d | 2711 | stmt_vec_info stmt_info = dr_info->stmt; |
fb85abff | 2712 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); |
2e966e2a | 2713 | class loop *loop = NULL; |
fb85abff | 2714 | |
0bf8b382 | 2715 | if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) |
2716 | return true; | |
2717 | ||
37545e54 | 2718 | if (loop_vinfo) |
2719 | loop = LOOP_VINFO_LOOP (loop_vinfo); | |
48e1416a | 2720 | |
37545e54 | 2721 | if (loop_vinfo && !step) |
fb85abff | 2722 | { |
6d8fb6cf | 2723 | if (dump_enabled_p ()) |
78bb46f5 | 2724 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2725 | "bad data-ref access in loop\n"); | |
fb85abff | 2726 | return false; |
2727 | } | |
2728 | ||
9b0be19c | 2729 | /* Allow loads with zero step in inner-loop vectorization. */ |
f634c3e9 | 2730 | if (loop_vinfo && integer_zerop (step)) |
b04940e7 | 2731 | { |
0219dc42 | 2732 | DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL; |
2733 | if (!nested_in_vect_loop_p (loop, stmt_info)) | |
9b0be19c | 2734 | return DR_IS_READ (dr); |
2735 | /* Allow references with zero step for outer loops marked | |
2736 | with pragma omp simd only - it guarantees absence of | |
2737 | loop-carried dependencies between inner loop iterations. */ | |
84017e0e | 2738 | if (loop->safelen < 2) |
afa60cb4 | 2739 | { |
2740 | if (dump_enabled_p ()) | |
2741 | dump_printf_loc (MSG_NOTE, vect_location, | |
78bb46f5 | 2742 | "zero step in inner loop of nest\n"); |
afa60cb4 | 2743 | return false; |
2744 | } | |
b04940e7 | 2745 | } |
fb85abff | 2746 | |
0219dc42 | 2747 | if (loop && nested_in_vect_loop_p (loop, stmt_info)) |
fb85abff | 2748 | { |
2749 | /* Interleaved accesses are not yet supported within outer-loop | |
2750 | vectorization for references in the inner-loop. */ | |
0219dc42 | 2751 | DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL; |
fb85abff | 2752 | |
2753 | /* For the rest of the analysis we use the outer-loop step. */ | |
2754 | step = STMT_VINFO_DR_STEP (stmt_info); | |
f634c3e9 | 2755 | if (integer_zerop (step)) |
fb85abff | 2756 | { |
6d8fb6cf | 2757 | if (dump_enabled_p ()) |
7bd765d4 | 2758 | dump_printf_loc (MSG_NOTE, vect_location, |
78bb46f5 | 2759 | "zero step in outer loop.\n"); |
0bd6d857 | 2760 | return DR_IS_READ (dr); |
fb85abff | 2761 | } |
2762 | } | |
2763 | ||
2764 | /* Consecutive? */ | |
f634c3e9 | 2765 | if (TREE_CODE (step) == INTEGER_CST) |
fb85abff | 2766 | { |
f9ae6f95 | 2767 | HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); |
f634c3e9 | 2768 | if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type)) |
2769 | || (dr_step < 0 | |
2770 | && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step))) | |
2771 | { | |
2772 | /* Mark that it is not interleaving. */ | |
0219dc42 | 2773 | DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL; |
f634c3e9 | 2774 | return true; |
2775 | } | |
fb85abff | 2776 | } |
2777 | ||
0219dc42 | 2778 | if (loop && nested_in_vect_loop_p (loop, stmt_info)) |
fb85abff | 2779 | { |
6d8fb6cf | 2780 | if (dump_enabled_p ()) |
7bd765d4 | 2781 | dump_printf_loc (MSG_NOTE, vect_location, |
78bb46f5 | 2782 | "grouped access in outer loop.\n"); |
fb85abff | 2783 | return false; |
2784 | } | |
2785 | ||
994be998 | 2786 | |
f634c3e9 | 2787 | /* Assume this is a DR handled by non-constant strided load case. */ |
2788 | if (TREE_CODE (step) != INTEGER_CST) | |
e1c75243 | 2789 | return (STMT_VINFO_STRIDED_P (stmt_info) |
994be998 | 2790 | && (!STMT_VINFO_GROUPED_ACCESS (stmt_info) |
abc9513d | 2791 | || vect_analyze_group_access (dr_info))); |
f634c3e9 | 2792 | |
fb85abff | 2793 | /* Not consecutive access - check if it's a part of interleaving group. */ |
abc9513d | 2794 | return vect_analyze_group_access (dr_info); |
fb85abff | 2795 | } |
2796 | ||
68f15e9d | 2797 | /* Compare two data-references DRA and DRB to group them into chunks |
2798 | suitable for grouping. */ | |
2799 | ||
2800 | static int | |
2801 | dr_group_sort_cmp (const void *dra_, const void *drb_) | |
2802 | { | |
2803 | data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_); | |
2804 | data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_); | |
68f15e9d | 2805 | int cmp; |
2806 | ||
2807 | /* Stabilize sort. */ | |
2808 | if (dra == drb) | |
2809 | return 0; | |
2810 | ||
8167d6ad | 2811 | /* DRs in different loops never belong to the same group. */ |
2812 | loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father; | |
2813 | loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father; | |
2814 | if (loopa != loopb) | |
2815 | return loopa->num < loopb->num ? -1 : 1; | |
2816 | ||
68f15e9d | 2817 | /* Ordering of DRs according to base. */ |
ce55060f | 2818 | cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra), |
2819 | DR_BASE_ADDRESS (drb)); | |
2820 | if (cmp != 0) | |
2821 | return cmp; | |
68f15e9d | 2822 | |
2823 | /* And according to DR_OFFSET. */ | |
ce55060f | 2824 | cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)); |
2825 | if (cmp != 0) | |
2826 | return cmp; | |
68f15e9d | 2827 | |
2828 | /* Put reads before writes. */ | |
2829 | if (DR_IS_READ (dra) != DR_IS_READ (drb)) | |
2830 | return DR_IS_READ (dra) ? -1 : 1; | |
2831 | ||
2832 | /* Then sort after access size. */ | |
ce55060f | 2833 | cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), |
2834 | TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)))); | |
2835 | if (cmp != 0) | |
2836 | return cmp; | |
68f15e9d | 2837 | |
2838 | /* And after step. */ | |
ce55060f | 2839 | cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)); |
2840 | if (cmp != 0) | |
2841 | return cmp; | |
68f15e9d | 2842 | |
2843 | /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */ | |
8672ee56 | 2844 | cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)); |
68f15e9d | 2845 | if (cmp == 0) |
2846 | return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1; | |
2847 | return cmp; | |
2848 | } | |
fb85abff | 2849 | |
2dd8e84c | 2850 | /* If OP is the result of a conversion, return the unconverted value, |
2851 | otherwise return null. */ | |
2852 | ||
2853 | static tree | |
2854 | strip_conversion (tree op) | |
2855 | { | |
2856 | if (TREE_CODE (op) != SSA_NAME) | |
2857 | return NULL_TREE; | |
2858 | gimple *stmt = SSA_NAME_DEF_STMT (op); | |
2859 | if (!is_gimple_assign (stmt) | |
2860 | || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))) | |
2861 | return NULL_TREE; | |
2862 | return gimple_assign_rhs1 (stmt); | |
2863 | } | |
2864 | ||
ecc42a77 | 2865 | /* Return true if vectorizable_* routines can handle statements STMT1_INFO |
f92474f8 | 2866 | and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can |
2867 | be grouped in SLP mode. */ | |
2dd8e84c | 2868 | |
2869 | static bool | |
f92474f8 | 2870 | can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info, |
2871 | bool allow_slp_p) | |
2dd8e84c | 2872 | { |
ecc42a77 | 2873 | if (gimple_assign_single_p (stmt1_info->stmt)) |
2874 | return gimple_assign_single_p (stmt2_info->stmt); | |
2dd8e84c | 2875 | |
ecc42a77 | 2876 | gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt); |
5b4b7bcc | 2877 | if (call1 && gimple_call_internal_p (call1)) |
2dd8e84c | 2878 | { |
2879 | /* Check for two masked loads or two masked stores. */ | |
ecc42a77 | 2880 | gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt); |
5b4b7bcc | 2881 | if (!call2 || !gimple_call_internal_p (call2)) |
2dd8e84c | 2882 | return false; |
5b4b7bcc | 2883 | internal_fn ifn = gimple_call_internal_fn (call1); |
2dd8e84c | 2884 | if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE) |
2885 | return false; | |
5b4b7bcc | 2886 | if (ifn != gimple_call_internal_fn (call2)) |
2dd8e84c | 2887 | return false; |
2888 | ||
2889 | /* Check that the masks are the same. Cope with casts of masks, | |
2890 | like those created by build_mask_conversion. */ | |
5b4b7bcc | 2891 | tree mask1 = gimple_call_arg (call1, 2); |
2892 | tree mask2 = gimple_call_arg (call2, 2); | |
f92474f8 | 2893 | if (!operand_equal_p (mask1, mask2, 0) |
2894 | && (ifn == IFN_MASK_STORE || !allow_slp_p)) | |
2dd8e84c | 2895 | { |
2896 | mask1 = strip_conversion (mask1); | |
2897 | if (!mask1) | |
2898 | return false; | |
2899 | mask2 = strip_conversion (mask2); | |
2900 | if (!mask2) | |
2901 | return false; | |
2902 | if (!operand_equal_p (mask1, mask2, 0)) | |
2903 | return false; | |
2904 | } | |
2905 | return true; | |
2906 | } | |
2907 | ||
2908 | return false; | |
2909 | } | |
2910 | ||
fb85abff | 2911 | /* Function vect_analyze_data_ref_accesses. |
2912 | ||
2913 | Analyze the access pattern of all the data references in the loop. | |
2914 | ||
2915 | FORNOW: the only access pattern that is considered vectorizable is a | |
2916 | simple step 1 (consecutive) access. | |
2917 | ||
2918 | FORNOW: handle only arrays and pointer accesses. */ | |
2919 | ||
ed9370cc | 2920 | opt_result |
e2c5c678 | 2921 | vect_analyze_data_ref_accesses (vec_info *vinfo) |
fb85abff | 2922 | { |
2923 | unsigned int i; | |
a99aba41 | 2924 | vec<data_reference_p> datarefs = vinfo->shared->datarefs; |
fb85abff | 2925 | struct data_reference *dr; |
2926 | ||
88f6eb8f | 2927 | DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses"); |
fb85abff | 2928 | |
68f15e9d | 2929 | if (datarefs.is_empty ()) |
ed9370cc | 2930 | return opt_result::success (); |
68f15e9d | 2931 | |
2932 | /* Sort the array of datarefs to make building the interleaving chains | |
863a3781 | 2933 | linear. Don't modify the original vector's order, it is needed for |
2934 | determining what dependencies are reversed. */ | |
2935 | vec<data_reference_p> datarefs_copy = datarefs.copy (); | |
90a2d741 | 2936 | datarefs_copy.qsort (dr_group_sort_cmp); |
e0599ca4 | 2937 | hash_set<stmt_vec_info> to_fixup; |
68f15e9d | 2938 | |
2939 | /* Build the interleaving chains. */ | |
863a3781 | 2940 | for (i = 0; i < datarefs_copy.length () - 1;) |
68f15e9d | 2941 | { |
863a3781 | 2942 | data_reference_p dra = datarefs_copy[i]; |
db72d3bf | 2943 | dr_vec_info *dr_info_a = vinfo->lookup_dr (dra); |
abc9513d | 2944 | stmt_vec_info stmtinfo_a = dr_info_a->stmt; |
68f15e9d | 2945 | stmt_vec_info lastinfo = NULL; |
ab053afe | 2946 | if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a) |
2947 | || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)) | |
f6aeb966 | 2948 | { |
2949 | ++i; | |
2950 | continue; | |
2951 | } | |
863a3781 | 2952 | for (i = i + 1; i < datarefs_copy.length (); ++i) |
68f15e9d | 2953 | { |
863a3781 | 2954 | data_reference_p drb = datarefs_copy[i]; |
db72d3bf | 2955 | dr_vec_info *dr_info_b = vinfo->lookup_dr (drb); |
abc9513d | 2956 | stmt_vec_info stmtinfo_b = dr_info_b->stmt; |
ab053afe | 2957 | if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b) |
2958 | || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) | |
f6aeb966 | 2959 | break; |
68f15e9d | 2960 | |
2961 | /* ??? Imperfect sorting (non-compatible types, non-modulo | |
2962 | accesses, same accesses) can lead to a group to be artificially | |
2963 | split here as we don't just skip over those. If it really | |
2964 | matters we can push those to a worklist and re-iterate | |
2965 | over them. The we can just skip ahead to the next DR here. */ | |
2966 | ||
8167d6ad | 2967 | /* DRs in a different loop should not be put into the same |
2968 | interleaving group. */ | |
2969 | if (gimple_bb (DR_STMT (dra))->loop_father | |
2970 | != gimple_bb (DR_STMT (drb))->loop_father) | |
2971 | break; | |
2972 | ||
68f15e9d | 2973 | /* Check that the data-refs have same first location (except init) |
5c0fac99 | 2974 | and they are both either store or load (not load and store, |
2975 | not masked loads or stores). */ | |
68f15e9d | 2976 | if (DR_IS_READ (dra) != DR_IS_READ (drb) |
ce55060f | 2977 | || data_ref_compare_tree (DR_BASE_ADDRESS (dra), |
2978 | DR_BASE_ADDRESS (drb)) != 0 | |
2979 | || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0 | |
f92474f8 | 2980 | || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true)) |
68f15e9d | 2981 | break; |
2982 | ||
994be998 | 2983 | /* Check that the data-refs have the same constant size. */ |
68f15e9d | 2984 | tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))); |
2985 | tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))); | |
e913b5cd | 2986 | if (!tree_fits_uhwi_p (sza) |
2987 | || !tree_fits_uhwi_p (szb) | |
994be998 | 2988 | || !tree_int_cst_equal (sza, szb)) |
2989 | break; | |
2990 | ||
2991 | /* Check that the data-refs have the same step. */ | |
ce55060f | 2992 | if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0) |
68f15e9d | 2993 | break; |
2994 | ||
68f15e9d | 2995 | /* Check the types are compatible. |
2996 | ??? We don't distinguish this during sorting. */ | |
2997 | if (!types_compatible_p (TREE_TYPE (DR_REF (dra)), | |
2998 | TREE_TYPE (DR_REF (drb)))) | |
2999 | break; | |
3000 | ||
c4d25d8a | 3001 | /* Check that the DR_INITs are compile-time constants. */ |
3002 | if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST | |
3003 | || TREE_CODE (DR_INIT (drb)) != INTEGER_CST) | |
3004 | break; | |
3005 | ||
da008d72 | 3006 | /* Different .GOMP_SIMD_LANE calls still give the same lane, |
3007 | just hold extra information. */ | |
3008 | if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a) | |
3009 | && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b) | |
3010 | && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0) | |
3011 | break; | |
3012 | ||
68f15e9d | 3013 | /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */ |
f9ae6f95 | 3014 | HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra)); |
3015 | HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb)); | |
9c9cb9cf | 3016 | HOST_WIDE_INT init_prev |
3017 | = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1])); | |
3018 | gcc_assert (init_a <= init_b | |
3019 | && init_a <= init_prev | |
3020 | && init_prev <= init_b); | |
3021 | ||
3022 | /* Do not place the same access in the interleaving chain twice. */ | |
3023 | if (init_b == init_prev) | |
3024 | { | |
3025 | gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1])) | |
3026 | < gimple_uid (DR_STMT (drb))); | |
e0599ca4 | 3027 | /* Simply link in duplicates and fix up the chain below. */ |
9c9cb9cf | 3028 | } |
e0599ca4 | 3029 | else |
994be998 | 3030 | { |
e0599ca4 | 3031 | /* If init_b == init_a + the size of the type * k, we have an |
3032 | interleaving, and DRA is accessed before DRB. */ | |
3033 | HOST_WIDE_INT type_size_a = tree_to_uhwi (sza); | |
3034 | if (type_size_a == 0 | |
3035 | || (init_b - init_a) % type_size_a != 0) | |
994be998 | 3036 | break; |
e0599ca4 | 3037 | |
3038 | /* If we have a store, the accesses are adjacent. This splits | |
3039 | groups into chunks we support (we don't support vectorization | |
3040 | of stores with gaps). */ | |
3041 | if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a) | |
3042 | break; | |
3043 | ||
3044 | /* If the step (if not zero or non-constant) is greater than the | |
3045 | difference between data-refs' inits this splits groups into | |
3046 | suitable sizes. */ | |
3047 | if (tree_fits_shwi_p (DR_STEP (dra))) | |
3048 | { | |
3049 | HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra)); | |
3050 | if (step != 0 && step <= (init_b - init_a)) | |
3051 | break; | |
3052 | } | |
994be998 | 3053 | } |
68f15e9d | 3054 | |
3055 | if (dump_enabled_p ()) | |
a4e972e3 | 3056 | dump_printf_loc (MSG_NOTE, vect_location, |
3057 | DR_IS_READ (dra) | |
3058 | ? "Detected interleaving load %T and %T\n" | |
3059 | : "Detected interleaving store %T and %T\n", | |
3060 | DR_REF (dra), DR_REF (drb)); | |
68f15e9d | 3061 | |
3062 | /* Link the found element into the group list. */ | |
e1009321 | 3063 | if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a)) |
68f15e9d | 3064 | { |
1c2fef9a | 3065 | DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a; |
68f15e9d | 3066 | lastinfo = stmtinfo_a; |
3067 | } | |
1c2fef9a | 3068 | DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a; |
3069 | DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b; | |
68f15e9d | 3070 | lastinfo = stmtinfo_b; |
e0599ca4 | 3071 | |
f92474f8 | 3072 | STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a) |
3073 | = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false); | |
3074 | ||
3075 | if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)) | |
3076 | dump_printf_loc (MSG_NOTE, vect_location, | |
3077 | "Load suitable for SLP vectorization only.\n"); | |
3078 | ||
e0599ca4 | 3079 | if (init_b == init_prev |
3080 | && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)) | |
3081 | && dump_enabled_p ()) | |
3082 | dump_printf_loc (MSG_NOTE, vect_location, | |
3083 | "Queuing group with duplicate access for fixup\n"); | |
68f15e9d | 3084 | } |
3085 | } | |
3086 | ||
e0599ca4 | 3087 | /* Fixup groups with duplicate entries by splitting it. */ |
3088 | while (1) | |
3089 | { | |
3090 | hash_set<stmt_vec_info>::iterator it = to_fixup.begin (); | |
3091 | if (!(it != to_fixup.end ())) | |
3092 | break; | |
3093 | stmt_vec_info grp = *it; | |
3094 | to_fixup.remove (grp); | |
3095 | ||
3096 | /* Find the earliest duplicate group member. */ | |
3097 | unsigned first_duplicate = -1u; | |
3098 | stmt_vec_info next, g = grp; | |
3099 | while ((next = DR_GROUP_NEXT_ELEMENT (g))) | |
3100 | { | |
6883ce83 | 3101 | if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr), |
3102 | DR_INIT (STMT_VINFO_DR_INFO (g)->dr)) | |
e0599ca4 | 3103 | && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate) |
3104 | first_duplicate = gimple_uid (STMT_VINFO_STMT (next)); | |
3105 | g = next; | |
3106 | } | |
3107 | if (first_duplicate == -1U) | |
3108 | continue; | |
3109 | ||
3110 | /* Then move all stmts after the first duplicate to a new group. | |
3111 | Note this is a heuristic but one with the property that *it | |
3112 | is fixed up completely. */ | |
3113 | g = grp; | |
bbe43331 | 3114 | stmt_vec_info newgroup = NULL, ng = grp; |
e0599ca4 | 3115 | while ((next = DR_GROUP_NEXT_ELEMENT (g))) |
3116 | { | |
3117 | if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate) | |
3118 | { | |
3119 | DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next); | |
3120 | if (!newgroup) | |
3121 | newgroup = next; | |
3122 | else | |
3123 | DR_GROUP_NEXT_ELEMENT (ng) = next; | |
3124 | ng = next; | |
3125 | DR_GROUP_FIRST_ELEMENT (ng) = newgroup; | |
3126 | } | |
3127 | else | |
3128 | g = DR_GROUP_NEXT_ELEMENT (g); | |
3129 | } | |
3130 | DR_GROUP_NEXT_ELEMENT (ng) = NULL; | |
3131 | ||
3132 | /* Fixup the new group which still may contain duplicates. */ | |
3133 | to_fixup.add (newgroup); | |
3134 | } | |
3135 | ||
863a3781 | 3136 | FOR_EACH_VEC_ELT (datarefs_copy, i, dr) |
abc9513d | 3137 | { |
db72d3bf | 3138 | dr_vec_info *dr_info = vinfo->lookup_dr (dr); |
abc9513d | 3139 | if (STMT_VINFO_VECTORIZABLE (dr_info->stmt) |
3140 | && !vect_analyze_data_ref_access (dr_info)) | |
3141 | { | |
3142 | if (dump_enabled_p ()) | |
3143 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
3144 | "not vectorized: complicated access pattern.\n"); | |
6ea6a380 | 3145 | |
abc9513d | 3146 | if (is_a <bb_vec_info> (vinfo)) |
3147 | { | |
3148 | /* Mark the statement as not vectorizable. */ | |
3149 | STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false; | |
3150 | continue; | |
3151 | } | |
3152 | else | |
3153 | { | |
3154 | datarefs_copy.release (); | |
ed9370cc | 3155 | return opt_result::failure_at (dr_info->stmt->stmt, |
3156 | "not vectorized:" | |
3157 | " complicated access pattern.\n"); | |
abc9513d | 3158 | } |
3159 | } | |
3160 | } | |
fb85abff | 3161 | |
863a3781 | 3162 | datarefs_copy.release (); |
ed9370cc | 3163 | return opt_result::success (); |
fb85abff | 3164 | } |
3165 | ||
8a7b0f48 | 3166 | /* Function vect_vfa_segment_size. |
3167 | ||
8a7b0f48 | 3168 | Input: |
abc9513d | 3169 | DR_INFO: The data reference. |
8a7b0f48 | 3170 | LENGTH_FACTOR: segment length to consider. |
3171 | ||
e85b4a5e | 3172 | Return a value suitable for the dr_with_seg_len::seg_len field. |
3173 | This is the "distance travelled" by the pointer from the first | |
3174 | iteration in the segment to the last. Note that it does not include | |
3175 | the size of the access; in effect it only describes the first byte. */ | |
8a7b0f48 | 3176 | |
3177 | static tree | |
abc9513d | 3178 | vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor) |
8a7b0f48 | 3179 | { |
e85b4a5e | 3180 | length_factor = size_binop (MINUS_EXPR, |
3181 | fold_convert (sizetype, length_factor), | |
3182 | size_one_node); | |
abc9513d | 3183 | return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)), |
e85b4a5e | 3184 | length_factor); |
3185 | } | |
8a7b0f48 | 3186 | |
abc9513d | 3187 | /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)), |
e85b4a5e | 3188 | gives the worst-case number of bytes covered by the segment. */ |
8a7b0f48 | 3189 | |
e85b4a5e | 3190 | static unsigned HOST_WIDE_INT |
abc9513d | 3191 | vect_vfa_access_size (dr_vec_info *dr_info) |
e85b4a5e | 3192 | { |
abc9513d | 3193 | stmt_vec_info stmt_vinfo = dr_info->stmt; |
3194 | tree ref_type = TREE_TYPE (DR_REF (dr_info->dr)); | |
e85b4a5e | 3195 | unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type)); |
3196 | unsigned HOST_WIDE_INT access_size = ref_size; | |
e1009321 | 3197 | if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo)) |
8a7b0f48 | 3198 | { |
abc9513d | 3199 | gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo); |
e1009321 | 3200 | access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo); |
e85b4a5e | 3201 | } |
3202 | if (STMT_VINFO_VEC_STMT (stmt_vinfo) | |
abc9513d | 3203 | && (vect_supportable_dr_alignment (dr_info, false) |
e85b4a5e | 3204 | == dr_explicit_realign_optimized)) |
3205 | { | |
3206 | /* We might access a full vector's worth. */ | |
3207 | tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo); | |
3208 | access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size; | |
8a7b0f48 | 3209 | } |
e85b4a5e | 3210 | return access_size; |
3211 | } | |
3212 | ||
abc9513d | 3213 | /* Get the minimum alignment for all the scalar accesses that DR_INFO |
3214 | describes. */ | |
e85b4a5e | 3215 | |
3216 | static unsigned int | |
abc9513d | 3217 | vect_vfa_align (dr_vec_info *dr_info) |
e85b4a5e | 3218 | { |
abc9513d | 3219 | return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr))); |
8a7b0f48 | 3220 | } |
3221 | ||
a5af7a75 | 3222 | /* Function vect_no_alias_p. |
3223 | ||
63bc418d | 3224 | Given data references A and B with equal base and offset, see whether |
3225 | the alias relation can be decided at compilation time. Return 1 if | |
3226 | it can and the references alias, 0 if it can and the references do | |
e85b4a5e | 3227 | not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A, |
3228 | SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent | |
3229 | of dr_with_seg_len::{seg_len,access_size} for A and B. */ | |
a5af7a75 | 3230 | |
63bc418d | 3231 | static int |
abc9513d | 3232 | vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b, |
e85b4a5e | 3233 | tree segment_length_a, tree segment_length_b, |
3234 | unsigned HOST_WIDE_INT access_size_a, | |
3235 | unsigned HOST_WIDE_INT access_size_b) | |
a5af7a75 | 3236 | { |
abc9513d | 3237 | poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr)); |
3238 | poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr)); | |
63bc418d | 3239 | poly_uint64 const_length_a; |
3240 | poly_uint64 const_length_b; | |
a5af7a75 | 3241 | |
a5af7a75 | 3242 | /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT |
3243 | bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of | |
3244 | [a, a+12) */ | |
abc9513d | 3245 | if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0) |
a5af7a75 | 3246 | { |
63bc418d | 3247 | const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi (); |
e85b4a5e | 3248 | offset_a = (offset_a + access_size_a) - const_length_a; |
a5af7a75 | 3249 | } |
63bc418d | 3250 | else |
3251 | const_length_a = tree_to_poly_uint64 (segment_length_a); | |
abc9513d | 3252 | if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0) |
a5af7a75 | 3253 | { |
63bc418d | 3254 | const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi (); |
e85b4a5e | 3255 | offset_b = (offset_b + access_size_b) - const_length_b; |
a5af7a75 | 3256 | } |
63bc418d | 3257 | else |
3258 | const_length_b = tree_to_poly_uint64 (segment_length_b); | |
a5af7a75 | 3259 | |
e85b4a5e | 3260 | const_length_a += access_size_a; |
3261 | const_length_b += access_size_b; | |
3262 | ||
63bc418d | 3263 | if (ranges_known_overlap_p (offset_a, const_length_a, |
3264 | offset_b, const_length_b)) | |
3265 | return 1; | |
a5af7a75 | 3266 | |
63bc418d | 3267 | if (!ranges_maybe_overlap_p (offset_a, const_length_a, |
3268 | offset_b, const_length_b)) | |
3269 | return 0; | |
3270 | ||
3271 | return -1; | |
a5af7a75 | 3272 | } |
3273 | ||
403965f7 | 3274 | /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH |
3275 | in DDR is >= VF. */ | |
3276 | ||
3277 | static bool | |
3278 | dependence_distance_ge_vf (data_dependence_relation *ddr, | |
d75596cd | 3279 | unsigned int loop_depth, poly_uint64 vf) |
403965f7 | 3280 | { |
3281 | if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE | |
3282 | || DDR_NUM_DIST_VECTS (ddr) == 0) | |
3283 | return false; | |
3284 | ||
3285 | /* If the dependence is exact, we should have limited the VF instead. */ | |
3286 | gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr)); | |
3287 | ||
3288 | unsigned int i; | |
3289 | lambda_vector dist_v; | |
3290 | FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) | |
3291 | { | |
3292 | HOST_WIDE_INT dist = dist_v[loop_depth]; | |
3293 | if (dist != 0 | |
3294 | && !(dist > 0 && DDR_REVERSED_P (ddr)) | |
d75596cd | 3295 | && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf)) |
403965f7 | 3296 | return false; |
3297 | } | |
3298 | ||
3299 | if (dump_enabled_p ()) | |
a4e972e3 | 3300 | dump_printf_loc (MSG_NOTE, vect_location, |
3301 | "dependence distance between %T and %T is >= VF\n", | |
3302 | DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr))); | |
403965f7 | 3303 | |
3304 | return true; | |
3305 | } | |
3306 | ||
e85b4a5e | 3307 | /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */ |
3308 | ||
3309 | static void | |
54e7de93 | 3310 | dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound) |
e85b4a5e | 3311 | { |
a4e972e3 | 3312 | dump_printf (dump_kind, "%s (%T) >= ", |
3313 | lower_bound.unsigned_p ? "unsigned" : "abs", | |
3314 | lower_bound.expr); | |
e85b4a5e | 3315 | dump_dec (dump_kind, lower_bound.min_value); |
3316 | } | |
3317 | ||
3318 | /* Record that the vectorized loop requires the vec_lower_bound described | |
3319 | by EXPR, UNSIGNED_P and MIN_VALUE. */ | |
3320 | ||
3321 | static void | |
3322 | vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p, | |
3323 | poly_uint64 min_value) | |
3324 | { | |
3325 | vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo); | |
3326 | for (unsigned int i = 0; i < lower_bounds.length (); ++i) | |
3327 | if (operand_equal_p (lower_bounds[i].expr, expr, 0)) | |
3328 | { | |
3329 | unsigned_p &= lower_bounds[i].unsigned_p; | |
3330 | min_value = upper_bound (lower_bounds[i].min_value, min_value); | |
3331 | if (lower_bounds[i].unsigned_p != unsigned_p | |
3332 | || maybe_lt (lower_bounds[i].min_value, min_value)) | |
3333 | { | |
3334 | lower_bounds[i].unsigned_p = unsigned_p; | |
3335 | lower_bounds[i].min_value = min_value; | |
3336 | if (dump_enabled_p ()) | |
3337 | { | |
3338 | dump_printf_loc (MSG_NOTE, vect_location, | |
3339 | "updating run-time check to "); | |
3340 | dump_lower_bound (MSG_NOTE, lower_bounds[i]); | |
3341 | dump_printf (MSG_NOTE, "\n"); | |
3342 | } | |
3343 | } | |
3344 | return; | |
3345 | } | |
3346 | ||
3347 | vec_lower_bound lower_bound (expr, unsigned_p, min_value); | |
3348 | if (dump_enabled_p ()) | |
3349 | { | |
3350 | dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that "); | |
3351 | dump_lower_bound (MSG_NOTE, lower_bound); | |
3352 | dump_printf (MSG_NOTE, "\n"); | |
3353 | } | |
3354 | LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound); | |
3355 | } | |
3356 | ||
abc9513d | 3357 | /* Return true if it's unlikely that the step of the vectorized form of DR_INFO |
e85b4a5e | 3358 | will span fewer than GAP bytes. */ |
3359 | ||
3360 | static bool | |
abc9513d | 3361 | vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info, |
3362 | poly_int64 gap) | |
e85b4a5e | 3363 | { |
abc9513d | 3364 | stmt_vec_info stmt_info = dr_info->stmt; |
e85b4a5e | 3365 | HOST_WIDE_INT count |
3366 | = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo)); | |
e1009321 | 3367 | if (DR_GROUP_FIRST_ELEMENT (stmt_info)) |
cd24aa3c | 3368 | count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info)); |
abc9513d | 3369 | return (estimated_poly_value (gap) |
3370 | <= count * vect_get_scalar_dr_size (dr_info)); | |
e85b4a5e | 3371 | } |
3372 | ||
abc9513d | 3373 | /* Return true if we know that there is no alias between DR_INFO_A and |
3374 | DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N. | |
3375 | When returning true, set *LOWER_BOUND_OUT to this N. */ | |
e85b4a5e | 3376 | |
3377 | static bool | |
abc9513d | 3378 | vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b, |
e85b4a5e | 3379 | poly_uint64 *lower_bound_out) |
3380 | { | |
3381 | /* Check that there is a constant gap of known sign between DR_A | |
3382 | and DR_B. */ | |
abc9513d | 3383 | data_reference *dr_a = dr_info_a->dr; |
3384 | data_reference *dr_b = dr_info_b->dr; | |
e85b4a5e | 3385 | poly_int64 init_a, init_b; |
3386 | if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0) | |
3387 | || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0) | |
3388 | || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0) | |
3389 | || !poly_int_tree_p (DR_INIT (dr_a), &init_a) | |
3390 | || !poly_int_tree_p (DR_INIT (dr_b), &init_b) | |
3391 | || !ordered_p (init_a, init_b)) | |
3392 | return false; | |
3393 | ||
3394 | /* Sort DR_A and DR_B by the address they access. */ | |
3395 | if (maybe_lt (init_b, init_a)) | |
3396 | { | |
3397 | std::swap (init_a, init_b); | |
abc9513d | 3398 | std::swap (dr_info_a, dr_info_b); |
e85b4a5e | 3399 | std::swap (dr_a, dr_b); |
3400 | } | |
3401 | ||
3402 | /* If the two accesses could be dependent within a scalar iteration, | |
3403 | make sure that we'd retain their order. */ | |
abc9513d | 3404 | if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b) |
3405 | && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b)) | |
e85b4a5e | 3406 | return false; |
3407 | ||
3408 | /* There is no alias if abs (DR_STEP) is greater than or equal to | |
3409 | the bytes spanned by the combination of the two accesses. */ | |
abc9513d | 3410 | *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a; |
e85b4a5e | 3411 | return true; |
3412 | } | |
3413 | ||
fb85abff | 3414 | /* Function vect_prune_runtime_alias_test_list. |
3415 | ||
3416 | Prune a list of ddrs to be tested at run-time by versioning for alias. | |
8a7b0f48 | 3417 | Merge several alias checks into one if possible. |
fb85abff | 3418 | Return FALSE if resulting list of ddrs is longer then allowed by |
3419 | PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */ | |
3420 | ||
ed9370cc | 3421 | opt_result |
fb85abff | 3422 | vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) |
3423 | { | |
f68a7726 | 3424 | typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash; |
3425 | hash_set <tree_pair_hash> compared_objects; | |
3426 | ||
3427 | vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo); | |
3428 | vec<dr_with_seg_len_pair_t> &comp_alias_ddrs | |
3429 | = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo); | |
3430 | vec<vec_object_pair> &check_unequal_addrs | |
3431 | = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo); | |
d75596cd | 3432 | poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
8a7b0f48 | 3433 | tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo); |
3434 | ||
3435 | ddr_p ddr; | |
3436 | unsigned int i; | |
3437 | tree length_factor; | |
fb85abff | 3438 | |
88f6eb8f | 3439 | DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list"); |
fb85abff | 3440 | |
e85b4a5e | 3441 | /* Step values are irrelevant for aliasing if the number of vector |
3442 | iterations is equal to the number of scalar iterations (which can | |
3443 | happen for fully-SLP loops). */ | |
3444 | bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U); | |
3445 | ||
3446 | if (!ignore_step_p) | |
3447 | { | |
3448 | /* Convert the checks for nonzero steps into bound tests. */ | |
3449 | tree value; | |
3450 | FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value) | |
3451 | vect_check_lower_bound (loop_vinfo, value, true, 1); | |
3452 | } | |
3453 | ||
8a7b0f48 | 3454 | if (may_alias_ddrs.is_empty ()) |
ed9370cc | 3455 | return opt_result::success (); |
8a7b0f48 | 3456 | |
8a7b0f48 | 3457 | comp_alias_ddrs.create (may_alias_ddrs.length ()); |
3458 | ||
403965f7 | 3459 | unsigned int loop_depth |
3460 | = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num, | |
3461 | LOOP_VINFO_LOOP_NEST (loop_vinfo)); | |
3462 | ||
8a7b0f48 | 3463 | /* First, we collect all data ref pairs for aliasing checks. */ |
3464 | FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr) | |
fb85abff | 3465 | { |
c1e75671 | 3466 | int comp_res; |
e85b4a5e | 3467 | poly_uint64 lower_bound; |
8a7b0f48 | 3468 | tree segment_length_a, segment_length_b; |
e85b4a5e | 3469 | unsigned HOST_WIDE_INT access_size_a, access_size_b; |
3470 | unsigned int align_a, align_b; | |
8a7b0f48 | 3471 | |
403965f7 | 3472 | /* Ignore the alias if the VF we chose ended up being no greater |
3473 | than the dependence distance. */ | |
3474 | if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor)) | |
3475 | continue; | |
3476 | ||
f68a7726 | 3477 | if (DDR_OBJECT_A (ddr)) |
3478 | { | |
3479 | vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr)); | |
3480 | if (!compared_objects.add (new_pair)) | |
3481 | { | |
3482 | if (dump_enabled_p ()) | |
a4e972e3 | 3483 | dump_printf_loc (MSG_NOTE, vect_location, |
3484 | "checking that %T and %T" | |
3485 | " have different addresses\n", | |
3486 | new_pair.first, new_pair.second); | |
f68a7726 | 3487 | LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair); |
3488 | } | |
3489 | continue; | |
3490 | } | |
3491 | ||
db72d3bf | 3492 | dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr)); |
abc9513d | 3493 | stmt_vec_info stmt_info_a = dr_info_a->stmt; |
e85b4a5e | 3494 | |
db72d3bf | 3495 | dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr)); |
abc9513d | 3496 | stmt_vec_info stmt_info_b = dr_info_b->stmt; |
e85b4a5e | 3497 | |
3498 | /* Skip the pair if inter-iteration dependencies are irrelevant | |
3499 | and intra-iteration dependencies are guaranteed to be honored. */ | |
3500 | if (ignore_step_p | |
abc9513d | 3501 | && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b) |
3502 | || vectorizable_with_step_bound_p (dr_info_a, dr_info_b, | |
3503 | &lower_bound))) | |
e85b4a5e | 3504 | { |
3505 | if (dump_enabled_p ()) | |
a4e972e3 | 3506 | dump_printf_loc (MSG_NOTE, vect_location, |
3507 | "no need for alias check between " | |
3508 | "%T and %T when VF is 1\n", | |
3509 | DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr)); | |
e85b4a5e | 3510 | continue; |
3511 | } | |
3512 | ||
3513 | /* See whether we can handle the alias using a bounds check on | |
3514 | the step, and whether that's likely to be the best approach. | |
3515 | (It might not be, for example, if the minimum step is much larger | |
3516 | than the number of bytes handled by one vector iteration.) */ | |
3517 | if (!ignore_step_p | |
abc9513d | 3518 | && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST |
3519 | && vectorizable_with_step_bound_p (dr_info_a, dr_info_b, | |
3520 | &lower_bound) | |
3521 | && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound) | |
3522 | || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound))) | |
e85b4a5e | 3523 | { |
abc9513d | 3524 | bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr); |
e85b4a5e | 3525 | if (dump_enabled_p ()) |
3526 | { | |
a4e972e3 | 3527 | dump_printf_loc (MSG_NOTE, vect_location, "no alias between " |
3528 | "%T and %T when the step %T is outside ", | |
3529 | DR_REF (dr_info_a->dr), | |
3530 | DR_REF (dr_info_b->dr), | |
3531 | DR_STEP (dr_info_a->dr)); | |
e85b4a5e | 3532 | if (unsigned_p) |
3533 | dump_printf (MSG_NOTE, "[0"); | |
3534 | else | |
3535 | { | |
3536 | dump_printf (MSG_NOTE, "("); | |
3537 | dump_dec (MSG_NOTE, poly_int64 (-lower_bound)); | |
3538 | } | |
3539 | dump_printf (MSG_NOTE, ", "); | |
3540 | dump_dec (MSG_NOTE, lower_bound); | |
3541 | dump_printf (MSG_NOTE, ")\n"); | |
3542 | } | |
abc9513d | 3543 | vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr), |
3544 | unsigned_p, lower_bound); | |
e85b4a5e | 3545 | continue; |
3546 | } | |
3547 | ||
cd24aa3c | 3548 | stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a); |
8a7b0f48 | 3549 | if (dr_group_first_a) |
3550 | { | |
cd24aa3c | 3551 | stmt_info_a = dr_group_first_a; |
abc9513d | 3552 | dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a); |
8a7b0f48 | 3553 | } |
fb85abff | 3554 | |
cd24aa3c | 3555 | stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b); |
8a7b0f48 | 3556 | if (dr_group_first_b) |
3557 | { | |
cd24aa3c | 3558 | stmt_info_b = dr_group_first_b; |
abc9513d | 3559 | dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b); |
8a7b0f48 | 3560 | } |
fb85abff | 3561 | |
e85b4a5e | 3562 | if (ignore_step_p) |
3563 | { | |
3564 | segment_length_a = size_zero_node; | |
3565 | segment_length_b = size_zero_node; | |
3566 | } | |
8a7b0f48 | 3567 | else |
e85b4a5e | 3568 | { |
abc9513d | 3569 | if (!operand_equal_p (DR_STEP (dr_info_a->dr), |
3570 | DR_STEP (dr_info_b->dr), 0)) | |
e85b4a5e | 3571 | length_factor = scalar_loop_iters; |
3572 | else | |
3573 | length_factor = size_int (vect_factor); | |
abc9513d | 3574 | segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor); |
3575 | segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor); | |
e85b4a5e | 3576 | } |
abc9513d | 3577 | access_size_a = vect_vfa_access_size (dr_info_a); |
3578 | access_size_b = vect_vfa_access_size (dr_info_b); | |
3579 | align_a = vect_vfa_align (dr_info_a); | |
3580 | align_b = vect_vfa_align (dr_info_b); | |
8a7b0f48 | 3581 | |
abc9513d | 3582 | comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr), |
3583 | DR_BASE_ADDRESS (dr_info_b->dr)); | |
a5af7a75 | 3584 | if (comp_res == 0) |
abc9513d | 3585 | comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr), |
3586 | DR_OFFSET (dr_info_b->dr)); | |
a5af7a75 | 3587 | |
63bc418d | 3588 | /* See whether the alias is known at compilation time. */ |
a5af7a75 | 3589 | if (comp_res == 0 |
abc9513d | 3590 | && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST |
3591 | && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST | |
63bc418d | 3592 | && poly_int_tree_p (segment_length_a) |
3593 | && poly_int_tree_p (segment_length_b)) | |
a5af7a75 | 3594 | { |
abc9513d | 3595 | int res = vect_compile_time_alias (dr_info_a, dr_info_b, |
63bc418d | 3596 | segment_length_a, |
e85b4a5e | 3597 | segment_length_b, |
3598 | access_size_a, | |
3599 | access_size_b); | |
3600 | if (res >= 0 && dump_enabled_p ()) | |
3601 | { | |
3602 | dump_printf_loc (MSG_NOTE, vect_location, | |
a4e972e3 | 3603 | "can tell at compile time that %T and %T", |
3604 | DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr)); | |
e85b4a5e | 3605 | if (res == 0) |
3606 | dump_printf (MSG_NOTE, " do not alias\n"); | |
3607 | else | |
3608 | dump_printf (MSG_NOTE, " alias\n"); | |
3609 | } | |
3610 | ||
63bc418d | 3611 | if (res == 0) |
a5af7a75 | 3612 | continue; |
3613 | ||
63bc418d | 3614 | if (res == 1) |
ed9370cc | 3615 | return opt_result::failure_at (stmt_info_b->stmt, |
3616 | "not vectorized:" | |
3617 | " compilation time alias: %G%G", | |
3618 | stmt_info_a->stmt, | |
3619 | stmt_info_b->stmt); | |
a5af7a75 | 3620 | } |
3621 | ||
43d14b66 | 3622 | dr_with_seg_len_pair_t dr_with_seg_len_pair |
abc9513d | 3623 | (dr_with_seg_len (dr_info_a->dr, segment_length_a, |
3624 | access_size_a, align_a), | |
3625 | dr_with_seg_len (dr_info_b->dr, segment_length_b, | |
3626 | access_size_b, align_b)); | |
43d14b66 | 3627 | |
c1e75671 | 3628 | /* Canonicalize pairs by sorting the two DR members. */ |
a5af7a75 | 3629 | if (comp_res > 0) |
3d4d7ad1 | 3630 | std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second); |
8a7b0f48 | 3631 | |
3632 | comp_alias_ddrs.safe_push (dr_with_seg_len_pair); | |
3633 | } | |
3634 | ||
d75596cd | 3635 | prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor); |
f68a7726 | 3636 | |
3637 | unsigned int count = (comp_alias_ddrs.length () | |
3638 | + check_unequal_addrs.length ()); | |
e85b4a5e | 3639 | |
91f42adc | 3640 | if (dump_enabled_p ()) |
3641 | dump_printf_loc (MSG_NOTE, vect_location, | |
3642 | "improved number of alias checks from %d to %d\n", | |
3643 | may_alias_ddrs.length (), count); | |
f68a7726 | 3644 | if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)) |
ed9370cc | 3645 | return opt_result::failure_at |
3646 | (vect_location, | |
3647 | "number of versioning for alias " | |
3648 | "run-time tests exceeds %d " | |
3649 | "(--param vect-max-version-for-alias-checks)\n", | |
3650 | PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)); | |
3651 | ||
3652 | return opt_result::success (); | |
fb85abff | 3653 | } |
3654 | ||
1619606c | 3655 | /* Check whether we can use an internal function for a gather load |
3656 | or scatter store. READ_P is true for loads and false for stores. | |
3657 | MASKED_P is true if the load or store is conditional. MEMORY_TYPE is | |
3658 | the type of the memory elements being loaded or stored. OFFSET_BITS | |
3659 | is the number of bits in each scalar offset and OFFSET_SIGN is the | |
3660 | sign of the offset. SCALE is the amount by which the offset should | |
3661 | be multiplied *after* it has been converted to address width. | |
3662 | ||
3663 | Return true if the function is supported, storing the function | |
3664 | id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */ | |
3665 | ||
1d2c127d | 3666 | bool |
1619606c | 3667 | vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype, |
3668 | tree memory_type, unsigned int offset_bits, | |
3669 | signop offset_sign, int scale, | |
3670 | internal_fn *ifn_out, tree *element_type_out) | |
3671 | { | |
3672 | unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type)); | |
3673 | unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype))); | |
3674 | if (offset_bits > element_bits) | |
3675 | /* Internal functions require the offset to be the same width as | |
3676 | the vector elements. We can extend narrower offsets, but it isn't | |
3677 | safe to truncate wider offsets. */ | |
3678 | return false; | |
3679 | ||
3680 | if (element_bits != memory_bits) | |
3681 | /* For now the vector elements must be the same width as the | |
3682 | memory elements. */ | |
3683 | return false; | |
3684 | ||
3685 | /* Work out which function we need. */ | |
3686 | internal_fn ifn; | |
3687 | if (read_p) | |
3688 | ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD; | |
3689 | else | |
0bf8b382 | 3690 | ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE; |
1619606c | 3691 | |
3692 | /* Test whether the target supports this combination. */ | |
3693 | if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type, | |
3694 | offset_sign, scale)) | |
3695 | return false; | |
3696 | ||
3697 | *ifn_out = ifn; | |
3698 | *element_type_out = TREE_TYPE (vectype); | |
3699 | return true; | |
3700 | } | |
3701 | ||
e068828a | 3702 | /* STMT_INFO is a call to an internal gather load or scatter store function. |
1619606c | 3703 | Describe the operation in INFO. */ |
3704 | ||
3705 | static void | |
e068828a | 3706 | vect_describe_gather_scatter_call (stmt_vec_info stmt_info, |
3707 | gather_scatter_info *info) | |
1619606c | 3708 | { |
e068828a | 3709 | gcall *call = as_a <gcall *> (stmt_info->stmt); |
1619606c | 3710 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
3711 | data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); | |
3712 | ||
3713 | info->ifn = gimple_call_internal_fn (call); | |
3714 | info->decl = NULL_TREE; | |
3715 | info->base = gimple_call_arg (call, 0); | |
3716 | info->offset = gimple_call_arg (call, 1); | |
3717 | info->offset_dt = vect_unknown_def_type; | |
3718 | info->offset_vectype = NULL_TREE; | |
3719 | info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2)); | |
3720 | info->element_type = TREE_TYPE (vectype); | |
3721 | info->memory_type = TREE_TYPE (DR_REF (dr)); | |
3722 | } | |
3723 | ||
ecc42a77 | 3724 | /* Return true if a non-affine read or write in STMT_INFO is suitable for a |
cf60da07 | 3725 | gather load or scatter store. Describe the operation in *INFO if so. */ |
16dfb112 | 3726 | |
cf60da07 | 3727 | bool |
ecc42a77 | 3728 | vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, |
cf60da07 | 3729 | gather_scatter_info *info) |
16dfb112 | 3730 | { |
81bc0f0f | 3731 | HOST_WIDE_INT scale = 1; |
3732 | poly_int64 pbitpos, pbitsize; | |
2e966e2a | 3733 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
16dfb112 | 3734 | struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); |
3735 | tree offtype = NULL_TREE; | |
1619606c | 3736 | tree decl = NULL_TREE, base, off; |
3737 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
3738 | tree memory_type = TREE_TYPE (DR_REF (dr)); | |
3754d046 | 3739 | machine_mode pmode; |
292237f3 | 3740 | int punsignedp, reversep, pvolatilep = 0; |
1619606c | 3741 | internal_fn ifn; |
3742 | tree element_type; | |
3743 | bool masked_p = false; | |
3744 | ||
3745 | /* See whether this is already a call to a gather/scatter internal function. | |
3746 | If not, see whether it's a masked load or store. */ | |
a73182ff | 3747 | gcall *call = dyn_cast <gcall *> (stmt_info->stmt); |
1619606c | 3748 | if (call && gimple_call_internal_p (call)) |
3749 | { | |
5b4b7bcc | 3750 | ifn = gimple_call_internal_fn (call); |
1619606c | 3751 | if (internal_gather_scatter_fn_p (ifn)) |
3752 | { | |
e068828a | 3753 | vect_describe_gather_scatter_call (stmt_info, info); |
1619606c | 3754 | return true; |
3755 | } | |
3756 | masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE); | |
3757 | } | |
3758 | ||
3759 | /* True if we should aim to use internal functions rather than | |
3760 | built-in functions. */ | |
3761 | bool use_ifn_p = (DR_IS_READ (dr) | |
0bf8b382 | 3762 | ? supports_vec_gather_load_p () |
3763 | : supports_vec_scatter_store_p ()); | |
16dfb112 | 3764 | |
c71d3c24 | 3765 | base = DR_REF (dr); |
3766 | /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF, | |
3767 | see if we can use the def stmt of the address. */ | |
1619606c | 3768 | if (masked_p |
c71d3c24 | 3769 | && TREE_CODE (base) == MEM_REF |
3770 | && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME | |
3771 | && integer_zerop (TREE_OPERAND (base, 1)) | |
3772 | && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0))) | |
3773 | { | |
42acab1c | 3774 | gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0)); |
c71d3c24 | 3775 | if (is_gimple_assign (def_stmt) |
3776 | && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR) | |
3777 | base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0); | |
3778 | } | |
3779 | ||
0bd6d857 | 3780 | /* The gather and scatter builtins need address of the form |
16dfb112 | 3781 | loop_invariant + vector * {1, 2, 4, 8} |
3782 | or | |
3783 | loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }. | |
3784 | Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture | |
3785 | of loop invariants/SSA_NAMEs defined in the loop, with casts, | |
3786 | multiplications and additions in it. To get a vector, we need | |
3787 | a single SSA_NAME that will be defined in the loop and will | |
3788 | contain everything that is not loop invariant and that can be | |
3789 | vectorized. The following code attempts to find such a preexistng | |
3790 | SSA_NAME OFF and put the loop invariants into a tree BASE | |
3791 | that can be gimplified before the loop. */ | |
292237f3 | 3792 | base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode, |
b3b6e4b5 | 3793 | &punsignedp, &reversep, &pvolatilep); |
8a51585f | 3794 | if (reversep) |
3795 | return false; | |
3796 | ||
81bc0f0f | 3797 | poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT); |
16dfb112 | 3798 | |
3799 | if (TREE_CODE (base) == MEM_REF) | |
3800 | { | |
3801 | if (!integer_zerop (TREE_OPERAND (base, 1))) | |
3802 | { | |
3803 | if (off == NULL_TREE) | |
90ca1268 | 3804 | off = wide_int_to_tree (sizetype, mem_ref_offset (base)); |
16dfb112 | 3805 | else |
3806 | off = size_binop (PLUS_EXPR, off, | |
3807 | fold_convert (sizetype, TREE_OPERAND (base, 1))); | |
3808 | } | |
3809 | base = TREE_OPERAND (base, 0); | |
3810 | } | |
3811 | else | |
3812 | base = build_fold_addr_expr (base); | |
3813 | ||
3814 | if (off == NULL_TREE) | |
3815 | off = size_zero_node; | |
3816 | ||
3817 | /* If base is not loop invariant, either off is 0, then we start with just | |
3818 | the constant offset in the loop invariant BASE and continue with base | |
3819 | as OFF, otherwise give up. | |
3820 | We could handle that case by gimplifying the addition of base + off | |
3821 | into some SSA_NAME and use that as off, but for now punt. */ | |
3822 | if (!expr_invariant_in_loop_p (loop, base)) | |
3823 | { | |
3824 | if (!integer_zerop (off)) | |
cf60da07 | 3825 | return false; |
16dfb112 | 3826 | off = base; |
81bc0f0f | 3827 | base = size_int (pbytepos); |
16dfb112 | 3828 | } |
3829 | /* Otherwise put base + constant offset into the loop invariant BASE | |
3830 | and continue with OFF. */ | |
3831 | else | |
3832 | { | |
3833 | base = fold_convert (sizetype, base); | |
81bc0f0f | 3834 | base = size_binop (PLUS_EXPR, base, size_int (pbytepos)); |
16dfb112 | 3835 | } |
3836 | ||
3837 | /* OFF at this point may be either a SSA_NAME or some tree expression | |
3838 | from get_inner_reference. Try to peel off loop invariants from it | |
3839 | into BASE as long as possible. */ | |
3840 | STRIP_NOPS (off); | |
3841 | while (offtype == NULL_TREE) | |
3842 | { | |
3843 | enum tree_code code; | |
3844 | tree op0, op1, add = NULL_TREE; | |
3845 | ||
3846 | if (TREE_CODE (off) == SSA_NAME) | |
3847 | { | |
42acab1c | 3848 | gimple *def_stmt = SSA_NAME_DEF_STMT (off); |
16dfb112 | 3849 | |
3850 | if (expr_invariant_in_loop_p (loop, off)) | |
cf60da07 | 3851 | return false; |
16dfb112 | 3852 | |
3853 | if (gimple_code (def_stmt) != GIMPLE_ASSIGN) | |
3854 | break; | |
3855 | ||
3856 | op0 = gimple_assign_rhs1 (def_stmt); | |
3857 | code = gimple_assign_rhs_code (def_stmt); | |
3858 | op1 = gimple_assign_rhs2 (def_stmt); | |
3859 | } | |
3860 | else | |
3861 | { | |
3862 | if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS) | |
cf60da07 | 3863 | return false; |
16dfb112 | 3864 | code = TREE_CODE (off); |
3865 | extract_ops_from_tree (off, &code, &op0, &op1); | |
3866 | } | |
3867 | switch (code) | |
3868 | { | |
3869 | case POINTER_PLUS_EXPR: | |
3870 | case PLUS_EXPR: | |
3871 | if (expr_invariant_in_loop_p (loop, op0)) | |
3872 | { | |
3873 | add = op0; | |
3874 | off = op1; | |
3875 | do_add: | |
3876 | add = fold_convert (sizetype, add); | |
3877 | if (scale != 1) | |
3878 | add = size_binop (MULT_EXPR, add, size_int (scale)); | |
3879 | base = size_binop (PLUS_EXPR, base, add); | |
3880 | continue; | |
3881 | } | |
3882 | if (expr_invariant_in_loop_p (loop, op1)) | |
3883 | { | |
3884 | add = op1; | |
3885 | off = op0; | |
3886 | goto do_add; | |
3887 | } | |
3888 | break; | |
3889 | case MINUS_EXPR: | |
3890 | if (expr_invariant_in_loop_p (loop, op1)) | |
3891 | { | |
3892 | add = fold_convert (sizetype, op1); | |
3893 | add = size_binop (MINUS_EXPR, size_zero_node, add); | |
3894 | off = op0; | |
3895 | goto do_add; | |
3896 | } | |
3897 | break; | |
3898 | case MULT_EXPR: | |
e913b5cd | 3899 | if (scale == 1 && tree_fits_shwi_p (op1)) |
16dfb112 | 3900 | { |
1619606c | 3901 | int new_scale = tree_to_shwi (op1); |
3902 | /* Only treat this as a scaling operation if the target | |
3903 | supports it. */ | |
3904 | if (use_ifn_p | |
3905 | && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, | |
3906 | vectype, memory_type, 1, | |
3907 | TYPE_SIGN (TREE_TYPE (op0)), | |
3908 | new_scale, &ifn, | |
3909 | &element_type)) | |
3910 | break; | |
3911 | scale = new_scale; | |
16dfb112 | 3912 | off = op0; |
3913 | continue; | |
3914 | } | |
3915 | break; | |
3916 | case SSA_NAME: | |
3917 | off = op0; | |
3918 | continue; | |
3919 | CASE_CONVERT: | |
3920 | if (!POINTER_TYPE_P (TREE_TYPE (op0)) | |
3921 | && !INTEGRAL_TYPE_P (TREE_TYPE (op0))) | |
3922 | break; | |
3923 | if (TYPE_PRECISION (TREE_TYPE (op0)) | |
3924 | == TYPE_PRECISION (TREE_TYPE (off))) | |
3925 | { | |
3926 | off = op0; | |
3927 | continue; | |
3928 | } | |
1619606c | 3929 | |
3930 | /* The internal functions need the offset to be the same width | |
3931 | as the elements of VECTYPE. Don't include operations that | |
3932 | cast the offset from that width to a different width. */ | |
3933 | if (use_ifn_p | |
3934 | && (int_size_in_bytes (TREE_TYPE (vectype)) | |
3935 | == int_size_in_bytes (TREE_TYPE (off)))) | |
3936 | break; | |
3937 | ||
16dfb112 | 3938 | if (TYPE_PRECISION (TREE_TYPE (op0)) |
3939 | < TYPE_PRECISION (TREE_TYPE (off))) | |
3940 | { | |
3941 | off = op0; | |
3942 | offtype = TREE_TYPE (off); | |
3943 | STRIP_NOPS (off); | |
3944 | continue; | |
3945 | } | |
3946 | break; | |
3947 | default: | |
3948 | break; | |
3949 | } | |
3950 | break; | |
3951 | } | |
3952 | ||
3953 | /* If at the end OFF still isn't a SSA_NAME or isn't | |
3954 | defined in the loop, punt. */ | |
3955 | if (TREE_CODE (off) != SSA_NAME | |
3956 | || expr_invariant_in_loop_p (loop, off)) | |
cf60da07 | 3957 | return false; |
16dfb112 | 3958 | |
3959 | if (offtype == NULL_TREE) | |
3960 | offtype = TREE_TYPE (off); | |
3961 | ||
1619606c | 3962 | if (use_ifn_p) |
3963 | { | |
3964 | if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype, | |
3965 | memory_type, TYPE_PRECISION (offtype), | |
3966 | TYPE_SIGN (offtype), scale, &ifn, | |
3967 | &element_type)) | |
3968 | return false; | |
3969 | } | |
0bd6d857 | 3970 | else |
1619606c | 3971 | { |
3972 | if (DR_IS_READ (dr)) | |
1f9a3b5c | 3973 | { |
3974 | if (targetm.vectorize.builtin_gather) | |
3975 | decl = targetm.vectorize.builtin_gather (vectype, offtype, scale); | |
3976 | } | |
1619606c | 3977 | else |
1f9a3b5c | 3978 | { |
3979 | if (targetm.vectorize.builtin_scatter) | |
3980 | decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale); | |
3981 | } | |
0bd6d857 | 3982 | |
1619606c | 3983 | if (!decl) |
3984 | return false; | |
3985 | ||
3986 | ifn = IFN_LAST; | |
3987 | element_type = TREE_TYPE (vectype); | |
3988 | } | |
cf60da07 | 3989 | |
1619606c | 3990 | info->ifn = ifn; |
cf60da07 | 3991 | info->decl = decl; |
3992 | info->base = base; | |
3993 | info->offset = off; | |
3994 | info->offset_dt = vect_unknown_def_type; | |
3995 | info->offset_vectype = NULL_TREE; | |
3996 | info->scale = scale; | |
1619606c | 3997 | info->element_type = element_type; |
3998 | info->memory_type = memory_type; | |
cf60da07 | 3999 | return true; |
16dfb112 | 4000 | } |
4001 | ||
ed9d8730 | 4002 | /* Find the data references in STMT, analyze them with respect to LOOP and |
4003 | append them to DATAREFS. Return false if datarefs in this stmt cannot | |
4004 | be handled. */ | |
4005 | ||
ed9370cc | 4006 | opt_result |
ed9d8730 | 4007 | vect_find_stmt_data_reference (loop_p loop, gimple *stmt, |
4008 | vec<data_reference_p> *datarefs) | |
4009 | { | |
4010 | /* We can ignore clobbers for dataref analysis - they are removed during | |
4011 | loop vectorization and BB vectorization checks dependences with a | |
4012 | stmt walk. */ | |
4013 | if (gimple_clobber_p (stmt)) | |
ed9370cc | 4014 | return opt_result::success (); |
ed9d8730 | 4015 | |
4016 | if (gimple_has_volatile_ops (stmt)) | |
ed9370cc | 4017 | return opt_result::failure_at (stmt, "not vectorized: volatile type: %G", |
4018 | stmt); | |
ed9d8730 | 4019 | |
aac19106 | 4020 | if (stmt_can_throw_internal (cfun, stmt)) |
ed9370cc | 4021 | return opt_result::failure_at (stmt, |
4022 | "not vectorized:" | |
4023 | " statement can throw an exception: %G", | |
4024 | stmt); | |
ed9d8730 | 4025 | |
4026 | auto_vec<data_reference_p, 2> refs; | |
ed9370cc | 4027 | opt_result res = find_data_references_in_stmt (loop, stmt, &refs); |
4028 | if (!res) | |
4029 | return res; | |
ed9d8730 | 4030 | |
4031 | if (refs.is_empty ()) | |
ed9370cc | 4032 | return opt_result::success (); |
ed9d8730 | 4033 | |
4034 | if (refs.length () > 1) | |
ed9370cc | 4035 | return opt_result::failure_at (stmt, |
4036 | "not vectorized:" | |
4037 | " more than one data ref in stmt: %G", stmt); | |
ed9d8730 | 4038 | |
4039 | if (gcall *call = dyn_cast <gcall *> (stmt)) | |
4040 | if (!gimple_call_internal_p (call) | |
4041 | || (gimple_call_internal_fn (call) != IFN_MASK_LOAD | |
4042 | && gimple_call_internal_fn (call) != IFN_MASK_STORE)) | |
ed9370cc | 4043 | return opt_result::failure_at (stmt, |
4044 | "not vectorized: dr in a call %G", stmt); | |
ed9d8730 | 4045 | |
4046 | data_reference_p dr = refs.pop (); | |
4047 | if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF | |
4048 | && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1))) | |
ed9370cc | 4049 | return opt_result::failure_at (stmt, |
4050 | "not vectorized:" | |
4051 | " statement is bitfield access %G", stmt); | |
ed9d8730 | 4052 | |
4053 | if (DR_BASE_ADDRESS (dr) | |
4054 | && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST) | |
ed9370cc | 4055 | return opt_result::failure_at (stmt, |
4056 | "not vectorized:" | |
4057 | " base addr of dr is a constant\n"); | |
ed9d8730 | 4058 | |
369a4f17 | 4059 | /* Check whether this may be a SIMD lane access and adjust the |
4060 | DR to make it easier for us to handle it. */ | |
4061 | if (loop | |
4062 | && loop->simduid | |
4063 | && (!DR_BASE_ADDRESS (dr) | |
4064 | || !DR_OFFSET (dr) | |
4065 | || !DR_INIT (dr) | |
4066 | || !DR_STEP (dr))) | |
4067 | { | |
4068 | struct data_reference *newdr | |
4069 | = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt, | |
4070 | DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr)); | |
4071 | if (DR_BASE_ADDRESS (newdr) | |
4072 | && DR_OFFSET (newdr) | |
4073 | && DR_INIT (newdr) | |
4074 | && DR_STEP (newdr) | |
1da67136 | 4075 | && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST |
369a4f17 | 4076 | && integer_zerop (DR_STEP (newdr))) |
4077 | { | |
1da67136 | 4078 | tree base_address = DR_BASE_ADDRESS (newdr); |
369a4f17 | 4079 | tree off = DR_OFFSET (newdr); |
4738cd0d | 4080 | tree step = ssize_int (1); |
1da67136 | 4081 | if (integer_zerop (off) |
4082 | && TREE_CODE (base_address) == POINTER_PLUS_EXPR) | |
4083 | { | |
4084 | off = TREE_OPERAND (base_address, 1); | |
4085 | base_address = TREE_OPERAND (base_address, 0); | |
4086 | } | |
369a4f17 | 4087 | STRIP_NOPS (off); |
4738cd0d | 4088 | if (TREE_CODE (off) == MULT_EXPR |
369a4f17 | 4089 | && tree_fits_uhwi_p (TREE_OPERAND (off, 1))) |
4090 | { | |
4738cd0d | 4091 | step = TREE_OPERAND (off, 1); |
369a4f17 | 4092 | off = TREE_OPERAND (off, 0); |
4093 | STRIP_NOPS (off); | |
4738cd0d | 4094 | } |
1da67136 | 4095 | if (CONVERT_EXPR_P (off) |
4096 | && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0))) | |
4097 | < TYPE_PRECISION (TREE_TYPE (off)))) | |
4098 | off = TREE_OPERAND (off, 0); | |
4099 | if (TREE_CODE (off) == SSA_NAME) | |
4738cd0d | 4100 | { |
1da67136 | 4101 | gimple *def = SSA_NAME_DEF_STMT (off); |
4102 | /* Look through widening conversion. */ | |
4103 | if (is_gimple_assign (def) | |
4104 | && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def))) | |
4105 | { | |
4106 | tree rhs1 = gimple_assign_rhs1 (def); | |
4107 | if (TREE_CODE (rhs1) == SSA_NAME | |
4108 | && INTEGRAL_TYPE_P (TREE_TYPE (rhs1)) | |
4109 | && (TYPE_PRECISION (TREE_TYPE (off)) | |
4110 | > TYPE_PRECISION (TREE_TYPE (rhs1)))) | |
4111 | def = SSA_NAME_DEF_STMT (rhs1); | |
4112 | } | |
4113 | if (is_gimple_call (def) | |
4114 | && gimple_call_internal_p (def) | |
4115 | && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE)) | |
369a4f17 | 4116 | { |
1da67136 | 4117 | tree arg = gimple_call_arg (def, 0); |
369a4f17 | 4118 | tree reft = TREE_TYPE (DR_REF (newdr)); |
1da67136 | 4119 | gcc_assert (TREE_CODE (arg) == SSA_NAME); |
4120 | arg = SSA_NAME_VAR (arg); | |
4121 | if (arg == loop->simduid | |
4122 | /* For now. */ | |
4123 | && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step)) | |
369a4f17 | 4124 | { |
1da67136 | 4125 | DR_BASE_ADDRESS (newdr) = base_address; |
4126 | DR_OFFSET (newdr) = ssize_int (0); | |
4127 | DR_STEP (newdr) = step; | |
4128 | DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT; | |
4129 | DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step); | |
4130 | /* Mark as simd-lane access. */ | |
4131 | tree arg2 = gimple_call_arg (def, 1); | |
4132 | newdr->aux = (void *) (-1 - tree_to_uhwi (arg2)); | |
4133 | free_data_ref (dr); | |
4134 | datarefs->safe_push (newdr); | |
4135 | return opt_result::success (); | |
369a4f17 | 4136 | } |
4137 | } | |
4138 | } | |
4139 | } | |
4140 | free_data_ref (newdr); | |
4141 | } | |
4142 | ||
ed9d8730 | 4143 | datarefs->safe_push (dr); |
ed9370cc | 4144 | return opt_result::success (); |
ed9d8730 | 4145 | } |
4146 | ||
fb85abff | 4147 | /* Function vect_analyze_data_refs. |
4148 | ||
37545e54 | 4149 | Find all the data references in the loop or basic block. |
fb85abff | 4150 | |
4151 | The general structure of the analysis of data refs in the vectorizer is as | |
4152 | follows: | |
48e1416a | 4153 | 1- vect_analyze_data_refs(loop/bb): call |
37545e54 | 4154 | compute_data_dependences_for_loop/bb to find and analyze all data-refs |
4155 | in the loop/bb and their dependences. | |
fb85abff | 4156 | 2- vect_analyze_dependences(): apply dependence testing using ddrs. |
4157 | 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok. | |
4158 | 4- vect_analyze_drs_access(): check that ref_stmt.step is ok. | |
4159 | ||
4160 | */ | |
4161 | ||
ed9370cc | 4162 | opt_result |
2403338f | 4163 | vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal) |
fb85abff | 4164 | { |
2e966e2a | 4165 | class loop *loop = NULL; |
fb85abff | 4166 | unsigned int i; |
fb85abff | 4167 | struct data_reference *dr; |
4168 | tree scalar_type; | |
4169 | ||
88f6eb8f | 4170 | DUMP_VECT_SCOPE ("vect_analyze_data_refs"); |
48e1416a | 4171 | |
e2c5c678 | 4172 | if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo)) |
0a08c1bc | 4173 | loop = LOOP_VINFO_LOOP (loop_vinfo); |
fb85abff | 4174 | |
282bf14c | 4175 | /* Go through the data-refs, check that the analysis succeeded. Update |
4176 | pointer from stmt_vec_info struct to DR and vectype. */ | |
fb85abff | 4177 | |
a99aba41 | 4178 | vec<data_reference_p> datarefs = vinfo->shared->datarefs; |
f1f41a6c | 4179 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
fb85abff | 4180 | { |
0bd6d857 | 4181 | enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE; |
d75596cd | 4182 | poly_uint64 vf; |
48e1416a | 4183 | |
ed9d8730 | 4184 | gcc_assert (DR_REF (dr)); |
5f02ee72 | 4185 | stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr)); |
4186 | gcc_assert (!stmt_info->dr_aux.dr); | |
4187 | stmt_info->dr_aux.dr = dr; | |
4188 | stmt_info->dr_aux.stmt = stmt_info; | |
fb85abff | 4189 | |
4190 | /* Check that analysis of the data-ref succeeded. */ | |
4191 | if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr) | |
16dfb112 | 4192 | || !DR_STEP (dr)) |
fb85abff | 4193 | { |
3d483a94 | 4194 | bool maybe_gather |
4195 | = DR_IS_READ (dr) | |
16dfb112 | 4196 | && !TREE_THIS_VOLATILE (DR_REF (dr)) |
1619606c | 4197 | && (targetm.vectorize.builtin_gather != NULL |
4198 | || supports_vec_gather_load_p ()); | |
0bd6d857 | 4199 | bool maybe_scatter |
4200 | = DR_IS_WRITE (dr) | |
4201 | && !TREE_THIS_VOLATILE (DR_REF (dr)) | |
0bf8b382 | 4202 | && (targetm.vectorize.builtin_scatter != NULL |
4203 | || supports_vec_scatter_store_p ()); | |
3d483a94 | 4204 | |
369a4f17 | 4205 | /* If target supports vector gather loads or scatter stores, |
4206 | see if they can't be used. */ | |
e2c5c678 | 4207 | if (is_a <loop_vec_info> (vinfo) |
0219dc42 | 4208 | && !nested_in_vect_loop_p (loop, stmt_info)) |
16dfb112 | 4209 | { |
369a4f17 | 4210 | if (maybe_gather || maybe_scatter) |
fa681b45 | 4211 | { |
4212 | if (maybe_gather) | |
4213 | gatherscatter = GATHER; | |
4214 | else | |
4215 | gatherscatter = SCATTER; | |
16dfb112 | 4216 | } |
16dfb112 | 4217 | } |
6ea6a380 | 4218 | |
369a4f17 | 4219 | if (gatherscatter == SG_NONE) |
16dfb112 | 4220 | { |
6d8fb6cf | 4221 | if (dump_enabled_p ()) |
a4e972e3 | 4222 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
4223 | "not vectorized: data ref analysis " | |
4224 | "failed %G", stmt_info->stmt); | |
e2c5c678 | 4225 | if (is_a <bb_vec_info> (vinfo)) |
58cfef6b | 4226 | { |
4227 | /* In BB vectorization the ref can still participate | |
4228 | in dependence analysis, we just can't vectorize it. */ | |
4229 | STMT_VINFO_VECTORIZABLE (stmt_info) = false; | |
4230 | continue; | |
4231 | } | |
ed9370cc | 4232 | return opt_result::failure_at (stmt_info->stmt, |
4233 | "not vectorized:" | |
4234 | " data ref analysis failed: %G", | |
4235 | stmt_info->stmt); | |
16dfb112 | 4236 | } |
fb85abff | 4237 | } |
4238 | ||
369a4f17 | 4239 | /* See if this was detected as SIMD lane access. */ |
da008d72 | 4240 | if (dr->aux == (void *)-1 |
4241 | || dr->aux == (void *)-2 | |
b05c7e43 | 4242 | || dr->aux == (void *)-3 |
4243 | || dr->aux == (void *)-4) | |
369a4f17 | 4244 | { |
0219dc42 | 4245 | if (nested_in_vect_loop_p (loop, stmt_info)) |
ed9370cc | 4246 | return opt_result::failure_at (stmt_info->stmt, |
4247 | "not vectorized:" | |
4248 | " data ref analysis failed: %G", | |
4249 | stmt_info->stmt); | |
da008d72 | 4250 | STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) |
4251 | = -(uintptr_t) dr->aux; | |
369a4f17 | 4252 | } |
4253 | ||
fa681b45 | 4254 | tree base = get_base_address (DR_REF (dr)); |
4255 | if (base && VAR_P (base) && DECL_NONALIASED (base)) | |
87c952b8 | 4256 | { |
6d8fb6cf | 4257 | if (dump_enabled_p ()) |
a4e972e3 | 4258 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
4259 | "not vectorized: base object not addressable " | |
4260 | "for stmt: %G", stmt_info->stmt); | |
e2c5c678 | 4261 | if (is_a <bb_vec_info> (vinfo)) |
ed9d8730 | 4262 | { |
4263 | /* In BB vectorization the ref can still participate | |
4264 | in dependence analysis, we just can't vectorize it. */ | |
4265 | STMT_VINFO_VECTORIZABLE (stmt_info) = false; | |
4266 | continue; | |
4267 | } | |
ed9370cc | 4268 | return opt_result::failure_at (stmt_info->stmt, |
4269 | "not vectorized: base object not" | |
4270 | " addressable for stmt: %G", | |
4271 | stmt_info->stmt); | |
87c952b8 | 4272 | } |
4273 | ||
ed9d8730 | 4274 | if (is_a <loop_vec_info> (vinfo) |
fa681b45 | 4275 | && DR_STEP (dr) |
ed9d8730 | 4276 | && TREE_CODE (DR_STEP (dr)) != INTEGER_CST) |
635bf3aa | 4277 | { |
0219dc42 | 4278 | if (nested_in_vect_loop_p (loop, stmt_info)) |
ed9370cc | 4279 | return opt_result::failure_at (stmt_info->stmt, |
4280 | "not vectorized:" | |
4281 | "not suitable for strided load %G", | |
4282 | stmt_info->stmt); | |
ed9d8730 | 4283 | STMT_VINFO_STRIDED_P (stmt_info) = true; |
635bf3aa | 4284 | } |
4285 | ||
fb85abff | 4286 | /* Update DR field in stmt_vec_info struct. */ |
fb85abff | 4287 | |
4288 | /* If the dataref is in an inner-loop of the loop that is considered for | |
4289 | for vectorization, we also want to analyze the access relative to | |
48e1416a | 4290 | the outer-loop (DR contains information only relative to the |
fb85abff | 4291 | inner-most enclosing loop). We do that by building a reference to the |
4292 | first location accessed by the inner-loop, and analyze it relative to | |
48e1416a | 4293 | the outer-loop. */ |
0219dc42 | 4294 | if (loop && nested_in_vect_loop_p (loop, stmt_info)) |
fb85abff | 4295 | { |
48e1416a | 4296 | /* Build a reference to the first location accessed by the |
a5456a6d | 4297 | inner loop: *(BASE + INIT + OFFSET). By construction, |
4298 | this address must be invariant in the inner loop, so we | |
4299 | can consider it as being used in the outer loop. */ | |
ed9d8730 | 4300 | tree base = unshare_expr (DR_BASE_ADDRESS (dr)); |
4301 | tree offset = unshare_expr (DR_OFFSET (dr)); | |
4302 | tree init = unshare_expr (DR_INIT (dr)); | |
a5456a6d | 4303 | tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), |
4304 | init, offset); | |
4305 | tree init_addr = fold_build_pointer_plus (base, init_offset); | |
4306 | tree init_ref = build_fold_indirect_ref (init_addr); | |
fb85abff | 4307 | |
6d8fb6cf | 4308 | if (dump_enabled_p ()) |
a4e972e3 | 4309 | dump_printf_loc (MSG_NOTE, vect_location, |
4310 | "analyze in outer loop: %T\n", init_ref); | |
fb85abff | 4311 | |
ed9370cc | 4312 | opt_result res |
4313 | = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info), | |
4314 | init_ref, loop, stmt_info->stmt); | |
4315 | if (!res) | |
a5456a6d | 4316 | /* dr_analyze_innermost already explained the failure. */ |
ed9370cc | 4317 | return res; |
fb85abff | 4318 | |
6d8fb6cf | 4319 | if (dump_enabled_p ()) |
a4e972e3 | 4320 | dump_printf_loc (MSG_NOTE, vect_location, |
4321 | "\touter base_address: %T\n" | |
4322 | "\touter offset from base address: %T\n" | |
4323 | "\touter constant offset from base address: %T\n" | |
4324 | "\touter step: %T\n" | |
4325 | "\touter base alignment: %d\n\n" | |
4326 | "\touter base misalignment: %d\n" | |
4327 | "\touter offset alignment: %d\n" | |
4328 | "\touter step alignment: %d\n", | |
4329 | STMT_VINFO_DR_BASE_ADDRESS (stmt_info), | |
4330 | STMT_VINFO_DR_OFFSET (stmt_info), | |
4331 | STMT_VINFO_DR_INIT (stmt_info), | |
4332 | STMT_VINFO_DR_STEP (stmt_info), | |
4333 | STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info), | |
4334 | STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info), | |
4335 | STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info), | |
4336 | STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info)); | |
fb85abff | 4337 | } |
4338 | ||
fb85abff | 4339 | /* Set vectype for STMT. */ |
4340 | scalar_type = TREE_TYPE (DR_REF (dr)); | |
53c3c39b | 4341 | STMT_VINFO_VECTYPE (stmt_info) |
4342 | = get_vectype_for_scalar_type (scalar_type); | |
48e1416a | 4343 | if (!STMT_VINFO_VECTYPE (stmt_info)) |
fb85abff | 4344 | { |
6d8fb6cf | 4345 | if (dump_enabled_p ()) |
fb85abff | 4346 | { |
78bb46f5 | 4347 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
a4e972e3 | 4348 | "not vectorized: no vectype for stmt: %G", |
4349 | stmt_info->stmt); | |
7bd765d4 | 4350 | dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: "); |
4351 | dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS, | |
4352 | scalar_type); | |
78bb46f5 | 4353 | dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); |
fb85abff | 4354 | } |
6ea6a380 | 4355 | |
e2c5c678 | 4356 | if (is_a <bb_vec_info> (vinfo)) |
77d241ed | 4357 | { |
4358 | /* No vector type is fine, the ref can still participate | |
4359 | in dependence analysis, we just can't vectorize it. */ | |
4360 | STMT_VINFO_VECTORIZABLE (stmt_info) = false; | |
4361 | continue; | |
4362 | } | |
36bcaa6e | 4363 | if (fatal) |
4364 | *fatal = false; | |
ed9370cc | 4365 | return opt_result::failure_at (stmt_info->stmt, |
4366 | "not vectorized:" | |
4367 | " no vectype for stmt: %G" | |
4368 | " scalar_type: %T\n", | |
4369 | stmt_info->stmt, scalar_type); | |
fb85abff | 4370 | } |
0bf5f81b | 4371 | else |
4372 | { | |
4373 | if (dump_enabled_p ()) | |
a4e972e3 | 4374 | dump_printf_loc (MSG_NOTE, vect_location, |
4375 | "got vectype for stmt: %G%T\n", | |
4376 | stmt_info->stmt, STMT_VINFO_VECTYPE (stmt_info)); | |
0bf5f81b | 4377 | } |
91a74fc6 | 4378 | |
4379 | /* Adjust the minimal vectorization factor according to the | |
4380 | vector type. */ | |
4381 | vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); | |
d75596cd | 4382 | *min_vf = upper_bound (*min_vf, vf); |
16dfb112 | 4383 | |
0bd6d857 | 4384 | if (gatherscatter != SG_NONE) |
16dfb112 | 4385 | { |
cf60da07 | 4386 | gather_scatter_info gs_info; |
0219dc42 | 4387 | if (!vect_check_gather_scatter (stmt_info, |
4388 | as_a <loop_vec_info> (vinfo), | |
cf60da07 | 4389 | &gs_info) |
4390 | || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset))) | |
2403338f | 4391 | { |
4392 | if (fatal) | |
4393 | *fatal = false; | |
4394 | return opt_result::failure_at | |
4395 | (stmt_info->stmt, | |
4396 | (gatherscatter == GATHER) | |
4397 | ? "not vectorized: not suitable for gather load %G" | |
4398 | : "not vectorized: not suitable for scatter store %G", | |
4399 | stmt_info->stmt); | |
4400 | } | |
0bd6d857 | 4401 | STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter; |
f634c3e9 | 4402 | } |
fb85abff | 4403 | } |
48e1416a | 4404 | |
58cfef6b | 4405 | /* We used to stop processing and prune the list here. Verify we no |
4406 | longer need to. */ | |
4407 | gcc_assert (i == datarefs.length ()); | |
07e3bcbf | 4408 | |
ed9370cc | 4409 | return opt_result::success (); |
fb85abff | 4410 | } |
4411 | ||
4412 | ||
4413 | /* Function vect_get_new_vect_var. | |
4414 | ||
282bf14c | 4415 | Returns a name for a new variable. The current naming scheme appends the |
48e1416a | 4416 | prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to |
4417 | the name of vectorizer generated variables, and appends that to NAME if | |
fb85abff | 4418 | provided. */ |
4419 | ||
4420 | tree | |
4421 | vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name) | |
4422 | { | |
4423 | const char *prefix; | |
4424 | tree new_vect_var; | |
4425 | ||
4426 | switch (var_kind) | |
4427 | { | |
4428 | case vect_simple_var: | |
0bf5f81b | 4429 | prefix = "vect"; |
fb85abff | 4430 | break; |
4431 | case vect_scalar_var: | |
0bf5f81b | 4432 | prefix = "stmp"; |
fb85abff | 4433 | break; |
dab48979 | 4434 | case vect_mask_var: |
4435 | prefix = "mask"; | |
4436 | break; | |
fb85abff | 4437 | case vect_pointer_var: |
0bf5f81b | 4438 | prefix = "vectp"; |
fb85abff | 4439 | break; |
4440 | default: | |
4441 | gcc_unreachable (); | |
4442 | } | |
4443 | ||
4444 | if (name) | |
4445 | { | |
0bf5f81b | 4446 | char* tmp = concat (prefix, "_", name, NULL); |
35244493 | 4447 | new_vect_var = create_tmp_reg (type, tmp); |
fb85abff | 4448 | free (tmp); |
4449 | } | |
4450 | else | |
35244493 | 4451 | new_vect_var = create_tmp_reg (type, prefix); |
fb85abff | 4452 | |
4453 | return new_vect_var; | |
4454 | } | |
4455 | ||
23ffec42 | 4456 | /* Like vect_get_new_vect_var but return an SSA name. */ |
4457 | ||
4458 | tree | |
4459 | vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name) | |
4460 | { | |
4461 | const char *prefix; | |
4462 | tree new_vect_var; | |
4463 | ||
4464 | switch (var_kind) | |
4465 | { | |
4466 | case vect_simple_var: | |
4467 | prefix = "vect"; | |
4468 | break; | |
4469 | case vect_scalar_var: | |
4470 | prefix = "stmp"; | |
4471 | break; | |
4472 | case vect_pointer_var: | |
4473 | prefix = "vectp"; | |
4474 | break; | |
4475 | default: | |
4476 | gcc_unreachable (); | |
4477 | } | |
4478 | ||
4479 | if (name) | |
4480 | { | |
4481 | char* tmp = concat (prefix, "_", name, NULL); | |
4482 | new_vect_var = make_temp_ssa_name (type, NULL, tmp); | |
4483 | free (tmp); | |
4484 | } | |
4485 | else | |
4486 | new_vect_var = make_temp_ssa_name (type, NULL, prefix); | |
4487 | ||
4488 | return new_vect_var; | |
4489 | } | |
4490 | ||
abc9513d | 4491 | /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */ |
4a2edd22 | 4492 | |
4493 | static void | |
abc9513d | 4494 | vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info) |
4a2edd22 | 4495 | { |
abc9513d | 4496 | duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr)); |
4497 | int misalign = DR_MISALIGNMENT (dr_info); | |
df8e9f7a | 4498 | if (misalign == DR_MISALIGNMENT_UNKNOWN) |
4a2edd22 | 4499 | mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name)); |
4500 | else | |
aec313e5 | 4501 | set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), |
e092c20e | 4502 | known_alignment (DR_TARGET_ALIGNMENT (dr_info)), |
4503 | misalign); | |
4a2edd22 | 4504 | } |
fb85abff | 4505 | |
4506 | /* Function vect_create_addr_base_for_vector_ref. | |
4507 | ||
4508 | Create an expression that computes the address of the first memory location | |
4509 | that will be accessed for a data reference. | |
4510 | ||
4511 | Input: | |
ecc42a77 | 4512 | STMT_INFO: The statement containing the data reference. |
fb85abff | 4513 | NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list. |
4514 | OFFSET: Optional. If supplied, it is be added to the initial address. | |
4515 | LOOP: Specify relative to which loop-nest should the address be computed. | |
4516 | For example, when the dataref is in an inner-loop nested in an | |
4517 | outer-loop that is now being vectorized, LOOP can be either the | |
282bf14c | 4518 | outer-loop, or the inner-loop. The first memory location accessed |
fb85abff | 4519 | by the following dataref ('in' points to short): |
4520 | ||
4521 | for (i=0; i<N; i++) | |
4522 | for (j=0; j<M; j++) | |
4523 | s += in[i+j] | |
4524 | ||
4525 | is as follows: | |
4526 | if LOOP=i_loop: &in (relative to i_loop) | |
4527 | if LOOP=j_loop: &in+i*2B (relative to j_loop) | |
1ec61bbd | 4528 | BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the |
4529 | initial address. Unlike OFFSET, which is number of elements to | |
4530 | be added, BYTE_OFFSET is measured in bytes. | |
fb85abff | 4531 | |
4532 | Output: | |
48e1416a | 4533 | 1. Return an SSA_NAME whose value is the address of the memory location of |
fb85abff | 4534 | the first vector of the data reference. |
4535 | 2. If new_stmt_list is not NULL_TREE after return then the caller must insert | |
4536 | these statement(s) which define the returned SSA_NAME. | |
4537 | ||
4538 | FORNOW: We are only handling array accesses with step 1. */ | |
4539 | ||
4540 | tree | |
ecc42a77 | 4541 | vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info, |
fb85abff | 4542 | gimple_seq *new_stmt_list, |
4543 | tree offset, | |
1ec61bbd | 4544 | tree byte_offset) |
fb85abff | 4545 | { |
abc9513d | 4546 | dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); |
4547 | struct data_reference *dr = dr_info->dr; | |
3c18ea71 | 4548 | const char *base_name; |
90d4c4af | 4549 | tree addr_base; |
fb85abff | 4550 | tree dest; |
4551 | gimple_seq seq = NULL; | |
f083cd24 | 4552 | tree vect_ptr_type; |
fb85abff | 4553 | tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); |
37545e54 | 4554 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); |
abc9513d | 4555 | innermost_loop_behavior *drb = vect_dr_behavior (dr_info); |
fb85abff | 4556 | |
9e879814 | 4557 | tree data_ref_base = unshare_expr (drb->base_address); |
4558 | tree base_offset = unshare_expr (drb->offset); | |
4559 | tree init = unshare_expr (drb->init); | |
fb85abff | 4560 | |
37545e54 | 4561 | if (loop_vinfo) |
3c18ea71 | 4562 | base_name = get_name (data_ref_base); |
37545e54 | 4563 | else |
4564 | { | |
4565 | base_offset = ssize_int (0); | |
4566 | init = ssize_int (0); | |
3c18ea71 | 4567 | base_name = get_name (DR_REF (dr)); |
48e1416a | 4568 | } |
37545e54 | 4569 | |
fb85abff | 4570 | /* Create base_offset */ |
4571 | base_offset = size_binop (PLUS_EXPR, | |
4572 | fold_convert (sizetype, base_offset), | |
4573 | fold_convert (sizetype, init)); | |
fb85abff | 4574 | |
4575 | if (offset) | |
4576 | { | |
fb85abff | 4577 | offset = fold_build2 (MULT_EXPR, sizetype, |
4578 | fold_convert (sizetype, offset), step); | |
4579 | base_offset = fold_build2 (PLUS_EXPR, sizetype, | |
4580 | base_offset, offset); | |
fb85abff | 4581 | } |
1ec61bbd | 4582 | if (byte_offset) |
4583 | { | |
4584 | byte_offset = fold_convert (sizetype, byte_offset); | |
4585 | base_offset = fold_build2 (PLUS_EXPR, sizetype, | |
4586 | base_offset, byte_offset); | |
4587 | } | |
fb85abff | 4588 | |
4589 | /* base + base_offset */ | |
37545e54 | 4590 | if (loop_vinfo) |
2cc66f2a | 4591 | addr_base = fold_build_pointer_plus (data_ref_base, base_offset); |
37545e54 | 4592 | else |
4593 | { | |
182cf5a9 | 4594 | addr_base = build1 (ADDR_EXPR, |
4595 | build_pointer_type (TREE_TYPE (DR_REF (dr))), | |
4596 | unshare_expr (DR_REF (dr))); | |
37545e54 | 4597 | } |
48e1416a | 4598 | |
fb85abff | 4599 | vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info)); |
90d4c4af | 4600 | dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name); |
8ee959f8 | 4601 | addr_base = force_gimple_operand (addr_base, &seq, true, dest); |
fb85abff | 4602 | gimple_seq_add_seq (new_stmt_list, seq); |
4603 | ||
f544b9a4 | 4604 | if (DR_PTR_INFO (dr) |
8ee959f8 | 4605 | && TREE_CODE (addr_base) == SSA_NAME |
4606 | && !SSA_NAME_PTR_INFO (addr_base)) | |
1259ab70 | 4607 | { |
abc9513d | 4608 | vect_duplicate_ssa_name_ptr_info (addr_base, dr_info); |
4a2edd22 | 4609 | if (offset || byte_offset) |
90d4c4af | 4610 | mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base)); |
1259ab70 | 4611 | } |
f544b9a4 | 4612 | |
6d8fb6cf | 4613 | if (dump_enabled_p ()) |
a4e972e3 | 4614 | dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base); |
f083cd24 | 4615 | |
90d4c4af | 4616 | return addr_base; |
fb85abff | 4617 | } |
4618 | ||
4619 | ||
4620 | /* Function vect_create_data_ref_ptr. | |
4621 | ||
bd5ba09f | 4622 | Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first |
ecc42a77 | 4623 | location accessed in the loop by STMT_INFO, along with the def-use update |
bd5ba09f | 4624 | chain to appropriately advance the pointer through the loop iterations. |
4625 | Also set aliasing information for the pointer. This pointer is used by | |
4626 | the callers to this function to create a memory reference expression for | |
4627 | vector load/store access. | |
fb85abff | 4628 | |
4629 | Input: | |
ecc42a77 | 4630 | 1. STMT_INFO: a stmt that references memory. Expected to be of the form |
fb85abff | 4631 | GIMPLE_ASSIGN <name, data-ref> or |
4632 | GIMPLE_ASSIGN <data-ref, name>. | |
bd5ba09f | 4633 | 2. AGGR_TYPE: the type of the reference, which should be either a vector |
4634 | or an array. | |
4635 | 3. AT_LOOP: the loop where the vector memref is to be created. | |
4636 | 4. OFFSET (optional): an offset to be added to the initial address accessed | |
ecc42a77 | 4637 | by the data-ref in STMT_INFO. |
bd5ba09f | 4638 | 5. BSI: location where the new stmts are to be placed if there is no loop |
4639 | 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain | |
fb85abff | 4640 | pointing to the initial address. |
1ec61bbd | 4641 | 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added |
ecc42a77 | 4642 | to the initial address accessed by the data-ref in STMT_INFO. This is |
1ec61bbd | 4643 | similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET |
4644 | in bytes. | |
1f9a3b5c | 4645 | 8. IV_STEP (optional, defaults to NULL): the amount that should be added |
4646 | to the IV during each iteration of the loop. NULL says to move | |
4647 | by one copy of AGGR_TYPE up or down, depending on the step of the | |
4648 | data reference. | |
fb85abff | 4649 | |
4650 | Output: | |
4651 | 1. Declare a new ptr to vector_type, and have it point to the base of the | |
4652 | data reference (initial addressed accessed by the data reference). | |
4653 | For example, for vector of type V8HI, the following code is generated: | |
4654 | ||
bd5ba09f | 4655 | v8hi *ap; |
4656 | ap = (v8hi *)initial_address; | |
fb85abff | 4657 | |
4658 | if OFFSET is not supplied: | |
4659 | initial_address = &a[init]; | |
4660 | if OFFSET is supplied: | |
4661 | initial_address = &a[init + OFFSET]; | |
1ec61bbd | 4662 | if BYTE_OFFSET is supplied: |
4663 | initial_address = &a[init] + BYTE_OFFSET; | |
fb85abff | 4664 | |
4665 | Return the initial_address in INITIAL_ADDRESS. | |
4666 | ||
4667 | 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also | |
48e1416a | 4668 | update the pointer in each iteration of the loop. |
fb85abff | 4669 | |
4670 | Return the increment stmt that updates the pointer in PTR_INCR. | |
4671 | ||
3c8b7bc7 | 4672 | 3. Return the pointer. */ |
fb85abff | 4673 | |
4674 | tree | |
ecc42a77 | 4675 | vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type, |
2e966e2a | 4676 | class loop *at_loop, tree offset, |
ecc42a77 | 4677 | tree *initial_address, gimple_stmt_iterator *gsi, |
3c8b7bc7 | 4678 | gimple **ptr_incr, bool only_init, |
ecc42a77 | 4679 | tree byte_offset, tree iv_step) |
fb85abff | 4680 | { |
3c18ea71 | 4681 | const char *base_name; |
fb85abff | 4682 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); |
2e966e2a | 4683 | class loop *loop = NULL; |
37545e54 | 4684 | bool nested_in_vect_loop = false; |
2e966e2a | 4685 | class loop *containing_loop = NULL; |
bd5ba09f | 4686 | tree aggr_ptr_type; |
4687 | tree aggr_ptr; | |
fb85abff | 4688 | tree new_temp; |
fb85abff | 4689 | gimple_seq new_stmt_list = NULL; |
37545e54 | 4690 | edge pe = NULL; |
fb85abff | 4691 | basic_block new_bb; |
bd5ba09f | 4692 | tree aggr_ptr_init; |
abc9513d | 4693 | dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); |
4694 | struct data_reference *dr = dr_info->dr; | |
bd5ba09f | 4695 | tree aptr; |
fb85abff | 4696 | gimple_stmt_iterator incr_gsi; |
4697 | bool insert_after; | |
4698 | tree indx_before_incr, indx_after_incr; | |
42acab1c | 4699 | gimple *incr; |
37545e54 | 4700 | bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); |
48e1416a | 4701 | |
1f9a3b5c | 4702 | gcc_assert (iv_step != NULL_TREE |
4703 | || TREE_CODE (aggr_type) == ARRAY_TYPE | |
bd5ba09f | 4704 | || TREE_CODE (aggr_type) == VECTOR_TYPE); |
4705 | ||
37545e54 | 4706 | if (loop_vinfo) |
4707 | { | |
4708 | loop = LOOP_VINFO_LOOP (loop_vinfo); | |
a73182ff | 4709 | nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info); |
4710 | containing_loop = (gimple_bb (stmt_info->stmt))->loop_father; | |
37545e54 | 4711 | pe = loop_preheader_edge (loop); |
4712 | } | |
4713 | else | |
4714 | { | |
4715 | gcc_assert (bb_vinfo); | |
4716 | only_init = true; | |
4717 | *ptr_incr = NULL; | |
4718 | } | |
48e1416a | 4719 | |
fb85abff | 4720 | /* Create an expression for the first address accessed by this load |
48e1416a | 4721 | in LOOP. */ |
3c18ea71 | 4722 | base_name = get_name (DR_BASE_ADDRESS (dr)); |
fb85abff | 4723 | |
6d8fb6cf | 4724 | if (dump_enabled_p ()) |
fb85abff | 4725 | { |
3c18ea71 | 4726 | tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr)); |
7bd765d4 | 4727 | dump_printf_loc (MSG_NOTE, vect_location, |
a4e972e3 | 4728 | "create %s-pointer variable to type: %T", |
4729 | get_tree_code_name (TREE_CODE (aggr_type)), | |
4730 | aggr_type); | |
3c18ea71 | 4731 | if (TREE_CODE (dr_base_type) == ARRAY_TYPE) |
7bd765d4 | 4732 | dump_printf (MSG_NOTE, " vectorizing an array ref: "); |
19bacd59 | 4733 | else if (TREE_CODE (dr_base_type) == VECTOR_TYPE) |
4734 | dump_printf (MSG_NOTE, " vectorizing a vector ref: "); | |
3c18ea71 | 4735 | else if (TREE_CODE (dr_base_type) == RECORD_TYPE) |
7bd765d4 | 4736 | dump_printf (MSG_NOTE, " vectorizing a record based array ref: "); |
3c18ea71 | 4737 | else |
7bd765d4 | 4738 | dump_printf (MSG_NOTE, " vectorizing a pointer ref: "); |
a4e972e3 | 4739 | dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr)); |
fb85abff | 4740 | } |
4741 | ||
90d4c4af | 4742 | /* (1) Create the new aggregate-pointer variable. |
4743 | Vector and array types inherit the alias set of their component | |
bd5ba09f | 4744 | type by default so we need to use a ref-all pointer if the data |
4745 | reference does not conflict with the created aggregated data | |
4746 | reference because it is not addressable. */ | |
90d4c4af | 4747 | bool need_ref_all = false; |
4748 | if (!alias_sets_conflict_p (get_alias_set (aggr_type), | |
a34701c9 | 4749 | get_alias_set (DR_REF (dr)))) |
90d4c4af | 4750 | need_ref_all = true; |
a34701c9 | 4751 | /* Likewise for any of the data references in the stmt group. */ |
e1009321 | 4752 | else if (DR_GROUP_SIZE (stmt_info) > 1) |
fb85abff | 4753 | { |
cd24aa3c | 4754 | stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info); |
dd277d48 | 4755 | do |
4756 | { | |
90d4c4af | 4757 | struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo); |
4758 | if (!alias_sets_conflict_p (get_alias_set (aggr_type), | |
4759 | get_alias_set (DR_REF (sdr)))) | |
dd277d48 | 4760 | { |
90d4c4af | 4761 | need_ref_all = true; |
dd277d48 | 4762 | break; |
4763 | } | |
cd24aa3c | 4764 | sinfo = DR_GROUP_NEXT_ELEMENT (sinfo); |
dd277d48 | 4765 | } |
cd24aa3c | 4766 | while (sinfo); |
fb85abff | 4767 | } |
90d4c4af | 4768 | aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode, |
4769 | need_ref_all); | |
4770 | aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name); | |
4771 | ||
fb85abff | 4772 | |
282bf14c | 4773 | /* Note: If the dataref is in an inner-loop nested in LOOP, and we are |
4774 | vectorizing LOOP (i.e., outer-loop vectorization), we need to create two | |
4775 | def-use update cycles for the pointer: one relative to the outer-loop | |
4776 | (LOOP), which is what steps (3) and (4) below do. The other is relative | |
4777 | to the inner-loop (which is the inner-most loop containing the dataref), | |
4778 | and this is done be step (5) below. | |
fb85abff | 4779 | |
282bf14c | 4780 | When vectorizing inner-most loops, the vectorized loop (LOOP) is also the |
4781 | inner-most loop, and so steps (3),(4) work the same, and step (5) is | |
4782 | redundant. Steps (3),(4) create the following: | |
fb85abff | 4783 | |
4784 | vp0 = &base_addr; | |
4785 | LOOP: vp1 = phi(vp0,vp2) | |
48e1416a | 4786 | ... |
fb85abff | 4787 | ... |
4788 | vp2 = vp1 + step | |
4789 | goto LOOP | |
48e1416a | 4790 | |
282bf14c | 4791 | If there is an inner-loop nested in loop, then step (5) will also be |
4792 | applied, and an additional update in the inner-loop will be created: | |
fb85abff | 4793 | |
4794 | vp0 = &base_addr; | |
4795 | LOOP: vp1 = phi(vp0,vp2) | |
4796 | ... | |
4797 | inner: vp3 = phi(vp1,vp4) | |
4798 | vp4 = vp3 + inner_step | |
4799 | if () goto inner | |
4800 | ... | |
4801 | vp2 = vp1 + step | |
4802 | if () goto LOOP */ | |
4803 | ||
bd5ba09f | 4804 | /* (2) Calculate the initial address of the aggregate-pointer, and set |
4805 | the aggregate-pointer to point to it before the loop. */ | |
fb85abff | 4806 | |
1ec61bbd | 4807 | /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */ |
fb85abff | 4808 | |
a73182ff | 4809 | new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list, |
9e879814 | 4810 | offset, byte_offset); |
fb85abff | 4811 | if (new_stmt_list) |
4812 | { | |
37545e54 | 4813 | if (pe) |
4814 | { | |
4815 | new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list); | |
4816 | gcc_assert (!new_bb); | |
4817 | } | |
4818 | else | |
bee862b6 | 4819 | gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT); |
fb85abff | 4820 | } |
4821 | ||
4822 | *initial_address = new_temp; | |
8ee959f8 | 4823 | aggr_ptr_init = new_temp; |
fb85abff | 4824 | |
bd5ba09f | 4825 | /* (3) Handle the updating of the aggregate-pointer inside the loop. |
282bf14c | 4826 | This is needed when ONLY_INIT is false, and also when AT_LOOP is the |
4827 | inner-loop nested in LOOP (during outer-loop vectorization). */ | |
fb85abff | 4828 | |
37545e54 | 4829 | /* No update in loop is required. */ |
48e1416a | 4830 | if (only_init && (!loop_vinfo || at_loop == loop)) |
bd5ba09f | 4831 | aptr = aggr_ptr_init; |
fb85abff | 4832 | else |
4833 | { | |
3c8b7bc7 | 4834 | /* Accesses to invariant addresses should be handled specially |
4835 | by the caller. */ | |
4836 | tree step = vect_dr_behavior (dr_info)->step; | |
4837 | gcc_assert (!integer_zerop (step)); | |
4838 | ||
1f9a3b5c | 4839 | if (iv_step == NULL_TREE) |
4840 | { | |
3c8b7bc7 | 4841 | /* The step of the aggregate pointer is the type size, |
4842 | negated for downward accesses. */ | |
1f9a3b5c | 4843 | iv_step = TYPE_SIZE_UNIT (aggr_type); |
3c8b7bc7 | 4844 | if (tree_int_cst_sgn (step) == -1) |
1f9a3b5c | 4845 | iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step); |
4846 | } | |
fb85abff | 4847 | |
4848 | standard_iv_increment_position (loop, &incr_gsi, &insert_after); | |
4849 | ||
bd5ba09f | 4850 | create_iv (aggr_ptr_init, |
8bbe6b75 | 4851 | fold_convert (aggr_ptr_type, iv_step), |
bd5ba09f | 4852 | aggr_ptr, loop, &incr_gsi, insert_after, |
fb85abff | 4853 | &indx_before_incr, &indx_after_incr); |
4854 | incr = gsi_stmt (incr_gsi); | |
04b2391d | 4855 | loop_vinfo->add_stmt (incr); |
fb85abff | 4856 | |
4857 | /* Copy the points-to information if it exists. */ | |
4858 | if (DR_PTR_INFO (dr)) | |
4859 | { | |
abc9513d | 4860 | vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info); |
4861 | vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info); | |
fb85abff | 4862 | } |
fb85abff | 4863 | if (ptr_incr) |
4864 | *ptr_incr = incr; | |
4865 | ||
bd5ba09f | 4866 | aptr = indx_before_incr; |
fb85abff | 4867 | } |
4868 | ||
4869 | if (!nested_in_vect_loop || only_init) | |
bd5ba09f | 4870 | return aptr; |
fb85abff | 4871 | |
4872 | ||
bd5ba09f | 4873 | /* (4) Handle the updating of the aggregate-pointer inside the inner-loop |
282bf14c | 4874 | nested in LOOP, if exists. */ |
fb85abff | 4875 | |
4876 | gcc_assert (nested_in_vect_loop); | |
4877 | if (!only_init) | |
4878 | { | |
4879 | standard_iv_increment_position (containing_loop, &incr_gsi, | |
4880 | &insert_after); | |
bd5ba09f | 4881 | create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr, |
fb85abff | 4882 | containing_loop, &incr_gsi, insert_after, &indx_before_incr, |
4883 | &indx_after_incr); | |
4884 | incr = gsi_stmt (incr_gsi); | |
04b2391d | 4885 | loop_vinfo->add_stmt (incr); |
fb85abff | 4886 | |
4887 | /* Copy the points-to information if it exists. */ | |
4888 | if (DR_PTR_INFO (dr)) | |
4889 | { | |
abc9513d | 4890 | vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info); |
4891 | vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info); | |
fb85abff | 4892 | } |
fb85abff | 4893 | if (ptr_incr) |
4894 | *ptr_incr = incr; | |
4895 | ||
48e1416a | 4896 | return indx_before_incr; |
fb85abff | 4897 | } |
4898 | else | |
4899 | gcc_unreachable (); | |
4900 | } | |
4901 | ||
4902 | ||
4903 | /* Function bump_vector_ptr | |
4904 | ||
4905 | Increment a pointer (to a vector type) by vector-size. If requested, | |
48e1416a | 4906 | i.e. if PTR-INCR is given, then also connect the new increment stmt |
fb85abff | 4907 | to the existing def-use update-chain of the pointer, by modifying |
4908 | the PTR_INCR as illustrated below: | |
4909 | ||
4910 | The pointer def-use update-chain before this function: | |
4911 | DATAREF_PTR = phi (p_0, p_2) | |
4912 | .... | |
48e1416a | 4913 | PTR_INCR: p_2 = DATAREF_PTR + step |
fb85abff | 4914 | |
4915 | The pointer def-use update-chain after this function: | |
4916 | DATAREF_PTR = phi (p_0, p_2) | |
4917 | .... | |
4918 | NEW_DATAREF_PTR = DATAREF_PTR + BUMP | |
4919 | .... | |
4920 | PTR_INCR: p_2 = NEW_DATAREF_PTR + step | |
4921 | ||
4922 | Input: | |
48e1416a | 4923 | DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated |
fb85abff | 4924 | in the loop. |
48e1416a | 4925 | PTR_INCR - optional. The stmt that updates the pointer in each iteration of |
fb85abff | 4926 | the loop. The increment amount across iterations is expected |
48e1416a | 4927 | to be vector_size. |
fb85abff | 4928 | BSI - location where the new update stmt is to be placed. |
ecc42a77 | 4929 | STMT_INFO - the original scalar memory-access stmt that is being vectorized. |
fb85abff | 4930 | BUMP - optional. The offset by which to bump the pointer. If not given, |
4931 | the offset is assumed to be vector_size. | |
4932 | ||
4933 | Output: Return NEW_DATAREF_PTR as illustrated above. | |
48e1416a | 4934 | |
fb85abff | 4935 | */ |
4936 | ||
4937 | tree | |
42acab1c | 4938 | bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi, |
ecc42a77 | 4939 | stmt_vec_info stmt_info, tree bump) |
fb85abff | 4940 | { |
fb85abff | 4941 | struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); |
4942 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
fb85abff | 4943 | tree update = TYPE_SIZE_UNIT (vectype); |
1a91d914 | 4944 | gassign *incr_stmt; |
fb85abff | 4945 | ssa_op_iter iter; |
4946 | use_operand_p use_p; | |
4947 | tree new_dataref_ptr; | |
4948 | ||
4949 | if (bump) | |
4950 | update = bump; | |
48e1416a | 4951 | |
8ee959f8 | 4952 | if (TREE_CODE (dataref_ptr) == SSA_NAME) |
4953 | new_dataref_ptr = copy_ssa_name (dataref_ptr); | |
4954 | else | |
4955 | new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr)); | |
e9cf809e | 4956 | incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR, |
4957 | dataref_ptr, update); | |
a73182ff | 4958 | vect_finish_stmt_generation (stmt_info, incr_stmt, gsi); |
fb85abff | 4959 | |
4960 | /* Copy the points-to information if it exists. */ | |
4961 | if (DR_PTR_INFO (dr)) | |
1259ab70 | 4962 | { |
4963 | duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr)); | |
ceea063b | 4964 | mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr)); |
1259ab70 | 4965 | } |
fb85abff | 4966 | |
4967 | if (!ptr_incr) | |
4968 | return new_dataref_ptr; | |
4969 | ||
4970 | /* Update the vector-pointer's cross-iteration increment. */ | |
4971 | FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE) | |
4972 | { | |
4973 | tree use = USE_FROM_PTR (use_p); | |
4974 | ||
4975 | if (use == dataref_ptr) | |
4976 | SET_USE (use_p, new_dataref_ptr); | |
4977 | else | |
1f9a3b5c | 4978 | gcc_assert (operand_equal_p (use, update, 0)); |
fb85abff | 4979 | } |
4980 | ||
4981 | return new_dataref_ptr; | |
4982 | } | |
4983 | ||
4984 | ||
1c4c7e32 | 4985 | /* Copy memory reference info such as base/clique from the SRC reference |
4986 | to the DEST MEM_REF. */ | |
4987 | ||
4988 | void | |
4989 | vect_copy_ref_info (tree dest, tree src) | |
4990 | { | |
4991 | if (TREE_CODE (dest) != MEM_REF) | |
4992 | return; | |
4993 | ||
4994 | tree src_base = src; | |
4995 | while (handled_component_p (src_base)) | |
4996 | src_base = TREE_OPERAND (src_base, 0); | |
4997 | if (TREE_CODE (src_base) != MEM_REF | |
4998 | && TREE_CODE (src_base) != TARGET_MEM_REF) | |
4999 | return; | |
5000 | ||
5001 | MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base); | |
5002 | MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base); | |
5003 | } | |
5004 | ||
5005 | ||
fb85abff | 5006 | /* Function vect_create_destination_var. |
5007 | ||
5008 | Create a new temporary of type VECTYPE. */ | |
5009 | ||
5010 | tree | |
5011 | vect_create_destination_var (tree scalar_dest, tree vectype) | |
5012 | { | |
5013 | tree vec_dest; | |
0bf5f81b | 5014 | const char *name; |
5015 | char *new_name; | |
fb85abff | 5016 | tree type; |
5017 | enum vect_var_kind kind; | |
5018 | ||
dab48979 | 5019 | kind = vectype |
5020 | ? VECTOR_BOOLEAN_TYPE_P (vectype) | |
5021 | ? vect_mask_var | |
5022 | : vect_simple_var | |
5023 | : vect_scalar_var; | |
fb85abff | 5024 | type = vectype ? vectype : TREE_TYPE (scalar_dest); |
5025 | ||
5026 | gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME); | |
5027 | ||
0bf5f81b | 5028 | name = get_name (scalar_dest); |
5029 | if (name) | |
b33b6e58 | 5030 | new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest)); |
0bf5f81b | 5031 | else |
b33b6e58 | 5032 | new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest)); |
fb85abff | 5033 | vec_dest = vect_get_new_vect_var (type, kind, new_name); |
0bf5f81b | 5034 | free (new_name); |
fb85abff | 5035 | |
5036 | return vec_dest; | |
5037 | } | |
5038 | ||
ee612634 | 5039 | /* Function vect_grouped_store_supported. |
fb85abff | 5040 | |
42f6a6e8 | 5041 | Returns TRUE if interleave high and interleave low permutations |
5042 | are supported, and FALSE otherwise. */ | |
fb85abff | 5043 | |
5044 | bool | |
ee612634 | 5045 | vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count) |
fb85abff | 5046 | { |
3754d046 | 5047 | machine_mode mode = TYPE_MODE (vectype); |
48e1416a | 5048 | |
d53391a8 | 5049 | /* vect_permute_store_chain requires the group size to be equal to 3 or |
5050 | be a power of two. */ | |
5051 | if (count != 3 && exact_log2 (count) == -1) | |
481fc474 | 5052 | { |
6d8fb6cf | 5053 | if (dump_enabled_p ()) |
7bd765d4 | 5054 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
d53391a8 | 5055 | "the size of the group of accesses" |
5056 | " is not a power of 2 or not eqaul to 3\n"); | |
481fc474 | 5057 | return false; |
5058 | } | |
5059 | ||
42f6a6e8 | 5060 | /* Check that the permutation is supported. */ |
8bec2124 | 5061 | if (VECTOR_MODE_P (mode)) |
5062 | { | |
ba7efd65 | 5063 | unsigned int i; |
d53391a8 | 5064 | if (count == 3) |
8bec2124 | 5065 | { |
d53391a8 | 5066 | unsigned int j0 = 0, j1 = 0, j2 = 0; |
5067 | unsigned int i, j; | |
5068 | ||
ba7efd65 | 5069 | unsigned int nelt; |
5070 | if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) | |
5071 | { | |
5072 | if (dump_enabled_p ()) | |
5073 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
5074 | "cannot handle groups of 3 stores for" | |
5075 | " variable-length vectors\n"); | |
5076 | return false; | |
5077 | } | |
5078 | ||
c3fa7fe9 | 5079 | vec_perm_builder sel (nelt, nelt, 1); |
5080 | sel.quick_grow (nelt); | |
1957c019 | 5081 | vec_perm_indices indices; |
d53391a8 | 5082 | for (j = 0; j < 3; j++) |
5083 | { | |
5084 | int nelt0 = ((3 - j) * nelt) % 3; | |
5085 | int nelt1 = ((3 - j) * nelt + 1) % 3; | |
5086 | int nelt2 = ((3 - j) * nelt + 2) % 3; | |
5087 | for (i = 0; i < nelt; i++) | |
5088 | { | |
5089 | if (3 * i + nelt0 < nelt) | |
5090 | sel[3 * i + nelt0] = j0++; | |
5091 | if (3 * i + nelt1 < nelt) | |
5092 | sel[3 * i + nelt1] = nelt + j1++; | |
5093 | if (3 * i + nelt2 < nelt) | |
5094 | sel[3 * i + nelt2] = 0; | |
5095 | } | |
1957c019 | 5096 | indices.new_vector (sel, 2, nelt); |
5097 | if (!can_vec_perm_const_p (mode, indices)) | |
d53391a8 | 5098 | { |
5099 | if (dump_enabled_p ()) | |
5100 | dump_printf (MSG_MISSED_OPTIMIZATION, | |
97f7d65e | 5101 | "permutation op not supported by target.\n"); |
d53391a8 | 5102 | return false; |
5103 | } | |
5104 | ||
5105 | for (i = 0; i < nelt; i++) | |
5106 | { | |
5107 | if (3 * i + nelt0 < nelt) | |
5108 | sel[3 * i + nelt0] = 3 * i + nelt0; | |
5109 | if (3 * i + nelt1 < nelt) | |
5110 | sel[3 * i + nelt1] = 3 * i + nelt1; | |
5111 | if (3 * i + nelt2 < nelt) | |
5112 | sel[3 * i + nelt2] = nelt + j2++; | |
5113 | } | |
1957c019 | 5114 | indices.new_vector (sel, 2, nelt); |
5115 | if (!can_vec_perm_const_p (mode, indices)) | |
d53391a8 | 5116 | { |
5117 | if (dump_enabled_p ()) | |
5118 | dump_printf (MSG_MISSED_OPTIMIZATION, | |
97f7d65e | 5119 | "permutation op not supported by target.\n"); |
d53391a8 | 5120 | return false; |
5121 | } | |
5122 | } | |
5123 | return true; | |
8bec2124 | 5124 | } |
d53391a8 | 5125 | else |
8bec2124 | 5126 | { |
d53391a8 | 5127 | /* If length is not equal to 3 then only power of 2 is supported. */ |
ac29ece2 | 5128 | gcc_assert (pow2p_hwi (count)); |
ba7efd65 | 5129 | poly_uint64 nelt = GET_MODE_NUNITS (mode); |
d53391a8 | 5130 | |
c3fa7fe9 | 5131 | /* The encoding has 2 interleaved stepped patterns. */ |
5132 | vec_perm_builder sel (nelt, 2, 3); | |
5133 | sel.quick_grow (6); | |
5134 | for (i = 0; i < 3; i++) | |
d53391a8 | 5135 | { |
5136 | sel[i * 2] = i; | |
5137 | sel[i * 2 + 1] = i + nelt; | |
5138 | } | |
1957c019 | 5139 | vec_perm_indices indices (sel, 2, nelt); |
5140 | if (can_vec_perm_const_p (mode, indices)) | |
282dc861 | 5141 | { |
c3fa7fe9 | 5142 | for (i = 0; i < 6; i++) |
ba7efd65 | 5143 | sel[i] += exact_div (nelt, 2); |
1957c019 | 5144 | indices.new_vector (sel, 2, nelt); |
5145 | if (can_vec_perm_const_p (mode, indices)) | |
282dc861 | 5146 | return true; |
5147 | } | |
8bec2124 | 5148 | } |
5149 | } | |
fb85abff | 5150 | |
6d8fb6cf | 5151 | if (dump_enabled_p ()) |
7bd765d4 | 5152 | dump_printf (MSG_MISSED_OPTIMIZATION, |
12554a62 | 5153 | "permutation op not supported by target.\n"); |
6620d7d7 | 5154 | return false; |
fb85abff | 5155 | } |
5156 | ||
5157 | ||
2dd8e84c | 5158 | /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of |
5159 | type VECTYPE. MASKED_P says whether the masked form is needed. */ | |
94b7b4dd | 5160 | |
5161 | bool | |
2dd8e84c | 5162 | vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, |
5163 | bool masked_p) | |
94b7b4dd | 5164 | { |
2dd8e84c | 5165 | if (masked_p) |
5166 | return vect_lanes_optab_supported_p ("vec_mask_store_lanes", | |
5167 | vec_mask_store_lanes_optab, | |
5168 | vectype, count); | |
5169 | else | |
5170 | return vect_lanes_optab_supported_p ("vec_store_lanes", | |
5171 | vec_store_lanes_optab, | |
5172 | vectype, count); | |
94b7b4dd | 5173 | } |
5174 | ||
5175 | ||
fb85abff | 5176 | /* Function vect_permute_store_chain. |
5177 | ||
5178 | Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be | |
d53391a8 | 5179 | a power of 2 or equal to 3, generate interleave_high/low stmts to reorder |
5180 | the data correctly for the stores. Return the final references for stores | |
5181 | in RESULT_CHAIN. | |
fb85abff | 5182 | |
5183 | E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. | |
282bf14c | 5184 | The input is 4 vectors each containing 8 elements. We assign a number to |
5185 | each element, the input sequence is: | |
fb85abff | 5186 | |
5187 | 1st vec: 0 1 2 3 4 5 6 7 | |
5188 | 2nd vec: 8 9 10 11 12 13 14 15 | |
48e1416a | 5189 | 3rd vec: 16 17 18 19 20 21 22 23 |
fb85abff | 5190 | 4th vec: 24 25 26 27 28 29 30 31 |
5191 | ||
5192 | The output sequence should be: | |
5193 | ||
5194 | 1st vec: 0 8 16 24 1 9 17 25 | |
5195 | 2nd vec: 2 10 18 26 3 11 19 27 | |
5196 | 3rd vec: 4 12 20 28 5 13 21 30 | |
5197 | 4th vec: 6 14 22 30 7 15 23 31 | |
5198 | ||
5199 | i.e., we interleave the contents of the four vectors in their order. | |
5200 | ||
282bf14c | 5201 | We use interleave_high/low instructions to create such output. The input of |
fb85abff | 5202 | each interleave_high/low operation is two vectors: |
48e1416a | 5203 | 1st vec 2nd vec |
5204 | 0 1 2 3 4 5 6 7 | |
5205 | the even elements of the result vector are obtained left-to-right from the | |
282bf14c | 5206 | high/low elements of the first vector. The odd elements of the result are |
fb85abff | 5207 | obtained left-to-right from the high/low elements of the second vector. |
5208 | The output of interleave_high will be: 0 4 1 5 | |
5209 | and of interleave_low: 2 6 3 7 | |
5210 | ||
48e1416a | 5211 | |
282bf14c | 5212 | The permutation is done in log LENGTH stages. In each stage interleave_high |
48e1416a | 5213 | and interleave_low stmts are created for each pair of vectors in DR_CHAIN, |
5214 | where the first argument is taken from the first half of DR_CHAIN and the | |
5215 | second argument from it's second half. | |
5216 | In our example, | |
fb85abff | 5217 | |
5218 | I1: interleave_high (1st vec, 3rd vec) | |
5219 | I2: interleave_low (1st vec, 3rd vec) | |
5220 | I3: interleave_high (2nd vec, 4th vec) | |
5221 | I4: interleave_low (2nd vec, 4th vec) | |
5222 | ||
5223 | The output for the first stage is: | |
5224 | ||
5225 | I1: 0 16 1 17 2 18 3 19 | |
5226 | I2: 4 20 5 21 6 22 7 23 | |
5227 | I3: 8 24 9 25 10 26 11 27 | |
5228 | I4: 12 28 13 29 14 30 15 31 | |
5229 | ||
5230 | The output of the second stage, i.e. the final result is: | |
5231 | ||
5232 | I1: 0 8 16 24 1 9 17 25 | |
5233 | I2: 2 10 18 26 3 11 19 27 | |
5234 | I3: 4 12 20 28 5 13 21 30 | |
5235 | I4: 6 14 22 30 7 15 23 31. */ | |
48e1416a | 5236 | |
481fc474 | 5237 | void |
f1f41a6c | 5238 | vect_permute_store_chain (vec<tree> dr_chain, |
48e1416a | 5239 | unsigned int length, |
ecc42a77 | 5240 | stmt_vec_info stmt_info, |
fb85abff | 5241 | gimple_stmt_iterator *gsi, |
f1f41a6c | 5242 | vec<tree> *result_chain) |
fb85abff | 5243 | { |
03d37e4e | 5244 | tree vect1, vect2, high, low; |
42acab1c | 5245 | gimple *perm_stmt; |
1c2fef9a | 5246 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
8bec2124 | 5247 | tree perm_mask_low, perm_mask_high; |
d53391a8 | 5248 | tree data_ref; |
5249 | tree perm3_mask_low, perm3_mask_high; | |
8b221927 | 5250 | unsigned int i, j, n, log_length = exact_log2 (length); |
282dc861 | 5251 | |
f40aaf2d | 5252 | result_chain->quick_grow (length); |
5253 | memcpy (result_chain->address (), dr_chain.address (), | |
5254 | length * sizeof (tree)); | |
fb85abff | 5255 | |
d53391a8 | 5256 | if (length == 3) |
8bec2124 | 5257 | { |
8b221927 | 5258 | /* vect_grouped_store_supported ensures that this is constant. */ |
f08ee65f | 5259 | unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); |
d53391a8 | 5260 | unsigned int j0 = 0, j1 = 0, j2 = 0; |
42f6a6e8 | 5261 | |
c3fa7fe9 | 5262 | vec_perm_builder sel (nelt, nelt, 1); |
5263 | sel.quick_grow (nelt); | |
1957c019 | 5264 | vec_perm_indices indices; |
d53391a8 | 5265 | for (j = 0; j < 3; j++) |
5266 | { | |
5267 | int nelt0 = ((3 - j) * nelt) % 3; | |
5268 | int nelt1 = ((3 - j) * nelt + 1) % 3; | |
5269 | int nelt2 = ((3 - j) * nelt + 2) % 3; | |
8bec2124 | 5270 | |
d53391a8 | 5271 | for (i = 0; i < nelt; i++) |
5272 | { | |
5273 | if (3 * i + nelt0 < nelt) | |
5274 | sel[3 * i + nelt0] = j0++; | |
5275 | if (3 * i + nelt1 < nelt) | |
5276 | sel[3 * i + nelt1] = nelt + j1++; | |
5277 | if (3 * i + nelt2 < nelt) | |
5278 | sel[3 * i + nelt2] = 0; | |
5279 | } | |
1957c019 | 5280 | indices.new_vector (sel, 2, nelt); |
5281 | perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices); | |
d53391a8 | 5282 | |
5283 | for (i = 0; i < nelt; i++) | |
5284 | { | |
5285 | if (3 * i + nelt0 < nelt) | |
5286 | sel[3 * i + nelt0] = 3 * i + nelt0; | |
5287 | if (3 * i + nelt1 < nelt) | |
5288 | sel[3 * i + nelt1] = 3 * i + nelt1; | |
5289 | if (3 * i + nelt2 < nelt) | |
5290 | sel[3 * i + nelt2] = nelt + j2++; | |
5291 | } | |
1957c019 | 5292 | indices.new_vector (sel, 2, nelt); |
5293 | perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices); | |
d53391a8 | 5294 | |
5295 | vect1 = dr_chain[0]; | |
5296 | vect2 = dr_chain[1]; | |
fb85abff | 5297 | |
5298 | /* Create interleaving stmt: | |
d53391a8 | 5299 | low = VEC_PERM_EXPR <vect1, vect2, |
5300 | {j, nelt, *, j + 1, nelt + j + 1, *, | |
5301 | j + 2, nelt + j + 2, *, ...}> */ | |
5302 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); | |
e9cf809e | 5303 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, |
5304 | vect2, perm3_mask_low); | |
a73182ff | 5305 | vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); |
fb85abff | 5306 | |
d53391a8 | 5307 | vect1 = data_ref; |
5308 | vect2 = dr_chain[2]; | |
fb85abff | 5309 | /* Create interleaving stmt: |
d53391a8 | 5310 | low = VEC_PERM_EXPR <vect1, vect2, |
5311 | {0, 1, nelt + j, 3, 4, nelt + j + 1, | |
5312 | 6, 7, nelt + j + 2, ...}> */ | |
5313 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); | |
e9cf809e | 5314 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, |
5315 | vect2, perm3_mask_high); | |
a73182ff | 5316 | vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); |
d53391a8 | 5317 | (*result_chain)[j] = data_ref; |
fb85abff | 5318 | } |
d53391a8 | 5319 | } |
5320 | else | |
5321 | { | |
5322 | /* If length is not equal to 3 then only power of 2 is supported. */ | |
ac29ece2 | 5323 | gcc_assert (pow2p_hwi (length)); |
d53391a8 | 5324 | |
c3fa7fe9 | 5325 | /* The encoding has 2 interleaved stepped patterns. */ |
f08ee65f | 5326 | poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype); |
c3fa7fe9 | 5327 | vec_perm_builder sel (nelt, 2, 3); |
5328 | sel.quick_grow (6); | |
5329 | for (i = 0; i < 3; i++) | |
d53391a8 | 5330 | { |
5331 | sel[i * 2] = i; | |
5332 | sel[i * 2 + 1] = i + nelt; | |
5333 | } | |
1957c019 | 5334 | vec_perm_indices indices (sel, 2, nelt); |
5335 | perm_mask_high = vect_gen_perm_mask_checked (vectype, indices); | |
d53391a8 | 5336 | |
c3fa7fe9 | 5337 | for (i = 0; i < 6; i++) |
f08ee65f | 5338 | sel[i] += exact_div (nelt, 2); |
1957c019 | 5339 | indices.new_vector (sel, 2, nelt); |
5340 | perm_mask_low = vect_gen_perm_mask_checked (vectype, indices); | |
d53391a8 | 5341 | |
5342 | for (i = 0, n = log_length; i < n; i++) | |
5343 | { | |
5344 | for (j = 0; j < length/2; j++) | |
5345 | { | |
5346 | vect1 = dr_chain[j]; | |
5347 | vect2 = dr_chain[j+length/2]; | |
5348 | ||
5349 | /* Create interleaving stmt: | |
5350 | high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, | |
5351 | ...}> */ | |
5352 | high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); | |
e9cf809e | 5353 | perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1, |
5354 | vect2, perm_mask_high); | |
a73182ff | 5355 | vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); |
d53391a8 | 5356 | (*result_chain)[2*j] = high; |
5357 | ||
5358 | /* Create interleaving stmt: | |
5359 | low = VEC_PERM_EXPR <vect1, vect2, | |
5360 | {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1, | |
5361 | ...}> */ | |
5362 | low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); | |
e9cf809e | 5363 | perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1, |
5364 | vect2, perm_mask_low); | |
a73182ff | 5365 | vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); |
d53391a8 | 5366 | (*result_chain)[2*j+1] = low; |
5367 | } | |
5368 | memcpy (dr_chain.address (), result_chain->address (), | |
5369 | length * sizeof (tree)); | |
5370 | } | |
fb85abff | 5371 | } |
fb85abff | 5372 | } |
5373 | ||
5374 | /* Function vect_setup_realignment | |
48e1416a | 5375 | |
fb85abff | 5376 | This function is called when vectorizing an unaligned load using |
5377 | the dr_explicit_realign[_optimized] scheme. | |
5378 | This function generates the following code at the loop prolog: | |
5379 | ||
5380 | p = initial_addr; | |
5381 | x msq_init = *(floor(p)); # prolog load | |
48e1416a | 5382 | realignment_token = call target_builtin; |
fb85abff | 5383 | loop: |
5384 | x msq = phi (msq_init, ---) | |
5385 | ||
48e1416a | 5386 | The stmts marked with x are generated only for the case of |
fb85abff | 5387 | dr_explicit_realign_optimized. |
5388 | ||
48e1416a | 5389 | The code above sets up a new (vector) pointer, pointing to the first |
ecc42a77 | 5390 | location accessed by STMT_INFO, and a "floor-aligned" load using that |
5391 | pointer. It also generates code to compute the "realignment-token" | |
5392 | (if the relevant target hook was defined), and creates a phi-node at the | |
5393 | loop-header bb whose arguments are the result of the prolog-load (created | |
5394 | by this function) and the result of a load that takes place in the loop | |
5395 | (to be created by the caller to this function). | |
fb85abff | 5396 | |
5397 | For the case of dr_explicit_realign_optimized: | |
48e1416a | 5398 | The caller to this function uses the phi-result (msq) to create the |
fb85abff | 5399 | realignment code inside the loop, and sets up the missing phi argument, |
5400 | as follows: | |
48e1416a | 5401 | loop: |
fb85abff | 5402 | msq = phi (msq_init, lsq) |
5403 | lsq = *(floor(p')); # load in loop | |
5404 | result = realign_load (msq, lsq, realignment_token); | |
5405 | ||
5406 | For the case of dr_explicit_realign: | |
5407 | loop: | |
5408 | msq = *(floor(p)); # load in loop | |
5409 | p' = p + (VS-1); | |
5410 | lsq = *(floor(p')); # load in loop | |
5411 | result = realign_load (msq, lsq, realignment_token); | |
5412 | ||
5413 | Input: | |
ecc42a77 | 5414 | STMT_INFO - (scalar) load stmt to be vectorized. This load accesses |
5415 | a memory location that may be unaligned. | |
fb85abff | 5416 | BSI - place where new code is to be inserted. |
5417 | ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes | |
48e1416a | 5418 | is used. |
5419 | ||
fb85abff | 5420 | Output: |
5421 | REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load | |
5422 | target hook, if defined. | |
5423 | Return value - the result of the loop-header phi node. */ | |
5424 | ||
5425 | tree | |
ecc42a77 | 5426 | vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, |
fb85abff | 5427 | tree *realignment_token, |
5428 | enum dr_alignment_support alignment_support_scheme, | |
5429 | tree init_addr, | |
2e966e2a | 5430 | class loop **at_loop) |
fb85abff | 5431 | { |
fb85abff | 5432 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
5433 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
abc9513d | 5434 | dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); |
5435 | struct data_reference *dr = dr_info->dr; | |
2e966e2a | 5436 | class loop *loop = NULL; |
ad074595 | 5437 | edge pe = NULL; |
a73182ff | 5438 | tree scalar_dest = gimple_assign_lhs (stmt_info->stmt); |
fb85abff | 5439 | tree vec_dest; |
42acab1c | 5440 | gimple *inc; |
fb85abff | 5441 | tree ptr; |
5442 | tree data_ref; | |
fb85abff | 5443 | basic_block new_bb; |
5444 | tree msq_init = NULL_TREE; | |
5445 | tree new_temp; | |
1a91d914 | 5446 | gphi *phi_stmt; |
fb85abff | 5447 | tree msq = NULL_TREE; |
5448 | gimple_seq stmts = NULL; | |
fb85abff | 5449 | bool compute_in_loop = false; |
ad074595 | 5450 | bool nested_in_vect_loop = false; |
2e966e2a | 5451 | class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father; |
5452 | class loop *loop_for_initial_load = NULL; | |
ad074595 | 5453 | |
5454 | if (loop_vinfo) | |
5455 | { | |
5456 | loop = LOOP_VINFO_LOOP (loop_vinfo); | |
a73182ff | 5457 | nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info); |
ad074595 | 5458 | } |
fb85abff | 5459 | |
5460 | gcc_assert (alignment_support_scheme == dr_explicit_realign | |
5461 | || alignment_support_scheme == dr_explicit_realign_optimized); | |
5462 | ||
5463 | /* We need to generate three things: | |
5464 | 1. the misalignment computation | |
5465 | 2. the extra vector load (for the optimized realignment scheme). | |
5466 | 3. the phi node for the two vectors from which the realignment is | |
282bf14c | 5467 | done (for the optimized realignment scheme). */ |
fb85abff | 5468 | |
5469 | /* 1. Determine where to generate the misalignment computation. | |
5470 | ||
5471 | If INIT_ADDR is NULL_TREE, this indicates that the misalignment | |
5472 | calculation will be generated by this function, outside the loop (in the | |
5473 | preheader). Otherwise, INIT_ADDR had already been computed for us by the | |
5474 | caller, inside the loop. | |
5475 | ||
5476 | Background: If the misalignment remains fixed throughout the iterations of | |
5477 | the loop, then both realignment schemes are applicable, and also the | |
5478 | misalignment computation can be done outside LOOP. This is because we are | |
5479 | vectorizing LOOP, and so the memory accesses in LOOP advance in steps that | |
5480 | are a multiple of VS (the Vector Size), and therefore the misalignment in | |
5481 | different vectorized LOOP iterations is always the same. | |
5482 | The problem arises only if the memory access is in an inner-loop nested | |
5483 | inside LOOP, which is now being vectorized using outer-loop vectorization. | |
5484 | This is the only case when the misalignment of the memory access may not | |
5485 | remain fixed throughout the iterations of the inner-loop (as explained in | |
5486 | detail in vect_supportable_dr_alignment). In this case, not only is the | |
5487 | optimized realignment scheme not applicable, but also the misalignment | |
5488 | computation (and generation of the realignment token that is passed to | |
5489 | REALIGN_LOAD) have to be done inside the loop. | |
5490 | ||
5491 | In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode | |
5492 | or not, which in turn determines if the misalignment is computed inside | |
5493 | the inner-loop, or outside LOOP. */ | |
5494 | ||
ad074595 | 5495 | if (init_addr != NULL_TREE || !loop_vinfo) |
fb85abff | 5496 | { |
5497 | compute_in_loop = true; | |
5498 | gcc_assert (alignment_support_scheme == dr_explicit_realign); | |
5499 | } | |
5500 | ||
5501 | ||
5502 | /* 2. Determine where to generate the extra vector load. | |
5503 | ||
5504 | For the optimized realignment scheme, instead of generating two vector | |
5505 | loads in each iteration, we generate a single extra vector load in the | |
5506 | preheader of the loop, and in each iteration reuse the result of the | |
5507 | vector load from the previous iteration. In case the memory access is in | |
5508 | an inner-loop nested inside LOOP, which is now being vectorized using | |
5509 | outer-loop vectorization, we need to determine whether this initial vector | |
5510 | load should be generated at the preheader of the inner-loop, or can be | |
5511 | generated at the preheader of LOOP. If the memory access has no evolution | |
5512 | in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has | |
5513 | to be generated inside LOOP (in the preheader of the inner-loop). */ | |
5514 | ||
5515 | if (nested_in_vect_loop) | |
5516 | { | |
5517 | tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info); | |
5518 | bool invariant_in_outerloop = | |
5519 | (tree_int_cst_compare (outerloop_step, size_zero_node) == 0); | |
5520 | loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner); | |
5521 | } | |
5522 | else | |
5523 | loop_for_initial_load = loop; | |
5524 | if (at_loop) | |
5525 | *at_loop = loop_for_initial_load; | |
5526 | ||
ad074595 | 5527 | if (loop_for_initial_load) |
5528 | pe = loop_preheader_edge (loop_for_initial_load); | |
5529 | ||
fb85abff | 5530 | /* 3. For the case of the optimized realignment, create the first vector |
5531 | load at the loop preheader. */ | |
5532 | ||
5533 | if (alignment_support_scheme == dr_explicit_realign_optimized) | |
5534 | { | |
5535 | /* Create msq_init = *(floor(p1)) in the loop preheader */ | |
1a91d914 | 5536 | gassign *new_stmt; |
fb85abff | 5537 | |
5538 | gcc_assert (!compute_in_loop); | |
fb85abff | 5539 | vec_dest = vect_create_destination_var (scalar_dest, vectype); |
a73182ff | 5540 | ptr = vect_create_data_ref_ptr (stmt_info, vectype, |
5541 | loop_for_initial_load, NULL_TREE, | |
3c8b7bc7 | 5542 | &init_addr, NULL, &inc, true); |
23bab442 | 5543 | if (TREE_CODE (ptr) == SSA_NAME) |
5544 | new_temp = copy_ssa_name (ptr); | |
5545 | else | |
5546 | new_temp = make_ssa_name (TREE_TYPE (ptr)); | |
e092c20e | 5547 | poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info); |
5548 | tree type = TREE_TYPE (ptr); | |
e9cf809e | 5549 | new_stmt = gimple_build_assign |
5550 | (new_temp, BIT_AND_EXPR, ptr, | |
e092c20e | 5551 | fold_build2 (MINUS_EXPR, type, |
5552 | build_int_cst (type, 0), | |
5553 | build_int_cst (type, align))); | |
86638c2e | 5554 | new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); |
5555 | gcc_assert (!new_bb); | |
2cb9ef39 | 5556 | data_ref |
5557 | = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp, | |
5558 | build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0)); | |
1c4c7e32 | 5559 | vect_copy_ref_info (data_ref, DR_REF (dr)); |
fb85abff | 5560 | new_stmt = gimple_build_assign (vec_dest, data_ref); |
5561 | new_temp = make_ssa_name (vec_dest, new_stmt); | |
5562 | gimple_assign_set_lhs (new_stmt, new_temp); | |
ad074595 | 5563 | if (pe) |
5564 | { | |
5565 | new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); | |
5566 | gcc_assert (!new_bb); | |
5567 | } | |
5568 | else | |
5569 | gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); | |
5570 | ||
fb85abff | 5571 | msq_init = gimple_assign_lhs (new_stmt); |
5572 | } | |
5573 | ||
5574 | /* 4. Create realignment token using a target builtin, if available. | |
5575 | It is done either inside the containing loop, or before LOOP (as | |
5576 | determined above). */ | |
5577 | ||
5578 | if (targetm.vectorize.builtin_mask_for_load) | |
5579 | { | |
1a91d914 | 5580 | gcall *new_stmt; |
fb85abff | 5581 | tree builtin_decl; |
5582 | ||
5583 | /* Compute INIT_ADDR - the initial addressed accessed by this memref. */ | |
ad074595 | 5584 | if (!init_addr) |
fb85abff | 5585 | { |
5586 | /* Generate the INIT_ADDR computation outside LOOP. */ | |
a73182ff | 5587 | init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts, |
9e879814 | 5588 | NULL_TREE); |
ad074595 | 5589 | if (loop) |
5590 | { | |
5591 | pe = loop_preheader_edge (loop); | |
5592 | new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); | |
5593 | gcc_assert (!new_bb); | |
5594 | } | |
5595 | else | |
5596 | gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); | |
fb85abff | 5597 | } |
5598 | ||
5599 | builtin_decl = targetm.vectorize.builtin_mask_for_load (); | |
5600 | new_stmt = gimple_build_call (builtin_decl, 1, init_addr); | |
5601 | vec_dest = | |
5602 | vect_create_destination_var (scalar_dest, | |
5603 | gimple_call_return_type (new_stmt)); | |
5604 | new_temp = make_ssa_name (vec_dest, new_stmt); | |
5605 | gimple_call_set_lhs (new_stmt, new_temp); | |
5606 | ||
5607 | if (compute_in_loop) | |
5608 | gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); | |
5609 | else | |
5610 | { | |
5611 | /* Generate the misalignment computation outside LOOP. */ | |
5612 | pe = loop_preheader_edge (loop); | |
5613 | new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); | |
5614 | gcc_assert (!new_bb); | |
5615 | } | |
5616 | ||
5617 | *realignment_token = gimple_call_lhs (new_stmt); | |
5618 | ||
5619 | /* The result of the CALL_EXPR to this builtin is determined from | |
5620 | the value of the parameter and no global variables are touched | |
5621 | which makes the builtin a "const" function. Requiring the | |
5622 | builtin to have the "const" attribute makes it unnecessary | |
5623 | to call mark_call_clobbered. */ | |
5624 | gcc_assert (TREE_READONLY (builtin_decl)); | |
5625 | } | |
5626 | ||
5627 | if (alignment_support_scheme == dr_explicit_realign) | |
5628 | return msq; | |
5629 | ||
5630 | gcc_assert (!compute_in_loop); | |
5631 | gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized); | |
5632 | ||
5633 | ||
5634 | /* 5. Create msq = phi <msq_init, lsq> in loop */ | |
5635 | ||
5636 | pe = loop_preheader_edge (containing_loop); | |
5637 | vec_dest = vect_create_destination_var (scalar_dest, vectype); | |
f9e245b2 | 5638 | msq = make_ssa_name (vec_dest); |
fb85abff | 5639 | phi_stmt = create_phi_node (msq, containing_loop->header); |
60d535d2 | 5640 | add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION); |
fb85abff | 5641 | |
5642 | return msq; | |
5643 | } | |
5644 | ||
5645 | ||
ee612634 | 5646 | /* Function vect_grouped_load_supported. |
fb85abff | 5647 | |
bc691ae4 | 5648 | COUNT is the size of the load group (the number of statements plus the |
5649 | number of gaps). SINGLE_ELEMENT_P is true if there is actually | |
5650 | only one statement, with a gap of COUNT - 1. | |
5651 | ||
5652 | Returns true if a suitable permute exists. */ | |
fb85abff | 5653 | |
5654 | bool | |
bc691ae4 | 5655 | vect_grouped_load_supported (tree vectype, bool single_element_p, |
5656 | unsigned HOST_WIDE_INT count) | |
fb85abff | 5657 | { |
3754d046 | 5658 | machine_mode mode = TYPE_MODE (vectype); |
fb85abff | 5659 | |
bc691ae4 | 5660 | /* If this is single-element interleaving with an element distance |
5661 | that leaves unused vector loads around punt - we at least create | |
5662 | very sub-optimal code in that case (and blow up memory, | |
5663 | see PR65518). */ | |
f08ee65f | 5664 | if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype))) |
bc691ae4 | 5665 | { |
5666 | if (dump_enabled_p ()) | |
5667 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
5668 | "single-element interleaving not supported " | |
5669 | "for not adjacent vector loads\n"); | |
5670 | return false; | |
5671 | } | |
5672 | ||
1e1bca71 | 5673 | /* vect_permute_load_chain requires the group size to be equal to 3 or |
5674 | be a power of two. */ | |
5675 | if (count != 3 && exact_log2 (count) == -1) | |
481fc474 | 5676 | { |
6d8fb6cf | 5677 | if (dump_enabled_p ()) |
7bd765d4 | 5678 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1e1bca71 | 5679 | "the size of the group of accesses" |
5680 | " is not a power of 2 or not equal to 3\n"); | |
481fc474 | 5681 | return false; |
5682 | } | |
5683 | ||
42f6a6e8 | 5684 | /* Check that the permutation is supported. */ |
5685 | if (VECTOR_MODE_P (mode)) | |
5686 | { | |
ba7efd65 | 5687 | unsigned int i, j; |
1e1bca71 | 5688 | if (count == 3) |
42f6a6e8 | 5689 | { |
ba7efd65 | 5690 | unsigned int nelt; |
5691 | if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) | |
5692 | { | |
5693 | if (dump_enabled_p ()) | |
5694 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
5695 | "cannot handle groups of 3 loads for" | |
5696 | " variable-length vectors\n"); | |
5697 | return false; | |
5698 | } | |
5699 | ||
c3fa7fe9 | 5700 | vec_perm_builder sel (nelt, nelt, 1); |
5701 | sel.quick_grow (nelt); | |
1957c019 | 5702 | vec_perm_indices indices; |
1e1bca71 | 5703 | unsigned int k; |
5704 | for (k = 0; k < 3; k++) | |
5705 | { | |
5706 | for (i = 0; i < nelt; i++) | |
5707 | if (3 * i + k < 2 * nelt) | |
5708 | sel[i] = 3 * i + k; | |
5709 | else | |
5710 | sel[i] = 0; | |
1957c019 | 5711 | indices.new_vector (sel, 2, nelt); |
5712 | if (!can_vec_perm_const_p (mode, indices)) | |
1e1bca71 | 5713 | { |
5714 | if (dump_enabled_p ()) | |
5715 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
5716 | "shuffle of 3 loads is not supported by" | |
5717 | " target\n"); | |
5c6f6a61 | 5718 | return false; |
1e1bca71 | 5719 | } |
5720 | for (i = 0, j = 0; i < nelt; i++) | |
5721 | if (3 * i + k < 2 * nelt) | |
5722 | sel[i] = i; | |
5723 | else | |
5724 | sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); | |
1957c019 | 5725 | indices.new_vector (sel, 2, nelt); |
5726 | if (!can_vec_perm_const_p (mode, indices)) | |
1e1bca71 | 5727 | { |
5728 | if (dump_enabled_p ()) | |
5729 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
5730 | "shuffle of 3 loads is not supported by" | |
5731 | " target\n"); | |
5732 | return false; | |
5733 | } | |
5734 | } | |
5735 | return true; | |
5736 | } | |
5737 | else | |
5738 | { | |
5739 | /* If length is not equal to 3 then only power of 2 is supported. */ | |
ac29ece2 | 5740 | gcc_assert (pow2p_hwi (count)); |
ba7efd65 | 5741 | poly_uint64 nelt = GET_MODE_NUNITS (mode); |
1957c019 | 5742 | |
c3fa7fe9 | 5743 | /* The encoding has a single stepped pattern. */ |
5744 | vec_perm_builder sel (nelt, 1, 3); | |
5745 | sel.quick_grow (3); | |
5746 | for (i = 0; i < 3; i++) | |
1e1bca71 | 5747 | sel[i] = i * 2; |
1957c019 | 5748 | vec_perm_indices indices (sel, 2, nelt); |
5749 | if (can_vec_perm_const_p (mode, indices)) | |
1e1bca71 | 5750 | { |
c3fa7fe9 | 5751 | for (i = 0; i < 3; i++) |
1e1bca71 | 5752 | sel[i] = i * 2 + 1; |
1957c019 | 5753 | indices.new_vector (sel, 2, nelt); |
5754 | if (can_vec_perm_const_p (mode, indices)) | |
1e1bca71 | 5755 | return true; |
5756 | } | |
5757 | } | |
42f6a6e8 | 5758 | } |
fb85abff | 5759 | |
6d8fb6cf | 5760 | if (dump_enabled_p ()) |
7bd765d4 | 5761 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1e1bca71 | 5762 | "extract even/odd not supported by target\n"); |
6620d7d7 | 5763 | return false; |
fb85abff | 5764 | } |
5765 | ||
2dd8e84c | 5766 | /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of |
5767 | type VECTYPE. MASKED_P says whether the masked form is needed. */ | |
94b7b4dd | 5768 | |
5769 | bool | |
2dd8e84c | 5770 | vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, |
5771 | bool masked_p) | |
94b7b4dd | 5772 | { |
2dd8e84c | 5773 | if (masked_p) |
5774 | return vect_lanes_optab_supported_p ("vec_mask_load_lanes", | |
5775 | vec_mask_load_lanes_optab, | |
5776 | vectype, count); | |
5777 | else | |
5778 | return vect_lanes_optab_supported_p ("vec_load_lanes", | |
5779 | vec_load_lanes_optab, | |
5780 | vectype, count); | |
94b7b4dd | 5781 | } |
fb85abff | 5782 | |
5783 | /* Function vect_permute_load_chain. | |
5784 | ||
5785 | Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be | |
1e1bca71 | 5786 | a power of 2 or equal to 3, generate extract_even/odd stmts to reorder |
5787 | the input data correctly. Return the final references for loads in | |
5788 | RESULT_CHAIN. | |
fb85abff | 5789 | |
5790 | E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. | |
5791 | The input is 4 vectors each containing 8 elements. We assign a number to each | |
5792 | element, the input sequence is: | |
5793 | ||
5794 | 1st vec: 0 1 2 3 4 5 6 7 | |
5795 | 2nd vec: 8 9 10 11 12 13 14 15 | |
48e1416a | 5796 | 3rd vec: 16 17 18 19 20 21 22 23 |
fb85abff | 5797 | 4th vec: 24 25 26 27 28 29 30 31 |
5798 | ||
5799 | The output sequence should be: | |
5800 | ||
5801 | 1st vec: 0 4 8 12 16 20 24 28 | |
5802 | 2nd vec: 1 5 9 13 17 21 25 29 | |
48e1416a | 5803 | 3rd vec: 2 6 10 14 18 22 26 30 |
fb85abff | 5804 | 4th vec: 3 7 11 15 19 23 27 31 |
5805 | ||
5806 | i.e., the first output vector should contain the first elements of each | |
5807 | interleaving group, etc. | |
5808 | ||
282bf14c | 5809 | We use extract_even/odd instructions to create such output. The input of |
5810 | each extract_even/odd operation is two vectors | |
48e1416a | 5811 | 1st vec 2nd vec |
5812 | 0 1 2 3 4 5 6 7 | |
fb85abff | 5813 | |
282bf14c | 5814 | and the output is the vector of extracted even/odd elements. The output of |
fb85abff | 5815 | extract_even will be: 0 2 4 6 |
5816 | and of extract_odd: 1 3 5 7 | |
5817 | ||
48e1416a | 5818 | |
282bf14c | 5819 | The permutation is done in log LENGTH stages. In each stage extract_even |
5820 | and extract_odd stmts are created for each pair of vectors in DR_CHAIN in | |
5821 | their order. In our example, | |
fb85abff | 5822 | |
5823 | E1: extract_even (1st vec, 2nd vec) | |
5824 | E2: extract_odd (1st vec, 2nd vec) | |
5825 | E3: extract_even (3rd vec, 4th vec) | |
5826 | E4: extract_odd (3rd vec, 4th vec) | |
5827 | ||
5828 | The output for the first stage will be: | |
5829 | ||
5830 | E1: 0 2 4 6 8 10 12 14 | |
5831 | E2: 1 3 5 7 9 11 13 15 | |
48e1416a | 5832 | E3: 16 18 20 22 24 26 28 30 |
fb85abff | 5833 | E4: 17 19 21 23 25 27 29 31 |
5834 | ||
5835 | In order to proceed and create the correct sequence for the next stage (or | |
48e1416a | 5836 | for the correct output, if the second stage is the last one, as in our |
5837 | example), we first put the output of extract_even operation and then the | |
fb85abff | 5838 | output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN). |
5839 | The input for the second stage is: | |
5840 | ||
5841 | 1st vec (E1): 0 2 4 6 8 10 12 14 | |
48e1416a | 5842 | 2nd vec (E3): 16 18 20 22 24 26 28 30 |
5843 | 3rd vec (E2): 1 3 5 7 9 11 13 15 | |
fb85abff | 5844 | 4th vec (E4): 17 19 21 23 25 27 29 31 |
5845 | ||
5846 | The output of the second stage: | |
5847 | ||
5848 | E1: 0 4 8 12 16 20 24 28 | |
5849 | E2: 2 6 10 14 18 22 26 30 | |
5850 | E3: 1 5 9 13 17 21 25 29 | |
5851 | E4: 3 7 11 15 19 23 27 31 | |
5852 | ||
5853 | And RESULT_CHAIN after reordering: | |
5854 | ||
5855 | 1st vec (E1): 0 4 8 12 16 20 24 28 | |
5856 | 2nd vec (E3): 1 5 9 13 17 21 25 29 | |
48e1416a | 5857 | 3rd vec (E2): 2 6 10 14 18 22 26 30 |
fb85abff | 5858 | 4th vec (E4): 3 7 11 15 19 23 27 31. */ |
5859 | ||
481fc474 | 5860 | static void |
f1f41a6c | 5861 | vect_permute_load_chain (vec<tree> dr_chain, |
48e1416a | 5862 | unsigned int length, |
ecc42a77 | 5863 | stmt_vec_info stmt_info, |
fb85abff | 5864 | gimple_stmt_iterator *gsi, |
f1f41a6c | 5865 | vec<tree> *result_chain) |
fb85abff | 5866 | { |
03d37e4e | 5867 | tree data_ref, first_vect, second_vect; |
42f6a6e8 | 5868 | tree perm_mask_even, perm_mask_odd; |
1e1bca71 | 5869 | tree perm3_mask_low, perm3_mask_high; |
42acab1c | 5870 | gimple *perm_stmt; |
1c2fef9a | 5871 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
42f6a6e8 | 5872 | unsigned int i, j, log_length = exact_log2 (length); |
282dc861 | 5873 | |
1648f21f | 5874 | result_chain->quick_grow (length); |
5875 | memcpy (result_chain->address (), dr_chain.address (), | |
5876 | length * sizeof (tree)); | |
42f6a6e8 | 5877 | |
1e1bca71 | 5878 | if (length == 3) |
fb85abff | 5879 | { |
8b221927 | 5880 | /* vect_grouped_load_supported ensures that this is constant. */ |
f08ee65f | 5881 | unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); |
1e1bca71 | 5882 | unsigned int k; |
fb85abff | 5883 | |
c3fa7fe9 | 5884 | vec_perm_builder sel (nelt, nelt, 1); |
5885 | sel.quick_grow (nelt); | |
1957c019 | 5886 | vec_perm_indices indices; |
1e1bca71 | 5887 | for (k = 0; k < 3; k++) |
5888 | { | |
5889 | for (i = 0; i < nelt; i++) | |
5890 | if (3 * i + k < 2 * nelt) | |
5891 | sel[i] = 3 * i + k; | |
5892 | else | |
5893 | sel[i] = 0; | |
1957c019 | 5894 | indices.new_vector (sel, 2, nelt); |
5895 | perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices); | |
1e1bca71 | 5896 | |
5897 | for (i = 0, j = 0; i < nelt; i++) | |
5898 | if (3 * i + k < 2 * nelt) | |
5899 | sel[i] = i; | |
5900 | else | |
5901 | sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); | |
1957c019 | 5902 | indices.new_vector (sel, 2, nelt); |
5903 | perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices); | |
1e1bca71 | 5904 | |
5905 | first_vect = dr_chain[0]; | |
5906 | second_vect = dr_chain[1]; | |
5907 | ||
5908 | /* Create interleaving stmt (low part of): | |
5909 | low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, | |
5910 | ...}> */ | |
321d85d9 | 5911 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); |
e9cf809e | 5912 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, |
5913 | second_vect, perm3_mask_low); | |
a73182ff | 5914 | vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); |
48e1416a | 5915 | |
1e1bca71 | 5916 | /* Create interleaving stmt (high part of): |
5917 | high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, | |
5918 | ...}> */ | |
5919 | first_vect = data_ref; | |
5920 | second_vect = dr_chain[2]; | |
321d85d9 | 5921 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); |
e9cf809e | 5922 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, |
5923 | second_vect, perm3_mask_high); | |
a73182ff | 5924 | vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); |
1e1bca71 | 5925 | (*result_chain)[k] = data_ref; |
fb85abff | 5926 | } |
fb85abff | 5927 | } |
1e1bca71 | 5928 | else |
5929 | { | |
5930 | /* If length is not equal to 3 then only power of 2 is supported. */ | |
ac29ece2 | 5931 | gcc_assert (pow2p_hwi (length)); |
1e1bca71 | 5932 | |
c3fa7fe9 | 5933 | /* The encoding has a single stepped pattern. */ |
f08ee65f | 5934 | poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype); |
c3fa7fe9 | 5935 | vec_perm_builder sel (nelt, 1, 3); |
5936 | sel.quick_grow (3); | |
5937 | for (i = 0; i < 3; ++i) | |
1e1bca71 | 5938 | sel[i] = i * 2; |
1957c019 | 5939 | vec_perm_indices indices (sel, 2, nelt); |
5940 | perm_mask_even = vect_gen_perm_mask_checked (vectype, indices); | |
1e1bca71 | 5941 | |
c3fa7fe9 | 5942 | for (i = 0; i < 3; ++i) |
1e1bca71 | 5943 | sel[i] = i * 2 + 1; |
1957c019 | 5944 | indices.new_vector (sel, 2, nelt); |
5945 | perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices); | |
fb85abff | 5946 | |
1e1bca71 | 5947 | for (i = 0; i < log_length; i++) |
5948 | { | |
5949 | for (j = 0; j < length; j += 2) | |
5950 | { | |
5951 | first_vect = dr_chain[j]; | |
5952 | second_vect = dr_chain[j+1]; | |
5953 | ||
5954 | /* data_ref = permute_even (first_data_ref, second_data_ref); */ | |
5955 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); | |
e9cf809e | 5956 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
5957 | first_vect, second_vect, | |
5958 | perm_mask_even); | |
a73182ff | 5959 | vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); |
1e1bca71 | 5960 | (*result_chain)[j/2] = data_ref; |
5961 | ||
5962 | /* data_ref = permute_odd (first_data_ref, second_data_ref); */ | |
5963 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); | |
e9cf809e | 5964 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
5965 | first_vect, second_vect, | |
5966 | perm_mask_odd); | |
a73182ff | 5967 | vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); |
1e1bca71 | 5968 | (*result_chain)[j/2+length/2] = data_ref; |
5969 | } | |
5970 | memcpy (dr_chain.address (), result_chain->address (), | |
5971 | length * sizeof (tree)); | |
5972 | } | |
5973 | } | |
5974 | } | |
fb85abff | 5975 | |
926f7a02 | 5976 | /* Function vect_shift_permute_load_chain. |
5977 | ||
5978 | Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate | |
5979 | sequence of stmts to reorder the input data accordingly. | |
5980 | Return the final references for loads in RESULT_CHAIN. | |
5981 | Return true if successed, false otherwise. | |
5982 | ||
5983 | E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8. | |
5984 | The input is 3 vectors each containing 8 elements. We assign a | |
5985 | number to each element, the input sequence is: | |
5986 | ||
5987 | 1st vec: 0 1 2 3 4 5 6 7 | |
5988 | 2nd vec: 8 9 10 11 12 13 14 15 | |
5989 | 3rd vec: 16 17 18 19 20 21 22 23 | |
5990 | ||
5991 | The output sequence should be: | |
5992 | ||
5993 | 1st vec: 0 3 6 9 12 15 18 21 | |
5994 | 2nd vec: 1 4 7 10 13 16 19 22 | |
5995 | 3rd vec: 2 5 8 11 14 17 20 23 | |
5996 | ||
5997 | We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output. | |
5998 | ||
5999 | First we shuffle all 3 vectors to get correct elements order: | |
6000 | ||
6001 | 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5) | |
6002 | 2nd vec: ( 8 11 14) ( 9 12 15) (10 13) | |
6003 | 3rd vec: (16 19 22) (17 20 23) (18 21) | |
6004 | ||
6005 | Next we unite and shift vector 3 times: | |
6006 | ||
6007 | 1st step: | |
6008 | shift right by 6 the concatenation of: | |
6009 | "1st vec" and "2nd vec" | |
6010 | ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13) | |
6011 | "2nd vec" and "3rd vec" | |
6012 | ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21) | |
6013 | "3rd vec" and "1st vec" | |
6014 | (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5) | |
6015 | | New vectors | | |
6016 | ||
6017 | So that now new vectors are: | |
6018 | ||
6019 | 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15) | |
6020 | 2nd vec: (10 13) (16 19 22) (17 20 23) | |
6021 | 3rd vec: (18 21) ( 0 3 6) ( 1 4 7) | |
6022 | ||
6023 | 2nd step: | |
6024 | shift right by 5 the concatenation of: | |
6025 | "1st vec" and "3rd vec" | |
6026 | ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7) | |
6027 | "2nd vec" and "1st vec" | |
6028 | (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15) | |
6029 | "3rd vec" and "2nd vec" | |
6030 | (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23) | |
6031 | | New vectors | | |
6032 | ||
6033 | So that now new vectors are: | |
6034 | ||
6035 | 1st vec: ( 9 12 15) (18 21) ( 0 3 6) | |
6036 | 2nd vec: (17 20 23) ( 2 5) ( 8 11 14) | |
6037 | 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY | |
6038 | ||
6039 | 3rd step: | |
6040 | shift right by 5 the concatenation of: | |
6041 | "1st vec" and "1st vec" | |
6042 | ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6) | |
6043 | shift right by 3 the concatenation of: | |
6044 | "2nd vec" and "2nd vec" | |
6045 | (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14) | |
6046 | | New vectors | | |
6047 | ||
6048 | So that now all vectors are READY: | |
6049 | 1st vec: ( 0 3 6) ( 9 12 15) (18 21) | |
6050 | 2nd vec: ( 2 5) ( 8 11 14) (17 20 23) | |
6051 | 3rd vec: ( 1 4 7) (10 13) (16 19 22) | |
6052 | ||
6053 | This algorithm is faster than one in vect_permute_load_chain if: | |
6054 | 1. "shift of a concatination" is faster than general permutation. | |
6055 | This is usually so. | |
6056 | 2. The TARGET machine can't execute vector instructions in parallel. | |
6057 | This is because each step of the algorithm depends on previous. | |
6058 | The algorithm in vect_permute_load_chain is much more parallel. | |
6059 | ||
6060 | The algorithm is applicable only for LOAD CHAIN LENGTH less than VF. | |
6061 | */ | |
6062 | ||
6063 | static bool | |
6064 | vect_shift_permute_load_chain (vec<tree> dr_chain, | |
6065 | unsigned int length, | |
ecc42a77 | 6066 | stmt_vec_info stmt_info, |
926f7a02 | 6067 | gimple_stmt_iterator *gsi, |
6068 | vec<tree> *result_chain) | |
6069 | { | |
6070 | tree vect[3], vect_shift[3], data_ref, first_vect, second_vect; | |
6071 | tree perm2_mask1, perm2_mask2, perm3_mask; | |
6072 | tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask; | |
42acab1c | 6073 | gimple *perm_stmt; |
926f7a02 | 6074 | |
1c2fef9a | 6075 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
926f7a02 | 6076 | unsigned int i; |
926f7a02 | 6077 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); |
6078 | ||
f08ee65f | 6079 | unsigned HOST_WIDE_INT nelt, vf; |
6080 | if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt) | |
6081 | || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf)) | |
d75596cd | 6082 | /* Not supported for variable-length vectors. */ |
6083 | return false; | |
6084 | ||
1957c019 | 6085 | vec_perm_builder sel (nelt, nelt, 1); |
282dc861 | 6086 | sel.quick_grow (nelt); |
6087 | ||
926f7a02 | 6088 | result_chain->quick_grow (length); |
6089 | memcpy (result_chain->address (), dr_chain.address (), | |
6090 | length * sizeof (tree)); | |
6091 | ||
d75596cd | 6092 | if (pow2p_hwi (length) && vf > 4) |
926f7a02 | 6093 | { |
2cc1223e | 6094 | unsigned int j, log_length = exact_log2 (length); |
926f7a02 | 6095 | for (i = 0; i < nelt / 2; ++i) |
6096 | sel[i] = i * 2; | |
6097 | for (i = 0; i < nelt / 2; ++i) | |
6098 | sel[nelt / 2 + i] = i * 2 + 1; | |
1957c019 | 6099 | vec_perm_indices indices (sel, 2, nelt); |
6100 | if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) | |
926f7a02 | 6101 | { |
6102 | if (dump_enabled_p ()) | |
6103 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6104 | "shuffle of 2 fields structure is not \ | |
6105 | supported by target\n"); | |
6106 | return false; | |
6107 | } | |
1957c019 | 6108 | perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices); |
926f7a02 | 6109 | |
6110 | for (i = 0; i < nelt / 2; ++i) | |
6111 | sel[i] = i * 2 + 1; | |
6112 | for (i = 0; i < nelt / 2; ++i) | |
6113 | sel[nelt / 2 + i] = i * 2; | |
1957c019 | 6114 | indices.new_vector (sel, 2, nelt); |
6115 | if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) | |
926f7a02 | 6116 | { |
6117 | if (dump_enabled_p ()) | |
6118 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6119 | "shuffle of 2 fields structure is not \ | |
6120 | supported by target\n"); | |
6121 | return false; | |
6122 | } | |
1957c019 | 6123 | perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices); |
926f7a02 | 6124 | |
6125 | /* Generating permutation constant to shift all elements. | |
6126 | For vector length 8 it is {4 5 6 7 8 9 10 11}. */ | |
6127 | for (i = 0; i < nelt; i++) | |
6128 | sel[i] = nelt / 2 + i; | |
1957c019 | 6129 | indices.new_vector (sel, 2, nelt); |
6130 | if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) | |
926f7a02 | 6131 | { |
6132 | if (dump_enabled_p ()) | |
6133 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6134 | "shift permutation is not supported by target\n"); | |
6135 | return false; | |
6136 | } | |
1957c019 | 6137 | shift1_mask = vect_gen_perm_mask_checked (vectype, indices); |
926f7a02 | 6138 | |
6139 | /* Generating permutation constant to select vector from 2. | |
6140 | For vector length 8 it is {0 1 2 3 12 13 14 15}. */ | |
6141 | for (i = 0; i < nelt / 2; i++) | |
6142 | sel[i] = i; | |
6143 | for (i = nelt / 2; i < nelt; i++) | |
6144 | sel[i] = nelt + i; | |
1957c019 | 6145 | indices.new_vector (sel, 2, nelt); |
6146 | if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) | |
926f7a02 | 6147 | { |
6148 | if (dump_enabled_p ()) | |
6149 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6150 | "select is not supported by target\n"); | |
6151 | return false; | |
6152 | } | |
1957c019 | 6153 | select_mask = vect_gen_perm_mask_checked (vectype, indices); |
926f7a02 | 6154 | |
2cc1223e | 6155 | for (i = 0; i < log_length; i++) |
6156 | { | |
6157 | for (j = 0; j < length; j += 2) | |
6158 | { | |
6159 | first_vect = dr_chain[j]; | |
6160 | second_vect = dr_chain[j + 1]; | |
6161 | ||
6162 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); | |
e9cf809e | 6163 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6164 | first_vect, first_vect, | |
6165 | perm2_mask1); | |
a73182ff | 6166 | vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); |
2cc1223e | 6167 | vect[0] = data_ref; |
6168 | ||
6169 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); | |
e9cf809e | 6170 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6171 | second_vect, second_vect, | |
6172 | perm2_mask2); | |
a73182ff | 6173 | vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); |
2cc1223e | 6174 | vect[1] = data_ref; |
926f7a02 | 6175 | |
2cc1223e | 6176 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift"); |
e9cf809e | 6177 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6178 | vect[0], vect[1], shift1_mask); | |
a73182ff | 6179 | vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); |
2cc1223e | 6180 | (*result_chain)[j/2 + length/2] = data_ref; |
6181 | ||
6182 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_select"); | |
e9cf809e | 6183 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6184 | vect[0], vect[1], select_mask); | |
a73182ff | 6185 | vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); |
2cc1223e | 6186 | (*result_chain)[j/2] = data_ref; |
6187 | } | |
6188 | memcpy (dr_chain.address (), result_chain->address (), | |
6189 | length * sizeof (tree)); | |
6190 | } | |
926f7a02 | 6191 | return true; |
6192 | } | |
d75596cd | 6193 | if (length == 3 && vf > 2) |
926f7a02 | 6194 | { |
6195 | unsigned int k = 0, l = 0; | |
6196 | ||
6197 | /* Generating permutation constant to get all elements in rigth order. | |
6198 | For vector length 8 it is {0 3 6 1 4 7 2 5}. */ | |
6199 | for (i = 0; i < nelt; i++) | |
6200 | { | |
6201 | if (3 * k + (l % 3) >= nelt) | |
6202 | { | |
6203 | k = 0; | |
6204 | l += (3 - (nelt % 3)); | |
6205 | } | |
6206 | sel[i] = 3 * k + (l % 3); | |
6207 | k++; | |
6208 | } | |
1957c019 | 6209 | vec_perm_indices indices (sel, 2, nelt); |
6210 | if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) | |
926f7a02 | 6211 | { |
6212 | if (dump_enabled_p ()) | |
6213 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6214 | "shuffle of 3 fields structure is not \ | |
6215 | supported by target\n"); | |
6216 | return false; | |
6217 | } | |
1957c019 | 6218 | perm3_mask = vect_gen_perm_mask_checked (vectype, indices); |
926f7a02 | 6219 | |
6220 | /* Generating permutation constant to shift all elements. | |
6221 | For vector length 8 it is {6 7 8 9 10 11 12 13}. */ | |
6222 | for (i = 0; i < nelt; i++) | |
6223 | sel[i] = 2 * (nelt / 3) + (nelt % 3) + i; | |
1957c019 | 6224 | indices.new_vector (sel, 2, nelt); |
6225 | if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) | |
926f7a02 | 6226 | { |
6227 | if (dump_enabled_p ()) | |
6228 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6229 | "shift permutation is not supported by target\n"); | |
6230 | return false; | |
6231 | } | |
1957c019 | 6232 | shift1_mask = vect_gen_perm_mask_checked (vectype, indices); |
926f7a02 | 6233 | |
6234 | /* Generating permutation constant to shift all elements. | |
6235 | For vector length 8 it is {5 6 7 8 9 10 11 12}. */ | |
6236 | for (i = 0; i < nelt; i++) | |
6237 | sel[i] = 2 * (nelt / 3) + 1 + i; | |
1957c019 | 6238 | indices.new_vector (sel, 2, nelt); |
6239 | if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) | |
926f7a02 | 6240 | { |
6241 | if (dump_enabled_p ()) | |
6242 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6243 | "shift permutation is not supported by target\n"); | |
6244 | return false; | |
6245 | } | |
1957c019 | 6246 | shift2_mask = vect_gen_perm_mask_checked (vectype, indices); |
926f7a02 | 6247 | |
6248 | /* Generating permutation constant to shift all elements. | |
6249 | For vector length 8 it is {3 4 5 6 7 8 9 10}. */ | |
6250 | for (i = 0; i < nelt; i++) | |
6251 | sel[i] = (nelt / 3) + (nelt % 3) / 2 + i; | |
1957c019 | 6252 | indices.new_vector (sel, 2, nelt); |
6253 | if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) | |
926f7a02 | 6254 | { |
6255 | if (dump_enabled_p ()) | |
6256 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6257 | "shift permutation is not supported by target\n"); | |
6258 | return false; | |
6259 | } | |
1957c019 | 6260 | shift3_mask = vect_gen_perm_mask_checked (vectype, indices); |
926f7a02 | 6261 | |
6262 | /* Generating permutation constant to shift all elements. | |
6263 | For vector length 8 it is {5 6 7 8 9 10 11 12}. */ | |
6264 | for (i = 0; i < nelt; i++) | |
6265 | sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i; | |
1957c019 | 6266 | indices.new_vector (sel, 2, nelt); |
6267 | if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) | |
926f7a02 | 6268 | { |
6269 | if (dump_enabled_p ()) | |
6270 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6271 | "shift permutation is not supported by target\n"); | |
6272 | return false; | |
6273 | } | |
1957c019 | 6274 | shift4_mask = vect_gen_perm_mask_checked (vectype, indices); |
926f7a02 | 6275 | |
6276 | for (k = 0; k < 3; k++) | |
6277 | { | |
321d85d9 | 6278 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3"); |
e9cf809e | 6279 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6280 | dr_chain[k], dr_chain[k], | |
6281 | perm3_mask); | |
a73182ff | 6282 | vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); |
926f7a02 | 6283 | vect[k] = data_ref; |
6284 | } | |
6285 | ||
6286 | for (k = 0; k < 3; k++) | |
6287 | { | |
6288 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1"); | |
e9cf809e | 6289 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6290 | vect[k % 3], vect[(k + 1) % 3], | |
6291 | shift1_mask); | |
a73182ff | 6292 | vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); |
926f7a02 | 6293 | vect_shift[k] = data_ref; |
6294 | } | |
6295 | ||
6296 | for (k = 0; k < 3; k++) | |
6297 | { | |
6298 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2"); | |
e9cf809e | 6299 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6300 | vect_shift[(4 - k) % 3], | |
6301 | vect_shift[(3 - k) % 3], | |
6302 | shift2_mask); | |
a73182ff | 6303 | vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); |
926f7a02 | 6304 | vect[k] = data_ref; |
6305 | } | |
6306 | ||
6307 | (*result_chain)[3 - (nelt % 3)] = vect[2]; | |
6308 | ||
6309 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3"); | |
e9cf809e | 6310 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0], |
6311 | vect[0], shift3_mask); | |
a73182ff | 6312 | vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); |
926f7a02 | 6313 | (*result_chain)[nelt % 3] = data_ref; |
6314 | ||
6315 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4"); | |
e9cf809e | 6316 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1], |
6317 | vect[1], shift4_mask); | |
a73182ff | 6318 | vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); |
926f7a02 | 6319 | (*result_chain)[0] = data_ref; |
6320 | return true; | |
6321 | } | |
6322 | return false; | |
6323 | } | |
6324 | ||
ee612634 | 6325 | /* Function vect_transform_grouped_load. |
fb85abff | 6326 | |
6327 | Given a chain of input interleaved data-refs (in DR_CHAIN), build statements | |
6328 | to perform their permutation and ascribe the result vectorized statements to | |
6329 | the scalar statements. | |
6330 | */ | |
6331 | ||
481fc474 | 6332 | void |
ecc42a77 | 6333 | vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain, |
6334 | int size, gimple_stmt_iterator *gsi) | |
fb85abff | 6335 | { |
3754d046 | 6336 | machine_mode mode; |
1e094109 | 6337 | vec<tree> result_chain = vNULL; |
fb85abff | 6338 | |
48e1416a | 6339 | /* DR_CHAIN contains input data-refs that are a part of the interleaving. |
6340 | RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted | |
fb85abff | 6341 | vectors, that are ready for vector computation. */ |
f1f41a6c | 6342 | result_chain.create (size); |
926f7a02 | 6343 | |
6344 | /* If reassociation width for vector type is 2 or greater target machine can | |
6345 | execute 2 or more vector instructions in parallel. Otherwise try to | |
6346 | get chain for loads group using vect_shift_permute_load_chain. */ | |
1c2fef9a | 6347 | mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info)); |
926f7a02 | 6348 | if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1 |
ac29ece2 | 6349 | || pow2p_hwi (size) |
a73182ff | 6350 | || !vect_shift_permute_load_chain (dr_chain, size, stmt_info, |
926f7a02 | 6351 | gsi, &result_chain)) |
a73182ff | 6352 | vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain); |
6353 | vect_record_grouped_load_vectors (stmt_info, result_chain); | |
f1f41a6c | 6354 | result_chain.release (); |
94b7b4dd | 6355 | } |
6356 | ||
ee612634 | 6357 | /* RESULT_CHAIN contains the output of a group of grouped loads that were |
ecc42a77 | 6358 | generated as part of the vectorization of STMT_INFO. Assign the statement |
94b7b4dd | 6359 | for each vector to the associated scalar statement. */ |
6360 | ||
6361 | void | |
ecc42a77 | 6362 | vect_record_grouped_load_vectors (stmt_vec_info stmt_info, |
6363 | vec<tree> result_chain) | |
94b7b4dd | 6364 | { |
aebdbd31 | 6365 | vec_info *vinfo = stmt_info->vinfo; |
cd24aa3c | 6366 | stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); |
94b7b4dd | 6367 | unsigned int i, gap_count; |
6368 | tree tmp_data_ref; | |
fb85abff | 6369 | |
48e1416a | 6370 | /* Put a permuted data-ref in the VECTORIZED_STMT field. |
6371 | Since we scan the chain starting from it's first node, their order | |
fb85abff | 6372 | corresponds the order of data-refs in RESULT_CHAIN. */ |
cd24aa3c | 6373 | stmt_vec_info next_stmt_info = first_stmt_info; |
fb85abff | 6374 | gap_count = 1; |
f1f41a6c | 6375 | FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref) |
fb85abff | 6376 | { |
cd24aa3c | 6377 | if (!next_stmt_info) |
fb85abff | 6378 | break; |
6379 | ||
282bf14c | 6380 | /* Skip the gaps. Loads created for the gaps will be removed by dead |
6381 | code elimination pass later. No need to check for the first stmt in | |
fb85abff | 6382 | the group, since it always exists. |
e1009321 | 6383 | DR_GROUP_GAP is the number of steps in elements from the previous |
6384 | access (if there is no gap DR_GROUP_GAP is 1). We skip loads that | |
282bf14c | 6385 | correspond to the gaps. */ |
cd24aa3c | 6386 | if (next_stmt_info != first_stmt_info |
6387 | && gap_count < DR_GROUP_GAP (next_stmt_info)) | |
6883ce83 | 6388 | { |
6389 | gap_count++; | |
6390 | continue; | |
6391 | } | |
fb85abff | 6392 | |
6883ce83 | 6393 | /* ??? The following needs cleanup after the removal of |
6394 | DR_GROUP_SAME_DR_STMT. */ | |
6395 | if (next_stmt_info) | |
fb85abff | 6396 | { |
aebdbd31 | 6397 | stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref); |
fb85abff | 6398 | /* We assume that if VEC_STMT is not NULL, this is a case of multiple |
6399 | copies, and we put the new vector statement in the first available | |
6400 | RELATED_STMT. */ | |
cd24aa3c | 6401 | if (!STMT_VINFO_VEC_STMT (next_stmt_info)) |
6402 | STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info; | |
fb85abff | 6403 | else |
6404 | { | |
6883ce83 | 6405 | stmt_vec_info prev_stmt_info |
6406 | = STMT_VINFO_VEC_STMT (next_stmt_info); | |
6407 | stmt_vec_info rel_stmt_info | |
6408 | = STMT_VINFO_RELATED_STMT (prev_stmt_info); | |
6409 | while (rel_stmt_info) | |
6410 | { | |
6411 | prev_stmt_info = rel_stmt_info; | |
6412 | rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info); | |
6413 | } | |
fb85abff | 6414 | |
6883ce83 | 6415 | STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info; |
fb85abff | 6416 | } |
6417 | ||
cd24aa3c | 6418 | next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info); |
fb85abff | 6419 | gap_count = 1; |
fb85abff | 6420 | } |
6421 | } | |
fb85abff | 6422 | } |
6423 | ||
6424 | /* Function vect_force_dr_alignment_p. | |
6425 | ||
6426 | Returns whether the alignment of a DECL can be forced to be aligned | |
6427 | on ALIGNMENT bit boundary. */ | |
6428 | ||
48e1416a | 6429 | bool |
e092c20e | 6430 | vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment) |
fb85abff | 6431 | { |
53e9c5c4 | 6432 | if (!VAR_P (decl)) |
fb85abff | 6433 | return false; |
6434 | ||
331d5983 | 6435 | if (decl_in_symtab_p (decl) |
6436 | && !symtab_node::get (decl)->can_increase_alignment_p ()) | |
8cab13cf | 6437 | return false; |
6438 | ||
fb85abff | 6439 | if (TREE_STATIC (decl)) |
c34f18f1 | 6440 | return (known_le (alignment, |
6441 | (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT)); | |
fb85abff | 6442 | else |
e092c20e | 6443 | return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT)); |
fb85abff | 6444 | } |
6445 | ||
fb85abff | 6446 | |
abc9513d | 6447 | /* Return whether the data reference DR_INFO is supported with respect to its |
0822b158 | 6448 | alignment. |
6449 | If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even | |
6450 | it is aligned, i.e., check if it is possible to vectorize it with different | |
fb85abff | 6451 | alignment. */ |
6452 | ||
6453 | enum dr_alignment_support | |
abc9513d | 6454 | vect_supportable_dr_alignment (dr_vec_info *dr_info, |
0822b158 | 6455 | bool check_aligned_accesses) |
fb85abff | 6456 | { |
abc9513d | 6457 | data_reference *dr = dr_info->dr; |
6458 | stmt_vec_info stmt_info = dr_info->stmt; | |
fb85abff | 6459 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
3754d046 | 6460 | machine_mode mode = TYPE_MODE (vectype); |
37545e54 | 6461 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); |
2e966e2a | 6462 | class loop *vect_loop = NULL; |
37545e54 | 6463 | bool nested_in_vect_loop = false; |
fb85abff | 6464 | |
abc9513d | 6465 | if (aligned_access_p (dr_info) && !check_aligned_accesses) |
fb85abff | 6466 | return dr_aligned; |
6467 | ||
c71d3c24 | 6468 | /* For now assume all conditional loads/stores support unaligned |
6469 | access without any special code. */ | |
0219dc42 | 6470 | if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) |
6471 | if (gimple_call_internal_p (stmt) | |
6472 | && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD | |
6473 | || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)) | |
6474 | return dr_unaligned_supported; | |
c71d3c24 | 6475 | |
ad074595 | 6476 | if (loop_vinfo) |
6477 | { | |
6478 | vect_loop = LOOP_VINFO_LOOP (loop_vinfo); | |
0219dc42 | 6479 | nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info); |
ad074595 | 6480 | } |
37545e54 | 6481 | |
fb85abff | 6482 | /* Possibly unaligned access. */ |
6483 | ||
6484 | /* We can choose between using the implicit realignment scheme (generating | |
6485 | a misaligned_move stmt) and the explicit realignment scheme (generating | |
282bf14c | 6486 | aligned loads with a REALIGN_LOAD). There are two variants to the |
6487 | explicit realignment scheme: optimized, and unoptimized. | |
fb85abff | 6488 | We can optimize the realignment only if the step between consecutive |
6489 | vector loads is equal to the vector size. Since the vector memory | |
6490 | accesses advance in steps of VS (Vector Size) in the vectorized loop, it | |
6491 | is guaranteed that the misalignment amount remains the same throughout the | |
6492 | execution of the vectorized loop. Therefore, we can create the | |
6493 | "realignment token" (the permutation mask that is passed to REALIGN_LOAD) | |
6494 | at the loop preheader. | |
6495 | ||
6496 | However, in the case of outer-loop vectorization, when vectorizing a | |
6497 | memory access in the inner-loop nested within the LOOP that is now being | |
6498 | vectorized, while it is guaranteed that the misalignment of the | |
6499 | vectorized memory access will remain the same in different outer-loop | |
6500 | iterations, it is *not* guaranteed that is will remain the same throughout | |
6501 | the execution of the inner-loop. This is because the inner-loop advances | |
6502 | with the original scalar step (and not in steps of VS). If the inner-loop | |
6503 | step happens to be a multiple of VS, then the misalignment remains fixed | |
6504 | and we can use the optimized realignment scheme. For example: | |
6505 | ||
6506 | for (i=0; i<N; i++) | |
6507 | for (j=0; j<M; j++) | |
6508 | s += a[i+j]; | |
6509 | ||
6510 | When vectorizing the i-loop in the above example, the step between | |
6511 | consecutive vector loads is 1, and so the misalignment does not remain | |
6512 | fixed across the execution of the inner-loop, and the realignment cannot | |
6513 | be optimized (as illustrated in the following pseudo vectorized loop): | |
6514 | ||
6515 | for (i=0; i<N; i+=4) | |
6516 | for (j=0; j<M; j++){ | |
6517 | vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...} | |
6518 | // when j is {0,1,2,3,4,5,6,7,...} respectively. | |
6519 | // (assuming that we start from an aligned address). | |
6520 | } | |
6521 | ||
6522 | We therefore have to use the unoptimized realignment scheme: | |
6523 | ||
6524 | for (i=0; i<N; i+=4) | |
6525 | for (j=k; j<M; j+=4) | |
6526 | vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming | |
6527 | // that the misalignment of the initial address is | |
6528 | // 0). | |
6529 | ||
6530 | The loop can then be vectorized as follows: | |
6531 | ||
6532 | for (k=0; k<4; k++){ | |
6533 | rt = get_realignment_token (&vp[k]); | |
6534 | for (i=0; i<N; i+=4){ | |
6535 | v1 = vp[i+k]; | |
6536 | for (j=k; j<M; j+=4){ | |
6537 | v2 = vp[i+j+VS-1]; | |
6538 | va = REALIGN_LOAD <v1,v2,rt>; | |
6539 | vs += va; | |
6540 | v1 = v2; | |
6541 | } | |
6542 | } | |
6543 | } */ | |
6544 | ||
6545 | if (DR_IS_READ (dr)) | |
6546 | { | |
c6b19c5f | 6547 | bool is_packed = false; |
6548 | tree type = (TREE_TYPE (DR_REF (dr))); | |
6549 | ||
d6bf3b14 | 6550 | if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing |
fb85abff | 6551 | && (!targetm.vectorize.builtin_mask_for_load |
6552 | || targetm.vectorize.builtin_mask_for_load ())) | |
6553 | { | |
6554 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
ca1a4077 | 6555 | |
6556 | /* If we are doing SLP then the accesses need not have the | |
6557 | same alignment, instead it depends on the SLP group size. */ | |
6558 | if (loop_vinfo | |
6559 | && STMT_SLP_TYPE (stmt_info) | |
d75596cd | 6560 | && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo) |
cd24aa3c | 6561 | * (DR_GROUP_SIZE |
6562 | (DR_GROUP_FIRST_ELEMENT (stmt_info))), | |
d75596cd | 6563 | TYPE_VECTOR_SUBPARTS (vectype))) |
ca1a4077 | 6564 | ; |
6565 | else if (!loop_vinfo | |
6566 | || (nested_in_vect_loop | |
52acb7ae | 6567 | && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)), |
6568 | GET_MODE_SIZE (TYPE_MODE (vectype))))) | |
fb85abff | 6569 | return dr_explicit_realign; |
6570 | else | |
6571 | return dr_explicit_realign_optimized; | |
6572 | } | |
abc9513d | 6573 | if (!known_alignment_for_access_p (dr_info)) |
cfa724cf | 6574 | is_packed = not_size_aligned (DR_REF (dr)); |
48e1416a | 6575 | |
33a82fb9 | 6576 | if (targetm.vectorize.support_vector_misalignment |
abc9513d | 6577 | (mode, type, DR_MISALIGNMENT (dr_info), is_packed)) |
fb85abff | 6578 | /* Can't software pipeline the loads, but can at least do them. */ |
6579 | return dr_unaligned_supported; | |
6580 | } | |
c6b19c5f | 6581 | else |
6582 | { | |
6583 | bool is_packed = false; | |
6584 | tree type = (TREE_TYPE (DR_REF (dr))); | |
fb85abff | 6585 | |
abc9513d | 6586 | if (!known_alignment_for_access_p (dr_info)) |
cfa724cf | 6587 | is_packed = not_size_aligned (DR_REF (dr)); |
48e1416a | 6588 | |
33a82fb9 | 6589 | if (targetm.vectorize.support_vector_misalignment |
abc9513d | 6590 | (mode, type, DR_MISALIGNMENT (dr_info), is_packed)) |
c6b19c5f | 6591 | return dr_unaligned_supported; |
6592 | } | |
48e1416a | 6593 | |
fb85abff | 6594 | /* Unsupported. */ |
6595 | return dr_unaligned_unsupported; | |
6596 | } |