]>
Commit | Line | Data |
---|---|---|
b8698a0f | 1 | /* Data References Analysis and Manipulation Utilities for Vectorization. |
8d9254fc | 2 | Copyright (C) 2003-2020 Free Software Foundation, Inc. |
b8698a0f | 3 | Contributed by Dorit Naishlos <dorit@il.ibm.com> |
ebfd146a IR |
4 | and Ira Rosen <irar@il.ibm.com> |
5 | ||
6 | This file is part of GCC. | |
7 | ||
8 | GCC is free software; you can redistribute it and/or modify it under | |
9 | the terms of the GNU General Public License as published by the Free | |
10 | Software Foundation; either version 3, or (at your option) any later | |
11 | version. | |
12 | ||
13 | GCC is distributed in the hope that it will be useful, but WITHOUT ANY | |
14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
15 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
16 | for more details. | |
17 | ||
18 | You should have received a copy of the GNU General Public License | |
19 | along with GCC; see the file COPYING3. If not see | |
20 | <http://www.gnu.org/licenses/>. */ | |
21 | ||
22 | #include "config.h" | |
23 | #include "system.h" | |
24 | #include "coretypes.h" | |
c7131fb2 | 25 | #include "backend.h" |
957060b5 AM |
26 | #include "target.h" |
27 | #include "rtl.h" | |
ebfd146a | 28 | #include "tree.h" |
c7131fb2 | 29 | #include "gimple.h" |
957060b5 | 30 | #include "predict.h" |
4d0cdd0c | 31 | #include "memmodel.h" |
957060b5 | 32 | #include "tm_p.h" |
c7131fb2 | 33 | #include "ssa.h" |
957060b5 AM |
34 | #include "optabs-tree.h" |
35 | #include "cgraph.h" | |
957060b5 | 36 | #include "dumpfile.h" |
c7131fb2 | 37 | #include "alias.h" |
40e23961 | 38 | #include "fold-const.h" |
d8a2d370 | 39 | #include "stor-layout.h" |
2fb9a547 | 40 | #include "tree-eh.h" |
45b0be94 | 41 | #include "gimplify.h" |
5be5c238 | 42 | #include "gimple-iterator.h" |
18f429e2 | 43 | #include "gimplify-me.h" |
e28030cf AM |
44 | #include "tree-ssa-loop-ivopts.h" |
45 | #include "tree-ssa-loop-manip.h" | |
442b4905 | 46 | #include "tree-ssa-loop.h" |
ebfd146a | 47 | #include "cfgloop.h" |
ebfd146a IR |
48 | #include "tree-scalar-evolution.h" |
49 | #include "tree-vectorizer.h" | |
2eb79bbb | 50 | #include "expr.h" |
9b2b7279 | 51 | #include "builtins.h" |
bb642979 | 52 | #include "tree-cfg.h" |
9adee305 | 53 | #include "tree-hash-traits.h" |
f151c9e1 | 54 | #include "vec-perm-indices.h" |
bfaa08b7 | 55 | #include "internal-fn.h" |
ebfd146a | 56 | |
272c6793 RS |
57 | /* Return true if load- or store-lanes optab OPTAB is implemented for |
58 | COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */ | |
59 | ||
60 | static bool | |
61 | vect_lanes_optab_supported_p (const char *name, convert_optab optab, | |
62 | tree vectype, unsigned HOST_WIDE_INT count) | |
63 | { | |
695da534 | 64 | machine_mode mode, array_mode; |
272c6793 RS |
65 | bool limit_p; |
66 | ||
67 | mode = TYPE_MODE (vectype); | |
695da534 | 68 | if (!targetm.array_mode (mode, count).exists (&array_mode)) |
272c6793 | 69 | { |
695da534 RS |
70 | poly_uint64 bits = count * GET_MODE_BITSIZE (mode); |
71 | limit_p = !targetm.array_mode_supported_p (mode, count); | |
72 | if (!int_mode_for_size (bits, limit_p).exists (&array_mode)) | |
73 | { | |
74 | if (dump_enabled_p ()) | |
75 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6f795a92 | 76 | "no array mode for %s[%wu]\n", |
695da534 RS |
77 | GET_MODE_NAME (mode), count); |
78 | return false; | |
79 | } | |
272c6793 RS |
80 | } |
81 | ||
82 | if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing) | |
83 | { | |
73fbfcad | 84 | if (dump_enabled_p ()) |
78c60e3d | 85 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
e645e942 | 86 | "cannot use %s<%s><%s>\n", name, |
78c60e3d | 87 | GET_MODE_NAME (array_mode), GET_MODE_NAME (mode)); |
272c6793 RS |
88 | return false; |
89 | } | |
90 | ||
73fbfcad | 91 | if (dump_enabled_p ()) |
78c60e3d | 92 | dump_printf_loc (MSG_NOTE, vect_location, |
e645e942 | 93 | "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode), |
78c60e3d | 94 | GET_MODE_NAME (mode)); |
272c6793 RS |
95 | |
96 | return true; | |
97 | } | |
98 | ||
99 | ||
32e8e429 | 100 | /* Return the smallest scalar part of STMT_INFO. |
ff802fa1 IR |
101 | This is used to determine the vectype of the stmt. We generally set the |
102 | vectype according to the type of the result (lhs). For stmts whose | |
ebfd146a | 103 | result-type is different than the type of the arguments (e.g., demotion, |
b8698a0f | 104 | promotion), vectype will be reset appropriately (later). Note that we have |
ebfd146a | 105 | to visit the smallest datatype in this function, because that determines the |
ff802fa1 | 106 | VF. If the smallest datatype in the loop is present only as the rhs of a |
ebfd146a IR |
107 | promotion operation - we'd miss it. |
108 | Such a case, where a variable of this datatype does not appear in the lhs | |
109 | anywhere in the loop, can only occur if it's an invariant: e.g.: | |
b8698a0f | 110 | 'int_x = (int) short_inv', which we'd expect to have been optimized away by |
ff802fa1 IR |
111 | invariant motion. However, we cannot rely on invariant motion to always |
112 | take invariants out of the loop, and so in the case of promotion we also | |
113 | have to check the rhs. | |
ebfd146a IR |
114 | LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding |
115 | types. */ | |
116 | ||
117 | tree | |
32e8e429 RS |
118 | vect_get_smallest_scalar_type (stmt_vec_info stmt_info, |
119 | HOST_WIDE_INT *lhs_size_unit, | |
120 | HOST_WIDE_INT *rhs_size_unit) | |
ebfd146a | 121 | { |
32e8e429 | 122 | tree scalar_type = gimple_expr_type (stmt_info->stmt); |
ebfd146a IR |
123 | HOST_WIDE_INT lhs, rhs; |
124 | ||
ac8936b4 RS |
125 | /* During the analysis phase, this function is called on arbitrary |
126 | statements that might not have scalar results. */ | |
127 | if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type))) | |
128 | return scalar_type; | |
129 | ||
ebfd146a IR |
130 | lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); |
131 | ||
32e8e429 | 132 | gassign *assign = dyn_cast <gassign *> (stmt_info->stmt); |
beb456c3 RS |
133 | if (assign |
134 | && (gimple_assign_cast_p (assign) | |
135 | || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR | |
136 | || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR | |
137 | || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR | |
138 | || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR | |
139 | || gimple_assign_rhs_code (assign) == FLOAT_EXPR)) | |
ebfd146a | 140 | { |
beb456c3 | 141 | tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign)); |
ebfd146a | 142 | |
a265c9a9 RB |
143 | rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type)); |
144 | if (rhs < lhs) | |
145 | scalar_type = rhs_type; | |
146 | } | |
6c0b8df1 | 147 | else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt)) |
a265c9a9 | 148 | { |
6c0b8df1 RS |
149 | unsigned int i = 0; |
150 | if (gimple_call_internal_p (call)) | |
151 | { | |
152 | internal_fn ifn = gimple_call_internal_fn (call); | |
153 | if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn)) | |
154 | /* gimple_expr_type already picked the type of the loaded | |
155 | or stored data. */ | |
156 | i = ~0U; | |
157 | else if (internal_fn_mask_index (ifn) == 0) | |
158 | i = 1; | |
159 | } | |
160 | if (i < gimple_call_num_args (call)) | |
161 | { | |
162 | tree rhs_type = TREE_TYPE (gimple_call_arg (call, i)); | |
163 | if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type))) | |
164 | { | |
165 | rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type)); | |
166 | if (rhs < lhs) | |
167 | scalar_type = rhs_type; | |
168 | } | |
169 | } | |
ebfd146a | 170 | } |
b8698a0f L |
171 | |
172 | *lhs_size_unit = lhs; | |
ebfd146a IR |
173 | *rhs_size_unit = rhs; |
174 | return scalar_type; | |
175 | } | |
176 | ||
177 | ||
ebfd146a IR |
178 | /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be |
179 | tested at run-time. Return TRUE if DDR was successfully inserted. | |
180 | Return false if versioning is not supported. */ | |
181 | ||
f4ebbd24 | 182 | static opt_result |
ebfd146a IR |
183 | vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo) |
184 | { | |
99b1c316 | 185 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
ebfd146a | 186 | |
028d4092 | 187 | if ((unsigned) param_vect_max_version_for_alias_checks == 0) |
f4ebbd24 DM |
188 | return opt_result::failure_at (vect_location, |
189 | "will not create alias checks, as" | |
190 | " --param vect-max-version-for-alias-checks" | |
191 | " == 0\n"); | |
ebfd146a | 192 | |
f4ebbd24 DM |
193 | opt_result res |
194 | = runtime_alias_check_p (ddr, loop, | |
195 | optimize_loop_nest_for_speed_p (loop)); | |
196 | if (!res) | |
197 | return res; | |
319e6439 | 198 | |
9771b263 | 199 | LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr); |
f4ebbd24 | 200 | return opt_result::success (); |
ebfd146a IR |
201 | } |
202 | ||
a57776a1 RS |
203 | /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */ |
204 | ||
205 | static void | |
206 | vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value) | |
207 | { | |
208 | vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo); | |
209 | for (unsigned int i = 0; i < checks.length(); ++i) | |
210 | if (checks[i] == value) | |
211 | return; | |
212 | ||
213 | if (dump_enabled_p ()) | |
3c2a8ed0 DM |
214 | dump_printf_loc (MSG_NOTE, vect_location, |
215 | "need run-time check that %T is nonzero\n", | |
216 | value); | |
a57776a1 RS |
217 | LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value); |
218 | } | |
219 | ||
89fa689a RS |
220 | /* Return true if we know that the order of vectorized DR_INFO_A and |
221 | vectorized DR_INFO_B will be the same as the order of DR_INFO_A and | |
222 | DR_INFO_B. At least one of the accesses is a write. */ | |
a57776a1 RS |
223 | |
224 | static bool | |
89fa689a | 225 | vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b) |
a57776a1 | 226 | { |
89fa689a RS |
227 | stmt_vec_info stmtinfo_a = dr_info_a->stmt; |
228 | stmt_vec_info stmtinfo_b = dr_info_b->stmt; | |
229 | ||
a57776a1 RS |
230 | /* Single statements are always kept in their original order. */ |
231 | if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a) | |
232 | && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)) | |
233 | return true; | |
234 | ||
6924b5e6 RB |
235 | /* STMT_A and STMT_B belong to overlapping groups. All loads are |
236 | emitted at the position of the first scalar load. | |
2fd579ab | 237 | Stores in a group are emitted at the position of the last scalar store. |
bf329927 | 238 | Compute that position and check whether the resulting order matches |
6924b5e6 RB |
239 | the current one. */ |
240 | stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a); | |
241 | if (il_a) | |
2fd579ab | 242 | { |
6924b5e6 RB |
243 | if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a))) |
244 | for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s; | |
245 | s = DR_GROUP_NEXT_ELEMENT (s)) | |
246 | il_a = get_later_stmt (il_a, s); | |
247 | else /* DR_IS_READ */ | |
248 | for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s; | |
249 | s = DR_GROUP_NEXT_ELEMENT (s)) | |
250 | if (get_later_stmt (il_a, s) == il_a) | |
251 | il_a = s; | |
2fd579ab | 252 | } |
bf329927 | 253 | else |
6924b5e6 RB |
254 | il_a = stmtinfo_a; |
255 | stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b); | |
256 | if (il_b) | |
2fd579ab | 257 | { |
6924b5e6 RB |
258 | if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b))) |
259 | for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s; | |
260 | s = DR_GROUP_NEXT_ELEMENT (s)) | |
261 | il_b = get_later_stmt (il_b, s); | |
262 | else /* DR_IS_READ */ | |
263 | for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s; | |
264 | s = DR_GROUP_NEXT_ELEMENT (s)) | |
265 | if (get_later_stmt (il_b, s) == il_b) | |
266 | il_b = s; | |
2fd579ab | 267 | } |
bf329927 | 268 | else |
6924b5e6 | 269 | il_b = stmtinfo_b; |
2fd579ab | 270 | bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a); |
6924b5e6 | 271 | return (get_later_stmt (il_a, il_b) == il_a) == a_after_b; |
a57776a1 | 272 | } |
a70d6342 | 273 | |
dfbddbeb RS |
274 | /* A subroutine of vect_analyze_data_ref_dependence. Handle |
275 | DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence | |
276 | distances. These distances are conservatively correct but they don't | |
277 | reflect a guaranteed dependence. | |
278 | ||
279 | Return true if this function does all the work necessary to avoid | |
280 | an alias or false if the caller should use the dependence distances | |
281 | to limit the vectorization factor in the usual way. LOOP_DEPTH is | |
282 | the depth of the loop described by LOOP_VINFO and the other arguments | |
283 | are as for vect_analyze_data_ref_dependence. */ | |
284 | ||
285 | static bool | |
286 | vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr, | |
287 | loop_vec_info loop_vinfo, | |
d9f21f6a | 288 | int loop_depth, unsigned int *max_vf) |
dfbddbeb | 289 | { |
99b1c316 | 290 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
dfbddbeb RS |
291 | lambda_vector dist_v; |
292 | unsigned int i; | |
293 | FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) | |
294 | { | |
295 | int dist = dist_v[loop_depth]; | |
296 | if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr))) | |
297 | { | |
298 | /* If the user asserted safelen >= DIST consecutive iterations | |
299 | can be executed concurrently, assume independence. | |
300 | ||
301 | ??? An alternative would be to add the alias check even | |
302 | in this case, and vectorize the fallback loop with the | |
303 | maximum VF set to safelen. However, if the user has | |
304 | explicitly given a length, it's less likely that that | |
305 | would be a win. */ | |
306 | if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen) | |
307 | { | |
d9f21f6a | 308 | if ((unsigned int) loop->safelen < *max_vf) |
dfbddbeb RS |
309 | *max_vf = loop->safelen; |
310 | LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; | |
311 | continue; | |
312 | } | |
313 | ||
314 | /* For dependence distances of 2 or more, we have the option | |
315 | of limiting VF or checking for an alias at runtime. | |
316 | Prefer to check at runtime if we can, to avoid limiting | |
317 | the VF unnecessarily when the bases are in fact independent. | |
318 | ||
319 | Note that the alias checks will be removed if the VF ends up | |
320 | being small enough. */ | |
f5ae2856 RS |
321 | dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr)); |
322 | dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr)); | |
323 | return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt) | |
324 | && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt) | |
5fa23466 | 325 | && vect_mark_for_runtime_alias_test (ddr, loop_vinfo)); |
dfbddbeb RS |
326 | } |
327 | } | |
328 | return true; | |
329 | } | |
330 | ||
331 | ||
ebfd146a IR |
332 | /* Function vect_analyze_data_ref_dependence. |
333 | ||
f4ebbd24 DM |
334 | FIXME: I needed to change the sense of the returned flag. |
335 | ||
336 | Return FALSE if there (might) exist a dependence between a memory-reference | |
ebfd146a | 337 | DRA and a memory-reference DRB. When versioning for alias may check a |
f4ebbd24 | 338 | dependence at run-time, return TRUE. Adjust *MAX_VF according to |
777e1f09 | 339 | the data dependence. */ |
b8698a0f | 340 | |
f4ebbd24 | 341 | static opt_result |
ebfd146a | 342 | vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr, |
d9f21f6a RS |
343 | loop_vec_info loop_vinfo, |
344 | unsigned int *max_vf) | |
ebfd146a IR |
345 | { |
346 | unsigned int i; | |
99b1c316 | 347 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
ebfd146a IR |
348 | struct data_reference *dra = DDR_A (ddr); |
349 | struct data_reference *drb = DDR_B (ddr); | |
f5ae2856 RS |
350 | dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra); |
351 | dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb); | |
89fa689a RS |
352 | stmt_vec_info stmtinfo_a = dr_info_a->stmt; |
353 | stmt_vec_info stmtinfo_b = dr_info_b->stmt; | |
ebfd146a IR |
354 | lambda_vector dist_v; |
355 | unsigned int loop_depth; | |
b8698a0f | 356 | |
5abe1e05 | 357 | /* In loop analysis all data references should be vectorizable. */ |
4b5caab7 IR |
358 | if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a) |
359 | || !STMT_VINFO_VECTORIZABLE (stmtinfo_b)) | |
5abe1e05 | 360 | gcc_unreachable (); |
4b5caab7 | 361 | |
5abe1e05 | 362 | /* Independent data accesses. */ |
ebfd146a | 363 | if (DDR_ARE_DEPENDENT (ddr) == chrec_known) |
f4ebbd24 | 364 | return opt_result::success (); |
a70d6342 | 365 | |
5abe1e05 RB |
366 | if (dra == drb |
367 | || (DR_IS_READ (dra) && DR_IS_READ (drb))) | |
f4ebbd24 | 368 | return opt_result::success (); |
5961d779 | 369 | |
2a5825f2 | 370 | /* We do not have to consider dependences between accesses that belong |
9e4da9b5 RS |
371 | to the same group, unless the stride could be smaller than the |
372 | group size. */ | |
2c53b149 RB |
373 | if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a) |
374 | && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a) | |
375 | == DR_GROUP_FIRST_ELEMENT (stmtinfo_b)) | |
9e4da9b5 | 376 | && !STMT_VINFO_STRIDED_P (stmtinfo_a)) |
f4ebbd24 | 377 | return opt_result::success (); |
2a5825f2 | 378 | |
5961d779 RB |
379 | /* Even if we have an anti-dependence then, as the vectorized loop covers at |
380 | least two scalar iterations, there is always also a true dependence. | |
381 | As the vectorizer does not re-order loads and stores we can ignore | |
382 | the anti-dependence if TBAA can disambiguate both DRs similar to the | |
383 | case with known negative distance anti-dependences (positive | |
384 | distance anti-dependences would violate TBAA constraints). */ | |
385 | if (((DR_IS_READ (dra) && DR_IS_WRITE (drb)) | |
386 | || (DR_IS_WRITE (dra) && DR_IS_READ (drb))) | |
387 | && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)), | |
388 | get_alias_set (DR_REF (drb)))) | |
f4ebbd24 | 389 | return opt_result::success (); |
b8698a0f | 390 | |
5abe1e05 | 391 | /* Unknown data dependence. */ |
ebfd146a IR |
392 | if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) |
393 | { | |
74bf76ed JJ |
394 | /* If user asserted safelen consecutive iterations can be |
395 | executed concurrently, assume independence. */ | |
396 | if (loop->safelen >= 2) | |
397 | { | |
d9f21f6a | 398 | if ((unsigned int) loop->safelen < *max_vf) |
74bf76ed | 399 | *max_vf = loop->safelen; |
d1417442 | 400 | LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; |
f4ebbd24 | 401 | return opt_result::success (); |
74bf76ed JJ |
402 | } |
403 | ||
3bab6342 AT |
404 | if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) |
405 | || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) | |
f4ebbd24 DM |
406 | return opt_result::failure_at |
407 | (stmtinfo_a->stmt, | |
408 | "versioning for alias not supported for: " | |
409 | "can't determine dependence between %T and %T\n", | |
410 | DR_REF (dra), DR_REF (drb)); | |
90eb75f2 | 411 | |
73fbfcad | 412 | if (dump_enabled_p ()) |
f4ebbd24 | 413 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt, |
3c2a8ed0 DM |
414 | "versioning for alias required: " |
415 | "can't determine dependence between %T and %T\n", | |
416 | DR_REF (dra), DR_REF (drb)); | |
e4a707c4 | 417 | |
5abe1e05 | 418 | /* Add to list of ddrs that need to be tested at run-time. */ |
f4ebbd24 | 419 | return vect_mark_for_runtime_alias_test (ddr, loop_vinfo); |
a70d6342 IR |
420 | } |
421 | ||
5abe1e05 | 422 | /* Known data dependence. */ |
ebfd146a IR |
423 | if (DDR_NUM_DIST_VECTS (ddr) == 0) |
424 | { | |
74bf76ed JJ |
425 | /* If user asserted safelen consecutive iterations can be |
426 | executed concurrently, assume independence. */ | |
427 | if (loop->safelen >= 2) | |
428 | { | |
d9f21f6a | 429 | if ((unsigned int) loop->safelen < *max_vf) |
74bf76ed | 430 | *max_vf = loop->safelen; |
d1417442 | 431 | LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; |
f4ebbd24 | 432 | return opt_result::success (); |
74bf76ed JJ |
433 | } |
434 | ||
3bab6342 AT |
435 | if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) |
436 | || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) | |
f4ebbd24 DM |
437 | return opt_result::failure_at |
438 | (stmtinfo_a->stmt, | |
439 | "versioning for alias not supported for: " | |
440 | "bad dist vector for %T and %T\n", | |
441 | DR_REF (dra), DR_REF (drb)); | |
90eb75f2 | 442 | |
73fbfcad | 443 | if (dump_enabled_p ()) |
f4ebbd24 | 444 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt, |
3c2a8ed0 DM |
445 | "versioning for alias required: " |
446 | "bad dist vector for %T and %T\n", | |
447 | DR_REF (dra), DR_REF (drb)); | |
ebfd146a | 448 | /* Add to list of ddrs that need to be tested at run-time. */ |
f4ebbd24 | 449 | return vect_mark_for_runtime_alias_test (ddr, loop_vinfo); |
b8698a0f | 450 | } |
ebfd146a IR |
451 | |
452 | loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr)); | |
dfbddbeb RS |
453 | |
454 | if (DDR_COULD_BE_INDEPENDENT_P (ddr) | |
455 | && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo, | |
456 | loop_depth, max_vf)) | |
f4ebbd24 | 457 | return opt_result::success (); |
dfbddbeb | 458 | |
9771b263 | 459 | FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) |
ebfd146a IR |
460 | { |
461 | int dist = dist_v[loop_depth]; | |
462 | ||
73fbfcad | 463 | if (dump_enabled_p ()) |
78c60e3d | 464 | dump_printf_loc (MSG_NOTE, vect_location, |
e645e942 | 465 | "dependence distance = %d.\n", dist); |
ebfd146a | 466 | |
777e1f09 | 467 | if (dist == 0) |
ebfd146a | 468 | { |
73fbfcad | 469 | if (dump_enabled_p ()) |
3c2a8ed0 DM |
470 | dump_printf_loc (MSG_NOTE, vect_location, |
471 | "dependence distance == 0 between %T and %T\n", | |
472 | DR_REF (dra), DR_REF (drb)); | |
ebfd146a | 473 | |
5185d248 RB |
474 | /* When we perform grouped accesses and perform implicit CSE |
475 | by detecting equal accesses and doing disambiguation with | |
476 | runtime alias tests like for | |
477 | .. = a[i]; | |
478 | .. = a[i+1]; | |
479 | a[i] = ..; | |
480 | a[i+1] = ..; | |
481 | *p = ..; | |
482 | .. = a[i]; | |
483 | .. = a[i+1]; | |
484 | where we will end up loading { a[i], a[i+1] } once, make | |
485 | sure that inserting group loads before the first load and | |
e33f43b9 RB |
486 | stores after the last store will do the right thing. |
487 | Similar for groups like | |
488 | a[i] = ...; | |
489 | ... = a[i]; | |
490 | a[i+1] = ...; | |
491 | where loads from the group interleave with the store. */ | |
89fa689a | 492 | if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b)) |
f4ebbd24 DM |
493 | return opt_result::failure_at (stmtinfo_a->stmt, |
494 | "READ_WRITE dependence" | |
495 | " in interleaving.\n"); | |
a57776a1 | 496 | |
962e91fc | 497 | if (loop->safelen < 2) |
5185d248 | 498 | { |
a57776a1 | 499 | tree indicator = dr_zero_step_indicator (dra); |
5fa23466 | 500 | if (!indicator || integer_zerop (indicator)) |
f4ebbd24 DM |
501 | return opt_result::failure_at (stmtinfo_a->stmt, |
502 | "access also has a zero step\n"); | |
5fa23466 RB |
503 | else if (TREE_CODE (indicator) != INTEGER_CST) |
504 | vect_check_nonzero_value (loop_vinfo, indicator); | |
ebfd146a | 505 | } |
777e1f09 RG |
506 | continue; |
507 | } | |
508 | ||
509 | if (dist > 0 && DDR_REVERSED_P (ddr)) | |
510 | { | |
511 | /* If DDR_REVERSED_P the order of the data-refs in DDR was | |
512 | reversed (to make distance vector positive), and the actual | |
513 | distance is negative. */ | |
73fbfcad | 514 | if (dump_enabled_p ()) |
f30d4934 | 515 | dump_printf_loc (MSG_NOTE, vect_location, |
e645e942 | 516 | "dependence distance negative.\n"); |
f30d4934 BC |
517 | /* When doing outer loop vectorization, we need to check if there is |
518 | a backward dependence at the inner loop level if the dependence | |
519 | at the outer loop is reversed. See PR81740. */ | |
520 | if (nested_in_vect_loop_p (loop, stmtinfo_a) | |
521 | || nested_in_vect_loop_p (loop, stmtinfo_b)) | |
522 | { | |
523 | unsigned inner_depth = index_in_loop_nest (loop->inner->num, | |
524 | DDR_LOOP_NEST (ddr)); | |
525 | if (dist_v[inner_depth] < 0) | |
526 | return opt_result::failure_at (stmtinfo_a->stmt, | |
527 | "not vectorized, dependence " | |
528 | "between data-refs %T and %T\n", | |
529 | DR_REF (dra), DR_REF (drb)); | |
530 | } | |
f2556b68 RB |
531 | /* Record a negative dependence distance to later limit the |
532 | amount of stmt copying / unrolling we can perform. | |
533 | Only need to handle read-after-write dependence. */ | |
534 | if (DR_IS_READ (drb) | |
535 | && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0 | |
536 | || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist)) | |
537 | STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist; | |
777e1f09 RG |
538 | continue; |
539 | } | |
540 | ||
d9f21f6a RS |
541 | unsigned int abs_dist = abs (dist); |
542 | if (abs_dist >= 2 && abs_dist < *max_vf) | |
777e1f09 RG |
543 | { |
544 | /* The dependence distance requires reduction of the maximal | |
545 | vectorization factor. */ | |
f30d4934 | 546 | *max_vf = abs_dist; |
73fbfcad | 547 | if (dump_enabled_p ()) |
78c60e3d | 548 | dump_printf_loc (MSG_NOTE, vect_location, |
e645e942 TJ |
549 | "adjusting maximal vectorization factor to %i\n", |
550 | *max_vf); | |
ebfd146a IR |
551 | } |
552 | ||
d9f21f6a | 553 | if (abs_dist >= *max_vf) |
ebfd146a | 554 | { |
b8698a0f | 555 | /* Dependence distance does not create dependence, as far as |
777e1f09 | 556 | vectorization is concerned, in this case. */ |
73fbfcad | 557 | if (dump_enabled_p ()) |
78c60e3d | 558 | dump_printf_loc (MSG_NOTE, vect_location, |
e645e942 | 559 | "dependence distance >= VF.\n"); |
ebfd146a IR |
560 | continue; |
561 | } | |
562 | ||
f4ebbd24 DM |
563 | return opt_result::failure_at (stmtinfo_a->stmt, |
564 | "not vectorized, possible dependence " | |
565 | "between data-refs %T and %T\n", | |
566 | DR_REF (dra), DR_REF (drb)); | |
ebfd146a IR |
567 | } |
568 | ||
f4ebbd24 | 569 | return opt_result::success (); |
ebfd146a IR |
570 | } |
571 | ||
572 | /* Function vect_analyze_data_ref_dependences. | |
b8698a0f | 573 | |
ebfd146a | 574 | Examine all the data references in the loop, and make sure there do not |
777e1f09 RG |
575 | exist any data dependences between them. Set *MAX_VF according to |
576 | the maximum vectorization factor the data dependences allow. */ | |
b8698a0f | 577 | |
f4ebbd24 | 578 | opt_result |
d9f21f6a RS |
579 | vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, |
580 | unsigned int *max_vf) | |
ebfd146a IR |
581 | { |
582 | unsigned int i; | |
ebfd146a IR |
583 | struct data_dependence_relation *ddr; |
584 | ||
adac3a68 | 585 | DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences"); |
5abe1e05 | 586 | |
ca823c85 RB |
587 | if (!LOOP_VINFO_DDRS (loop_vinfo).exists ()) |
588 | { | |
589 | LOOP_VINFO_DDRS (loop_vinfo) | |
590 | .create (LOOP_VINFO_DATAREFS (loop_vinfo).length () | |
591 | * LOOP_VINFO_DATAREFS (loop_vinfo).length ()); | |
592 | /* We need read-read dependences to compute | |
593 | STMT_VINFO_SAME_ALIGN_REFS. */ | |
c2fd033c RB |
594 | bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo), |
595 | &LOOP_VINFO_DDRS (loop_vinfo), | |
596 | LOOP_VINFO_LOOP_NEST (loop_vinfo), | |
597 | true); | |
598 | gcc_assert (res); | |
ca823c85 RB |
599 | } |
600 | ||
d1417442 | 601 | LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true; |
5abe1e05 | 602 | |
598eaaa2 YR |
603 | /* For epilogues we either have no aliases or alias versioning |
604 | was applied to original loop. Therefore we may just get max_vf | |
605 | using VF of original loop. */ | |
606 | if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) | |
a41a6142 | 607 | *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo); |
598eaaa2 YR |
608 | else |
609 | FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr) | |
f4ebbd24 DM |
610 | { |
611 | opt_result res | |
612 | = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf); | |
613 | if (!res) | |
614 | return res; | |
615 | } | |
5abe1e05 | 616 | |
f4ebbd24 | 617 | return opt_result::success (); |
5abe1e05 RB |
618 | } |
619 | ||
620 | ||
621 | /* Function vect_slp_analyze_data_ref_dependence. | |
622 | ||
623 | Return TRUE if there (might) exist a dependence between a memory-reference | |
f5ae2856 RS |
624 | DRA and a memory-reference DRB for VINFO. When versioning for alias |
625 | may check a dependence at run-time, return FALSE. Adjust *MAX_VF | |
626 | according to the data dependence. */ | |
5abe1e05 RB |
627 | |
628 | static bool | |
f5ae2856 RS |
629 | vect_slp_analyze_data_ref_dependence (vec_info *vinfo, |
630 | struct data_dependence_relation *ddr) | |
5abe1e05 RB |
631 | { |
632 | struct data_reference *dra = DDR_A (ddr); | |
633 | struct data_reference *drb = DDR_B (ddr); | |
f5ae2856 RS |
634 | dr_vec_info *dr_info_a = vinfo->lookup_dr (dra); |
635 | dr_vec_info *dr_info_b = vinfo->lookup_dr (drb); | |
5abe1e05 RB |
636 | |
637 | /* We need to check dependences of statements marked as unvectorizable | |
638 | as well, they still can prohibit vectorization. */ | |
639 | ||
640 | /* Independent data accesses. */ | |
641 | if (DDR_ARE_DEPENDENT (ddr) == chrec_known) | |
642 | return false; | |
643 | ||
644 | if (dra == drb) | |
645 | return false; | |
646 | ||
647 | /* Read-read is OK. */ | |
648 | if (DR_IS_READ (dra) && DR_IS_READ (drb)) | |
649 | return false; | |
650 | ||
e6c9d234 RB |
651 | /* If dra and drb are part of the same interleaving chain consider |
652 | them independent. */ | |
89fa689a RS |
653 | if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt) |
654 | && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt) | |
655 | == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt))) | |
e6c9d234 RB |
656 | return false; |
657 | ||
5abe1e05 RB |
658 | /* Unknown data dependence. */ |
659 | if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) | |
fcac74a1 | 660 | { |
649d196d | 661 | if (dump_enabled_p ()) |
3c2a8ed0 DM |
662 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
663 | "can't determine dependence between %T and %T\n", | |
664 | DR_REF (dra), DR_REF (drb)); | |
fcac74a1 | 665 | } |
649d196d | 666 | else if (dump_enabled_p ()) |
3c2a8ed0 DM |
667 | dump_printf_loc (MSG_NOTE, vect_location, |
668 | "determined dependence between %T and %T\n", | |
669 | DR_REF (dra), DR_REF (drb)); | |
b8698a0f | 670 | |
5abe1e05 RB |
671 | return true; |
672 | } | |
673 | ||
674 | ||
c2a12ca0 RB |
675 | /* Analyze dependences involved in the transform of SLP NODE. STORES |
676 | contain the vector of scalar stores of this instance if we are | |
677 | disambiguating the loads. */ | |
64900538 RB |
678 | |
679 | static bool | |
4e849a74 | 680 | vect_slp_analyze_node_dependences (vec_info *vinfo, slp_tree node, |
b9787581 | 681 | vec<stmt_vec_info> stores, |
32e8e429 | 682 | stmt_vec_info last_store_info) |
64900538 RB |
683 | { |
684 | /* This walks over all stmts involved in the SLP load/store done | |
685 | in NODE verifying we can sink them up to the last stmt in the | |
686 | group. */ | |
6924b5e6 | 687 | if (DR_IS_WRITE (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node)))) |
64900538 | 688 | { |
6924b5e6 RB |
689 | stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node); |
690 | for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k) | |
64900538 | 691 | { |
6924b5e6 RB |
692 | stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k]; |
693 | if (access_info == last_access_info) | |
64900538 | 694 | continue; |
6924b5e6 RB |
695 | data_reference *dr_a = STMT_VINFO_DATA_REF (access_info); |
696 | ao_ref ref; | |
697 | bool ref_initialized_p = false; | |
698 | for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt); | |
699 | gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi)) | |
27312bf2 | 700 | { |
6924b5e6 RB |
701 | gimple *stmt = gsi_stmt (gsi); |
702 | if (! gimple_vuse (stmt)) | |
703 | continue; | |
704 | ||
705 | /* If we couldn't record a (single) data reference for this | |
706 | stmt we have to resort to the alias oracle. */ | |
707 | stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt); | |
708 | data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info); | |
709 | if (!dr_b) | |
710 | { | |
711 | /* We are moving a store - this means | |
712 | we cannot use TBAA for disambiguation. */ | |
713 | if (!ref_initialized_p) | |
714 | ao_ref_init (&ref, DR_REF (dr_a)); | |
715 | if (stmt_may_clobber_ref_p_1 (stmt, &ref, false) | |
716 | || ref_maybe_used_by_stmt_p (stmt, &ref, false)) | |
717 | return false; | |
718 | continue; | |
719 | } | |
720 | ||
721 | bool dependent = false; | |
722 | /* If we run into a store of this same instance (we've just | |
723 | marked those) then delay dependence checking until we run | |
724 | into the last store because this is where it will have | |
725 | been sunk to (and we verify if we can do that as well). */ | |
726 | if (gimple_visited_p (stmt)) | |
727 | { | |
728 | if (stmt_info != last_store_info) | |
729 | continue; | |
730 | unsigned i; | |
731 | stmt_vec_info store_info; | |
732 | FOR_EACH_VEC_ELT (stores, i, store_info) | |
733 | { | |
734 | data_reference *store_dr | |
735 | = STMT_VINFO_DATA_REF (store_info); | |
736 | ddr_p ddr = initialize_data_dependence_relation | |
737 | (dr_a, store_dr, vNULL); | |
738 | dependent | |
739 | = vect_slp_analyze_data_ref_dependence (vinfo, ddr); | |
740 | free_dependence_relation (ddr); | |
741 | if (dependent) | |
742 | break; | |
743 | } | |
744 | } | |
745 | else | |
746 | { | |
747 | ddr_p ddr = initialize_data_dependence_relation (dr_a, | |
748 | dr_b, vNULL); | |
749 | dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr); | |
750 | free_dependence_relation (ddr); | |
751 | } | |
752 | if (dependent) | |
27312bf2 | 753 | return false; |
27312bf2 | 754 | } |
6924b5e6 RB |
755 | } |
756 | } | |
757 | else /* DR_IS_READ */ | |
758 | { | |
759 | stmt_vec_info first_access_info | |
760 | = vect_find_first_scalar_stmt_in_slp (node); | |
761 | for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k) | |
762 | { | |
763 | stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k]; | |
764 | if (access_info == first_access_info) | |
765 | continue; | |
766 | data_reference *dr_a = STMT_VINFO_DATA_REF (access_info); | |
767 | ao_ref ref; | |
768 | bool ref_initialized_p = false; | |
769 | for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt); | |
770 | gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi)) | |
c2a12ca0 | 771 | { |
6924b5e6 RB |
772 | gimple *stmt = gsi_stmt (gsi); |
773 | if (! gimple_vdef (stmt)) | |
c2a12ca0 | 774 | continue; |
6924b5e6 RB |
775 | |
776 | /* If we couldn't record a (single) data reference for this | |
777 | stmt we have to resort to the alias oracle. */ | |
778 | stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt); | |
779 | data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info); | |
780 | if (!dr_b) | |
781 | { | |
782 | /* We are hoisting a load - this means we can use | |
783 | TBAA for disambiguation. */ | |
784 | if (!ref_initialized_p) | |
785 | ao_ref_init (&ref, DR_REF (dr_a)); | |
786 | if (stmt_may_clobber_ref_p_1 (stmt, &ref, true)) | |
787 | return false; | |
788 | continue; | |
789 | } | |
790 | ||
791 | bool dependent = false; | |
792 | /* If we run into a store of this same instance (we've just | |
793 | marked those) then delay dependence checking until we run | |
794 | into the last store because this is where it will have | |
795 | been sunk to (and we verify if we can do that as well). */ | |
796 | if (gimple_visited_p (stmt)) | |
c2a12ca0 | 797 | { |
6924b5e6 RB |
798 | if (stmt_info != last_store_info) |
799 | continue; | |
800 | unsigned i; | |
801 | stmt_vec_info store_info; | |
802 | FOR_EACH_VEC_ELT (stores, i, store_info) | |
803 | { | |
804 | data_reference *store_dr | |
805 | = STMT_VINFO_DATA_REF (store_info); | |
806 | ddr_p ddr = initialize_data_dependence_relation | |
807 | (dr_a, store_dr, vNULL); | |
808 | dependent | |
809 | = vect_slp_analyze_data_ref_dependence (vinfo, ddr); | |
810 | free_dependence_relation (ddr); | |
811 | if (dependent) | |
812 | break; | |
813 | } | |
814 | } | |
815 | else | |
816 | { | |
817 | ddr_p ddr = initialize_data_dependence_relation (dr_a, | |
818 | dr_b, vNULL); | |
819 | dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr); | |
c2a12ca0 RB |
820 | free_dependence_relation (ddr); |
821 | } | |
6924b5e6 RB |
822 | if (dependent) |
823 | return false; | |
c2a12ca0 | 824 | } |
64900538 RB |
825 | } |
826 | } | |
827 | return true; | |
828 | } | |
829 | ||
830 | ||
5abe1e05 RB |
831 | /* Function vect_analyze_data_ref_dependences. |
832 | ||
833 | Examine all the data references in the basic-block, and make sure there | |
834 | do not exist any data dependences between them. Set *MAX_VF according to | |
835 | the maximum vectorization factor the data dependences allow. */ | |
836 | ||
837 | bool | |
308bc496 | 838 | vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance) |
5abe1e05 | 839 | { |
adac3a68 | 840 | DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence"); |
5abe1e05 | 841 | |
c2a12ca0 RB |
842 | /* The stores of this instance are at the root of the SLP tree. */ |
843 | slp_tree store = SLP_INSTANCE_TREE (instance); | |
b9787581 | 844 | if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0])) |
c2a12ca0 RB |
845 | store = NULL; |
846 | ||
847 | /* Verify we can sink stores to the vectorized stmt insert location. */ | |
95c68311 | 848 | stmt_vec_info last_store_info = NULL; |
c2a12ca0 | 849 | if (store) |
64900538 | 850 | { |
4e849a74 | 851 | if (! vect_slp_analyze_node_dependences (vinfo, store, vNULL, NULL)) |
c2a12ca0 RB |
852 | return false; |
853 | ||
854 | /* Mark stores in this instance and remember the last one. */ | |
95c68311 | 855 | last_store_info = vect_find_last_scalar_stmt_in_slp (store); |
4e849a74 | 856 | for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k) |
b9787581 | 857 | gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true); |
64900538 | 858 | } |
5abe1e05 | 859 | |
c2a12ca0 | 860 | bool res = true; |
ebfd146a | 861 | |
c2a12ca0 RB |
862 | /* Verify we can sink loads to the vectorized stmt insert location, |
863 | special-casing stores of this instance. */ | |
864 | slp_tree load; | |
865 | unsigned int i; | |
866 | FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load) | |
4e849a74 | 867 | if (! vect_slp_analyze_node_dependences (vinfo, load, |
c2a12ca0 RB |
868 | store |
869 | ? SLP_TREE_SCALAR_STMTS (store) | |
95c68311 | 870 | : vNULL, last_store_info)) |
c2a12ca0 RB |
871 | { |
872 | res = false; | |
873 | break; | |
874 | } | |
875 | ||
876 | /* Unset the visited flag. */ | |
877 | if (store) | |
4e849a74 | 878 | for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k) |
b9787581 | 879 | gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false); |
c2a12ca0 RB |
880 | |
881 | return res; | |
ebfd146a IR |
882 | } |
883 | ||
d7609678 RS |
884 | /* Record the base alignment guarantee given by DRB, which occurs |
885 | in STMT_INFO. */ | |
62c8a2cf RS |
886 | |
887 | static void | |
308bc496 | 888 | vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info, |
62c8a2cf RS |
889 | innermost_loop_behavior *drb) |
890 | { | |
891 | bool existed; | |
892 | innermost_loop_behavior *&entry | |
893 | = vinfo->base_alignments.get_or_insert (drb->base_address, &existed); | |
894 | if (!existed || entry->base_alignment < drb->base_alignment) | |
895 | { | |
896 | entry = drb; | |
897 | if (dump_enabled_p ()) | |
3c2a8ed0 DM |
898 | dump_printf_loc (MSG_NOTE, vect_location, |
899 | "recording new base alignment for %T\n" | |
900 | " alignment: %d\n" | |
901 | " misalignment: %d\n" | |
902 | " based on: %G", | |
903 | drb->base_address, | |
904 | drb->base_alignment, | |
905 | drb->base_misalignment, | |
906 | stmt_info->stmt); | |
62c8a2cf RS |
907 | } |
908 | } | |
909 | ||
910 | /* If the region we're going to vectorize is reached, all unconditional | |
911 | data references occur at least once. We can therefore pool the base | |
912 | alignment guarantees from each unconditional reference. Do this by | |
913 | going through all the data references in VINFO and checking whether | |
914 | the containing statement makes the reference unconditionally. If so, | |
915 | record the alignment of the base address in VINFO so that it can be | |
916 | used for all other references with the same base. */ | |
917 | ||
918 | void | |
919 | vect_record_base_alignments (vec_info *vinfo) | |
920 | { | |
921 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); | |
99b1c316 | 922 | class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL; |
62c8a2cf RS |
923 | data_reference *dr; |
924 | unsigned int i; | |
ca823c85 | 925 | FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr) |
3b1cffcc | 926 | { |
f5ae2856 | 927 | dr_vec_info *dr_info = vinfo->lookup_dr (dr); |
89fa689a | 928 | stmt_vec_info stmt_info = dr_info->stmt; |
57c454d2 | 929 | if (!DR_IS_CONDITIONAL_IN_STMT (dr) |
5fa23466 RB |
930 | && STMT_VINFO_VECTORIZABLE (stmt_info) |
931 | && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)) | |
57c454d2 | 932 | { |
308bc496 | 933 | vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr)); |
62c8a2cf | 934 | |
57c454d2 RB |
935 | /* If DR is nested in the loop that is being vectorized, we can also |
936 | record the alignment of the base wrt the outer loop. */ | |
78e02b3b | 937 | if (loop && nested_in_vect_loop_p (loop, stmt_info)) |
5fa23466 | 938 | vect_record_base_alignment |
308bc496 | 939 | (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info)); |
57c454d2 | 940 | } |
3b1cffcc | 941 | } |
62c8a2cf RS |
942 | } |
943 | ||
89fa689a | 944 | /* Return the target alignment for the vectorized form of DR_INFO. */ |
f702e7d4 | 945 | |
ca31798e | 946 | static poly_uint64 |
89fa689a | 947 | vect_calculate_target_alignment (dr_vec_info *dr_info) |
f702e7d4 | 948 | { |
89fa689a | 949 | tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt); |
f702e7d4 RS |
950 | return targetm.vectorize.preferred_vector_alignment (vectype); |
951 | } | |
952 | ||
ebfd146a IR |
953 | /* Function vect_compute_data_ref_alignment |
954 | ||
89fa689a | 955 | Compute the misalignment of the data reference DR_INFO. |
ebfd146a IR |
956 | |
957 | Output: | |
89fa689a | 958 | 1. DR_MISALIGNMENT (DR_INFO) is defined. |
ebfd146a IR |
959 | |
960 | FOR NOW: No analysis is actually performed. Misalignment is calculated | |
961 | only for trivial cases. TODO. */ | |
962 | ||
5fa23466 | 963 | static void |
308bc496 | 964 | vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info) |
ebfd146a | 965 | { |
89fa689a | 966 | stmt_vec_info stmt_info = dr_info->stmt; |
308bc496 RB |
967 | vec_base_alignments *base_alignments = &vinfo->base_alignments; |
968 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); | |
99b1c316 | 969 | class loop *loop = NULL; |
89fa689a | 970 | tree ref = DR_REF (dr_info->dr); |
3f5e8a76 | 971 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
b8698a0f | 972 | |
73fbfcad | 973 | if (dump_enabled_p ()) |
78c60e3d | 974 | dump_printf_loc (MSG_NOTE, vect_location, |
e645e942 | 975 | "vect_compute_data_ref_alignment:\n"); |
ebfd146a | 976 | |
a70d6342 IR |
977 | if (loop_vinfo) |
978 | loop = LOOP_VINFO_LOOP (loop_vinfo); | |
b8698a0f | 979 | |
ebfd146a | 980 | /* Initialize misalignment to unknown. */ |
89fa689a | 981 | SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN); |
ebfd146a | 982 | |
5fa23466 RB |
983 | if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) |
984 | return; | |
985 | ||
308bc496 | 986 | innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info); |
3f5e8a76 RS |
987 | bool step_preserves_misalignment_p; |
988 | ||
ca31798e AV |
989 | poly_uint64 vector_alignment |
990 | = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT); | |
89fa689a | 991 | DR_TARGET_ALIGNMENT (dr_info) = vector_alignment; |
f702e7d4 | 992 | |
4e9d58d1 AV |
993 | /* If the main loop has peeled for alignment we have no way of knowing |
994 | whether the data accesses in the epilogues are aligned. We can't at | |
995 | compile time answer the question whether we have entered the main loop or | |
996 | not. Fixes PR 92351. */ | |
997 | if (loop_vinfo) | |
998 | { | |
999 | loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); | |
1000 | if (orig_loop_vinfo | |
1001 | && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0) | |
1002 | return; | |
1003 | } | |
1004 | ||
ca31798e AV |
1005 | unsigned HOST_WIDE_INT vect_align_c; |
1006 | if (!vector_alignment.is_constant (&vect_align_c)) | |
1007 | return; | |
1008 | ||
3f5e8a76 RS |
1009 | /* No step for BB vectorization. */ |
1010 | if (!loop) | |
1011 | { | |
1012 | gcc_assert (integer_zerop (drb->step)); | |
1013 | step_preserves_misalignment_p = true; | |
1014 | } | |
ebfd146a IR |
1015 | |
1016 | /* In case the dataref is in an inner-loop of the loop that is being | |
1017 | vectorized (LOOP), we use the base and misalignment information | |
ff802fa1 | 1018 | relative to the outer-loop (LOOP). This is ok only if the misalignment |
ebfd146a IR |
1019 | stays the same throughout the execution of the inner-loop, which is why |
1020 | we have to check that the stride of the dataref in the inner-loop evenly | |
f702e7d4 | 1021 | divides by the vector alignment. */ |
78e02b3b | 1022 | else if (nested_in_vect_loop_p (loop, stmt_info)) |
ebfd146a | 1023 | { |
3f5e8a76 | 1024 | step_preserves_misalignment_p |
ca31798e | 1025 | = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0; |
b8698a0f | 1026 | |
3f5e8a76 | 1027 | if (dump_enabled_p ()) |
ebfd146a | 1028 | { |
3f5e8a76 RS |
1029 | if (step_preserves_misalignment_p) |
1030 | dump_printf_loc (MSG_NOTE, vect_location, | |
f702e7d4 | 1031 | "inner step divides the vector alignment.\n"); |
3f5e8a76 | 1032 | else |
78c60e3d | 1033 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
f702e7d4 RS |
1034 | "inner step doesn't divide the vector" |
1035 | " alignment.\n"); | |
ebfd146a IR |
1036 | } |
1037 | } | |
1038 | ||
91ff1504 RB |
1039 | /* Similarly we can only use base and misalignment information relative to |
1040 | an innermost loop if the misalignment stays the same throughout the | |
1041 | execution of the loop. As above, this is the case if the stride of | |
f702e7d4 | 1042 | the dataref evenly divides by the alignment. */ |
91ff1504 | 1043 | else |
3ebde0e9 | 1044 | { |
d9f21f6a | 1045 | poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
3f5e8a76 | 1046 | step_preserves_misalignment_p |
ca31798e | 1047 | = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c); |
3ebde0e9 | 1048 | |
3f5e8a76 RS |
1049 | if (!step_preserves_misalignment_p && dump_enabled_p ()) |
1050 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
f702e7d4 | 1051 | "step doesn't divide the vector alignment.\n"); |
3ebde0e9 | 1052 | } |
52639a61 | 1053 | |
bb642979 RS |
1054 | unsigned int base_alignment = drb->base_alignment; |
1055 | unsigned int base_misalignment = drb->base_misalignment; | |
ebfd146a | 1056 | |
62c8a2cf RS |
1057 | /* Calculate the maximum of the pooled base address alignment and the |
1058 | alignment that we can compute for DR itself. */ | |
1059 | innermost_loop_behavior **entry = base_alignments->get (drb->base_address); | |
1060 | if (entry && base_alignment < (*entry)->base_alignment) | |
1061 | { | |
1062 | base_alignment = (*entry)->base_alignment; | |
1063 | base_misalignment = (*entry)->base_misalignment; | |
1064 | } | |
1065 | ||
ca31798e | 1066 | if (drb->offset_alignment < vect_align_c |
832b4117 RS |
1067 | || !step_preserves_misalignment_p |
1068 | /* We need to know whether the step wrt the vectorized loop is | |
1069 | negative when computing the starting misalignment below. */ | |
1070 | || TREE_CODE (drb->step) != INTEGER_CST) | |
ebfd146a | 1071 | { |
73fbfcad | 1072 | if (dump_enabled_p ()) |
3c2a8ed0 DM |
1073 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1074 | "Unknown alignment for access: %T\n", ref); | |
5fa23466 | 1075 | return; |
ebfd146a IR |
1076 | } |
1077 | ||
ca31798e | 1078 | if (base_alignment < vect_align_c) |
ebfd146a | 1079 | { |
a199d5e7 RS |
1080 | unsigned int max_alignment; |
1081 | tree base = get_base_for_alignment (drb->base_address, &max_alignment); | |
ca31798e | 1082 | if (max_alignment < vect_align_c |
a199d5e7 | 1083 | || !vect_can_force_dr_alignment_p (base, |
ca31798e | 1084 | vect_align_c * BITS_PER_UNIT)) |
ebfd146a | 1085 | { |
73fbfcad | 1086 | if (dump_enabled_p ()) |
3c2a8ed0 DM |
1087 | dump_printf_loc (MSG_NOTE, vect_location, |
1088 | "can't force alignment of ref: %T\n", ref); | |
5fa23466 | 1089 | return; |
ebfd146a | 1090 | } |
b8698a0f | 1091 | |
ebfd146a IR |
1092 | /* Force the alignment of the decl. |
1093 | NOTE: This is the only change to the code we make during | |
1094 | the analysis phase, before deciding to vectorize the loop. */ | |
73fbfcad | 1095 | if (dump_enabled_p ()) |
3c2a8ed0 DM |
1096 | dump_printf_loc (MSG_NOTE, vect_location, |
1097 | "force alignment of %T\n", ref); | |
720f5239 | 1098 | |
89fa689a RS |
1099 | dr_info->base_decl = base; |
1100 | dr_info->base_misaligned = true; | |
bb642979 | 1101 | base_misalignment = 0; |
ebfd146a | 1102 | } |
8944b5b3 RS |
1103 | poly_int64 misalignment |
1104 | = base_misalignment + wi::to_poly_offset (drb->init).force_shwi (); | |
ebfd146a | 1105 | |
46241ea9 RG |
1106 | /* If this is a backward running DR then first access in the larger |
1107 | vectype actually is N-1 elements before the address in the DR. | |
1108 | Adjust misalign accordingly. */ | |
3f5e8a76 | 1109 | if (tree_int_cst_sgn (drb->step) < 0) |
bb642979 RS |
1110 | /* PLUS because STEP is negative. */ |
1111 | misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1) | |
1112 | * TREE_INT_CST_LOW (drb->step)); | |
46241ea9 | 1113 | |
8944b5b3 | 1114 | unsigned int const_misalignment; |
ca31798e | 1115 | if (!known_misalignment (misalignment, vect_align_c, &const_misalignment)) |
8944b5b3 RS |
1116 | { |
1117 | if (dump_enabled_p ()) | |
3c2a8ed0 DM |
1118 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1119 | "Non-constant misalignment for access: %T\n", ref); | |
5fa23466 | 1120 | return; |
8944b5b3 RS |
1121 | } |
1122 | ||
89fa689a | 1123 | SET_DR_MISALIGNMENT (dr_info, const_misalignment); |
ebfd146a | 1124 | |
73fbfcad | 1125 | if (dump_enabled_p ()) |
3c2a8ed0 DM |
1126 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1127 | "misalign = %d bytes of ref %T\n", | |
1128 | DR_MISALIGNMENT (dr_info), ref); | |
ebfd146a | 1129 | |
5fa23466 | 1130 | return; |
ebfd146a IR |
1131 | } |
1132 | ||
71595748 | 1133 | /* Function vect_update_misalignment_for_peel. |
89fa689a RS |
1134 | Sets DR_INFO's misalignment |
1135 | - to 0 if it has the same alignment as DR_PEEL_INFO, | |
1136 | - to the misalignment computed using NPEEL if DR_INFO's salignment is known, | |
71595748 | 1137 | - to -1 (unknown) otherwise. |
ebfd146a | 1138 | |
89fa689a RS |
1139 | DR_INFO - the data reference whose misalignment is to be adjusted. |
1140 | DR_PEEL_INFO - the data reference whose misalignment is being made | |
1141 | zero in the vector loop by the peel. | |
ebfd146a | 1142 | NPEEL - the number of iterations in the peel loop if the misalignment |
89fa689a | 1143 | of DR_PEEL_INFO is known at compile time. */ |
ebfd146a IR |
1144 | |
1145 | static void | |
89fa689a RS |
1146 | vect_update_misalignment_for_peel (dr_vec_info *dr_info, |
1147 | dr_vec_info *dr_peel_info, int npeel) | |
ebfd146a IR |
1148 | { |
1149 | unsigned int i; | |
71595748 | 1150 | vec<dr_p> same_aligned_drs; |
ebfd146a | 1151 | struct data_reference *current_dr; |
89fa689a | 1152 | stmt_vec_info peel_stmt_info = dr_peel_info->stmt; |
ebfd146a | 1153 | |
7ea4b8ed RB |
1154 | /* It can be assumed that if dr_info has the same alignment as dr_peel, |
1155 | it is aligned in the vector loop. */ | |
89fa689a | 1156 | same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info); |
71595748 | 1157 | FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr) |
ebfd146a | 1158 | { |
89fa689a | 1159 | if (current_dr != dr_info->dr) |
ebfd146a | 1160 | continue; |
89fa689a RS |
1161 | gcc_assert (!known_alignment_for_access_p (dr_info) |
1162 | || !known_alignment_for_access_p (dr_peel_info) | |
7ea4b8ed RB |
1163 | || (DR_MISALIGNMENT (dr_info) |
1164 | == DR_MISALIGNMENT (dr_peel_info))); | |
89fa689a | 1165 | SET_DR_MISALIGNMENT (dr_info, 0); |
ebfd146a IR |
1166 | return; |
1167 | } | |
1168 | ||
ca31798e AV |
1169 | unsigned HOST_WIDE_INT alignment; |
1170 | if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment) | |
1171 | && known_alignment_for_access_p (dr_info) | |
89fa689a | 1172 | && known_alignment_for_access_p (dr_peel_info)) |
ebfd146a | 1173 | { |
89fa689a | 1174 | int misal = DR_MISALIGNMENT (dr_info); |
7ea4b8ed | 1175 | misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr)); |
ca31798e | 1176 | misal &= alignment - 1; |
89fa689a | 1177 | SET_DR_MISALIGNMENT (dr_info, misal); |
ebfd146a IR |
1178 | return; |
1179 | } | |
1180 | ||
73fbfcad | 1181 | if (dump_enabled_p ()) |
8d21ff9f RD |
1182 | dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \ |
1183 | "to unknown (-1).\n"); | |
89fa689a | 1184 | SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN); |
ebfd146a IR |
1185 | } |
1186 | ||
d30846a0 FY |
1187 | /* Return true if alignment is relevant for DR_INFO. */ |
1188 | ||
1189 | static bool | |
1190 | vect_relevant_for_alignment_p (dr_vec_info *dr_info) | |
1191 | { | |
1192 | stmt_vec_info stmt_info = dr_info->stmt; | |
1193 | ||
1194 | if (!STMT_VINFO_RELEVANT_P (stmt_info)) | |
1195 | return false; | |
1196 | ||
1197 | /* For interleaving, only the alignment of the first access matters. */ | |
1198 | if (STMT_VINFO_GROUPED_ACCESS (stmt_info) | |
1199 | && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info) | |
1200 | return false; | |
1201 | ||
1202 | /* Scatter-gather and invariant accesses continue to address individual | |
1203 | scalars, so vector-level alignment is irrelevant. */ | |
1204 | if (STMT_VINFO_GATHER_SCATTER_P (stmt_info) | |
1205 | || integer_zerop (DR_STEP (dr_info->dr))) | |
1206 | return false; | |
1207 | ||
1208 | /* Strided accesses perform only component accesses, alignment is | |
1209 | irrelevant for them. */ | |
1210 | if (STMT_VINFO_STRIDED_P (stmt_info) | |
1211 | && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) | |
1212 | return false; | |
1213 | ||
1214 | return true; | |
1215 | } | |
ebfd146a | 1216 | |
a5b50aa1 RB |
1217 | /* Function verify_data_ref_alignment |
1218 | ||
89fa689a | 1219 | Return TRUE if DR_INFO can be handled with respect to alignment. */ |
a5b50aa1 | 1220 | |
f4ebbd24 | 1221 | static opt_result |
308bc496 | 1222 | verify_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info) |
a5b50aa1 | 1223 | { |
52eab378 | 1224 | enum dr_alignment_support supportable_dr_alignment |
308bc496 | 1225 | = vect_supportable_dr_alignment (vinfo, dr_info, false); |
a5b50aa1 | 1226 | if (!supportable_dr_alignment) |
f4ebbd24 DM |
1227 | return opt_result::failure_at |
1228 | (dr_info->stmt->stmt, | |
1229 | DR_IS_READ (dr_info->dr) | |
1230 | ? "not vectorized: unsupported unaligned load: %T\n" | |
1231 | : "not vectorized: unsupported unaligned store: %T\n", | |
1232 | DR_REF (dr_info->dr)); | |
a5b50aa1 RB |
1233 | |
1234 | if (supportable_dr_alignment != dr_aligned && dump_enabled_p ()) | |
1235 | dump_printf_loc (MSG_NOTE, vect_location, | |
1236 | "Vectorizing an unaligned access.\n"); | |
1237 | ||
f4ebbd24 | 1238 | return opt_result::success (); |
a5b50aa1 RB |
1239 | } |
1240 | ||
ebfd146a IR |
1241 | /* Function vect_verify_datarefs_alignment |
1242 | ||
1243 | Return TRUE if all data references in the loop can be | |
1244 | handled with respect to alignment. */ | |
1245 | ||
f4ebbd24 | 1246 | opt_result |
8df82de2 | 1247 | vect_verify_datarefs_alignment (loop_vec_info loop_vinfo) |
ebfd146a | 1248 | { |
8df82de2 | 1249 | vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); |
ebfd146a | 1250 | struct data_reference *dr; |
ebfd146a IR |
1251 | unsigned int i; |
1252 | ||
9771b263 | 1253 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
31271e91 | 1254 | { |
8df82de2 | 1255 | dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); |
d30846a0 | 1256 | if (!vect_relevant_for_alignment_p (dr_info)) |
513ecaea | 1257 | continue; |
52eab378 | 1258 | |
8df82de2 | 1259 | opt_result res = verify_data_ref_alignment (loop_vinfo, dr_info); |
f4ebbd24 DM |
1260 | if (!res) |
1261 | return res; | |
31271e91 | 1262 | } |
4b5caab7 | 1263 | |
f4ebbd24 | 1264 | return opt_result::success (); |
ebfd146a IR |
1265 | } |
1266 | ||
4c9bcf89 RG |
1267 | /* Given an memory reference EXP return whether its alignment is less |
1268 | than its size. */ | |
1269 | ||
1270 | static bool | |
1271 | not_size_aligned (tree exp) | |
1272 | { | |
cc269bb6 | 1273 | if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp)))) |
4c9bcf89 RG |
1274 | return true; |
1275 | ||
eb1ce453 | 1276 | return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp))) |
4c9bcf89 RG |
1277 | > get_object_alignment (exp)); |
1278 | } | |
ebfd146a IR |
1279 | |
1280 | /* Function vector_alignment_reachable_p | |
1281 | ||
89fa689a | 1282 | Return true if vector alignment for DR_INFO is reachable by peeling |
ebfd146a IR |
1283 | a few loop iterations. Return false otherwise. */ |
1284 | ||
1285 | static bool | |
89fa689a | 1286 | vector_alignment_reachable_p (dr_vec_info *dr_info) |
ebfd146a | 1287 | { |
89fa689a | 1288 | stmt_vec_info stmt_info = dr_info->stmt; |
ebfd146a IR |
1289 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
1290 | ||
0d0293ac | 1291 | if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) |
ebfd146a IR |
1292 | { |
1293 | /* For interleaved access we peel only if number of iterations in | |
1294 | the prolog loop ({VF - misalignment}), is a multiple of the | |
1295 | number of the interleaved accesses. */ | |
1296 | int elem_size, mis_in_elements; | |
ebfd146a IR |
1297 | |
1298 | /* FORNOW: handle only known alignment. */ | |
89fa689a | 1299 | if (!known_alignment_for_access_p (dr_info)) |
ebfd146a IR |
1300 | return false; |
1301 | ||
9031b367 RS |
1302 | poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype); |
1303 | poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype)); | |
1304 | elem_size = vector_element_size (vector_size, nelements); | |
89fa689a | 1305 | mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size; |
ebfd146a | 1306 | |
2c53b149 | 1307 | if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info))) |
ebfd146a IR |
1308 | return false; |
1309 | } | |
1310 | ||
1311 | /* If misalignment is known at the compile time then allow peeling | |
1312 | only if natural alignment is reachable through peeling. */ | |
89fa689a | 1313 | if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info)) |
ebfd146a | 1314 | { |
b8698a0f | 1315 | HOST_WIDE_INT elmsize = |
ebfd146a | 1316 | int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); |
73fbfcad | 1317 | if (dump_enabled_p ()) |
ebfd146a | 1318 | { |
e645e942 | 1319 | dump_printf_loc (MSG_NOTE, vect_location, |
6f795a92 DM |
1320 | "data size = %wd. misalignment = %d.\n", elmsize, |
1321 | DR_MISALIGNMENT (dr_info)); | |
ebfd146a | 1322 | } |
89fa689a | 1323 | if (DR_MISALIGNMENT (dr_info) % elmsize) |
ebfd146a | 1324 | { |
73fbfcad | 1325 | if (dump_enabled_p ()) |
e645e942 TJ |
1326 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1327 | "data size does not divide the misalignment.\n"); | |
ebfd146a IR |
1328 | return false; |
1329 | } | |
1330 | } | |
1331 | ||
89fa689a | 1332 | if (!known_alignment_for_access_p (dr_info)) |
ebfd146a | 1333 | { |
89fa689a RS |
1334 | tree type = TREE_TYPE (DR_REF (dr_info->dr)); |
1335 | bool is_packed = not_size_aligned (DR_REF (dr_info->dr)); | |
73fbfcad | 1336 | if (dump_enabled_p ()) |
e645e942 | 1337 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
c2873892 RB |
1338 | "Unknown misalignment, %snaturally aligned\n", |
1339 | is_packed ? "not " : ""); | |
1340 | return targetm.vectorize.vector_alignment_reachable (type, is_packed); | |
ebfd146a IR |
1341 | } |
1342 | ||
1343 | return true; | |
1344 | } | |
1345 | ||
720f5239 | 1346 | |
89fa689a | 1347 | /* Calculate the cost of the memory access represented by DR_INFO. */ |
720f5239 | 1348 | |
92345349 | 1349 | static void |
308bc496 | 1350 | vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info, |
720f5239 | 1351 | unsigned int *inside_cost, |
92345349 | 1352 | unsigned int *outside_cost, |
c76d9edb RB |
1353 | stmt_vector_for_cost *body_cost_vec, |
1354 | stmt_vector_for_cost *prologue_cost_vec) | |
720f5239 | 1355 | { |
89fa689a | 1356 | stmt_vec_info stmt_info = dr_info->stmt; |
308bc496 | 1357 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); |
e8f142e2 RS |
1358 | int ncopies; |
1359 | ||
1360 | if (PURE_SLP_STMT (stmt_info)) | |
1361 | ncopies = 1; | |
1362 | else | |
1363 | ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info)); | |
720f5239 | 1364 | |
89fa689a | 1365 | if (DR_IS_READ (dr_info->dr)) |
308bc496 RB |
1366 | vect_get_load_cost (vinfo, stmt_info, ncopies, true, inside_cost, |
1367 | outside_cost, prologue_cost_vec, body_cost_vec, false); | |
720f5239 | 1368 | else |
308bc496 | 1369 | vect_get_store_cost (vinfo,stmt_info, ncopies, inside_cost, body_cost_vec); |
720f5239 | 1370 | |
73fbfcad | 1371 | if (dump_enabled_p ()) |
78c60e3d SS |
1372 | dump_printf_loc (MSG_NOTE, vect_location, |
1373 | "vect_get_data_access_cost: inside_cost = %d, " | |
e645e942 | 1374 | "outside_cost = %d.\n", *inside_cost, *outside_cost); |
720f5239 IR |
1375 | } |
1376 | ||
1377 | ||
b939ea86 RB |
1378 | typedef struct _vect_peel_info |
1379 | { | |
89fa689a | 1380 | dr_vec_info *dr_info; |
34e82342 | 1381 | int npeel; |
b939ea86 RB |
1382 | unsigned int count; |
1383 | } *vect_peel_info; | |
1384 | ||
1385 | typedef struct _vect_peel_extended_info | |
1386 | { | |
308bc496 | 1387 | vec_info *vinfo; |
b939ea86 RB |
1388 | struct _vect_peel_info peel_info; |
1389 | unsigned int inside_cost; | |
1390 | unsigned int outside_cost; | |
b939ea86 RB |
1391 | } *vect_peel_extended_info; |
1392 | ||
1393 | ||
1394 | /* Peeling hashtable helpers. */ | |
1395 | ||
1396 | struct peel_info_hasher : free_ptr_hash <_vect_peel_info> | |
1397 | { | |
1398 | static inline hashval_t hash (const _vect_peel_info *); | |
1399 | static inline bool equal (const _vect_peel_info *, const _vect_peel_info *); | |
1400 | }; | |
1401 | ||
1402 | inline hashval_t | |
1403 | peel_info_hasher::hash (const _vect_peel_info *peel_info) | |
1404 | { | |
1405 | return (hashval_t) peel_info->npeel; | |
1406 | } | |
1407 | ||
1408 | inline bool | |
1409 | peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b) | |
1410 | { | |
1411 | return (a->npeel == b->npeel); | |
1412 | } | |
1413 | ||
1414 | ||
89fa689a | 1415 | /* Insert DR_INFO into peeling hash table with NPEEL as key. */ |
720f5239 IR |
1416 | |
1417 | static void | |
b939ea86 | 1418 | vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab, |
89fa689a | 1419 | loop_vec_info loop_vinfo, dr_vec_info *dr_info, |
720f5239 IR |
1420 | int npeel) |
1421 | { | |
1422 | struct _vect_peel_info elem, *slot; | |
bf190e8d | 1423 | _vect_peel_info **new_slot; |
89fa689a | 1424 | bool supportable_dr_alignment |
308bc496 | 1425 | = vect_supportable_dr_alignment (loop_vinfo, dr_info, true); |
720f5239 IR |
1426 | |
1427 | elem.npeel = npeel; | |
b939ea86 | 1428 | slot = peeling_htab->find (&elem); |
720f5239 IR |
1429 | if (slot) |
1430 | slot->count++; | |
1431 | else | |
1432 | { | |
1433 | slot = XNEW (struct _vect_peel_info); | |
1434 | slot->npeel = npeel; | |
89fa689a | 1435 | slot->dr_info = dr_info; |
720f5239 | 1436 | slot->count = 1; |
b939ea86 | 1437 | new_slot = peeling_htab->find_slot (slot, INSERT); |
720f5239 IR |
1438 | *new_slot = slot; |
1439 | } | |
1440 | ||
8b5e1202 SO |
1441 | if (!supportable_dr_alignment |
1442 | && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) | |
720f5239 IR |
1443 | slot->count += VECT_MAX_COST; |
1444 | } | |
1445 | ||
1446 | ||
1447 | /* Traverse peeling hash table to find peeling option that aligns maximum | |
1448 | number of data accesses. */ | |
1449 | ||
bf190e8d LC |
1450 | int |
1451 | vect_peeling_hash_get_most_frequent (_vect_peel_info **slot, | |
1452 | _vect_peel_extended_info *max) | |
720f5239 | 1453 | { |
bf190e8d | 1454 | vect_peel_info elem = *slot; |
720f5239 | 1455 | |
44542f8e IR |
1456 | if (elem->count > max->peel_info.count |
1457 | || (elem->count == max->peel_info.count | |
1458 | && max->peel_info.npeel > elem->npeel)) | |
720f5239 IR |
1459 | { |
1460 | max->peel_info.npeel = elem->npeel; | |
1461 | max->peel_info.count = elem->count; | |
89fa689a | 1462 | max->peel_info.dr_info = elem->dr_info; |
720f5239 IR |
1463 | } |
1464 | ||
1465 | return 1; | |
1466 | } | |
1467 | ||
f5ae2856 RS |
1468 | /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking |
1469 | data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true, | |
1470 | we assume DR0_INFO's misalignment will be zero after peeling. */ | |
720f5239 | 1471 | |
71595748 | 1472 | static void |
f5ae2856 | 1473 | vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo, |
89fa689a | 1474 | dr_vec_info *dr0_info, |
71595748 RD |
1475 | unsigned int *inside_cost, |
1476 | unsigned int *outside_cost, | |
1477 | stmt_vector_for_cost *body_cost_vec, | |
c76d9edb | 1478 | stmt_vector_for_cost *prologue_cost_vec, |
4d3d23fb RD |
1479 | unsigned int npeel, |
1480 | bool unknown_misalignment) | |
720f5239 | 1481 | { |
f5ae2856 | 1482 | vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); |
71595748 RD |
1483 | unsigned i; |
1484 | data_reference *dr; | |
720f5239 | 1485 | |
9771b263 | 1486 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
720f5239 | 1487 | { |
f5ae2856 | 1488 | dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); |
d30846a0 | 1489 | if (!vect_relevant_for_alignment_p (dr_info)) |
8ee38286 RB |
1490 | continue; |
1491 | ||
71595748 | 1492 | int save_misalignment; |
89fa689a | 1493 | save_misalignment = DR_MISALIGNMENT (dr_info); |
d629ab44 RB |
1494 | if (npeel == 0) |
1495 | ; | |
89fa689a RS |
1496 | else if (unknown_misalignment && dr_info == dr0_info) |
1497 | SET_DR_MISALIGNMENT (dr_info, 0); | |
71595748 | 1498 | else |
89fa689a | 1499 | vect_update_misalignment_for_peel (dr_info, dr0_info, npeel); |
308bc496 | 1500 | vect_get_data_access_cost (loop_vinfo, dr_info, inside_cost, outside_cost, |
c76d9edb | 1501 | body_cost_vec, prologue_cost_vec); |
89fa689a | 1502 | SET_DR_MISALIGNMENT (dr_info, save_misalignment); |
720f5239 | 1503 | } |
71595748 RD |
1504 | } |
1505 | ||
1506 | /* Traverse peeling hash table and calculate cost for each peeling option. | |
1507 | Find the one with the lowest cost. */ | |
1508 | ||
1509 | int | |
1510 | vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot, | |
1511 | _vect_peel_extended_info *min) | |
1512 | { | |
1513 | vect_peel_info elem = *slot; | |
1514 | int dummy; | |
1515 | unsigned int inside_cost = 0, outside_cost = 0; | |
308bc496 | 1516 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo); |
71595748 RD |
1517 | stmt_vector_for_cost prologue_cost_vec, body_cost_vec, |
1518 | epilogue_cost_vec; | |
1519 | ||
1520 | prologue_cost_vec.create (2); | |
1521 | body_cost_vec.create (2); | |
1522 | epilogue_cost_vec.create (2); | |
1523 | ||
f5ae2856 RS |
1524 | vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost, |
1525 | &outside_cost, &body_cost_vec, | |
1526 | &prologue_cost_vec, elem->npeel, false); | |
720f5239 | 1527 | |
ec15a152 RD |
1528 | body_cost_vec.release (); |
1529 | ||
696814ed RB |
1530 | outside_cost += vect_get_known_peeling_cost |
1531 | (loop_vinfo, elem->npeel, &dummy, | |
6d098c57 RB |
1532 | &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), |
1533 | &prologue_cost_vec, &epilogue_cost_vec); | |
92345349 BS |
1534 | |
1535 | /* Prologue and epilogue costs are added to the target model later. | |
1536 | These costs depend only on the scalar iteration cost, the | |
1537 | number of peeling iterations finally chosen, and the number of | |
1538 | misaligned statements. So discard the information found here. */ | |
9771b263 DN |
1539 | prologue_cost_vec.release (); |
1540 | epilogue_cost_vec.release (); | |
720f5239 IR |
1541 | |
1542 | if (inside_cost < min->inside_cost | |
71595748 RD |
1543 | || (inside_cost == min->inside_cost |
1544 | && outside_cost < min->outside_cost)) | |
720f5239 IR |
1545 | { |
1546 | min->inside_cost = inside_cost; | |
1547 | min->outside_cost = outside_cost; | |
89fa689a | 1548 | min->peel_info.dr_info = elem->dr_info; |
720f5239 | 1549 | min->peel_info.npeel = elem->npeel; |
71595748 | 1550 | min->peel_info.count = elem->count; |
720f5239 IR |
1551 | } |
1552 | ||
1553 | return 1; | |
1554 | } | |
1555 | ||
1556 | ||
1557 | /* Choose best peeling option by traversing peeling hash table and either | |
1558 | choosing an option with the lowest cost (if cost model is enabled) or the | |
1559 | option that aligns as many accesses as possible. */ | |
1560 | ||
1e69cc8f | 1561 | static struct _vect_peel_extended_info |
b939ea86 | 1562 | vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab, |
ec15a152 | 1563 | loop_vec_info loop_vinfo) |
720f5239 IR |
1564 | { |
1565 | struct _vect_peel_extended_info res; | |
1566 | ||
89fa689a | 1567 | res.peel_info.dr_info = NULL; |
308bc496 | 1568 | res.vinfo = loop_vinfo; |
720f5239 | 1569 | |
8b5e1202 | 1570 | if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) |
720f5239 IR |
1571 | { |
1572 | res.inside_cost = INT_MAX; | |
1573 | res.outside_cost = INT_MAX; | |
b939ea86 RB |
1574 | peeling_htab->traverse <_vect_peel_extended_info *, |
1575 | vect_peeling_hash_get_lowest_cost> (&res); | |
720f5239 IR |
1576 | } |
1577 | else | |
1578 | { | |
1579 | res.peel_info.count = 0; | |
b939ea86 RB |
1580 | peeling_htab->traverse <_vect_peel_extended_info *, |
1581 | vect_peeling_hash_get_most_frequent> (&res); | |
1e69cc8f RD |
1582 | res.inside_cost = 0; |
1583 | res.outside_cost = 0; | |
720f5239 IR |
1584 | } |
1585 | ||
1e69cc8f | 1586 | return res; |
720f5239 IR |
1587 | } |
1588 | ||
71595748 RD |
1589 | /* Return true if the new peeling NPEEL is supported. */ |
1590 | ||
1591 | static bool | |
89fa689a | 1592 | vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info, |
71595748 RD |
1593 | unsigned npeel) |
1594 | { | |
1595 | unsigned i; | |
1596 | struct data_reference *dr = NULL; | |
1597 | vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); | |
71595748 RD |
1598 | enum dr_alignment_support supportable_dr_alignment; |
1599 | ||
1600 | /* Ensure that all data refs can be vectorized after the peel. */ | |
1601 | FOR_EACH_VEC_ELT (datarefs, i, dr) | |
1602 | { | |
1603 | int save_misalignment; | |
1604 | ||
89fa689a | 1605 | if (dr == dr0_info->dr) |
71595748 RD |
1606 | continue; |
1607 | ||
f5ae2856 | 1608 | dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); |
d30846a0 | 1609 | if (!vect_relevant_for_alignment_p (dr_info)) |
71595748 RD |
1610 | continue; |
1611 | ||
89fa689a RS |
1612 | save_misalignment = DR_MISALIGNMENT (dr_info); |
1613 | vect_update_misalignment_for_peel (dr_info, dr0_info, npeel); | |
1614 | supportable_dr_alignment | |
308bc496 | 1615 | = vect_supportable_dr_alignment (loop_vinfo, dr_info, false); |
89fa689a | 1616 | SET_DR_MISALIGNMENT (dr_info, save_misalignment); |
71595748 RD |
1617 | |
1618 | if (!supportable_dr_alignment) | |
1619 | return false; | |
1620 | } | |
1621 | ||
1622 | return true; | |
1623 | } | |
720f5239 | 1624 | |
ebfd146a IR |
1625 | /* Function vect_enhance_data_refs_alignment |
1626 | ||
1627 | This pass will use loop versioning and loop peeling in order to enhance | |
1628 | the alignment of data references in the loop. | |
1629 | ||
1630 | FOR NOW: we assume that whatever versioning/peeling takes place, only the | |
ff802fa1 | 1631 | original loop is to be vectorized. Any other loops that are created by |
ebfd146a | 1632 | the transformations performed in this pass - are not supposed to be |
ff802fa1 | 1633 | vectorized. This restriction will be relaxed. |
ebfd146a IR |
1634 | |
1635 | This pass will require a cost model to guide it whether to apply peeling | |
ff802fa1 | 1636 | or versioning or a combination of the two. For example, the scheme that |
ebfd146a IR |
1637 | intel uses when given a loop with several memory accesses, is as follows: |
1638 | choose one memory access ('p') which alignment you want to force by doing | |
ff802fa1 | 1639 | peeling. Then, either (1) generate a loop in which 'p' is aligned and all |
ebfd146a IR |
1640 | other accesses are not necessarily aligned, or (2) use loop versioning to |
1641 | generate one loop in which all accesses are aligned, and another loop in | |
1642 | which only 'p' is necessarily aligned. | |
1643 | ||
1644 | ("Automatic Intra-Register Vectorization for the Intel Architecture", | |
1645 | Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International | |
1646 | Journal of Parallel Programming, Vol. 30, No. 2, April 2002.) | |
1647 | ||
ff802fa1 | 1648 | Devising a cost model is the most critical aspect of this work. It will |
ebfd146a | 1649 | guide us on which access to peel for, whether to use loop versioning, how |
ff802fa1 | 1650 | many versions to create, etc. The cost model will probably consist of |
ebfd146a IR |
1651 | generic considerations as well as target specific considerations (on |
1652 | powerpc for example, misaligned stores are more painful than misaligned | |
1653 | loads). | |
1654 | ||
1655 | Here are the general steps involved in alignment enhancements: | |
1656 | ||
1657 | -- original loop, before alignment analysis: | |
1658 | for (i=0; i<N; i++){ | |
1659 | x = q[i]; # DR_MISALIGNMENT(q) = unknown | |
1660 | p[i] = y; # DR_MISALIGNMENT(p) = unknown | |
1661 | } | |
1662 | ||
1663 | -- After vect_compute_data_refs_alignment: | |
1664 | for (i=0; i<N; i++){ | |
1665 | x = q[i]; # DR_MISALIGNMENT(q) = 3 | |
1666 | p[i] = y; # DR_MISALIGNMENT(p) = unknown | |
1667 | } | |
1668 | ||
1669 | -- Possibility 1: we do loop versioning: | |
1670 | if (p is aligned) { | |
1671 | for (i=0; i<N; i++){ # loop 1A | |
1672 | x = q[i]; # DR_MISALIGNMENT(q) = 3 | |
1673 | p[i] = y; # DR_MISALIGNMENT(p) = 0 | |
1674 | } | |
1675 | } | |
1676 | else { | |
1677 | for (i=0; i<N; i++){ # loop 1B | |
1678 | x = q[i]; # DR_MISALIGNMENT(q) = 3 | |
1679 | p[i] = y; # DR_MISALIGNMENT(p) = unaligned | |
1680 | } | |
1681 | } | |
1682 | ||
1683 | -- Possibility 2: we do loop peeling: | |
1684 | for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). | |
1685 | x = q[i]; | |
1686 | p[i] = y; | |
1687 | } | |
1688 | for (i = 3; i < N; i++){ # loop 2A | |
1689 | x = q[i]; # DR_MISALIGNMENT(q) = 0 | |
1690 | p[i] = y; # DR_MISALIGNMENT(p) = unknown | |
1691 | } | |
1692 | ||
1693 | -- Possibility 3: combination of loop peeling and versioning: | |
1694 | for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). | |
1695 | x = q[i]; | |
1696 | p[i] = y; | |
1697 | } | |
1698 | if (p is aligned) { | |
1699 | for (i = 3; i<N; i++){ # loop 3A | |
1700 | x = q[i]; # DR_MISALIGNMENT(q) = 0 | |
1701 | p[i] = y; # DR_MISALIGNMENT(p) = 0 | |
1702 | } | |
1703 | } | |
1704 | else { | |
1705 | for (i = 3; i<N; i++){ # loop 3B | |
1706 | x = q[i]; # DR_MISALIGNMENT(q) = 0 | |
1707 | p[i] = y; # DR_MISALIGNMENT(p) = unaligned | |
1708 | } | |
1709 | } | |
1710 | ||
ff802fa1 | 1711 | These loops are later passed to loop_transform to be vectorized. The |
ebfd146a IR |
1712 | vectorizer will use the alignment information to guide the transformation |
1713 | (whether to generate regular loads/stores, or with special handling for | |
1714 | misalignment). */ | |
1715 | ||
f4ebbd24 | 1716 | opt_result |
ebfd146a IR |
1717 | vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) |
1718 | { | |
9771b263 | 1719 | vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); |
99b1c316 | 1720 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
4d3d23fb | 1721 | enum dr_alignment_support supportable_dr_alignment; |
89fa689a RS |
1722 | dr_vec_info *first_store = NULL; |
1723 | dr_vec_info *dr0_info = NULL; | |
ebfd146a | 1724 | struct data_reference *dr; |
720f5239 | 1725 | unsigned int i, j; |
ebfd146a IR |
1726 | bool do_peeling = false; |
1727 | bool do_versioning = false; | |
720f5239 | 1728 | unsigned int npeel = 0; |
1e69cc8f RD |
1729 | bool one_misalignment_known = false; |
1730 | bool one_misalignment_unknown = false; | |
4d3d23fb | 1731 | bool one_dr_unsupportable = false; |
89fa689a | 1732 | dr_vec_info *unsupportable_dr_info = NULL; |
d9f21f6a | 1733 | poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
720f5239 IR |
1734 | unsigned possible_npeel_number = 1; |
1735 | tree vectype; | |
d9f21f6a | 1736 | unsigned int mis, same_align_drs_max = 0; |
b939ea86 | 1737 | hash_table<peel_info_hasher> peeling_htab (1); |
ebfd146a | 1738 | |
adac3a68 | 1739 | DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment"); |
ebfd146a | 1740 | |
ddf56386 RB |
1741 | /* Reset data so we can safely be called multiple times. */ |
1742 | LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0); | |
1743 | LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0; | |
1744 | ||
ebfd146a IR |
1745 | /* While cost model enhancements are expected in the future, the high level |
1746 | view of the code at this time is as follows: | |
1747 | ||
673beced RE |
1748 | A) If there is a misaligned access then see if peeling to align |
1749 | this access can make all data references satisfy | |
8f439681 RE |
1750 | vect_supportable_dr_alignment. If so, update data structures |
1751 | as needed and return true. | |
ebfd146a IR |
1752 | |
1753 | B) If peeling wasn't possible and there is a data reference with an | |
1754 | unknown misalignment that does not satisfy vect_supportable_dr_alignment | |
1755 | then see if loop versioning checks can be used to make all data | |
1756 | references satisfy vect_supportable_dr_alignment. If so, update | |
1757 | data structures as needed and return true. | |
1758 | ||
1759 | C) If neither peeling nor versioning were successful then return false if | |
1760 | any data reference does not satisfy vect_supportable_dr_alignment. | |
1761 | ||
1762 | D) Return true (all data references satisfy vect_supportable_dr_alignment). | |
1763 | ||
1764 | Note, Possibility 3 above (which is peeling and versioning together) is not | |
1765 | being done at this time. */ | |
1766 | ||
1767 | /* (1) Peeling to force alignment. */ | |
1768 | ||
1769 | /* (1.1) Decide whether to perform peeling, and how many iterations to peel: | |
1770 | Considerations: | |
1771 | + How many accesses will become aligned due to the peeling | |
1772 | - How many accesses will become unaligned due to the peeling, | |
1773 | and the cost of misaligned accesses. | |
b8698a0f | 1774 | - The cost of peeling (the extra runtime checks, the increase |
720f5239 | 1775 | in code size). */ |
ebfd146a | 1776 | |
9771b263 | 1777 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
ebfd146a | 1778 | { |
f5ae2856 | 1779 | dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); |
e54353a7 | 1780 | if (!vect_relevant_for_alignment_p (dr_info)) |
319e6439 RG |
1781 | continue; |
1782 | ||
e54353a7 | 1783 | stmt_vec_info stmt_info = dr_info->stmt; |
308bc496 RB |
1784 | supportable_dr_alignment |
1785 | = vect_supportable_dr_alignment (loop_vinfo, dr_info, true); | |
89fa689a | 1786 | do_peeling = vector_alignment_reachable_p (dr_info); |
720f5239 | 1787 | if (do_peeling) |
ebfd146a | 1788 | { |
89fa689a | 1789 | if (known_alignment_for_access_p (dr_info)) |
720f5239 | 1790 | { |
f702e7d4 | 1791 | unsigned int npeel_tmp = 0; |
d8ba5b19 RG |
1792 | bool negative = tree_int_cst_compare (DR_STEP (dr), |
1793 | size_zero_node) < 0; | |
720f5239 | 1794 | |
f702e7d4 | 1795 | vectype = STMT_VINFO_VECTYPE (stmt_info); |
ca31798e AV |
1796 | /* If known_alignment_for_access_p then we have set |
1797 | DR_MISALIGNMENT which is only done if we know it at compiler | |
1798 | time, so it is safe to assume target alignment is constant. | |
1799 | */ | |
1800 | unsigned int target_align = | |
1801 | DR_TARGET_ALIGNMENT (dr_info).to_constant (); | |
89fa689a RS |
1802 | unsigned int dr_size = vect_get_scalar_dr_size (dr_info); |
1803 | mis = (negative | |
1804 | ? DR_MISALIGNMENT (dr_info) | |
1805 | : -DR_MISALIGNMENT (dr_info)); | |
1806 | if (DR_MISALIGNMENT (dr_info) != 0) | |
f702e7d4 | 1807 | npeel_tmp = (mis & (target_align - 1)) / dr_size; |
720f5239 IR |
1808 | |
1809 | /* For multiple types, it is possible that the bigger type access | |
ff802fa1 | 1810 | will have more than one peeling option. E.g., a loop with two |
720f5239 | 1811 | types: one of size (vector size / 4), and the other one of |
ff802fa1 | 1812 | size (vector size / 8). Vectorization factor will 8. If both |
8d21ff9f | 1813 | accesses are misaligned by 3, the first one needs one scalar |
ff802fa1 | 1814 | iteration to be aligned, and the second one needs 5. But the |
6af801f5 | 1815 | first one will be aligned also by peeling 5 scalar |
720f5239 IR |
1816 | iterations, and in that case both accesses will be aligned. |
1817 | Hence, except for the immediate peeling amount, we also want | |
1818 | to try to add full vector size, while we don't exceed | |
1819 | vectorization factor. | |
8d21ff9f RD |
1820 | We do this automatically for cost model, since we calculate |
1821 | cost for every peeling option. */ | |
8b5e1202 | 1822 | if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) |
91ff1504 | 1823 | { |
d9f21f6a | 1824 | poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info) |
2c53b149 | 1825 | ? vf * DR_GROUP_SIZE (stmt_info) : vf); |
d9f21f6a RS |
1826 | possible_npeel_number |
1827 | = vect_get_num_vectors (nscalars, vectype); | |
720f5239 | 1828 | |
4d3d23fb RD |
1829 | /* NPEEL_TMP is 0 when there is no misalignment, but also |
1830 | allow peeling NELEMENTS. */ | |
89fa689a | 1831 | if (DR_MISALIGNMENT (dr_info) == 0) |
8d21ff9f RD |
1832 | possible_npeel_number++; |
1833 | } | |
720f5239 | 1834 | |
8d21ff9f RD |
1835 | /* Save info about DR in the hash table. Also include peeling |
1836 | amounts according to the explanation above. */ | |
720f5239 IR |
1837 | for (j = 0; j < possible_npeel_number; j++) |
1838 | { | |
b939ea86 | 1839 | vect_peeling_hash_insert (&peeling_htab, loop_vinfo, |
89fa689a | 1840 | dr_info, npeel_tmp); |
f702e7d4 | 1841 | npeel_tmp += target_align / dr_size; |
720f5239 IR |
1842 | } |
1843 | ||
1e69cc8f | 1844 | one_misalignment_known = true; |
720f5239 IR |
1845 | } |
1846 | else | |
1847 | { | |
4ba5ea11 RB |
1848 | /* If we don't know any misalignment values, we prefer |
1849 | peeling for data-ref that has the maximum number of data-refs | |
720f5239 IR |
1850 | with the same alignment, unless the target prefers to align |
1851 | stores over load. */ | |
1e69cc8f RD |
1852 | unsigned same_align_drs |
1853 | = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length (); | |
89fa689a | 1854 | if (!dr0_info |
1e69cc8f RD |
1855 | || same_align_drs_max < same_align_drs) |
1856 | { | |
1857 | same_align_drs_max = same_align_drs; | |
89fa689a | 1858 | dr0_info = dr_info; |
1e69cc8f RD |
1859 | } |
1860 | /* For data-refs with the same number of related | |
1861 | accesses prefer the one where the misalign | |
1862 | computation will be invariant in the outermost loop. */ | |
1863 | else if (same_align_drs_max == same_align_drs) | |
1864 | { | |
99b1c316 | 1865 | class loop *ivloop0, *ivloop; |
1e69cc8f | 1866 | ivloop0 = outermost_invariant_loop_for_expr |
89fa689a | 1867 | (loop, DR_BASE_ADDRESS (dr0_info->dr)); |
1e69cc8f RD |
1868 | ivloop = outermost_invariant_loop_for_expr |
1869 | (loop, DR_BASE_ADDRESS (dr)); | |
1870 | if ((ivloop && !ivloop0) | |
1871 | || (ivloop && ivloop0 | |
1872 | && flow_loop_nested_p (ivloop, ivloop0))) | |
89fa689a | 1873 | dr0_info = dr_info; |
1e69cc8f | 1874 | } |
720f5239 | 1875 | |
4d3d23fb RD |
1876 | one_misalignment_unknown = true; |
1877 | ||
1878 | /* Check for data refs with unsupportable alignment that | |
1879 | can be peeled. */ | |
1880 | if (!supportable_dr_alignment) | |
1881 | { | |
1882 | one_dr_unsupportable = true; | |
89fa689a | 1883 | unsupportable_dr_info = dr_info; |
4d3d23fb RD |
1884 | } |
1885 | ||
1e69cc8f | 1886 | if (!first_store && DR_IS_WRITE (dr)) |
89fa689a | 1887 | first_store = dr_info; |
720f5239 IR |
1888 | } |
1889 | } | |
1890 | else | |
1891 | { | |
89fa689a | 1892 | if (!aligned_access_p (dr_info)) |
720f5239 | 1893 | { |
73fbfcad | 1894 | if (dump_enabled_p ()) |
e645e942 TJ |
1895 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
1896 | "vector alignment may not be reachable\n"); | |
720f5239 IR |
1897 | break; |
1898 | } | |
1899 | } | |
ebfd146a IR |
1900 | } |
1901 | ||
afb119be RB |
1902 | /* Check if we can possibly peel the loop. */ |
1903 | if (!vect_can_advance_ivs_p (loop_vinfo) | |
a6c51a12 YR |
1904 | || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)) |
1905 | || loop->inner) | |
ebfd146a IR |
1906 | do_peeling = false; |
1907 | ||
64812d33 RD |
1908 | struct _vect_peel_extended_info peel_for_known_alignment; |
1909 | struct _vect_peel_extended_info peel_for_unknown_alignment; | |
1910 | struct _vect_peel_extended_info best_peel; | |
1911 | ||
1912 | peel_for_unknown_alignment.inside_cost = INT_MAX; | |
1913 | peel_for_unknown_alignment.outside_cost = INT_MAX; | |
1914 | peel_for_unknown_alignment.peel_info.count = 0; | |
1e69cc8f | 1915 | |
b1aef01e | 1916 | if (do_peeling |
64812d33 | 1917 | && one_misalignment_unknown) |
720f5239 | 1918 | { |
720f5239 IR |
1919 | /* Check if the target requires to prefer stores over loads, i.e., if |
1920 | misaligned stores are more expensive than misaligned loads (taking | |
1921 | drs with same alignment into account). */ | |
64812d33 RD |
1922 | unsigned int load_inside_cost = 0; |
1923 | unsigned int load_outside_cost = 0; | |
1924 | unsigned int store_inside_cost = 0; | |
1925 | unsigned int store_outside_cost = 0; | |
d9f21f6a | 1926 | unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2; |
64812d33 RD |
1927 | |
1928 | stmt_vector_for_cost dummy; | |
1929 | dummy.create (2); | |
f5ae2856 | 1930 | vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info, |
64812d33 RD |
1931 | &load_inside_cost, |
1932 | &load_outside_cost, | |
c76d9edb | 1933 | &dummy, &dummy, estimated_npeels, true); |
64812d33 RD |
1934 | dummy.release (); |
1935 | ||
1936 | if (first_store) | |
1937 | { | |
1e69cc8f | 1938 | dummy.create (2); |
f5ae2856 | 1939 | vect_get_peeling_costs_all_drs (loop_vinfo, first_store, |
1e69cc8f RD |
1940 | &store_inside_cost, |
1941 | &store_outside_cost, | |
c76d9edb RB |
1942 | &dummy, &dummy, |
1943 | estimated_npeels, true); | |
9771b263 | 1944 | dummy.release (); |
64812d33 RD |
1945 | } |
1946 | else | |
1947 | { | |
1948 | store_inside_cost = INT_MAX; | |
1949 | store_outside_cost = INT_MAX; | |
1950 | } | |
720f5239 | 1951 | |
64812d33 RD |
1952 | if (load_inside_cost > store_inside_cost |
1953 | || (load_inside_cost == store_inside_cost | |
1954 | && load_outside_cost > store_outside_cost)) | |
1955 | { | |
89fa689a | 1956 | dr0_info = first_store; |
64812d33 RD |
1957 | peel_for_unknown_alignment.inside_cost = store_inside_cost; |
1958 | peel_for_unknown_alignment.outside_cost = store_outside_cost; | |
1959 | } | |
1960 | else | |
1961 | { | |
1962 | peel_for_unknown_alignment.inside_cost = load_inside_cost; | |
1963 | peel_for_unknown_alignment.outside_cost = load_outside_cost; | |
1964 | } | |
1e69cc8f | 1965 | |
64812d33 RD |
1966 | stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec; |
1967 | prologue_cost_vec.create (2); | |
1968 | epilogue_cost_vec.create (2); | |
1e69cc8f | 1969 | |
64812d33 RD |
1970 | int dummy2; |
1971 | peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost | |
d9f21f6a | 1972 | (loop_vinfo, estimated_npeels, &dummy2, |
64812d33 RD |
1973 | &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), |
1974 | &prologue_cost_vec, &epilogue_cost_vec); | |
1e69cc8f | 1975 | |
64812d33 RD |
1976 | prologue_cost_vec.release (); |
1977 | epilogue_cost_vec.release (); | |
720f5239 | 1978 | |
64812d33 | 1979 | peel_for_unknown_alignment.peel_info.count = 1 |
89fa689a | 1980 | + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length (); |
720f5239 IR |
1981 | } |
1982 | ||
64812d33 | 1983 | peel_for_unknown_alignment.peel_info.npeel = 0; |
89fa689a | 1984 | peel_for_unknown_alignment.peel_info.dr_info = dr0_info; |
64812d33 RD |
1985 | |
1986 | best_peel = peel_for_unknown_alignment; | |
1987 | ||
1e69cc8f RD |
1988 | peel_for_known_alignment.inside_cost = INT_MAX; |
1989 | peel_for_known_alignment.outside_cost = INT_MAX; | |
1990 | peel_for_known_alignment.peel_info.count = 0; | |
89fa689a | 1991 | peel_for_known_alignment.peel_info.dr_info = NULL; |
1e69cc8f RD |
1992 | |
1993 | if (do_peeling && one_misalignment_known) | |
720f5239 IR |
1994 | { |
1995 | /* Peeling is possible, but there is no data access that is not supported | |
64812d33 RD |
1996 | unless aligned. So we try to choose the best possible peeling from |
1997 | the hash table. */ | |
1e69cc8f | 1998 | peel_for_known_alignment = vect_peeling_hash_choose_best_peeling |
ec15a152 | 1999 | (&peeling_htab, loop_vinfo); |
720f5239 IR |
2000 | } |
2001 | ||
1e69cc8f | 2002 | /* Compare costs of peeling for known and unknown alignment. */ |
89fa689a | 2003 | if (peel_for_known_alignment.peel_info.dr_info != NULL |
64812d33 RD |
2004 | && peel_for_unknown_alignment.inside_cost |
2005 | >= peel_for_known_alignment.inside_cost) | |
4d3d23fb RD |
2006 | { |
2007 | best_peel = peel_for_known_alignment; | |
64812d33 | 2008 | |
4d3d23fb RD |
2009 | /* If the best peeling for known alignment has NPEEL == 0, perform no |
2010 | peeling at all except if there is an unsupportable dr that we can | |
2011 | align. */ | |
2012 | if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable) | |
2013 | do_peeling = false; | |
2014 | } | |
64812d33 | 2015 | |
4d3d23fb RD |
2016 | /* If there is an unsupportable data ref, prefer this over all choices so far |
2017 | since we'd have to discard a chosen peeling except when it accidentally | |
2018 | aligned the unsupportable data ref. */ | |
2019 | if (one_dr_unsupportable) | |
89fa689a | 2020 | dr0_info = unsupportable_dr_info; |
4d3d23fb RD |
2021 | else if (do_peeling) |
2022 | { | |
d629ab44 | 2023 | /* Calculate the penalty for no peeling, i.e. leaving everything as-is. |
ec15a152 | 2024 | TODO: Use nopeel_outside_cost or get rid of it? */ |
4d3d23fb RD |
2025 | unsigned nopeel_inside_cost = 0; |
2026 | unsigned nopeel_outside_cost = 0; | |
64812d33 | 2027 | |
4d3d23fb RD |
2028 | stmt_vector_for_cost dummy; |
2029 | dummy.create (2); | |
f5ae2856 | 2030 | vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost, |
c76d9edb RB |
2031 | &nopeel_outside_cost, &dummy, &dummy, |
2032 | 0, false); | |
4d3d23fb | 2033 | dummy.release (); |
64812d33 | 2034 | |
4d3d23fb RD |
2035 | /* Add epilogue costs. As we do not peel for alignment here, no prologue |
2036 | costs will be recorded. */ | |
2037 | stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec; | |
2038 | prologue_cost_vec.create (2); | |
2039 | epilogue_cost_vec.create (2); | |
64812d33 | 2040 | |
4d3d23fb RD |
2041 | int dummy2; |
2042 | nopeel_outside_cost += vect_get_known_peeling_cost | |
2043 | (loop_vinfo, 0, &dummy2, | |
2044 | &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), | |
2045 | &prologue_cost_vec, &epilogue_cost_vec); | |
2046 | ||
2047 | prologue_cost_vec.release (); | |
2048 | epilogue_cost_vec.release (); | |
64812d33 | 2049 | |
4d3d23fb | 2050 | npeel = best_peel.peel_info.npeel; |
89fa689a | 2051 | dr0_info = best_peel.peel_info.dr_info; |
1e69cc8f | 2052 | |
4d3d23fb RD |
2053 | /* If no peeling is not more expensive than the best peeling we |
2054 | have so far, don't perform any peeling. */ | |
2055 | if (nopeel_inside_cost <= best_peel.inside_cost) | |
2056 | do_peeling = false; | |
2057 | } | |
1e69cc8f | 2058 | |
ebfd146a IR |
2059 | if (do_peeling) |
2060 | { | |
89fa689a | 2061 | stmt_vec_info stmt_info = dr0_info->stmt; |
720f5239 | 2062 | vectype = STMT_VINFO_VECTYPE (stmt_info); |
ebfd146a | 2063 | |
89fa689a | 2064 | if (known_alignment_for_access_p (dr0_info)) |
ebfd146a | 2065 | { |
89fa689a | 2066 | bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr), |
d8ba5b19 | 2067 | size_zero_node) < 0; |
720f5239 IR |
2068 | if (!npeel) |
2069 | { | |
2070 | /* Since it's known at compile time, compute the number of | |
2071 | iterations in the peeled loop (the peeling factor) for use in | |
2072 | updating DR_MISALIGNMENT values. The peeling factor is the | |
2073 | vectorization factor minus the misalignment as an element | |
2074 | count. */ | |
89fa689a RS |
2075 | mis = (negative |
2076 | ? DR_MISALIGNMENT (dr0_info) | |
2077 | : -DR_MISALIGNMENT (dr0_info)); | |
ca31798e AV |
2078 | /* If known_alignment_for_access_p then we have set |
2079 | DR_MISALIGNMENT which is only done if we know it at compiler | |
2080 | time, so it is safe to assume target alignment is constant. | |
2081 | */ | |
2082 | unsigned int target_align = | |
2083 | DR_TARGET_ALIGNMENT (dr0_info).to_constant (); | |
f702e7d4 | 2084 | npeel = ((mis & (target_align - 1)) |
89fa689a | 2085 | / vect_get_scalar_dr_size (dr0_info)); |
720f5239 | 2086 | } |
ebfd146a | 2087 | |
b8698a0f | 2088 | /* For interleaved data access every iteration accesses all the |
ebfd146a IR |
2089 | members of the group, therefore we divide the number of iterations |
2090 | by the group size. */ | |
0d0293ac | 2091 | if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) |
2c53b149 | 2092 | npeel /= DR_GROUP_SIZE (stmt_info); |
ebfd146a | 2093 | |
73fbfcad | 2094 | if (dump_enabled_p ()) |
78c60e3d | 2095 | dump_printf_loc (MSG_NOTE, vect_location, |
e645e942 | 2096 | "Try peeling by %d\n", npeel); |
ebfd146a IR |
2097 | } |
2098 | ||
71595748 | 2099 | /* Ensure that all datarefs can be vectorized after the peel. */ |
89fa689a | 2100 | if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel)) |
71595748 | 2101 | do_peeling = false; |
ebfd146a | 2102 | |
71595748 | 2103 | /* Check if all datarefs are supportable and log. */ |
89fa689a | 2104 | if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0) |
720f5239 | 2105 | { |
f4ebbd24 | 2106 | opt_result stat = vect_verify_datarefs_alignment (loop_vinfo); |
720f5239 IR |
2107 | if (!stat) |
2108 | do_peeling = false; | |
2109 | else | |
ec15a152 | 2110 | return stat; |
720f5239 IR |
2111 | } |
2112 | ||
476c1280 | 2113 | /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */ |
4f17aa0b XDL |
2114 | if (do_peeling) |
2115 | { | |
2116 | unsigned max_allowed_peel | |
028d4092 | 2117 | = param_vect_max_peeling_for_alignment; |
247afa98 RB |
2118 | if (flag_vect_cost_model == VECT_COST_MODEL_CHEAP) |
2119 | max_allowed_peel = 0; | |
4f17aa0b XDL |
2120 | if (max_allowed_peel != (unsigned)-1) |
2121 | { | |
2122 | unsigned max_peel = npeel; | |
2123 | if (max_peel == 0) | |
2124 | { | |
ca31798e AV |
2125 | poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info); |
2126 | unsigned HOST_WIDE_INT target_align_c; | |
2127 | if (target_align.is_constant (&target_align_c)) | |
2128 | max_peel = | |
2129 | target_align_c / vect_get_scalar_dr_size (dr0_info) - 1; | |
2130 | else | |
2131 | { | |
2132 | do_peeling = false; | |
2133 | if (dump_enabled_p ()) | |
2134 | dump_printf_loc (MSG_NOTE, vect_location, | |
2135 | "Disable peeling, max peels set and vector" | |
2136 | " alignment unknown\n"); | |
2137 | } | |
4f17aa0b XDL |
2138 | } |
2139 | if (max_peel > max_allowed_peel) | |
2140 | { | |
2141 | do_peeling = false; | |
2142 | if (dump_enabled_p ()) | |
2143 | dump_printf_loc (MSG_NOTE, vect_location, | |
2144 | "Disable peeling, max peels reached: %d\n", max_peel); | |
2145 | } | |
2146 | } | |
2147 | } | |
2148 | ||
476c1280 | 2149 | /* Cost model #2 - if peeling may result in a remaining loop not |
d9f21f6a RS |
2150 | iterating enough to be vectorized then do not peel. Since this |
2151 | is a cost heuristic rather than a correctness decision, use the | |
2152 | most likely runtime value for variable vectorization factors. */ | |
476c1280 RB |
2153 | if (do_peeling |
2154 | && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) | |
2155 | { | |
d9f21f6a RS |
2156 | unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); |
2157 | unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel; | |
2158 | if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo) | |
2159 | < assumed_vf + max_peel) | |
476c1280 RB |
2160 | do_peeling = false; |
2161 | } | |
2162 | ||
ebfd146a IR |
2163 | if (do_peeling) |
2164 | { | |
2165 | /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i. | |
2166 | If the misalignment of DR_i is identical to that of dr0 then set | |
2167 | DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and | |
2168 | dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i) | |
2169 | by the peeling factor times the element size of DR_i (MOD the | |
2170 | vectorization factor times the size). Otherwise, the | |
2171 | misalignment of DR_i must be set to unknown. */ | |
9771b263 | 2172 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
89fa689a | 2173 | if (dr != dr0_info->dr) |
ccbd7103 | 2174 | { |
f5ae2856 | 2175 | dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); |
e54353a7 | 2176 | if (!vect_relevant_for_alignment_p (dr_info)) |
ccbd7103 RB |
2177 | continue; |
2178 | ||
89fa689a | 2179 | vect_update_misalignment_for_peel (dr_info, dr0_info, npeel); |
ccbd7103 | 2180 | } |
ebfd146a | 2181 | |
1e5e6ff5 | 2182 | LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info; |
720f5239 | 2183 | if (npeel) |
15e693cc | 2184 | LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel; |
720f5239 | 2185 | else |
15e693cc | 2186 | LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) |
89fa689a RS |
2187 | = DR_MISALIGNMENT (dr0_info); |
2188 | SET_DR_MISALIGNMENT (dr0_info, 0); | |
73fbfcad | 2189 | if (dump_enabled_p ()) |
78c60e3d SS |
2190 | { |
2191 | dump_printf_loc (MSG_NOTE, vect_location, | |
e645e942 | 2192 | "Alignment of access forced using peeling.\n"); |
78c60e3d | 2193 | dump_printf_loc (MSG_NOTE, vect_location, |
e645e942 | 2194 | "Peeling for alignment will be applied.\n"); |
78c60e3d | 2195 | } |
ec15a152 | 2196 | |
62c00445 RB |
2197 | /* The inside-loop cost will be accounted for in vectorizable_load |
2198 | and vectorizable_store correctly with adjusted alignments. | |
2199 | Drop the body_cst_vec on the floor here. */ | |
f4ebbd24 | 2200 | opt_result stat = vect_verify_datarefs_alignment (loop_vinfo); |
ebfd146a IR |
2201 | gcc_assert (stat); |
2202 | return stat; | |
2203 | } | |
2204 | } | |
2205 | ||
ebfd146a IR |
2206 | /* (2) Versioning to force alignment. */ |
2207 | ||
2208 | /* Try versioning if: | |
247afa98 | 2209 | 1) optimize loop for speed and the cost-model is not cheap |
d6d11272 | 2210 | 2) there is at least one unsupported misaligned data ref with an unknown |
ebfd146a | 2211 | misalignment, and |
d6d11272 XDL |
2212 | 3) all misaligned data refs with a known misalignment are supported, and |
2213 | 4) the number of runtime alignment checks is within reason. */ | |
ebfd146a | 2214 | |
247afa98 RB |
2215 | do_versioning |
2216 | = (optimize_loop_nest_for_speed_p (loop) | |
2217 | && !loop->inner /* FORNOW */ | |
9d99596e | 2218 | && flag_vect_cost_model != VECT_COST_MODEL_CHEAP); |
ebfd146a IR |
2219 | |
2220 | if (do_versioning) | |
2221 | { | |
9771b263 | 2222 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
ebfd146a | 2223 | { |
f5ae2856 | 2224 | dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); |
89fa689a | 2225 | if (aligned_access_p (dr_info) |
d30846a0 | 2226 | || !vect_relevant_for_alignment_p (dr_info)) |
ebfd146a IR |
2227 | continue; |
2228 | ||
d30846a0 | 2229 | stmt_vec_info stmt_info = dr_info->stmt; |
f2e2a985 | 2230 | if (STMT_VINFO_STRIDED_P (stmt_info)) |
7b5fc413 | 2231 | { |
7b5fc413 RB |
2232 | do_versioning = false; |
2233 | break; | |
2234 | } | |
319e6439 | 2235 | |
89fa689a | 2236 | supportable_dr_alignment |
308bc496 | 2237 | = vect_supportable_dr_alignment (loop_vinfo, dr_info, false); |
ebfd146a IR |
2238 | |
2239 | if (!supportable_dr_alignment) | |
2240 | { | |
ebfd146a IR |
2241 | int mask; |
2242 | tree vectype; | |
2243 | ||
89fa689a | 2244 | if (known_alignment_for_access_p (dr_info) |
9771b263 | 2245 | || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length () |
028d4092 | 2246 | >= (unsigned) param_vect_max_version_for_alignment_checks) |
ebfd146a IR |
2247 | { |
2248 | do_versioning = false; | |
2249 | break; | |
2250 | } | |
2251 | ||
78e02b3b RS |
2252 | vectype = STMT_VINFO_VECTYPE (stmt_info); |
2253 | gcc_assert (vectype); | |
b8698a0f | 2254 | |
cf098191 RS |
2255 | /* At present we don't support versioning for alignment |
2256 | with variable VF, since there's no guarantee that the | |
2257 | VF is a power of two. We could relax this if we added | |
2258 | a way of enforcing a power-of-two size. */ | |
2259 | unsigned HOST_WIDE_INT size; | |
2260 | if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size)) | |
2261 | { | |
2262 | do_versioning = false; | |
2263 | break; | |
2264 | } | |
2265 | ||
c9aa9108 JR |
2266 | /* Forcing alignment in the first iteration is no good if |
2267 | we don't keep it across iterations. For now, just disable | |
2268 | versioning in this case. | |
6647c1e8 JJ |
2269 | ?? We could actually unroll the loop to achieve the required |
2270 | overall step alignment, and forcing the alignment could be | |
c9aa9108 JR |
2271 | done by doing some iterations of the non-vectorized loop. */ |
2272 | if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo) | |
2273 | * DR_STEP_ALIGNMENT (dr), | |
6647c1e8 | 2274 | DR_TARGET_ALIGNMENT (dr_info))) |
c9aa9108 JR |
2275 | { |
2276 | do_versioning = false; | |
2277 | break; | |
2278 | } | |
2279 | ||
ebfd146a IR |
2280 | /* The rightmost bits of an aligned address must be zeros. |
2281 | Construct the mask needed for this test. For example, | |
2282 | GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the | |
2283 | mask must be 15 = 0xf. */ | |
cf098191 | 2284 | mask = size - 1; |
ebfd146a | 2285 | |
557532d1 RS |
2286 | /* FORNOW: use the same mask to test all potentially unaligned |
2287 | references in the loop. */ | |
2288 | if (LOOP_VINFO_PTR_MASK (loop_vinfo) | |
2289 | && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask) | |
2290 | { | |
2291 | do_versioning = false; | |
2292 | break; | |
2293 | } | |
2294 | ||
ebfd146a | 2295 | LOOP_VINFO_PTR_MASK (loop_vinfo) = mask; |
78e02b3b | 2296 | LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info); |
ebfd146a IR |
2297 | } |
2298 | } | |
b8698a0f | 2299 | |
ebfd146a | 2300 | /* Versioning requires at least one misaligned data reference. */ |
e9dbe7bb | 2301 | if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)) |
ebfd146a IR |
2302 | do_versioning = false; |
2303 | else if (!do_versioning) | |
9771b263 | 2304 | LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0); |
ebfd146a IR |
2305 | } |
2306 | ||
2307 | if (do_versioning) | |
2308 | { | |
7bcbf2d8 | 2309 | vec<stmt_vec_info> may_misalign_stmts |
ebfd146a | 2310 | = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); |
7bcbf2d8 | 2311 | stmt_vec_info stmt_info; |
ebfd146a IR |
2312 | |
2313 | /* It can now be assumed that the data references in the statements | |
2314 | in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version | |
2315 | of the loop being vectorized. */ | |
7bcbf2d8 | 2316 | FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info) |
ebfd146a | 2317 | { |
89fa689a RS |
2318 | dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); |
2319 | SET_DR_MISALIGNMENT (dr_info, 0); | |
73fbfcad | 2320 | if (dump_enabled_p ()) |
e645e942 TJ |
2321 | dump_printf_loc (MSG_NOTE, vect_location, |
2322 | "Alignment of access forced using versioning.\n"); | |
ebfd146a IR |
2323 | } |
2324 | ||
73fbfcad | 2325 | if (dump_enabled_p ()) |
e645e942 TJ |
2326 | dump_printf_loc (MSG_NOTE, vect_location, |
2327 | "Versioning for alignment will be applied.\n"); | |
ebfd146a IR |
2328 | |
2329 | /* Peeling and versioning can't be done together at this time. */ | |
2330 | gcc_assert (! (do_peeling && do_versioning)); | |
2331 | ||
f4ebbd24 | 2332 | opt_result stat = vect_verify_datarefs_alignment (loop_vinfo); |
ebfd146a IR |
2333 | gcc_assert (stat); |
2334 | return stat; | |
2335 | } | |
2336 | ||
2337 | /* This point is reached if neither peeling nor versioning is being done. */ | |
2338 | gcc_assert (! (do_peeling || do_versioning)); | |
2339 | ||
f4ebbd24 | 2340 | opt_result stat = vect_verify_datarefs_alignment (loop_vinfo); |
ebfd146a IR |
2341 | return stat; |
2342 | } | |
2343 | ||
2344 | ||
777e1f09 RG |
2345 | /* Function vect_find_same_alignment_drs. |
2346 | ||
f5ae2856 | 2347 | Update group and alignment relations in VINFO according to the chosen |
777e1f09 RG |
2348 | vectorization factor. */ |
2349 | ||
2350 | static void | |
f5ae2856 | 2351 | vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr) |
777e1f09 | 2352 | { |
777e1f09 RG |
2353 | struct data_reference *dra = DDR_A (ddr); |
2354 | struct data_reference *drb = DDR_B (ddr); | |
f5ae2856 RS |
2355 | dr_vec_info *dr_info_a = vinfo->lookup_dr (dra); |
2356 | dr_vec_info *dr_info_b = vinfo->lookup_dr (drb); | |
89fa689a RS |
2357 | stmt_vec_info stmtinfo_a = dr_info_a->stmt; |
2358 | stmt_vec_info stmtinfo_b = dr_info_b->stmt; | |
777e1f09 RG |
2359 | |
2360 | if (DDR_ARE_DEPENDENT (ddr) == chrec_known) | |
2361 | return; | |
2362 | ||
720f5239 | 2363 | if (dra == drb) |
777e1f09 RG |
2364 | return; |
2365 | ||
5fa23466 RB |
2366 | if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) |
2367 | || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) | |
2368 | return; | |
2369 | ||
62c8a2cf | 2370 | if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0) |
748bbe72 RS |
2371 | || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0) |
2372 | || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0)) | |
777e1f09 RG |
2373 | return; |
2374 | ||
748bbe72 | 2375 | /* Two references with distance zero have the same alignment. */ |
c0a46545 RS |
2376 | poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra)) |
2377 | - wi::to_poly_offset (DR_INIT (drb))); | |
2378 | if (maybe_ne (diff, 0)) | |
777e1f09 | 2379 | { |
748bbe72 | 2380 | /* Get the wider of the two alignments. */ |
ca31798e AV |
2381 | poly_uint64 align_a = |
2382 | exact_div (vect_calculate_target_alignment (dr_info_a), | |
2383 | BITS_PER_UNIT); | |
2384 | poly_uint64 align_b = | |
2385 | exact_div (vect_calculate_target_alignment (dr_info_b), | |
2386 | BITS_PER_UNIT); | |
2387 | unsigned HOST_WIDE_INT align_a_c, align_b_c; | |
2388 | if (!align_a.is_constant (&align_a_c) | |
2389 | || !align_b.is_constant (&align_b_c)) | |
2390 | return; | |
2391 | ||
2392 | unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c); | |
748bbe72 RS |
2393 | |
2394 | /* Require the gap to be a multiple of the larger vector alignment. */ | |
c0a46545 | 2395 | if (!multiple_p (diff, max_align)) |
748bbe72 RS |
2396 | return; |
2397 | } | |
777e1f09 | 2398 | |
748bbe72 RS |
2399 | STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb); |
2400 | STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra); | |
2401 | if (dump_enabled_p ()) | |
3c2a8ed0 DM |
2402 | dump_printf_loc (MSG_NOTE, vect_location, |
2403 | "accesses have the same alignment: %T and %T\n", | |
2404 | DR_REF (dra), DR_REF (drb)); | |
777e1f09 RG |
2405 | } |
2406 | ||
2407 | ||
ebfd146a IR |
2408 | /* Function vect_analyze_data_refs_alignment |
2409 | ||
2410 | Analyze the alignment of the data-references in the loop. | |
2411 | Return FALSE if a data reference is found that cannot be vectorized. */ | |
2412 | ||
f4ebbd24 | 2413 | opt_result |
8df82de2 | 2414 | vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo) |
ebfd146a | 2415 | { |
adac3a68 | 2416 | DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment"); |
ebfd146a | 2417 | |
777e1f09 RG |
2418 | /* Mark groups of data references with same alignment using |
2419 | data dependence information. */ | |
8df82de2 | 2420 | vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo); |
a5b50aa1 RB |
2421 | struct data_dependence_relation *ddr; |
2422 | unsigned int i; | |
2423 | ||
2424 | FOR_EACH_VEC_ELT (ddrs, i, ddr) | |
8df82de2 | 2425 | vect_find_same_alignment_drs (loop_vinfo, ddr); |
a5b50aa1 | 2426 | |
8df82de2 | 2427 | vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); |
a5b50aa1 RB |
2428 | struct data_reference *dr; |
2429 | ||
8df82de2 | 2430 | vect_record_base_alignments (loop_vinfo); |
a5b50aa1 | 2431 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
777e1f09 | 2432 | { |
8df82de2 | 2433 | dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); |
89fa689a | 2434 | if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)) |
8df82de2 | 2435 | vect_compute_data_ref_alignment (loop_vinfo, dr_info); |
777e1f09 RG |
2436 | } |
2437 | ||
f4ebbd24 | 2438 | return opt_result::success (); |
a5b50aa1 RB |
2439 | } |
2440 | ||
2441 | ||
2442 | /* Analyze alignment of DRs of stmts in NODE. */ | |
2443 | ||
2444 | static bool | |
308bc496 | 2445 | vect_slp_analyze_and_verify_node_alignment (vec_info *vinfo, slp_tree node) |
a5b50aa1 | 2446 | { |
52eab378 RB |
2447 | /* We vectorize from the first scalar stmt in the node unless |
2448 | the node is permuted in which case we start from the first | |
2449 | element in the group. */ | |
b9787581 | 2450 | stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; |
89fa689a | 2451 | dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info); |
52eab378 | 2452 | if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) |
bffb8014 | 2453 | first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info); |
52eab378 | 2454 | |
89fa689a | 2455 | dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info); |
308bc496 | 2456 | vect_compute_data_ref_alignment (vinfo, dr_info); |
6924b5e6 | 2457 | /* In several places we need alignment of the first element anyway. */ |
89fa689a | 2458 | if (dr_info != first_dr_info) |
308bc496 | 2459 | vect_compute_data_ref_alignment (vinfo, first_dr_info); |
6924b5e6 RB |
2460 | |
2461 | /* For creating the data-ref pointer we need alignment of the | |
2462 | first element as well. */ | |
2463 | first_stmt_info = vect_find_first_scalar_stmt_in_slp (node); | |
2464 | if (first_stmt_info != SLP_TREE_SCALAR_STMTS (node)[0]) | |
2465 | { | |
2466 | first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info); | |
2467 | if (dr_info != first_dr_info) | |
2468 | vect_compute_data_ref_alignment (vinfo, first_dr_info); | |
2469 | } | |
2470 | ||
308bc496 | 2471 | if (! verify_data_ref_alignment (vinfo, dr_info)) |
ebfd146a | 2472 | { |
52eab378 RB |
2473 | if (dump_enabled_p ()) |
2474 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
2475 | "not vectorized: bad data alignment in basic " | |
2476 | "block.\n"); | |
2477 | return false; | |
ebfd146a IR |
2478 | } |
2479 | ||
2480 | return true; | |
2481 | } | |
2482 | ||
a5b50aa1 RB |
2483 | /* Function vect_slp_analyze_instance_alignment |
2484 | ||
2485 | Analyze the alignment of the data-references in the SLP instance. | |
2486 | Return FALSE if a data reference is found that cannot be vectorized. */ | |
2487 | ||
2488 | bool | |
308bc496 RB |
2489 | vect_slp_analyze_and_verify_instance_alignment (vec_info *vinfo, |
2490 | slp_instance instance) | |
a5b50aa1 | 2491 | { |
adac3a68 | 2492 | DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment"); |
a5b50aa1 RB |
2493 | |
2494 | slp_tree node; | |
2495 | unsigned i; | |
2496 | FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node) | |
308bc496 | 2497 | if (! vect_slp_analyze_and_verify_node_alignment (vinfo, node)) |
a5b50aa1 RB |
2498 | return false; |
2499 | ||
2500 | node = SLP_INSTANCE_TREE (instance); | |
9758d196 | 2501 | if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node)) |
a5b50aa1 | 2502 | && ! vect_slp_analyze_and_verify_node_alignment |
308bc496 | 2503 | (vinfo, SLP_INSTANCE_TREE (instance))) |
a5b50aa1 RB |
2504 | return false; |
2505 | ||
2506 | return true; | |
2507 | } | |
2508 | ||
ebfd146a | 2509 | |
89fa689a | 2510 | /* Analyze groups of accesses: check that DR_INFO belongs to a group of |
0d0293ac MM |
2511 | accesses of legal size, step, etc. Detect gaps, single element |
2512 | interleaving, and other special cases. Set grouped access info. | |
97af59b2 RB |
2513 | Collect groups of strided stores for further use in SLP analysis. |
2514 | Worker for vect_analyze_group_access. */ | |
ebfd146a IR |
2515 | |
2516 | static bool | |
308bc496 | 2517 | vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info) |
ebfd146a | 2518 | { |
89fa689a | 2519 | data_reference *dr = dr_info->dr; |
ebfd146a IR |
2520 | tree step = DR_STEP (dr); |
2521 | tree scalar_type = TREE_TYPE (DR_REF (dr)); | |
2522 | HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); | |
89fa689a | 2523 | stmt_vec_info stmt_info = dr_info->stmt; |
308bc496 RB |
2524 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); |
2525 | bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo); | |
7b5fc413 | 2526 | HOST_WIDE_INT dr_step = -1; |
0d0293ac | 2527 | HOST_WIDE_INT groupsize, last_accessed_element = 1; |
ebfd146a IR |
2528 | bool slp_impossible = false; |
2529 | ||
0d0293ac MM |
2530 | /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the |
2531 | size of the interleaving group (including gaps). */ | |
7b5fc413 RB |
2532 | if (tree_fits_shwi_p (step)) |
2533 | { | |
2534 | dr_step = tree_to_shwi (step); | |
993a6bd9 RB |
2535 | /* Check that STEP is a multiple of type size. Otherwise there is |
2536 | a non-element-sized gap at the end of the group which we | |
2c53b149 | 2537 | cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE. |
993a6bd9 | 2538 | ??? As we can handle non-constant step fine here we should |
2c53b149 RB |
2539 | simply remove uses of DR_GROUP_GAP between the last and first |
2540 | element and instead rely on DR_STEP. DR_GROUP_SIZE then would | |
993a6bd9 RB |
2541 | simply not include that gap. */ |
2542 | if ((dr_step % type_size) != 0) | |
2543 | { | |
2544 | if (dump_enabled_p ()) | |
3c2a8ed0 DM |
2545 | dump_printf_loc (MSG_NOTE, vect_location, |
2546 | "Step %T is not a multiple of the element size" | |
2547 | " for %T\n", | |
2548 | step, DR_REF (dr)); | |
993a6bd9 RB |
2549 | return false; |
2550 | } | |
7b5fc413 RB |
2551 | groupsize = absu_hwi (dr_step) / type_size; |
2552 | } | |
2553 | else | |
2554 | groupsize = 0; | |
ebfd146a IR |
2555 | |
2556 | /* Not consecutive access is possible only if it is a part of interleaving. */ | |
78e02b3b | 2557 | if (!DR_GROUP_FIRST_ELEMENT (stmt_info)) |
ebfd146a IR |
2558 | { |
2559 | /* Check if it this DR is a part of interleaving, and is a single | |
2560 | element of the group that is accessed in the loop. */ | |
b8698a0f | 2561 | |
ebfd146a | 2562 | /* Gaps are supported only for loads. STEP must be a multiple of the type |
4aa157e8 | 2563 | size. */ |
ebfd146a IR |
2564 | if (DR_IS_READ (dr) |
2565 | && (dr_step % type_size) == 0 | |
4aa157e8 | 2566 | && groupsize > 0) |
ebfd146a | 2567 | { |
78e02b3b RS |
2568 | DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info; |
2569 | DR_GROUP_SIZE (stmt_info) = groupsize; | |
2c53b149 | 2570 | DR_GROUP_GAP (stmt_info) = groupsize - 1; |
73fbfcad | 2571 | if (dump_enabled_p ()) |
3c2a8ed0 DM |
2572 | dump_printf_loc (MSG_NOTE, vect_location, |
2573 | "Detected single element interleaving %T" | |
2574 | " step %T\n", | |
2575 | DR_REF (dr), step); | |
48df3fa6 | 2576 | |
ebfd146a IR |
2577 | return true; |
2578 | } | |
4b5caab7 | 2579 | |
73fbfcad | 2580 | if (dump_enabled_p ()) |
3c2a8ed0 DM |
2581 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2582 | "not consecutive access %G", stmt_info->stmt); | |
4b5caab7 IR |
2583 | |
2584 | if (bb_vinfo) | |
78e02b3b RS |
2585 | { |
2586 | /* Mark the statement as unvectorizable. */ | |
89fa689a | 2587 | STMT_VINFO_VECTORIZABLE (stmt_info) = false; |
78e02b3b RS |
2588 | return true; |
2589 | } | |
78c60e3d | 2590 | |
bbeeac91 DM |
2591 | if (dump_enabled_p ()) |
2592 | dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n"); | |
90a7a1b5 RB |
2593 | STMT_VINFO_STRIDED_P (stmt_info) = true; |
2594 | return true; | |
ebfd146a IR |
2595 | } |
2596 | ||
78e02b3b | 2597 | if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info) |
ebfd146a IR |
2598 | { |
2599 | /* First stmt in the interleaving chain. Check the chain. */ | |
bffb8014 | 2600 | stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info); |
ebfd146a | 2601 | struct data_reference *data_ref = dr; |
df398a37 | 2602 | unsigned int count = 1; |
ebfd146a | 2603 | tree prev_init = DR_INIT (data_ref); |
08940f33 | 2604 | HOST_WIDE_INT diff, gaps = 0; |
ebfd146a | 2605 | |
c0a46545 | 2606 | /* By construction, all group members have INTEGER_CST DR_INITs. */ |
ebfd146a IR |
2607 | while (next) |
2608 | { | |
f95b7597 RB |
2609 | /* We never have the same DR multiple times. */ |
2610 | gcc_assert (tree_int_cst_compare (DR_INIT (data_ref), | |
2611 | DR_INIT (STMT_VINFO_DATA_REF (next))) != 0); | |
48df3fa6 | 2612 | |
bffb8014 | 2613 | data_ref = STMT_VINFO_DATA_REF (next); |
ebfd146a | 2614 | |
08940f33 RB |
2615 | /* All group members have the same STEP by construction. */ |
2616 | gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0)); | |
ebfd146a | 2617 | |
ebfd146a IR |
2618 | /* Check that the distance between two accesses is equal to the type |
2619 | size. Otherwise, we have gaps. */ | |
2620 | diff = (TREE_INT_CST_LOW (DR_INIT (data_ref)) | |
2621 | - TREE_INT_CST_LOW (prev_init)) / type_size; | |
2622 | if (diff != 1) | |
2623 | { | |
2624 | /* FORNOW: SLP of accesses with gaps is not supported. */ | |
2625 | slp_impossible = true; | |
b0af49c4 | 2626 | if (DR_IS_WRITE (data_ref)) |
ebfd146a | 2627 | { |
73fbfcad | 2628 | if (dump_enabled_p ()) |
e645e942 TJ |
2629 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2630 | "interleaved store with gaps\n"); | |
ebfd146a IR |
2631 | return false; |
2632 | } | |
4da39468 IR |
2633 | |
2634 | gaps += diff - 1; | |
ebfd146a IR |
2635 | } |
2636 | ||
48df3fa6 IR |
2637 | last_accessed_element += diff; |
2638 | ||
ebfd146a | 2639 | /* Store the gap from the previous member of the group. If there is no |
2c53b149 | 2640 | gap in the access, DR_GROUP_GAP is always 1. */ |
bffb8014 | 2641 | DR_GROUP_GAP (next) = diff; |
ebfd146a | 2642 | |
bffb8014 RS |
2643 | prev_init = DR_INIT (data_ref); |
2644 | next = DR_GROUP_NEXT_ELEMENT (next); | |
2645 | /* Count the number of data-refs in the chain. */ | |
2646 | count++; | |
ebfd146a IR |
2647 | } |
2648 | ||
7b5fc413 RB |
2649 | if (groupsize == 0) |
2650 | groupsize = count + gaps; | |
ebfd146a | 2651 | |
30fec2f9 RB |
2652 | /* This could be UINT_MAX but as we are generating code in a very |
2653 | inefficient way we have to cap earlier. See PR78699 for example. */ | |
2654 | if (groupsize > 4096) | |
97af59b2 RB |
2655 | { |
2656 | if (dump_enabled_p ()) | |
2657 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
2658 | "group is too large\n"); | |
2659 | return false; | |
2660 | } | |
2661 | ||
7b5fc413 | 2662 | /* Check that the size of the interleaving is equal to count for stores, |
ebfd146a | 2663 | i.e., that there are no gaps. */ |
e004aa11 RB |
2664 | if (groupsize != count |
2665 | && !DR_IS_READ (dr)) | |
ebfd146a | 2666 | { |
203942b8 RS |
2667 | groupsize = count; |
2668 | STMT_VINFO_STRIDED_P (stmt_info) = true; | |
e004aa11 RB |
2669 | } |
2670 | ||
2671 | /* If there is a gap after the last load in the group it is the | |
2672 | difference between the groupsize and the last accessed | |
2673 | element. | |
2674 | When there is no gap, this difference should be 0. */ | |
78e02b3b | 2675 | DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element; |
ebfd146a | 2676 | |
78e02b3b | 2677 | DR_GROUP_SIZE (stmt_info) = groupsize; |
73fbfcad | 2678 | if (dump_enabled_p ()) |
e004aa11 RB |
2679 | { |
2680 | dump_printf_loc (MSG_NOTE, vect_location, | |
97af59b2 RB |
2681 | "Detected interleaving "); |
2682 | if (DR_IS_READ (dr)) | |
2683 | dump_printf (MSG_NOTE, "load "); | |
203942b8 RS |
2684 | else if (STMT_VINFO_STRIDED_P (stmt_info)) |
2685 | dump_printf (MSG_NOTE, "strided store "); | |
97af59b2 RB |
2686 | else |
2687 | dump_printf (MSG_NOTE, "store "); | |
7ea4b8ed RB |
2688 | dump_printf (MSG_NOTE, "of size %u\n", |
2689 | (unsigned)groupsize); | |
2690 | dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt); | |
2691 | next = DR_GROUP_NEXT_ELEMENT (stmt_info); | |
2692 | while (next) | |
2693 | { | |
2694 | if (DR_GROUP_GAP (next) != 1) | |
2695 | dump_printf_loc (MSG_NOTE, vect_location, | |
2696 | "\t<gap of %d elements>\n", | |
2697 | DR_GROUP_GAP (next) - 1); | |
2698 | dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt); | |
2699 | next = DR_GROUP_NEXT_ELEMENT (next); | |
2700 | } | |
78e02b3b | 2701 | if (DR_GROUP_GAP (stmt_info) != 0) |
e004aa11 | 2702 | dump_printf_loc (MSG_NOTE, vect_location, |
7ea4b8ed | 2703 | "\t<gap of %d elements>\n", |
78e02b3b | 2704 | DR_GROUP_GAP (stmt_info)); |
e004aa11 | 2705 | } |
ebfd146a | 2706 | |
b8698a0f | 2707 | /* SLP: create an SLP data structure for every interleaving group of |
ebfd146a | 2708 | stores for further analysis in vect_analyse_slp. */ |
b0af49c4 | 2709 | if (DR_IS_WRITE (dr) && !slp_impossible) |
78e02b3b RS |
2710 | { |
2711 | if (loop_vinfo) | |
2712 | LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info); | |
2713 | if (bb_vinfo) | |
2714 | BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info); | |
2715 | } | |
ebfd146a IR |
2716 | } |
2717 | ||
2718 | return true; | |
2719 | } | |
2720 | ||
89fa689a | 2721 | /* Analyze groups of accesses: check that DR_INFO belongs to a group of |
97af59b2 RB |
2722 | accesses of legal size, step, etc. Detect gaps, single element |
2723 | interleaving, and other special cases. Set grouped access info. | |
2724 | Collect groups of strided stores for further use in SLP analysis. */ | |
2725 | ||
2726 | static bool | |
308bc496 | 2727 | vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info) |
97af59b2 | 2728 | { |
308bc496 | 2729 | if (!vect_analyze_group_access_1 (vinfo, dr_info)) |
97af59b2 RB |
2730 | { |
2731 | /* Dissolve the group if present. */ | |
89fa689a | 2732 | stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt); |
bffb8014 | 2733 | while (stmt_info) |
97af59b2 | 2734 | { |
bffb8014 RS |
2735 | stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info); |
2736 | DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL; | |
2737 | DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL; | |
2738 | stmt_info = next; | |
97af59b2 RB |
2739 | } |
2740 | return false; | |
2741 | } | |
2742 | return true; | |
2743 | } | |
ebfd146a | 2744 | |
89fa689a | 2745 | /* Analyze the access pattern of the data-reference DR_INFO. |
ebfd146a | 2746 | In case of non-consecutive accesses call vect_analyze_group_access() to |
0d0293ac | 2747 | analyze groups of accesses. */ |
ebfd146a IR |
2748 | |
2749 | static bool | |
308bc496 | 2750 | vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info) |
ebfd146a | 2751 | { |
89fa689a | 2752 | data_reference *dr = dr_info->dr; |
ebfd146a IR |
2753 | tree step = DR_STEP (dr); |
2754 | tree scalar_type = TREE_TYPE (DR_REF (dr)); | |
89fa689a | 2755 | stmt_vec_info stmt_info = dr_info->stmt; |
308bc496 | 2756 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); |
99b1c316 | 2757 | class loop *loop = NULL; |
ebfd146a | 2758 | |
f307441a RS |
2759 | if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) |
2760 | return true; | |
2761 | ||
a70d6342 IR |
2762 | if (loop_vinfo) |
2763 | loop = LOOP_VINFO_LOOP (loop_vinfo); | |
b8698a0f | 2764 | |
a70d6342 | 2765 | if (loop_vinfo && !step) |
ebfd146a | 2766 | { |
73fbfcad | 2767 | if (dump_enabled_p ()) |
e645e942 TJ |
2768 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2769 | "bad data-ref access in loop\n"); | |
ebfd146a IR |
2770 | return false; |
2771 | } | |
2772 | ||
c134cf2a | 2773 | /* Allow loads with zero step in inner-loop vectorization. */ |
319e6439 | 2774 | if (loop_vinfo && integer_zerop (step)) |
39becbac | 2775 | { |
78e02b3b RS |
2776 | DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL; |
2777 | if (!nested_in_vect_loop_p (loop, stmt_info)) | |
c134cf2a YR |
2778 | return DR_IS_READ (dr); |
2779 | /* Allow references with zero step for outer loops marked | |
2780 | with pragma omp simd only - it guarantees absence of | |
2781 | loop-carried dependencies between inner loop iterations. */ | |
962e91fc | 2782 | if (loop->safelen < 2) |
6e8dad05 RB |
2783 | { |
2784 | if (dump_enabled_p ()) | |
2785 | dump_printf_loc (MSG_NOTE, vect_location, | |
e645e942 | 2786 | "zero step in inner loop of nest\n"); |
6e8dad05 RB |
2787 | return false; |
2788 | } | |
39becbac | 2789 | } |
ebfd146a | 2790 | |
78e02b3b | 2791 | if (loop && nested_in_vect_loop_p (loop, stmt_info)) |
ebfd146a IR |
2792 | { |
2793 | /* Interleaved accesses are not yet supported within outer-loop | |
2794 | vectorization for references in the inner-loop. */ | |
78e02b3b | 2795 | DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL; |
ebfd146a IR |
2796 | |
2797 | /* For the rest of the analysis we use the outer-loop step. */ | |
2798 | step = STMT_VINFO_DR_STEP (stmt_info); | |
319e6439 | 2799 | if (integer_zerop (step)) |
ebfd146a | 2800 | { |
73fbfcad | 2801 | if (dump_enabled_p ()) |
78c60e3d | 2802 | dump_printf_loc (MSG_NOTE, vect_location, |
e645e942 | 2803 | "zero step in outer loop.\n"); |
3bab6342 | 2804 | return DR_IS_READ (dr); |
ebfd146a IR |
2805 | } |
2806 | } | |
2807 | ||
2808 | /* Consecutive? */ | |
319e6439 | 2809 | if (TREE_CODE (step) == INTEGER_CST) |
ebfd146a | 2810 | { |
319e6439 RG |
2811 | HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); |
2812 | if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type)) | |
2813 | || (dr_step < 0 | |
2814 | && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step))) | |
2815 | { | |
2816 | /* Mark that it is not interleaving. */ | |
78e02b3b | 2817 | DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL; |
319e6439 RG |
2818 | return true; |
2819 | } | |
ebfd146a IR |
2820 | } |
2821 | ||
78e02b3b | 2822 | if (loop && nested_in_vect_loop_p (loop, stmt_info)) |
ebfd146a | 2823 | { |
73fbfcad | 2824 | if (dump_enabled_p ()) |
78c60e3d | 2825 | dump_printf_loc (MSG_NOTE, vect_location, |
e645e942 | 2826 | "grouped access in outer loop.\n"); |
ebfd146a IR |
2827 | return false; |
2828 | } | |
2829 | ||
7b5fc413 | 2830 | |
319e6439 RG |
2831 | /* Assume this is a DR handled by non-constant strided load case. */ |
2832 | if (TREE_CODE (step) != INTEGER_CST) | |
f2e2a985 | 2833 | return (STMT_VINFO_STRIDED_P (stmt_info) |
7b5fc413 | 2834 | && (!STMT_VINFO_GROUPED_ACCESS (stmt_info) |
308bc496 | 2835 | || vect_analyze_group_access (vinfo, dr_info))); |
319e6439 | 2836 | |
ebfd146a | 2837 | /* Not consecutive access - check if it's a part of interleaving group. */ |
308bc496 | 2838 | return vect_analyze_group_access (vinfo, dr_info); |
ebfd146a IR |
2839 | } |
2840 | ||
5abe1e05 RB |
2841 | /* Compare two data-references DRA and DRB to group them into chunks |
2842 | suitable for grouping. */ | |
2843 | ||
2844 | static int | |
2845 | dr_group_sort_cmp (const void *dra_, const void *drb_) | |
2846 | { | |
2847 | data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_); | |
2848 | data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_); | |
5abe1e05 RB |
2849 | int cmp; |
2850 | ||
2851 | /* Stabilize sort. */ | |
2852 | if (dra == drb) | |
2853 | return 0; | |
2854 | ||
8349b024 RB |
2855 | /* DRs in different loops never belong to the same group. */ |
2856 | loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father; | |
2857 | loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father; | |
2858 | if (loopa != loopb) | |
2859 | return loopa->num < loopb->num ? -1 : 1; | |
2860 | ||
5abe1e05 | 2861 | /* Ordering of DRs according to base. */ |
d20eac1b RB |
2862 | cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra), |
2863 | DR_BASE_ADDRESS (drb)); | |
2864 | if (cmp != 0) | |
2865 | return cmp; | |
5abe1e05 RB |
2866 | |
2867 | /* And according to DR_OFFSET. */ | |
d20eac1b RB |
2868 | cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)); |
2869 | if (cmp != 0) | |
2870 | return cmp; | |
5abe1e05 RB |
2871 | |
2872 | /* Put reads before writes. */ | |
2873 | if (DR_IS_READ (dra) != DR_IS_READ (drb)) | |
2874 | return DR_IS_READ (dra) ? -1 : 1; | |
2875 | ||
2876 | /* Then sort after access size. */ | |
d20eac1b RB |
2877 | cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), |
2878 | TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)))); | |
2879 | if (cmp != 0) | |
2880 | return cmp; | |
5abe1e05 RB |
2881 | |
2882 | /* And after step. */ | |
d20eac1b RB |
2883 | cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)); |
2884 | if (cmp != 0) | |
2885 | return cmp; | |
5abe1e05 RB |
2886 | |
2887 | /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */ | |
36fd6408 | 2888 | cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)); |
5abe1e05 RB |
2889 | if (cmp == 0) |
2890 | return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1; | |
2891 | return cmp; | |
2892 | } | |
ebfd146a | 2893 | |
7e11fc7f RS |
2894 | /* If OP is the result of a conversion, return the unconverted value, |
2895 | otherwise return null. */ | |
2896 | ||
2897 | static tree | |
2898 | strip_conversion (tree op) | |
2899 | { | |
2900 | if (TREE_CODE (op) != SSA_NAME) | |
2901 | return NULL_TREE; | |
2902 | gimple *stmt = SSA_NAME_DEF_STMT (op); | |
2903 | if (!is_gimple_assign (stmt) | |
2904 | || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))) | |
2905 | return NULL_TREE; | |
2906 | return gimple_assign_rhs1 (stmt); | |
2907 | } | |
2908 | ||
32e8e429 | 2909 | /* Return true if vectorizable_* routines can handle statements STMT1_INFO |
99763671 AM |
2910 | and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can |
2911 | be grouped in SLP mode. */ | |
7e11fc7f RS |
2912 | |
2913 | static bool | |
99763671 AM |
2914 | can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info, |
2915 | bool allow_slp_p) | |
7e11fc7f | 2916 | { |
32e8e429 RS |
2917 | if (gimple_assign_single_p (stmt1_info->stmt)) |
2918 | return gimple_assign_single_p (stmt2_info->stmt); | |
7e11fc7f | 2919 | |
32e8e429 | 2920 | gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt); |
beb456c3 | 2921 | if (call1 && gimple_call_internal_p (call1)) |
7e11fc7f RS |
2922 | { |
2923 | /* Check for two masked loads or two masked stores. */ | |
32e8e429 | 2924 | gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt); |
beb456c3 | 2925 | if (!call2 || !gimple_call_internal_p (call2)) |
7e11fc7f | 2926 | return false; |
beb456c3 | 2927 | internal_fn ifn = gimple_call_internal_fn (call1); |
7e11fc7f RS |
2928 | if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE) |
2929 | return false; | |
beb456c3 | 2930 | if (ifn != gimple_call_internal_fn (call2)) |
7e11fc7f RS |
2931 | return false; |
2932 | ||
2933 | /* Check that the masks are the same. Cope with casts of masks, | |
2934 | like those created by build_mask_conversion. */ | |
beb456c3 RS |
2935 | tree mask1 = gimple_call_arg (call1, 2); |
2936 | tree mask2 = gimple_call_arg (call2, 2); | |
99763671 AM |
2937 | if (!operand_equal_p (mask1, mask2, 0) |
2938 | && (ifn == IFN_MASK_STORE || !allow_slp_p)) | |
7e11fc7f RS |
2939 | { |
2940 | mask1 = strip_conversion (mask1); | |
2941 | if (!mask1) | |
2942 | return false; | |
2943 | mask2 = strip_conversion (mask2); | |
2944 | if (!mask2) | |
2945 | return false; | |
2946 | if (!operand_equal_p (mask1, mask2, 0)) | |
2947 | return false; | |
2948 | } | |
2949 | return true; | |
2950 | } | |
2951 | ||
2952 | return false; | |
2953 | } | |
2954 | ||
ebfd146a IR |
2955 | /* Function vect_analyze_data_ref_accesses. |
2956 | ||
2957 | Analyze the access pattern of all the data references in the loop. | |
2958 | ||
2959 | FORNOW: the only access pattern that is considered vectorizable is a | |
2960 | simple step 1 (consecutive) access. | |
2961 | ||
2962 | FORNOW: handle only arrays and pointer accesses. */ | |
2963 | ||
f4ebbd24 | 2964 | opt_result |
310213d4 | 2965 | vect_analyze_data_ref_accesses (vec_info *vinfo) |
ebfd146a IR |
2966 | { |
2967 | unsigned int i; | |
ca823c85 | 2968 | vec<data_reference_p> datarefs = vinfo->shared->datarefs; |
ebfd146a IR |
2969 | struct data_reference *dr; |
2970 | ||
adac3a68 | 2971 | DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses"); |
ebfd146a | 2972 | |
5abe1e05 | 2973 | if (datarefs.is_empty ()) |
f4ebbd24 | 2974 | return opt_result::success (); |
5abe1e05 RB |
2975 | |
2976 | /* Sort the array of datarefs to make building the interleaving chains | |
3d54b29d JJ |
2977 | linear. Don't modify the original vector's order, it is needed for |
2978 | determining what dependencies are reversed. */ | |
2979 | vec<data_reference_p> datarefs_copy = datarefs.copy (); | |
75509ba2 | 2980 | datarefs_copy.qsort (dr_group_sort_cmp); |
be43a887 | 2981 | hash_set<stmt_vec_info> to_fixup; |
5abe1e05 RB |
2982 | |
2983 | /* Build the interleaving chains. */ | |
3d54b29d | 2984 | for (i = 0; i < datarefs_copy.length () - 1;) |
5abe1e05 | 2985 | { |
3d54b29d | 2986 | data_reference_p dra = datarefs_copy[i]; |
f5ae2856 | 2987 | dr_vec_info *dr_info_a = vinfo->lookup_dr (dra); |
89fa689a | 2988 | stmt_vec_info stmtinfo_a = dr_info_a->stmt; |
5abe1e05 | 2989 | stmt_vec_info lastinfo = NULL; |
82279a51 RS |
2990 | if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a) |
2991 | || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)) | |
f955c4c4 RB |
2992 | { |
2993 | ++i; | |
2994 | continue; | |
2995 | } | |
3d54b29d | 2996 | for (i = i + 1; i < datarefs_copy.length (); ++i) |
5abe1e05 | 2997 | { |
3d54b29d | 2998 | data_reference_p drb = datarefs_copy[i]; |
f5ae2856 | 2999 | dr_vec_info *dr_info_b = vinfo->lookup_dr (drb); |
89fa689a | 3000 | stmt_vec_info stmtinfo_b = dr_info_b->stmt; |
82279a51 RS |
3001 | if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b) |
3002 | || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) | |
f955c4c4 | 3003 | break; |
5abe1e05 RB |
3004 | |
3005 | /* ??? Imperfect sorting (non-compatible types, non-modulo | |
3006 | accesses, same accesses) can lead to a group to be artificially | |
3007 | split here as we don't just skip over those. If it really | |
3008 | matters we can push those to a worklist and re-iterate | |
3009 | over them. The we can just skip ahead to the next DR here. */ | |
3010 | ||
8349b024 RB |
3011 | /* DRs in a different loop should not be put into the same |
3012 | interleaving group. */ | |
3013 | if (gimple_bb (DR_STMT (dra))->loop_father | |
3014 | != gimple_bb (DR_STMT (drb))->loop_father) | |
3015 | break; | |
3016 | ||
5abe1e05 | 3017 | /* Check that the data-refs have same first location (except init) |
61331c48 JJ |
3018 | and they are both either store or load (not load and store, |
3019 | not masked loads or stores). */ | |
5abe1e05 | 3020 | if (DR_IS_READ (dra) != DR_IS_READ (drb) |
d20eac1b RB |
3021 | || data_ref_compare_tree (DR_BASE_ADDRESS (dra), |
3022 | DR_BASE_ADDRESS (drb)) != 0 | |
3023 | || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0 | |
99763671 | 3024 | || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true)) |
5abe1e05 RB |
3025 | break; |
3026 | ||
7b5fc413 | 3027 | /* Check that the data-refs have the same constant size. */ |
5abe1e05 RB |
3028 | tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))); |
3029 | tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))); | |
cc269bb6 RS |
3030 | if (!tree_fits_uhwi_p (sza) |
3031 | || !tree_fits_uhwi_p (szb) | |
7b5fc413 RB |
3032 | || !tree_int_cst_equal (sza, szb)) |
3033 | break; | |
3034 | ||
3035 | /* Check that the data-refs have the same step. */ | |
d20eac1b | 3036 | if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0) |
5abe1e05 RB |
3037 | break; |
3038 | ||
5abe1e05 RB |
3039 | /* Check the types are compatible. |
3040 | ??? We don't distinguish this during sorting. */ | |
3041 | if (!types_compatible_p (TREE_TYPE (DR_REF (dra)), | |
3042 | TREE_TYPE (DR_REF (drb)))) | |
3043 | break; | |
3044 | ||
c0a46545 RS |
3045 | /* Check that the DR_INITs are compile-time constants. */ |
3046 | if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST | |
3047 | || TREE_CODE (DR_INIT (drb)) != INTEGER_CST) | |
3048 | break; | |
3049 | ||
0356aab8 JJ |
3050 | /* Different .GOMP_SIMD_LANE calls still give the same lane, |
3051 | just hold extra information. */ | |
3052 | if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a) | |
3053 | && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b) | |
3054 | && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0) | |
3055 | break; | |
3056 | ||
5abe1e05 RB |
3057 | /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */ |
3058 | HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra)); | |
3059 | HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb)); | |
fb607032 RB |
3060 | HOST_WIDE_INT init_prev |
3061 | = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1])); | |
3062 | gcc_assert (init_a <= init_b | |
3063 | && init_a <= init_prev | |
3064 | && init_prev <= init_b); | |
3065 | ||
3066 | /* Do not place the same access in the interleaving chain twice. */ | |
3067 | if (init_b == init_prev) | |
3068 | { | |
3069 | gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1])) | |
3070 | < gimple_uid (DR_STMT (drb))); | |
be43a887 | 3071 | /* Simply link in duplicates and fix up the chain below. */ |
fb607032 | 3072 | } |
be43a887 | 3073 | else |
7b5fc413 | 3074 | { |
be43a887 RB |
3075 | /* If init_b == init_a + the size of the type * k, we have an |
3076 | interleaving, and DRA is accessed before DRB. */ | |
3077 | HOST_WIDE_INT type_size_a = tree_to_uhwi (sza); | |
3078 | if (type_size_a == 0 | |
3079 | || (init_b - init_a) % type_size_a != 0) | |
7b5fc413 | 3080 | break; |
be43a887 RB |
3081 | |
3082 | /* If we have a store, the accesses are adjacent. This splits | |
3083 | groups into chunks we support (we don't support vectorization | |
3084 | of stores with gaps). */ | |
3085 | if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a) | |
3086 | break; | |
3087 | ||
3088 | /* If the step (if not zero or non-constant) is greater than the | |
3089 | difference between data-refs' inits this splits groups into | |
3090 | suitable sizes. */ | |
3091 | if (tree_fits_shwi_p (DR_STEP (dra))) | |
3092 | { | |
3093 | HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra)); | |
3094 | if (step != 0 && step <= (init_b - init_a)) | |
3095 | break; | |
3096 | } | |
7b5fc413 | 3097 | } |
5abe1e05 RB |
3098 | |
3099 | if (dump_enabled_p ()) | |
3c2a8ed0 DM |
3100 | dump_printf_loc (MSG_NOTE, vect_location, |
3101 | DR_IS_READ (dra) | |
3102 | ? "Detected interleaving load %T and %T\n" | |
3103 | : "Detected interleaving store %T and %T\n", | |
3104 | DR_REF (dra), DR_REF (drb)); | |
5abe1e05 RB |
3105 | |
3106 | /* Link the found element into the group list. */ | |
2c53b149 | 3107 | if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a)) |
5abe1e05 | 3108 | { |
91987857 | 3109 | DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a; |
5abe1e05 RB |
3110 | lastinfo = stmtinfo_a; |
3111 | } | |
91987857 RS |
3112 | DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a; |
3113 | DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b; | |
5abe1e05 | 3114 | lastinfo = stmtinfo_b; |
be43a887 | 3115 | |
99763671 AM |
3116 | STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a) |
3117 | = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false); | |
3118 | ||
3119 | if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)) | |
3120 | dump_printf_loc (MSG_NOTE, vect_location, | |
3121 | "Load suitable for SLP vectorization only.\n"); | |
3122 | ||
be43a887 RB |
3123 | if (init_b == init_prev |
3124 | && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)) | |
3125 | && dump_enabled_p ()) | |
3126 | dump_printf_loc (MSG_NOTE, vect_location, | |
3127 | "Queuing group with duplicate access for fixup\n"); | |
5abe1e05 RB |
3128 | } |
3129 | } | |
3130 | ||
be43a887 RB |
3131 | /* Fixup groups with duplicate entries by splitting it. */ |
3132 | while (1) | |
3133 | { | |
3134 | hash_set<stmt_vec_info>::iterator it = to_fixup.begin (); | |
3135 | if (!(it != to_fixup.end ())) | |
3136 | break; | |
3137 | stmt_vec_info grp = *it; | |
3138 | to_fixup.remove (grp); | |
3139 | ||
3140 | /* Find the earliest duplicate group member. */ | |
3141 | unsigned first_duplicate = -1u; | |
3142 | stmt_vec_info next, g = grp; | |
3143 | while ((next = DR_GROUP_NEXT_ELEMENT (g))) | |
3144 | { | |
f95b7597 RB |
3145 | if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr), |
3146 | DR_INIT (STMT_VINFO_DR_INFO (g)->dr)) | |
be43a887 RB |
3147 | && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate) |
3148 | first_duplicate = gimple_uid (STMT_VINFO_STMT (next)); | |
3149 | g = next; | |
3150 | } | |
3151 | if (first_duplicate == -1U) | |
3152 | continue; | |
3153 | ||
3154 | /* Then move all stmts after the first duplicate to a new group. | |
3155 | Note this is a heuristic but one with the property that *it | |
3156 | is fixed up completely. */ | |
3157 | g = grp; | |
303d8f77 | 3158 | stmt_vec_info newgroup = NULL, ng = grp; |
be43a887 RB |
3159 | while ((next = DR_GROUP_NEXT_ELEMENT (g))) |
3160 | { | |
3161 | if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate) | |
3162 | { | |
3163 | DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next); | |
3164 | if (!newgroup) | |
3165 | newgroup = next; | |
3166 | else | |
3167 | DR_GROUP_NEXT_ELEMENT (ng) = next; | |
3168 | ng = next; | |
3169 | DR_GROUP_FIRST_ELEMENT (ng) = newgroup; | |
3170 | } | |
3171 | else | |
3172 | g = DR_GROUP_NEXT_ELEMENT (g); | |
3173 | } | |
3174 | DR_GROUP_NEXT_ELEMENT (ng) = NULL; | |
3175 | ||
3176 | /* Fixup the new group which still may contain duplicates. */ | |
3177 | to_fixup.add (newgroup); | |
3178 | } | |
3179 | ||
3d54b29d | 3180 | FOR_EACH_VEC_ELT (datarefs_copy, i, dr) |
89fa689a | 3181 | { |
f5ae2856 | 3182 | dr_vec_info *dr_info = vinfo->lookup_dr (dr); |
89fa689a | 3183 | if (STMT_VINFO_VECTORIZABLE (dr_info->stmt) |
308bc496 | 3184 | && !vect_analyze_data_ref_access (vinfo, dr_info)) |
89fa689a RS |
3185 | { |
3186 | if (dump_enabled_p ()) | |
3187 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
3188 | "not vectorized: complicated access pattern.\n"); | |
4b5caab7 | 3189 | |
89fa689a RS |
3190 | if (is_a <bb_vec_info> (vinfo)) |
3191 | { | |
3192 | /* Mark the statement as not vectorizable. */ | |
3193 | STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false; | |
3194 | continue; | |
3195 | } | |
3196 | else | |
3197 | { | |
3198 | datarefs_copy.release (); | |
f4ebbd24 DM |
3199 | return opt_result::failure_at (dr_info->stmt->stmt, |
3200 | "not vectorized:" | |
3201 | " complicated access pattern.\n"); | |
89fa689a RS |
3202 | } |
3203 | } | |
3204 | } | |
ebfd146a | 3205 | |
3d54b29d | 3206 | datarefs_copy.release (); |
f4ebbd24 | 3207 | return opt_result::success (); |
ebfd146a IR |
3208 | } |
3209 | ||
a05a89fa CH |
3210 | /* Function vect_vfa_segment_size. |
3211 | ||
a05a89fa | 3212 | Input: |
89fa689a | 3213 | DR_INFO: The data reference. |
a05a89fa CH |
3214 | LENGTH_FACTOR: segment length to consider. |
3215 | ||
a57776a1 RS |
3216 | Return a value suitable for the dr_with_seg_len::seg_len field. |
3217 | This is the "distance travelled" by the pointer from the first | |
3218 | iteration in the segment to the last. Note that it does not include | |
3219 | the size of the access; in effect it only describes the first byte. */ | |
a05a89fa CH |
3220 | |
3221 | static tree | |
89fa689a | 3222 | vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor) |
a05a89fa | 3223 | { |
a57776a1 RS |
3224 | length_factor = size_binop (MINUS_EXPR, |
3225 | fold_convert (sizetype, length_factor), | |
3226 | size_one_node); | |
89fa689a | 3227 | return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)), |
a57776a1 RS |
3228 | length_factor); |
3229 | } | |
a05a89fa | 3230 | |
89fa689a | 3231 | /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)), |
a57776a1 | 3232 | gives the worst-case number of bytes covered by the segment. */ |
a05a89fa | 3233 | |
a57776a1 | 3234 | static unsigned HOST_WIDE_INT |
308bc496 | 3235 | vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info) |
a57776a1 | 3236 | { |
89fa689a RS |
3237 | stmt_vec_info stmt_vinfo = dr_info->stmt; |
3238 | tree ref_type = TREE_TYPE (DR_REF (dr_info->dr)); | |
a57776a1 RS |
3239 | unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type)); |
3240 | unsigned HOST_WIDE_INT access_size = ref_size; | |
2c53b149 | 3241 | if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo)) |
a05a89fa | 3242 | { |
89fa689a | 3243 | gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo); |
2c53b149 | 3244 | access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo); |
a57776a1 | 3245 | } |
b05d5563 | 3246 | if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists () |
308bc496 | 3247 | && (vect_supportable_dr_alignment (vinfo, dr_info, false) |
a57776a1 RS |
3248 | == dr_explicit_realign_optimized)) |
3249 | { | |
3250 | /* We might access a full vector's worth. */ | |
3251 | tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo); | |
3252 | access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size; | |
a05a89fa | 3253 | } |
a57776a1 RS |
3254 | return access_size; |
3255 | } | |
3256 | ||
89fa689a RS |
3257 | /* Get the minimum alignment for all the scalar accesses that DR_INFO |
3258 | describes. */ | |
a57776a1 RS |
3259 | |
3260 | static unsigned int | |
89fa689a | 3261 | vect_vfa_align (dr_vec_info *dr_info) |
a57776a1 | 3262 | { |
89fa689a | 3263 | return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr))); |
a05a89fa CH |
3264 | } |
3265 | ||
6fa3d4b4 BC |
3266 | /* Function vect_no_alias_p. |
3267 | ||
b064d4f9 RS |
3268 | Given data references A and B with equal base and offset, see whether |
3269 | the alias relation can be decided at compilation time. Return 1 if | |
3270 | it can and the references alias, 0 if it can and the references do | |
a57776a1 RS |
3271 | not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A, |
3272 | SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent | |
3273 | of dr_with_seg_len::{seg_len,access_size} for A and B. */ | |
6fa3d4b4 | 3274 | |
b064d4f9 | 3275 | static int |
89fa689a | 3276 | vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b, |
a57776a1 RS |
3277 | tree segment_length_a, tree segment_length_b, |
3278 | unsigned HOST_WIDE_INT access_size_a, | |
3279 | unsigned HOST_WIDE_INT access_size_b) | |
6fa3d4b4 | 3280 | { |
89fa689a RS |
3281 | poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr)); |
3282 | poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr)); | |
b064d4f9 RS |
3283 | poly_uint64 const_length_a; |
3284 | poly_uint64 const_length_b; | |
6fa3d4b4 | 3285 | |
6fa3d4b4 BC |
3286 | /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT |
3287 | bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of | |
3288 | [a, a+12) */ | |
89fa689a | 3289 | if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0) |
6fa3d4b4 | 3290 | { |
b064d4f9 | 3291 | const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi (); |
f91aa3e6 | 3292 | offset_a -= const_length_a; |
6fa3d4b4 | 3293 | } |
b064d4f9 RS |
3294 | else |
3295 | const_length_a = tree_to_poly_uint64 (segment_length_a); | |
89fa689a | 3296 | if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0) |
6fa3d4b4 | 3297 | { |
b064d4f9 | 3298 | const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi (); |
f91aa3e6 | 3299 | offset_b -= const_length_b; |
6fa3d4b4 | 3300 | } |
b064d4f9 RS |
3301 | else |
3302 | const_length_b = tree_to_poly_uint64 (segment_length_b); | |
6fa3d4b4 | 3303 | |
a57776a1 RS |
3304 | const_length_a += access_size_a; |
3305 | const_length_b += access_size_b; | |
3306 | ||
b064d4f9 RS |
3307 | if (ranges_known_overlap_p (offset_a, const_length_a, |
3308 | offset_b, const_length_b)) | |
3309 | return 1; | |
6fa3d4b4 | 3310 | |
b064d4f9 RS |
3311 | if (!ranges_maybe_overlap_p (offset_a, const_length_a, |
3312 | offset_b, const_length_b)) | |
3313 | return 0; | |
3314 | ||
3315 | return -1; | |
6fa3d4b4 BC |
3316 | } |
3317 | ||
dfbddbeb RS |
3318 | /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH |
3319 | in DDR is >= VF. */ | |
3320 | ||
3321 | static bool | |
3322 | dependence_distance_ge_vf (data_dependence_relation *ddr, | |
d9f21f6a | 3323 | unsigned int loop_depth, poly_uint64 vf) |
dfbddbeb RS |
3324 | { |
3325 | if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE | |
3326 | || DDR_NUM_DIST_VECTS (ddr) == 0) | |
3327 | return false; | |
3328 | ||
3329 | /* If the dependence is exact, we should have limited the VF instead. */ | |
3330 | gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr)); | |
3331 | ||
3332 | unsigned int i; | |
3333 | lambda_vector dist_v; | |
3334 | FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) | |
3335 | { | |
3336 | HOST_WIDE_INT dist = dist_v[loop_depth]; | |
3337 | if (dist != 0 | |
3338 | && !(dist > 0 && DDR_REVERSED_P (ddr)) | |
d9f21f6a | 3339 | && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf)) |
dfbddbeb RS |
3340 | return false; |
3341 | } | |
3342 | ||
3343 | if (dump_enabled_p ()) | |
3c2a8ed0 DM |
3344 | dump_printf_loc (MSG_NOTE, vect_location, |
3345 | "dependence distance between %T and %T is >= VF\n", | |
3346 | DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr))); | |
dfbddbeb RS |
3347 | |
3348 | return true; | |
3349 | } | |
3350 | ||
a57776a1 RS |
3351 | /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */ |
3352 | ||
3353 | static void | |
4af78ef8 | 3354 | dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound) |
a57776a1 | 3355 | { |
3c2a8ed0 DM |
3356 | dump_printf (dump_kind, "%s (%T) >= ", |
3357 | lower_bound.unsigned_p ? "unsigned" : "abs", | |
3358 | lower_bound.expr); | |
a57776a1 RS |
3359 | dump_dec (dump_kind, lower_bound.min_value); |
3360 | } | |
3361 | ||
3362 | /* Record that the vectorized loop requires the vec_lower_bound described | |
3363 | by EXPR, UNSIGNED_P and MIN_VALUE. */ | |
3364 | ||
3365 | static void | |
3366 | vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p, | |
3367 | poly_uint64 min_value) | |
3368 | { | |
3369 | vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo); | |
3370 | for (unsigned int i = 0; i < lower_bounds.length (); ++i) | |
3371 | if (operand_equal_p (lower_bounds[i].expr, expr, 0)) | |
3372 | { | |
3373 | unsigned_p &= lower_bounds[i].unsigned_p; | |
3374 | min_value = upper_bound (lower_bounds[i].min_value, min_value); | |
3375 | if (lower_bounds[i].unsigned_p != unsigned_p | |
3376 | || maybe_lt (lower_bounds[i].min_value, min_value)) | |
3377 | { | |
3378 | lower_bounds[i].unsigned_p = unsigned_p; | |
3379 | lower_bounds[i].min_value = min_value; | |
3380 | if (dump_enabled_p ()) | |
3381 | { | |
3382 | dump_printf_loc (MSG_NOTE, vect_location, | |
3383 | "updating run-time check to "); | |
3384 | dump_lower_bound (MSG_NOTE, lower_bounds[i]); | |
3385 | dump_printf (MSG_NOTE, "\n"); | |
3386 | } | |
3387 | } | |
3388 | return; | |
3389 | } | |
3390 | ||
3391 | vec_lower_bound lower_bound (expr, unsigned_p, min_value); | |
3392 | if (dump_enabled_p ()) | |
3393 | { | |
3394 | dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that "); | |
3395 | dump_lower_bound (MSG_NOTE, lower_bound); | |
3396 | dump_printf (MSG_NOTE, "\n"); | |
3397 | } | |
3398 | LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound); | |
3399 | } | |
3400 | ||
89fa689a | 3401 | /* Return true if it's unlikely that the step of the vectorized form of DR_INFO |
a57776a1 RS |
3402 | will span fewer than GAP bytes. */ |
3403 | ||
3404 | static bool | |
89fa689a RS |
3405 | vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info, |
3406 | poly_int64 gap) | |
a57776a1 | 3407 | { |
89fa689a | 3408 | stmt_vec_info stmt_info = dr_info->stmt; |
a57776a1 RS |
3409 | HOST_WIDE_INT count |
3410 | = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo)); | |
2c53b149 | 3411 | if (DR_GROUP_FIRST_ELEMENT (stmt_info)) |
bffb8014 | 3412 | count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info)); |
89fa689a RS |
3413 | return (estimated_poly_value (gap) |
3414 | <= count * vect_get_scalar_dr_size (dr_info)); | |
a57776a1 RS |
3415 | } |
3416 | ||
89fa689a RS |
3417 | /* Return true if we know that there is no alias between DR_INFO_A and |
3418 | DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N. | |
3419 | When returning true, set *LOWER_BOUND_OUT to this N. */ | |
a57776a1 RS |
3420 | |
3421 | static bool | |
89fa689a | 3422 | vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b, |
a57776a1 RS |
3423 | poly_uint64 *lower_bound_out) |
3424 | { | |
3425 | /* Check that there is a constant gap of known sign between DR_A | |
3426 | and DR_B. */ | |
89fa689a RS |
3427 | data_reference *dr_a = dr_info_a->dr; |
3428 | data_reference *dr_b = dr_info_b->dr; | |
a57776a1 RS |
3429 | poly_int64 init_a, init_b; |
3430 | if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0) | |
3431 | || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0) | |
3432 | || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0) | |
3433 | || !poly_int_tree_p (DR_INIT (dr_a), &init_a) | |
3434 | || !poly_int_tree_p (DR_INIT (dr_b), &init_b) | |
3435 | || !ordered_p (init_a, init_b)) | |
3436 | return false; | |
3437 | ||
3438 | /* Sort DR_A and DR_B by the address they access. */ | |
3439 | if (maybe_lt (init_b, init_a)) | |
3440 | { | |
3441 | std::swap (init_a, init_b); | |
89fa689a | 3442 | std::swap (dr_info_a, dr_info_b); |
a57776a1 RS |
3443 | std::swap (dr_a, dr_b); |
3444 | } | |
3445 | ||
3446 | /* If the two accesses could be dependent within a scalar iteration, | |
3447 | make sure that we'd retain their order. */ | |
89fa689a RS |
3448 | if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b) |
3449 | && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b)) | |
a57776a1 RS |
3450 | return false; |
3451 | ||
3452 | /* There is no alias if abs (DR_STEP) is greater than or equal to | |
3453 | the bytes spanned by the combination of the two accesses. */ | |
89fa689a | 3454 | *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a; |
a57776a1 RS |
3455 | return true; |
3456 | } | |
3457 | ||
ebfd146a IR |
3458 | /* Function vect_prune_runtime_alias_test_list. |
3459 | ||
3460 | Prune a list of ddrs to be tested at run-time by versioning for alias. | |
a05a89fa | 3461 | Merge several alias checks into one if possible. |
ebfd146a IR |
3462 | Return FALSE if resulting list of ddrs is longer then allowed by |
3463 | PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */ | |
3464 | ||
f4ebbd24 | 3465 | opt_result |
ebfd146a IR |
3466 | vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) |
3467 | { | |
9adee305 RS |
3468 | typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash; |
3469 | hash_set <tree_pair_hash> compared_objects; | |
3470 | ||
3471 | vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo); | |
3472 | vec<dr_with_seg_len_pair_t> &comp_alias_ddrs | |
3473 | = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo); | |
3474 | vec<vec_object_pair> &check_unequal_addrs | |
3475 | = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo); | |
d9f21f6a | 3476 | poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
a05a89fa CH |
3477 | tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo); |
3478 | ||
3479 | ddr_p ddr; | |
3480 | unsigned int i; | |
3481 | tree length_factor; | |
ebfd146a | 3482 | |
adac3a68 | 3483 | DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list"); |
ebfd146a | 3484 | |
a57776a1 RS |
3485 | /* Step values are irrelevant for aliasing if the number of vector |
3486 | iterations is equal to the number of scalar iterations (which can | |
3487 | happen for fully-SLP loops). */ | |
3488 | bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U); | |
3489 | ||
3490 | if (!ignore_step_p) | |
3491 | { | |
3492 | /* Convert the checks for nonzero steps into bound tests. */ | |
3493 | tree value; | |
3494 | FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value) | |
3495 | vect_check_lower_bound (loop_vinfo, value, true, 1); | |
3496 | } | |
3497 | ||
a05a89fa | 3498 | if (may_alias_ddrs.is_empty ()) |
f4ebbd24 | 3499 | return opt_result::success (); |
a05a89fa | 3500 | |
a05a89fa CH |
3501 | comp_alias_ddrs.create (may_alias_ddrs.length ()); |
3502 | ||
dfbddbeb RS |
3503 | unsigned int loop_depth |
3504 | = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num, | |
3505 | LOOP_VINFO_LOOP_NEST (loop_vinfo)); | |
3506 | ||
a05a89fa CH |
3507 | /* First, we collect all data ref pairs for aliasing checks. */ |
3508 | FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr) | |
ebfd146a | 3509 | { |
a57776a1 | 3510 | poly_uint64 lower_bound; |
a05a89fa | 3511 | tree segment_length_a, segment_length_b; |
a57776a1 RS |
3512 | unsigned HOST_WIDE_INT access_size_a, access_size_b; |
3513 | unsigned int align_a, align_b; | |
a05a89fa | 3514 | |
dfbddbeb RS |
3515 | /* Ignore the alias if the VF we chose ended up being no greater |
3516 | than the dependence distance. */ | |
3517 | if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor)) | |
3518 | continue; | |
3519 | ||
9adee305 RS |
3520 | if (DDR_OBJECT_A (ddr)) |
3521 | { | |
3522 | vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr)); | |
3523 | if (!compared_objects.add (new_pair)) | |
3524 | { | |
3525 | if (dump_enabled_p ()) | |
3c2a8ed0 DM |
3526 | dump_printf_loc (MSG_NOTE, vect_location, |
3527 | "checking that %T and %T" | |
3528 | " have different addresses\n", | |
3529 | new_pair.first, new_pair.second); | |
9adee305 RS |
3530 | LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair); |
3531 | } | |
3532 | continue; | |
3533 | } | |
3534 | ||
f5ae2856 | 3535 | dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr)); |
89fa689a | 3536 | stmt_vec_info stmt_info_a = dr_info_a->stmt; |
a57776a1 | 3537 | |
f5ae2856 | 3538 | dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr)); |
89fa689a | 3539 | stmt_vec_info stmt_info_b = dr_info_b->stmt; |
a57776a1 | 3540 | |
e9acf80c RS |
3541 | bool preserves_scalar_order_p |
3542 | = vect_preserves_scalar_order_p (dr_info_a, dr_info_b); | |
3543 | ||
a57776a1 RS |
3544 | /* Skip the pair if inter-iteration dependencies are irrelevant |
3545 | and intra-iteration dependencies are guaranteed to be honored. */ | |
3546 | if (ignore_step_p | |
e9acf80c | 3547 | && (preserves_scalar_order_p |
89fa689a RS |
3548 | || vectorizable_with_step_bound_p (dr_info_a, dr_info_b, |
3549 | &lower_bound))) | |
a57776a1 RS |
3550 | { |
3551 | if (dump_enabled_p ()) | |
3c2a8ed0 DM |
3552 | dump_printf_loc (MSG_NOTE, vect_location, |
3553 | "no need for alias check between " | |
3554 | "%T and %T when VF is 1\n", | |
3555 | DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr)); | |
a57776a1 RS |
3556 | continue; |
3557 | } | |
3558 | ||
3559 | /* See whether we can handle the alias using a bounds check on | |
3560 | the step, and whether that's likely to be the best approach. | |
3561 | (It might not be, for example, if the minimum step is much larger | |
3562 | than the number of bytes handled by one vector iteration.) */ | |
3563 | if (!ignore_step_p | |
89fa689a RS |
3564 | && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST |
3565 | && vectorizable_with_step_bound_p (dr_info_a, dr_info_b, | |
3566 | &lower_bound) | |
3567 | && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound) | |
3568 | || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound))) | |
a57776a1 | 3569 | { |
89fa689a | 3570 | bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr); |
a57776a1 RS |
3571 | if (dump_enabled_p ()) |
3572 | { | |
3c2a8ed0 DM |
3573 | dump_printf_loc (MSG_NOTE, vect_location, "no alias between " |
3574 | "%T and %T when the step %T is outside ", | |
3575 | DR_REF (dr_info_a->dr), | |
3576 | DR_REF (dr_info_b->dr), | |
3577 | DR_STEP (dr_info_a->dr)); | |
a57776a1 RS |
3578 | if (unsigned_p) |
3579 | dump_printf (MSG_NOTE, "[0"); | |
3580 | else | |
3581 | { | |
3582 | dump_printf (MSG_NOTE, "("); | |
3583 | dump_dec (MSG_NOTE, poly_int64 (-lower_bound)); | |
3584 | } | |
3585 | dump_printf (MSG_NOTE, ", "); | |
3586 | dump_dec (MSG_NOTE, lower_bound); | |
3587 | dump_printf (MSG_NOTE, ")\n"); | |
3588 | } | |
89fa689a RS |
3589 | vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr), |
3590 | unsigned_p, lower_bound); | |
a57776a1 RS |
3591 | continue; |
3592 | } | |
3593 | ||
bffb8014 | 3594 | stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a); |
a05a89fa CH |
3595 | if (dr_group_first_a) |
3596 | { | |
bffb8014 | 3597 | stmt_info_a = dr_group_first_a; |
89fa689a | 3598 | dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a); |
a05a89fa | 3599 | } |
ebfd146a | 3600 | |
bffb8014 | 3601 | stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b); |
a05a89fa CH |
3602 | if (dr_group_first_b) |
3603 | { | |
bffb8014 | 3604 | stmt_info_b = dr_group_first_b; |
89fa689a | 3605 | dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b); |
a05a89fa | 3606 | } |
ebfd146a | 3607 | |
a57776a1 RS |
3608 | if (ignore_step_p) |
3609 | { | |
3610 | segment_length_a = size_zero_node; | |
3611 | segment_length_b = size_zero_node; | |
3612 | } | |
a05a89fa | 3613 | else |
a57776a1 | 3614 | { |
89fa689a RS |
3615 | if (!operand_equal_p (DR_STEP (dr_info_a->dr), |
3616 | DR_STEP (dr_info_b->dr), 0)) | |
a57776a1 RS |
3617 | length_factor = scalar_loop_iters; |
3618 | else | |
3619 | length_factor = size_int (vect_factor); | |
89fa689a RS |
3620 | segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor); |
3621 | segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor); | |
a57776a1 | 3622 | } |
308bc496 RB |
3623 | access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a); |
3624 | access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b); | |
89fa689a RS |
3625 | align_a = vect_vfa_align (dr_info_a); |
3626 | align_b = vect_vfa_align (dr_info_b); | |
a05a89fa | 3627 | |
b064d4f9 | 3628 | /* See whether the alias is known at compilation time. */ |
1fb2b0f6 RS |
3629 | if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr), |
3630 | DR_BASE_ADDRESS (dr_info_b->dr), 0) | |
3631 | && operand_equal_p (DR_OFFSET (dr_info_a->dr), | |
3632 | DR_OFFSET (dr_info_b->dr), 0) | |
89fa689a RS |
3633 | && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST |
3634 | && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST | |
b064d4f9 RS |
3635 | && poly_int_tree_p (segment_length_a) |
3636 | && poly_int_tree_p (segment_length_b)) | |
6fa3d4b4 | 3637 | { |
89fa689a | 3638 | int res = vect_compile_time_alias (dr_info_a, dr_info_b, |
b064d4f9 | 3639 | segment_length_a, |
a57776a1 RS |
3640 | segment_length_b, |
3641 | access_size_a, | |
3642 | access_size_b); | |
3643 | if (res >= 0 && dump_enabled_p ()) | |
3644 | { | |
3645 | dump_printf_loc (MSG_NOTE, vect_location, | |
3c2a8ed0 DM |
3646 | "can tell at compile time that %T and %T", |
3647 | DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr)); | |
a57776a1 RS |
3648 | if (res == 0) |
3649 | dump_printf (MSG_NOTE, " do not alias\n"); | |
3650 | else | |
3651 | dump_printf (MSG_NOTE, " alias\n"); | |
3652 | } | |
3653 | ||
b064d4f9 | 3654 | if (res == 0) |
6fa3d4b4 BC |
3655 | continue; |
3656 | ||
b064d4f9 | 3657 | if (res == 1) |
f4ebbd24 DM |
3658 | return opt_result::failure_at (stmt_info_b->stmt, |
3659 | "not vectorized:" | |
3660 | " compilation time alias: %G%G", | |
3661 | stmt_info_a->stmt, | |
3662 | stmt_info_b->stmt); | |
6fa3d4b4 BC |
3663 | } |
3664 | ||
e9acf80c RS |
3665 | dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a, |
3666 | access_size_a, align_a); | |
3667 | dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b, | |
3668 | access_size_b, align_b); | |
3669 | /* Canonicalize the order to be the one that's needed for accurate | |
3670 | RAW, WAR and WAW flags, in cases where the data references are | |
3671 | well-ordered. The order doesn't really matter otherwise, | |
3672 | but we might as well be consistent. */ | |
3673 | if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a) | |
3674 | std::swap (dr_a, dr_b); | |
3675 | ||
93bdc3ed | 3676 | dr_with_seg_len_pair_t dr_with_seg_len_pair |
e9acf80c RS |
3677 | (dr_a, dr_b, (preserves_scalar_order_p |
3678 | ? dr_with_seg_len_pair_t::WELL_ORDERED | |
3679 | : dr_with_seg_len_pair_t::REORDERED)); | |
93bdc3ed | 3680 | |
a05a89fa CH |
3681 | comp_alias_ddrs.safe_push (dr_with_seg_len_pair); |
3682 | } | |
3683 | ||
d9f21f6a | 3684 | prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor); |
9adee305 RS |
3685 | |
3686 | unsigned int count = (comp_alias_ddrs.length () | |
3687 | + check_unequal_addrs.length ()); | |
a57776a1 | 3688 | |
bbeeac91 DM |
3689 | if (dump_enabled_p ()) |
3690 | dump_printf_loc (MSG_NOTE, vect_location, | |
3691 | "improved number of alias checks from %d to %d\n", | |
3692 | may_alias_ddrs.length (), count); | |
028d4092 | 3693 | unsigned limit = param_vect_max_version_for_alias_checks; |
247afa98 | 3694 | if (flag_simd_cost_model == VECT_COST_MODEL_CHEAP) |
028d4092 | 3695 | limit = param_vect_max_version_for_alias_checks * 6 / 10; |
247afa98 | 3696 | if (count > limit) |
f4ebbd24 DM |
3697 | return opt_result::failure_at |
3698 | (vect_location, | |
247afa98 RB |
3699 | "number of versioning for alias run-time tests exceeds %d " |
3700 | "(--param vect-max-version-for-alias-checks)\n", limit); | |
f4ebbd24 DM |
3701 | |
3702 | return opt_result::success (); | |
ebfd146a IR |
3703 | } |
3704 | ||
bfaa08b7 RS |
3705 | /* Check whether we can use an internal function for a gather load |
3706 | or scatter store. READ_P is true for loads and false for stores. | |
3707 | MASKED_P is true if the load or store is conditional. MEMORY_TYPE is | |
09eb042a RS |
3708 | the type of the memory elements being loaded or stored. OFFSET_TYPE |
3709 | is the type of the offset that is being applied to the invariant | |
3710 | base address. SCALE is the amount by which the offset should | |
bfaa08b7 RS |
3711 | be multiplied *after* it has been converted to address width. |
3712 | ||
09eb042a RS |
3713 | Return true if the function is supported, storing the function id in |
3714 | *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */ | |
bfaa08b7 | 3715 | |
429ef523 | 3716 | bool |
09eb042a RS |
3717 | vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, |
3718 | tree vectype, tree memory_type, tree offset_type, | |
3719 | int scale, internal_fn *ifn_out, | |
3720 | tree *offset_vectype_out) | |
bfaa08b7 RS |
3721 | { |
3722 | unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type)); | |
d17a896d | 3723 | unsigned int element_bits = vector_element_bits (vectype); |
bfaa08b7 RS |
3724 | if (element_bits != memory_bits) |
3725 | /* For now the vector elements must be the same width as the | |
3726 | memory elements. */ | |
3727 | return false; | |
3728 | ||
3729 | /* Work out which function we need. */ | |
3730 | internal_fn ifn; | |
3731 | if (read_p) | |
3732 | ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD; | |
3733 | else | |
f307441a | 3734 | ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE; |
bfaa08b7 | 3735 | |
09eb042a RS |
3736 | for (;;) |
3737 | { | |
3738 | tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type); | |
3739 | if (!offset_vectype) | |
3740 | return false; | |
bfaa08b7 | 3741 | |
09eb042a RS |
3742 | /* Test whether the target supports this combination. */ |
3743 | if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type, | |
3744 | offset_vectype, scale)) | |
3745 | { | |
3746 | *ifn_out = ifn; | |
3747 | *offset_vectype_out = offset_vectype; | |
3748 | return true; | |
3749 | } | |
3750 | ||
3751 | if (TYPE_PRECISION (offset_type) >= POINTER_SIZE | |
3752 | && TYPE_PRECISION (offset_type) >= element_bits) | |
3753 | return false; | |
3754 | ||
3755 | offset_type = build_nonstandard_integer_type | |
3756 | (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type)); | |
3757 | } | |
bfaa08b7 RS |
3758 | } |
3759 | ||
82570274 | 3760 | /* STMT_INFO is a call to an internal gather load or scatter store function. |
bfaa08b7 RS |
3761 | Describe the operation in INFO. */ |
3762 | ||
3763 | static void | |
82570274 RS |
3764 | vect_describe_gather_scatter_call (stmt_vec_info stmt_info, |
3765 | gather_scatter_info *info) | |
bfaa08b7 | 3766 | { |
82570274 | 3767 | gcall *call = as_a <gcall *> (stmt_info->stmt); |
bfaa08b7 RS |
3768 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
3769 | data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); | |
3770 | ||
3771 | info->ifn = gimple_call_internal_fn (call); | |
3772 | info->decl = NULL_TREE; | |
3773 | info->base = gimple_call_arg (call, 0); | |
3774 | info->offset = gimple_call_arg (call, 1); | |
3775 | info->offset_dt = vect_unknown_def_type; | |
3776 | info->offset_vectype = NULL_TREE; | |
3777 | info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2)); | |
3778 | info->element_type = TREE_TYPE (vectype); | |
3779 | info->memory_type = TREE_TYPE (DR_REF (dr)); | |
3780 | } | |
3781 | ||
32e8e429 | 3782 | /* Return true if a non-affine read or write in STMT_INFO is suitable for a |
134c85ca | 3783 | gather load or scatter store. Describe the operation in *INFO if so. */ |
aec7ae7d | 3784 | |
134c85ca | 3785 | bool |
32e8e429 | 3786 | vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, |
134c85ca | 3787 | gather_scatter_info *info) |
aec7ae7d | 3788 | { |
f37fac2b RS |
3789 | HOST_WIDE_INT scale = 1; |
3790 | poly_int64 pbitpos, pbitsize; | |
99b1c316 | 3791 | class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
aec7ae7d JJ |
3792 | struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); |
3793 | tree offtype = NULL_TREE; | |
bfaa08b7 RS |
3794 | tree decl = NULL_TREE, base, off; |
3795 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
3796 | tree memory_type = TREE_TYPE (DR_REF (dr)); | |
ef4bddc2 | 3797 | machine_mode pmode; |
ee45a32d | 3798 | int punsignedp, reversep, pvolatilep = 0; |
bfaa08b7 | 3799 | internal_fn ifn; |
09eb042a | 3800 | tree offset_vectype; |
bfaa08b7 RS |
3801 | bool masked_p = false; |
3802 | ||
3803 | /* See whether this is already a call to a gather/scatter internal function. | |
3804 | If not, see whether it's a masked load or store. */ | |
86a91c0a | 3805 | gcall *call = dyn_cast <gcall *> (stmt_info->stmt); |
bfaa08b7 RS |
3806 | if (call && gimple_call_internal_p (call)) |
3807 | { | |
beb456c3 | 3808 | ifn = gimple_call_internal_fn (call); |
bfaa08b7 RS |
3809 | if (internal_gather_scatter_fn_p (ifn)) |
3810 | { | |
82570274 | 3811 | vect_describe_gather_scatter_call (stmt_info, info); |
bfaa08b7 RS |
3812 | return true; |
3813 | } | |
3814 | masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE); | |
3815 | } | |
3816 | ||
3817 | /* True if we should aim to use internal functions rather than | |
3818 | built-in functions. */ | |
3819 | bool use_ifn_p = (DR_IS_READ (dr) | |
f307441a RS |
3820 | ? supports_vec_gather_load_p () |
3821 | : supports_vec_scatter_store_p ()); | |
aec7ae7d | 3822 | |
5ce9450f JJ |
3823 | base = DR_REF (dr); |
3824 | /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF, | |
3825 | see if we can use the def stmt of the address. */ | |
bfaa08b7 | 3826 | if (masked_p |
5ce9450f JJ |
3827 | && TREE_CODE (base) == MEM_REF |
3828 | && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME | |
3829 | && integer_zerop (TREE_OPERAND (base, 1)) | |
3830 | && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0))) | |
3831 | { | |
355fe088 | 3832 | gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0)); |
5ce9450f JJ |
3833 | if (is_gimple_assign (def_stmt) |
3834 | && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR) | |
3835 | base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0); | |
3836 | } | |
3837 | ||
3bab6342 | 3838 | /* The gather and scatter builtins need address of the form |
aec7ae7d JJ |
3839 | loop_invariant + vector * {1, 2, 4, 8} |
3840 | or | |
3841 | loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }. | |
3842 | Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture | |
3843 | of loop invariants/SSA_NAMEs defined in the loop, with casts, | |
3844 | multiplications and additions in it. To get a vector, we need | |
3845 | a single SSA_NAME that will be defined in the loop and will | |
3846 | contain everything that is not loop invariant and that can be | |
3847 | vectorized. The following code attempts to find such a preexistng | |
3848 | SSA_NAME OFF and put the loop invariants into a tree BASE | |
3849 | that can be gimplified before the loop. */ | |
ee45a32d | 3850 | base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode, |
25b75a48 | 3851 | &punsignedp, &reversep, &pvolatilep); |
8c963290 RB |
3852 | if (reversep) |
3853 | return false; | |
3854 | ||
f37fac2b | 3855 | poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT); |
aec7ae7d JJ |
3856 | |
3857 | if (TREE_CODE (base) == MEM_REF) | |
3858 | { | |
3859 | if (!integer_zerop (TREE_OPERAND (base, 1))) | |
3860 | { | |
3861 | if (off == NULL_TREE) | |
aca52e6f | 3862 | off = wide_int_to_tree (sizetype, mem_ref_offset (base)); |
aec7ae7d JJ |
3863 | else |
3864 | off = size_binop (PLUS_EXPR, off, | |
3865 | fold_convert (sizetype, TREE_OPERAND (base, 1))); | |
3866 | } | |
3867 | base = TREE_OPERAND (base, 0); | |
3868 | } | |
3869 | else | |
3870 | base = build_fold_addr_expr (base); | |
3871 | ||
3872 | if (off == NULL_TREE) | |
3873 | off = size_zero_node; | |
3874 | ||
3875 | /* If base is not loop invariant, either off is 0, then we start with just | |
3876 | the constant offset in the loop invariant BASE and continue with base | |
3877 | as OFF, otherwise give up. | |
3878 | We could handle that case by gimplifying the addition of base + off | |
3879 | into some SSA_NAME and use that as off, but for now punt. */ | |
3880 | if (!expr_invariant_in_loop_p (loop, base)) | |
3881 | { | |
3882 | if (!integer_zerop (off)) | |
134c85ca | 3883 | return false; |
aec7ae7d | 3884 | off = base; |
f37fac2b | 3885 | base = size_int (pbytepos); |
aec7ae7d JJ |
3886 | } |
3887 | /* Otherwise put base + constant offset into the loop invariant BASE | |
3888 | and continue with OFF. */ | |
3889 | else | |
3890 | { | |
3891 | base = fold_convert (sizetype, base); | |
f37fac2b | 3892 | base = size_binop (PLUS_EXPR, base, size_int (pbytepos)); |
aec7ae7d JJ |
3893 | } |
3894 | ||
3895 | /* OFF at this point may be either a SSA_NAME or some tree expression | |
3896 | from get_inner_reference. Try to peel off loop invariants from it | |
3897 | into BASE as long as possible. */ | |
3898 | STRIP_NOPS (off); | |
3899 | while (offtype == NULL_TREE) | |
3900 | { | |
3901 | enum tree_code code; | |
3902 | tree op0, op1, add = NULL_TREE; | |
3903 | ||
3904 | if (TREE_CODE (off) == SSA_NAME) | |
3905 | { | |
355fe088 | 3906 | gimple *def_stmt = SSA_NAME_DEF_STMT (off); |
aec7ae7d JJ |
3907 | |
3908 | if (expr_invariant_in_loop_p (loop, off)) | |
134c85ca | 3909 | return false; |
aec7ae7d JJ |
3910 | |
3911 | if (gimple_code (def_stmt) != GIMPLE_ASSIGN) | |
3912 | break; | |
3913 | ||
3914 | op0 = gimple_assign_rhs1 (def_stmt); | |
3915 | code = gimple_assign_rhs_code (def_stmt); | |
3916 | op1 = gimple_assign_rhs2 (def_stmt); | |
3917 | } | |
3918 | else | |
3919 | { | |
3920 | if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS) | |
134c85ca | 3921 | return false; |
aec7ae7d JJ |
3922 | code = TREE_CODE (off); |
3923 | extract_ops_from_tree (off, &code, &op0, &op1); | |
3924 | } | |
3925 | switch (code) | |
3926 | { | |
3927 | case POINTER_PLUS_EXPR: | |
3928 | case PLUS_EXPR: | |
3929 | if (expr_invariant_in_loop_p (loop, op0)) | |
3930 | { | |
3931 | add = op0; | |
3932 | off = op1; | |
3933 | do_add: | |
3934 | add = fold_convert (sizetype, add); | |
3935 | if (scale != 1) | |
3936 | add = size_binop (MULT_EXPR, add, size_int (scale)); | |
3937 | base = size_binop (PLUS_EXPR, base, add); | |
3938 | continue; | |
3939 | } | |
3940 | if (expr_invariant_in_loop_p (loop, op1)) | |
3941 | { | |
3942 | add = op1; | |
3943 | off = op0; | |
3944 | goto do_add; | |
3945 | } | |
3946 | break; | |
3947 | case MINUS_EXPR: | |
3948 | if (expr_invariant_in_loop_p (loop, op1)) | |
3949 | { | |
3950 | add = fold_convert (sizetype, op1); | |
3951 | add = size_binop (MINUS_EXPR, size_zero_node, add); | |
3952 | off = op0; | |
3953 | goto do_add; | |
3954 | } | |
3955 | break; | |
3956 | case MULT_EXPR: | |
9541ffee | 3957 | if (scale == 1 && tree_fits_shwi_p (op1)) |
aec7ae7d | 3958 | { |
bfaa08b7 RS |
3959 | int new_scale = tree_to_shwi (op1); |
3960 | /* Only treat this as a scaling operation if the target | |
09eb042a | 3961 | supports it for at least some offset type. */ |
bfaa08b7 | 3962 | if (use_ifn_p |
09eb042a RS |
3963 | && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), |
3964 | masked_p, vectype, memory_type, | |
3965 | signed_char_type_node, | |
3966 | new_scale, &ifn, | |
3967 | &offset_vectype) | |
3968 | && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), | |
3969 | masked_p, vectype, memory_type, | |
3970 | unsigned_char_type_node, | |
bfaa08b7 | 3971 | new_scale, &ifn, |
09eb042a | 3972 | &offset_vectype)) |
bfaa08b7 RS |
3973 | break; |
3974 | scale = new_scale; | |
aec7ae7d JJ |
3975 | off = op0; |
3976 | continue; | |
3977 | } | |
3978 | break; | |
3979 | case SSA_NAME: | |
3980 | off = op0; | |
3981 | continue; | |
3982 | CASE_CONVERT: | |
3983 | if (!POINTER_TYPE_P (TREE_TYPE (op0)) | |
3984 | && !INTEGRAL_TYPE_P (TREE_TYPE (op0))) | |
3985 | break; | |
09eb042a RS |
3986 | |
3987 | /* Don't include the conversion if the target is happy with | |
3988 | the current offset type. */ | |
3989 | if (use_ifn_p | |
3990 | && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), | |
3991 | masked_p, vectype, memory_type, | |
3992 | TREE_TYPE (off), scale, &ifn, | |
3993 | &offset_vectype)) | |
3994 | break; | |
3995 | ||
aec7ae7d JJ |
3996 | if (TYPE_PRECISION (TREE_TYPE (op0)) |
3997 | == TYPE_PRECISION (TREE_TYPE (off))) | |
3998 | { | |
3999 | off = op0; | |
4000 | continue; | |
4001 | } | |
bfaa08b7 | 4002 | |
aec7ae7d JJ |
4003 | if (TYPE_PRECISION (TREE_TYPE (op0)) |
4004 | < TYPE_PRECISION (TREE_TYPE (off))) | |
4005 | { | |
4006 | off = op0; | |
4007 | offtype = TREE_TYPE (off); | |
4008 | STRIP_NOPS (off); | |
4009 | continue; | |
4010 | } | |
4011 | break; | |
4012 | default: | |
4013 | break; | |
4014 | } | |
4015 | break; | |
4016 | } | |
4017 | ||
4018 | /* If at the end OFF still isn't a SSA_NAME or isn't | |
4019 | defined in the loop, punt. */ | |
4020 | if (TREE_CODE (off) != SSA_NAME | |
4021 | || expr_invariant_in_loop_p (loop, off)) | |
134c85ca | 4022 | return false; |
aec7ae7d JJ |
4023 | |
4024 | if (offtype == NULL_TREE) | |
4025 | offtype = TREE_TYPE (off); | |
4026 | ||
bfaa08b7 RS |
4027 | if (use_ifn_p) |
4028 | { | |
09eb042a RS |
4029 | if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p, |
4030 | vectype, memory_type, offtype, scale, | |
4031 | &ifn, &offset_vectype)) | |
bfaa08b7 RS |
4032 | return false; |
4033 | } | |
3bab6342 | 4034 | else |
bfaa08b7 RS |
4035 | { |
4036 | if (DR_IS_READ (dr)) | |
ab2fc782 RS |
4037 | { |
4038 | if (targetm.vectorize.builtin_gather) | |
4039 | decl = targetm.vectorize.builtin_gather (vectype, offtype, scale); | |
4040 | } | |
bfaa08b7 | 4041 | else |
ab2fc782 RS |
4042 | { |
4043 | if (targetm.vectorize.builtin_scatter) | |
4044 | decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale); | |
4045 | } | |
3bab6342 | 4046 | |
bfaa08b7 RS |
4047 | if (!decl) |
4048 | return false; | |
4049 | ||
4050 | ifn = IFN_LAST; | |
09eb042a RS |
4051 | /* The offset vector type will be read from DECL when needed. */ |
4052 | offset_vectype = NULL_TREE; | |
bfaa08b7 | 4053 | } |
134c85ca | 4054 | |
bfaa08b7 | 4055 | info->ifn = ifn; |
134c85ca RS |
4056 | info->decl = decl; |
4057 | info->base = base; | |
4058 | info->offset = off; | |
4059 | info->offset_dt = vect_unknown_def_type; | |
09eb042a | 4060 | info->offset_vectype = offset_vectype; |
134c85ca | 4061 | info->scale = scale; |
09eb042a | 4062 | info->element_type = TREE_TYPE (vectype); |
bfaa08b7 | 4063 | info->memory_type = memory_type; |
134c85ca | 4064 | return true; |
aec7ae7d JJ |
4065 | } |
4066 | ||
8e846c66 RB |
4067 | /* Find the data references in STMT, analyze them with respect to LOOP and |
4068 | append them to DATAREFS. Return false if datarefs in this stmt cannot | |
4069 | be handled. */ | |
4070 | ||
f4ebbd24 | 4071 | opt_result |
8e846c66 RB |
4072 | vect_find_stmt_data_reference (loop_p loop, gimple *stmt, |
4073 | vec<data_reference_p> *datarefs) | |
4074 | { | |
4075 | /* We can ignore clobbers for dataref analysis - they are removed during | |
4076 | loop vectorization and BB vectorization checks dependences with a | |
4077 | stmt walk. */ | |
4078 | if (gimple_clobber_p (stmt)) | |
f4ebbd24 | 4079 | return opt_result::success (); |
8e846c66 RB |
4080 | |
4081 | if (gimple_has_volatile_ops (stmt)) | |
f4ebbd24 DM |
4082 | return opt_result::failure_at (stmt, "not vectorized: volatile type: %G", |
4083 | stmt); | |
8e846c66 | 4084 | |
36bbc05d | 4085 | if (stmt_can_throw_internal (cfun, stmt)) |
f4ebbd24 DM |
4086 | return opt_result::failure_at (stmt, |
4087 | "not vectorized:" | |
4088 | " statement can throw an exception: %G", | |
4089 | stmt); | |
8e846c66 RB |
4090 | |
4091 | auto_vec<data_reference_p, 2> refs; | |
f4ebbd24 DM |
4092 | opt_result res = find_data_references_in_stmt (loop, stmt, &refs); |
4093 | if (!res) | |
4094 | return res; | |
8e846c66 RB |
4095 | |
4096 | if (refs.is_empty ()) | |
f4ebbd24 | 4097 | return opt_result::success (); |
8e846c66 RB |
4098 | |
4099 | if (refs.length () > 1) | |
f4ebbd24 DM |
4100 | return opt_result::failure_at (stmt, |
4101 | "not vectorized:" | |
4102 | " more than one data ref in stmt: %G", stmt); | |
8e846c66 RB |
4103 | |
4104 | if (gcall *call = dyn_cast <gcall *> (stmt)) | |
4105 | if (!gimple_call_internal_p (call) | |
4106 | || (gimple_call_internal_fn (call) != IFN_MASK_LOAD | |
4107 | && gimple_call_internal_fn (call) != IFN_MASK_STORE)) | |
f4ebbd24 DM |
4108 | return opt_result::failure_at (stmt, |
4109 | "not vectorized: dr in a call %G", stmt); | |
8e846c66 RB |
4110 | |
4111 | data_reference_p dr = refs.pop (); | |
4112 | if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF | |
4113 | && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1))) | |
f4ebbd24 DM |
4114 | return opt_result::failure_at (stmt, |
4115 | "not vectorized:" | |
4116 | " statement is bitfield access %G", stmt); | |
8e846c66 RB |
4117 | |
4118 | if (DR_BASE_ADDRESS (dr) | |
4119 | && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST) | |
f4ebbd24 DM |
4120 | return opt_result::failure_at (stmt, |
4121 | "not vectorized:" | |
4122 | " base addr of dr is a constant\n"); | |
8e846c66 | 4123 | |
f2227a66 RB |
4124 | /* Check whether this may be a SIMD lane access and adjust the |
4125 | DR to make it easier for us to handle it. */ | |
4126 | if (loop | |
4127 | && loop->simduid | |
4128 | && (!DR_BASE_ADDRESS (dr) | |
4129 | || !DR_OFFSET (dr) | |
4130 | || !DR_INIT (dr) | |
4131 | || !DR_STEP (dr))) | |
4132 | { | |
4133 | struct data_reference *newdr | |
4134 | = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt, | |
4135 | DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr)); | |
4136 | if (DR_BASE_ADDRESS (newdr) | |
4137 | && DR_OFFSET (newdr) | |
4138 | && DR_INIT (newdr) | |
4139 | && DR_STEP (newdr) | |
c13c129f | 4140 | && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST |
f2227a66 RB |
4141 | && integer_zerop (DR_STEP (newdr))) |
4142 | { | |
c13c129f | 4143 | tree base_address = DR_BASE_ADDRESS (newdr); |
f2227a66 | 4144 | tree off = DR_OFFSET (newdr); |
080c269b | 4145 | tree step = ssize_int (1); |
c13c129f JJ |
4146 | if (integer_zerop (off) |
4147 | && TREE_CODE (base_address) == POINTER_PLUS_EXPR) | |
4148 | { | |
4149 | off = TREE_OPERAND (base_address, 1); | |
4150 | base_address = TREE_OPERAND (base_address, 0); | |
4151 | } | |
f2227a66 | 4152 | STRIP_NOPS (off); |
080c269b | 4153 | if (TREE_CODE (off) == MULT_EXPR |
f2227a66 RB |
4154 | && tree_fits_uhwi_p (TREE_OPERAND (off, 1))) |
4155 | { | |
080c269b | 4156 | step = TREE_OPERAND (off, 1); |
f2227a66 RB |
4157 | off = TREE_OPERAND (off, 0); |
4158 | STRIP_NOPS (off); | |
080c269b | 4159 | } |
c13c129f JJ |
4160 | if (CONVERT_EXPR_P (off) |
4161 | && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0))) | |
4162 | < TYPE_PRECISION (TREE_TYPE (off)))) | |
4163 | off = TREE_OPERAND (off, 0); | |
4164 | if (TREE_CODE (off) == SSA_NAME) | |
080c269b | 4165 | { |
c13c129f JJ |
4166 | gimple *def = SSA_NAME_DEF_STMT (off); |
4167 | /* Look through widening conversion. */ | |
4168 | if (is_gimple_assign (def) | |
4169 | && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def))) | |
4170 | { | |
4171 | tree rhs1 = gimple_assign_rhs1 (def); | |
4172 | if (TREE_CODE (rhs1) == SSA_NAME | |
4173 | && INTEGRAL_TYPE_P (TREE_TYPE (rhs1)) | |
4174 | && (TYPE_PRECISION (TREE_TYPE (off)) | |
4175 | > TYPE_PRECISION (TREE_TYPE (rhs1)))) | |
4176 | def = SSA_NAME_DEF_STMT (rhs1); | |
4177 | } | |
4178 | if (is_gimple_call (def) | |
4179 | && gimple_call_internal_p (def) | |
4180 | && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE)) | |
f2227a66 | 4181 | { |
c13c129f | 4182 | tree arg = gimple_call_arg (def, 0); |
f2227a66 | 4183 | tree reft = TREE_TYPE (DR_REF (newdr)); |
c13c129f JJ |
4184 | gcc_assert (TREE_CODE (arg) == SSA_NAME); |
4185 | arg = SSA_NAME_VAR (arg); | |
4186 | if (arg == loop->simduid | |
4187 | /* For now. */ | |
4188 | && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step)) | |
f2227a66 | 4189 | { |
c13c129f JJ |
4190 | DR_BASE_ADDRESS (newdr) = base_address; |
4191 | DR_OFFSET (newdr) = ssize_int (0); | |
4192 | DR_STEP (newdr) = step; | |
4193 | DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT; | |
4194 | DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step); | |
4195 | /* Mark as simd-lane access. */ | |
4196 | tree arg2 = gimple_call_arg (def, 1); | |
4197 | newdr->aux = (void *) (-1 - tree_to_uhwi (arg2)); | |
4198 | free_data_ref (dr); | |
4199 | datarefs->safe_push (newdr); | |
4200 | return opt_result::success (); | |
f2227a66 RB |
4201 | } |
4202 | } | |
4203 | } | |
4204 | } | |
4205 | free_data_ref (newdr); | |
4206 | } | |
4207 | ||
8e846c66 | 4208 | datarefs->safe_push (dr); |
f4ebbd24 | 4209 | return opt_result::success (); |
8e846c66 RB |
4210 | } |
4211 | ||
ebfd146a IR |
4212 | /* Function vect_analyze_data_refs. |
4213 | ||
a70d6342 | 4214 | Find all the data references in the loop or basic block. |
ebfd146a IR |
4215 | |
4216 | The general structure of the analysis of data refs in the vectorizer is as | |
4217 | follows: | |
b8698a0f | 4218 | 1- vect_analyze_data_refs(loop/bb): call |
a70d6342 IR |
4219 | compute_data_dependences_for_loop/bb to find and analyze all data-refs |
4220 | in the loop/bb and their dependences. | |
ebfd146a IR |
4221 | 2- vect_analyze_dependences(): apply dependence testing using ddrs. |
4222 | 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok. | |
4223 | 4- vect_analyze_drs_access(): check that ref_stmt.step is ok. | |
4224 | ||
4225 | */ | |
4226 | ||
f4ebbd24 | 4227 | opt_result |
a7b3509e | 4228 | vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal) |
ebfd146a | 4229 | { |
99b1c316 | 4230 | class loop *loop = NULL; |
ebfd146a | 4231 | unsigned int i; |
ebfd146a IR |
4232 | struct data_reference *dr; |
4233 | tree scalar_type; | |
4234 | ||
adac3a68 | 4235 | DUMP_VECT_SCOPE ("vect_analyze_data_refs"); |
b8698a0f | 4236 | |
310213d4 | 4237 | if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo)) |
428db0ba | 4238 | loop = LOOP_VINFO_LOOP (loop_vinfo); |
ebfd146a | 4239 | |
ff802fa1 IR |
4240 | /* Go through the data-refs, check that the analysis succeeded. Update |
4241 | pointer from stmt_vec_info struct to DR and vectype. */ | |
ebfd146a | 4242 | |
ca823c85 | 4243 | vec<data_reference_p> datarefs = vinfo->shared->datarefs; |
9771b263 | 4244 | FOR_EACH_VEC_ELT (datarefs, i, dr) |
ebfd146a | 4245 | { |
3bab6342 | 4246 | enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE; |
d9f21f6a | 4247 | poly_uint64 vf; |
b8698a0f | 4248 | |
8e846c66 | 4249 | gcc_assert (DR_REF (dr)); |
f44fb7aa RS |
4250 | stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr)); |
4251 | gcc_assert (!stmt_info->dr_aux.dr); | |
4252 | stmt_info->dr_aux.dr = dr; | |
4253 | stmt_info->dr_aux.stmt = stmt_info; | |
ebfd146a IR |
4254 | |
4255 | /* Check that analysis of the data-ref succeeded. */ | |
4256 | if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr) | |
aec7ae7d | 4257 | || !DR_STEP (dr)) |
ebfd146a | 4258 | { |
74bf76ed JJ |
4259 | bool maybe_gather |
4260 | = DR_IS_READ (dr) | |
aec7ae7d | 4261 | && !TREE_THIS_VOLATILE (DR_REF (dr)) |
bfaa08b7 RS |
4262 | && (targetm.vectorize.builtin_gather != NULL |
4263 | || supports_vec_gather_load_p ()); | |
3bab6342 AT |
4264 | bool maybe_scatter |
4265 | = DR_IS_WRITE (dr) | |
4266 | && !TREE_THIS_VOLATILE (DR_REF (dr)) | |
f307441a RS |
4267 | && (targetm.vectorize.builtin_scatter != NULL |
4268 | || supports_vec_scatter_store_p ()); | |
74bf76ed | 4269 | |
f2227a66 RB |
4270 | /* If target supports vector gather loads or scatter stores, |
4271 | see if they can't be used. */ | |
310213d4 | 4272 | if (is_a <loop_vec_info> (vinfo) |
78e02b3b | 4273 | && !nested_in_vect_loop_p (loop, stmt_info)) |
aec7ae7d | 4274 | { |
f2227a66 | 4275 | if (maybe_gather || maybe_scatter) |
5fa23466 RB |
4276 | { |
4277 | if (maybe_gather) | |
4278 | gatherscatter = GATHER; | |
4279 | else | |
4280 | gatherscatter = SCATTER; | |
aec7ae7d | 4281 | } |
aec7ae7d | 4282 | } |
4b5caab7 | 4283 | |
f2227a66 | 4284 | if (gatherscatter == SG_NONE) |
aec7ae7d | 4285 | { |
73fbfcad | 4286 | if (dump_enabled_p ()) |
3c2a8ed0 DM |
4287 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
4288 | "not vectorized: data ref analysis " | |
4289 | "failed %G", stmt_info->stmt); | |
310213d4 | 4290 | if (is_a <bb_vec_info> (vinfo)) |
27312bf2 RB |
4291 | { |
4292 | /* In BB vectorization the ref can still participate | |
4293 | in dependence analysis, we just can't vectorize it. */ | |
4294 | STMT_VINFO_VECTORIZABLE (stmt_info) = false; | |
4295 | continue; | |
4296 | } | |
f4ebbd24 DM |
4297 | return opt_result::failure_at (stmt_info->stmt, |
4298 | "not vectorized:" | |
4299 | " data ref analysis failed: %G", | |
4300 | stmt_info->stmt); | |
aec7ae7d | 4301 | } |
ebfd146a IR |
4302 | } |
4303 | ||
f2227a66 | 4304 | /* See if this was detected as SIMD lane access. */ |
0356aab8 JJ |
4305 | if (dr->aux == (void *)-1 |
4306 | || dr->aux == (void *)-2 | |
1612b1fe JJ |
4307 | || dr->aux == (void *)-3 |
4308 | || dr->aux == (void *)-4) | |
f2227a66 | 4309 | { |
78e02b3b | 4310 | if (nested_in_vect_loop_p (loop, stmt_info)) |
f4ebbd24 DM |
4311 | return opt_result::failure_at (stmt_info->stmt, |
4312 | "not vectorized:" | |
4313 | " data ref analysis failed: %G", | |
4314 | stmt_info->stmt); | |
0356aab8 JJ |
4315 | STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) |
4316 | = -(uintptr_t) dr->aux; | |
f2227a66 RB |
4317 | } |
4318 | ||
5fa23466 RB |
4319 | tree base = get_base_address (DR_REF (dr)); |
4320 | if (base && VAR_P (base) && DECL_NONALIASED (base)) | |
508ef0c6 | 4321 | { |
73fbfcad | 4322 | if (dump_enabled_p ()) |
3c2a8ed0 DM |
4323 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
4324 | "not vectorized: base object not addressable " | |
4325 | "for stmt: %G", stmt_info->stmt); | |
310213d4 | 4326 | if (is_a <bb_vec_info> (vinfo)) |
8e846c66 RB |
4327 | { |
4328 | /* In BB vectorization the ref can still participate | |
4329 | in dependence analysis, we just can't vectorize it. */ | |
4330 | STMT_VINFO_VECTORIZABLE (stmt_info) = false; | |
4331 | continue; | |
4332 | } | |
f4ebbd24 DM |
4333 | return opt_result::failure_at (stmt_info->stmt, |
4334 | "not vectorized: base object not" | |
4335 | " addressable for stmt: %G", | |
4336 | stmt_info->stmt); | |
508ef0c6 RG |
4337 | } |
4338 | ||
8e846c66 | 4339 | if (is_a <loop_vec_info> (vinfo) |
5fa23466 | 4340 | && DR_STEP (dr) |
8e846c66 | 4341 | && TREE_CODE (DR_STEP (dr)) != INTEGER_CST) |
9c239085 | 4342 | { |
78e02b3b | 4343 | if (nested_in_vect_loop_p (loop, stmt_info)) |
f4ebbd24 | 4344 | return opt_result::failure_at (stmt_info->stmt, |
f8cb8bcd | 4345 | "not vectorized: " |
f4ebbd24 DM |
4346 | "not suitable for strided load %G", |
4347 | stmt_info->stmt); | |
8e846c66 | 4348 | STMT_VINFO_STRIDED_P (stmt_info) = true; |
9c239085 JJ |
4349 | } |
4350 | ||
ebfd146a | 4351 | /* Update DR field in stmt_vec_info struct. */ |
ebfd146a IR |
4352 | |
4353 | /* If the dataref is in an inner-loop of the loop that is considered for | |
4354 | for vectorization, we also want to analyze the access relative to | |
b8698a0f | 4355 | the outer-loop (DR contains information only relative to the |
ebfd146a IR |
4356 | inner-most enclosing loop). We do that by building a reference to the |
4357 | first location accessed by the inner-loop, and analyze it relative to | |
b8698a0f | 4358 | the outer-loop. */ |
78e02b3b | 4359 | if (loop && nested_in_vect_loop_p (loop, stmt_info)) |
ebfd146a | 4360 | { |
b8698a0f | 4361 | /* Build a reference to the first location accessed by the |
bb642979 RS |
4362 | inner loop: *(BASE + INIT + OFFSET). By construction, |
4363 | this address must be invariant in the inner loop, so we | |
4364 | can consider it as being used in the outer loop. */ | |
8e846c66 RB |
4365 | tree base = unshare_expr (DR_BASE_ADDRESS (dr)); |
4366 | tree offset = unshare_expr (DR_OFFSET (dr)); | |
4367 | tree init = unshare_expr (DR_INIT (dr)); | |
bb642979 RS |
4368 | tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), |
4369 | init, offset); | |
4370 | tree init_addr = fold_build_pointer_plus (base, init_offset); | |
4371 | tree init_ref = build_fold_indirect_ref (init_addr); | |
ebfd146a | 4372 | |
73fbfcad | 4373 | if (dump_enabled_p ()) |
3c2a8ed0 DM |
4374 | dump_printf_loc (MSG_NOTE, vect_location, |
4375 | "analyze in outer loop: %T\n", init_ref); | |
ebfd146a | 4376 | |
f4ebbd24 DM |
4377 | opt_result res |
4378 | = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info), | |
4379 | init_ref, loop, stmt_info->stmt); | |
4380 | if (!res) | |
bb642979 | 4381 | /* dr_analyze_innermost already explained the failure. */ |
f4ebbd24 | 4382 | return res; |
ebfd146a | 4383 | |
73fbfcad | 4384 | if (dump_enabled_p ()) |
3c2a8ed0 DM |
4385 | dump_printf_loc (MSG_NOTE, vect_location, |
4386 | "\touter base_address: %T\n" | |
4387 | "\touter offset from base address: %T\n" | |
4388 | "\touter constant offset from base address: %T\n" | |
4389 | "\touter step: %T\n" | |
4390 | "\touter base alignment: %d\n\n" | |
4391 | "\touter base misalignment: %d\n" | |
4392 | "\touter offset alignment: %d\n" | |
4393 | "\touter step alignment: %d\n", | |
4394 | STMT_VINFO_DR_BASE_ADDRESS (stmt_info), | |
4395 | STMT_VINFO_DR_OFFSET (stmt_info), | |
4396 | STMT_VINFO_DR_INIT (stmt_info), | |
4397 | STMT_VINFO_DR_STEP (stmt_info), | |
4398 | STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info), | |
4399 | STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info), | |
4400 | STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info), | |
4401 | STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info)); | |
ebfd146a IR |
4402 | } |
4403 | ||
ebfd146a IR |
4404 | /* Set vectype for STMT. */ |
4405 | scalar_type = TREE_TYPE (DR_REF (dr)); | |
9b75f56d RS |
4406 | tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type); |
4407 | if (!vectype) | |
ebfd146a | 4408 | { |
73fbfcad | 4409 | if (dump_enabled_p ()) |
ebfd146a | 4410 | { |
e645e942 | 4411 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
3c2a8ed0 DM |
4412 | "not vectorized: no vectype for stmt: %G", |
4413 | stmt_info->stmt); | |
78c60e3d SS |
4414 | dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: "); |
4415 | dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS, | |
4416 | scalar_type); | |
e645e942 | 4417 | dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); |
ebfd146a | 4418 | } |
4b5caab7 | 4419 | |
310213d4 | 4420 | if (is_a <bb_vec_info> (vinfo)) |
64900538 RB |
4421 | { |
4422 | /* No vector type is fine, the ref can still participate | |
4423 | in dependence analysis, we just can't vectorize it. */ | |
4424 | STMT_VINFO_VECTORIZABLE (stmt_info) = false; | |
4425 | continue; | |
4426 | } | |
1f88cc26 RB |
4427 | if (fatal) |
4428 | *fatal = false; | |
f4ebbd24 DM |
4429 | return opt_result::failure_at (stmt_info->stmt, |
4430 | "not vectorized:" | |
4431 | " no vectype for stmt: %G" | |
4432 | " scalar_type: %T\n", | |
4433 | stmt_info->stmt, scalar_type); | |
ebfd146a | 4434 | } |
451dabda RB |
4435 | else |
4436 | { | |
4437 | if (dump_enabled_p ()) | |
3c2a8ed0 DM |
4438 | dump_printf_loc (MSG_NOTE, vect_location, |
4439 | "got vectype for stmt: %G%T\n", | |
9b75f56d | 4440 | stmt_info->stmt, vectype); |
451dabda | 4441 | } |
777e1f09 RG |
4442 | |
4443 | /* Adjust the minimal vectorization factor according to the | |
4444 | vector type. */ | |
9b75f56d | 4445 | vf = TYPE_VECTOR_SUBPARTS (vectype); |
d9f21f6a | 4446 | *min_vf = upper_bound (*min_vf, vf); |
aec7ae7d | 4447 | |
9b75f56d RS |
4448 | /* Leave the BB vectorizer to pick the vector type later, based on |
4449 | the final dataref group size and SLP node size. */ | |
4450 | if (is_a <loop_vec_info> (vinfo)) | |
4451 | STMT_VINFO_VECTYPE (stmt_info) = vectype; | |
4452 | ||
3bab6342 | 4453 | if (gatherscatter != SG_NONE) |
aec7ae7d | 4454 | { |
134c85ca | 4455 | gather_scatter_info gs_info; |
78e02b3b RS |
4456 | if (!vect_check_gather_scatter (stmt_info, |
4457 | as_a <loop_vec_info> (vinfo), | |
134c85ca | 4458 | &gs_info) |
7ed54790 RS |
4459 | || !get_vectype_for_scalar_type (vinfo, |
4460 | TREE_TYPE (gs_info.offset))) | |
a7b3509e JJ |
4461 | { |
4462 | if (fatal) | |
4463 | *fatal = false; | |
4464 | return opt_result::failure_at | |
4465 | (stmt_info->stmt, | |
4466 | (gatherscatter == GATHER) | |
4467 | ? "not vectorized: not suitable for gather load %G" | |
4468 | : "not vectorized: not suitable for scatter store %G", | |
4469 | stmt_info->stmt); | |
4470 | } | |
3bab6342 | 4471 | STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter; |
319e6439 | 4472 | } |
ebfd146a | 4473 | } |
b8698a0f | 4474 | |
27312bf2 RB |
4475 | /* We used to stop processing and prune the list here. Verify we no |
4476 | longer need to. */ | |
4477 | gcc_assert (i == datarefs.length ()); | |
fcac74a1 | 4478 | |
f4ebbd24 | 4479 | return opt_result::success (); |
ebfd146a IR |
4480 | } |
4481 | ||
4482 | ||
4483 | /* Function vect_get_new_vect_var. | |
4484 | ||
ff802fa1 | 4485 | Returns a name for a new variable. The current naming scheme appends the |
b8698a0f L |
4486 | prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to |
4487 | the name of vectorizer generated variables, and appends that to NAME if | |
ebfd146a IR |
4488 | provided. */ |
4489 | ||
4490 | tree | |
4491 | vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name) | |
4492 | { | |
4493 | const char *prefix; | |
4494 | tree new_vect_var; | |
4495 | ||
4496 | switch (var_kind) | |
4497 | { | |
4498 | case vect_simple_var: | |
451dabda | 4499 | prefix = "vect"; |
ebfd146a IR |
4500 | break; |
4501 | case vect_scalar_var: | |
451dabda | 4502 | prefix = "stmp"; |
ebfd146a | 4503 | break; |
42fd8198 IE |
4504 | case vect_mask_var: |
4505 | prefix = "mask"; | |
4506 | break; | |
ebfd146a | 4507 | case vect_pointer_var: |
451dabda | 4508 | prefix = "vectp"; |
ebfd146a IR |
4509 | break; |
4510 | default: | |
4511 | gcc_unreachable (); | |
4512 | } | |
4513 | ||
4514 | if (name) | |
4515 | { | |
451dabda | 4516 | char* tmp = concat (prefix, "_", name, NULL); |
65876d24 | 4517 | new_vect_var = create_tmp_reg (type, tmp); |
ebfd146a IR |
4518 | free (tmp); |
4519 | } | |
4520 | else | |
65876d24 | 4521 | new_vect_var = create_tmp_reg (type, prefix); |
ebfd146a IR |
4522 | |
4523 | return new_vect_var; | |
4524 | } | |
4525 | ||
0e22bb5a RB |
4526 | /* Like vect_get_new_vect_var but return an SSA name. */ |
4527 | ||
4528 | tree | |
4529 | vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name) | |
4530 | { | |
4531 | const char *prefix; | |
4532 | tree new_vect_var; | |
4533 | ||
4534 | switch (var_kind) | |
4535 | { | |
4536 | case vect_simple_var: | |
4537 | prefix = "vect"; | |
4538 | break; | |
4539 | case vect_scalar_var: | |
4540 | prefix = "stmp"; | |
4541 | break; | |
4542 | case vect_pointer_var: | |
4543 | prefix = "vectp"; | |
4544 | break; | |
4545 | default: | |
4546 | gcc_unreachable (); | |
4547 | } | |
4548 | ||
4549 | if (name) | |
4550 | { | |
4551 | char* tmp = concat (prefix, "_", name, NULL); | |
4552 | new_vect_var = make_temp_ssa_name (type, NULL, tmp); | |
4553 | free (tmp); | |
4554 | } | |
4555 | else | |
4556 | new_vect_var = make_temp_ssa_name (type, NULL, prefix); | |
4557 | ||
4558 | return new_vect_var; | |
4559 | } | |
4560 | ||
89fa689a | 4561 | /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */ |
faf4220c JJ |
4562 | |
4563 | static void | |
89fa689a | 4564 | vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info) |
faf4220c | 4565 | { |
89fa689a RS |
4566 | duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr)); |
4567 | int misalign = DR_MISALIGNMENT (dr_info); | |
8d21ff9f | 4568 | if (misalign == DR_MISALIGNMENT_UNKNOWN) |
faf4220c JJ |
4569 | mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name)); |
4570 | else | |
f702e7d4 | 4571 | set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), |
ca31798e AV |
4572 | known_alignment (DR_TARGET_ALIGNMENT (dr_info)), |
4573 | misalign); | |
faf4220c | 4574 | } |
ebfd146a IR |
4575 | |
4576 | /* Function vect_create_addr_base_for_vector_ref. | |
4577 | ||
4578 | Create an expression that computes the address of the first memory location | |
4579 | that will be accessed for a data reference. | |
4580 | ||
4581 | Input: | |
32e8e429 | 4582 | STMT_INFO: The statement containing the data reference. |
ebfd146a IR |
4583 | NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list. |
4584 | OFFSET: Optional. If supplied, it is be added to the initial address. | |
4585 | LOOP: Specify relative to which loop-nest should the address be computed. | |
4586 | For example, when the dataref is in an inner-loop nested in an | |
4587 | outer-loop that is now being vectorized, LOOP can be either the | |
ff802fa1 | 4588 | outer-loop, or the inner-loop. The first memory location accessed |
ebfd146a IR |
4589 | by the following dataref ('in' points to short): |
4590 | ||
4591 | for (i=0; i<N; i++) | |
4592 | for (j=0; j<M; j++) | |
4593 | s += in[i+j] | |
4594 | ||
4595 | is as follows: | |
4596 | if LOOP=i_loop: &in (relative to i_loop) | |
4597 | if LOOP=j_loop: &in+i*2B (relative to j_loop) | |
356bbc4c JJ |
4598 | BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the |
4599 | initial address. Unlike OFFSET, which is number of elements to | |
4600 | be added, BYTE_OFFSET is measured in bytes. | |
ebfd146a IR |
4601 | |
4602 | Output: | |
b8698a0f | 4603 | 1. Return an SSA_NAME whose value is the address of the memory location of |
ebfd146a IR |
4604 | the first vector of the data reference. |
4605 | 2. If new_stmt_list is not NULL_TREE after return then the caller must insert | |
4606 | these statement(s) which define the returned SSA_NAME. | |
4607 | ||
4608 | FORNOW: We are only handling array accesses with step 1. */ | |
4609 | ||
4610 | tree | |
308bc496 | 4611 | vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info, |
ebfd146a IR |
4612 | gimple_seq *new_stmt_list, |
4613 | tree offset, | |
356bbc4c | 4614 | tree byte_offset) |
ebfd146a | 4615 | { |
89fa689a RS |
4616 | dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); |
4617 | struct data_reference *dr = dr_info->dr; | |
595c2679 | 4618 | const char *base_name; |
4bdd44c4 | 4619 | tree addr_base; |
ebfd146a IR |
4620 | tree dest; |
4621 | gimple_seq seq = NULL; | |
8644a673 | 4622 | tree vect_ptr_type; |
ebfd146a | 4623 | tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); |
308bc496 RB |
4624 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); |
4625 | innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info); | |
ebfd146a | 4626 | |
3f5e8a76 | 4627 | tree data_ref_base = unshare_expr (drb->base_address); |
308bc496 | 4628 | tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true)); |
3f5e8a76 | 4629 | tree init = unshare_expr (drb->init); |
ebfd146a | 4630 | |
a70d6342 | 4631 | if (loop_vinfo) |
595c2679 | 4632 | base_name = get_name (data_ref_base); |
a70d6342 IR |
4633 | else |
4634 | { | |
4635 | base_offset = ssize_int (0); | |
4636 | init = ssize_int (0); | |
595c2679 | 4637 | base_name = get_name (DR_REF (dr)); |
b8698a0f | 4638 | } |
a70d6342 | 4639 | |
ebfd146a IR |
4640 | /* Create base_offset */ |
4641 | base_offset = size_binop (PLUS_EXPR, | |
4642 | fold_convert (sizetype, base_offset), | |
4643 | fold_convert (sizetype, init)); | |
ebfd146a IR |
4644 | |
4645 | if (offset) | |
4646 | { | |
ebfd146a IR |
4647 | offset = fold_build2 (MULT_EXPR, sizetype, |
4648 | fold_convert (sizetype, offset), step); | |
4649 | base_offset = fold_build2 (PLUS_EXPR, sizetype, | |
4650 | base_offset, offset); | |
ebfd146a | 4651 | } |
356bbc4c JJ |
4652 | if (byte_offset) |
4653 | { | |
4654 | byte_offset = fold_convert (sizetype, byte_offset); | |
4655 | base_offset = fold_build2 (PLUS_EXPR, sizetype, | |
4656 | base_offset, byte_offset); | |
4657 | } | |
ebfd146a IR |
4658 | |
4659 | /* base + base_offset */ | |
a70d6342 | 4660 | if (loop_vinfo) |
5d49b6a7 | 4661 | addr_base = fold_build_pointer_plus (data_ref_base, base_offset); |
a70d6342 IR |
4662 | else |
4663 | { | |
70f34814 RG |
4664 | addr_base = build1 (ADDR_EXPR, |
4665 | build_pointer_type (TREE_TYPE (DR_REF (dr))), | |
4666 | unshare_expr (DR_REF (dr))); | |
a70d6342 | 4667 | } |
b8698a0f | 4668 | |
ebfd146a | 4669 | vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info)); |
4bdd44c4 | 4670 | dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name); |
aed93b23 | 4671 | addr_base = force_gimple_operand (addr_base, &seq, true, dest); |
ebfd146a IR |
4672 | gimple_seq_add_seq (new_stmt_list, seq); |
4673 | ||
17fc049f | 4674 | if (DR_PTR_INFO (dr) |
aed93b23 RB |
4675 | && TREE_CODE (addr_base) == SSA_NAME |
4676 | && !SSA_NAME_PTR_INFO (addr_base)) | |
128aaeed | 4677 | { |
89fa689a | 4678 | vect_duplicate_ssa_name_ptr_info (addr_base, dr_info); |
faf4220c | 4679 | if (offset || byte_offset) |
4bdd44c4 | 4680 | mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base)); |
128aaeed | 4681 | } |
17fc049f | 4682 | |
73fbfcad | 4683 | if (dump_enabled_p ()) |
3c2a8ed0 | 4684 | dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base); |
8644a673 | 4685 | |
4bdd44c4 | 4686 | return addr_base; |
ebfd146a IR |
4687 | } |
4688 | ||
4689 | ||
4690 | /* Function vect_create_data_ref_ptr. | |
4691 | ||
920e8172 | 4692 | Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first |
32e8e429 | 4693 | location accessed in the loop by STMT_INFO, along with the def-use update |
920e8172 RS |
4694 | chain to appropriately advance the pointer through the loop iterations. |
4695 | Also set aliasing information for the pointer. This pointer is used by | |
4696 | the callers to this function to create a memory reference expression for | |
4697 | vector load/store access. | |
ebfd146a IR |
4698 | |
4699 | Input: | |
32e8e429 | 4700 | 1. STMT_INFO: a stmt that references memory. Expected to be of the form |
ebfd146a IR |
4701 | GIMPLE_ASSIGN <name, data-ref> or |
4702 | GIMPLE_ASSIGN <data-ref, name>. | |
920e8172 RS |
4703 | 2. AGGR_TYPE: the type of the reference, which should be either a vector |
4704 | or an array. | |
4705 | 3. AT_LOOP: the loop where the vector memref is to be created. | |
4706 | 4. OFFSET (optional): an offset to be added to the initial address accessed | |
32e8e429 | 4707 | by the data-ref in STMT_INFO. |
920e8172 RS |
4708 | 5. BSI: location where the new stmts are to be placed if there is no loop |
4709 | 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain | |
ebfd146a | 4710 | pointing to the initial address. |
356bbc4c | 4711 | 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added |
32e8e429 | 4712 | to the initial address accessed by the data-ref in STMT_INFO. This is |
356bbc4c JJ |
4713 | similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET |
4714 | in bytes. | |
ab2fc782 RS |
4715 | 8. IV_STEP (optional, defaults to NULL): the amount that should be added |
4716 | to the IV during each iteration of the loop. NULL says to move | |
4717 | by one copy of AGGR_TYPE up or down, depending on the step of the | |
4718 | data reference. | |
ebfd146a IR |
4719 | |
4720 | Output: | |
4721 | 1. Declare a new ptr to vector_type, and have it point to the base of the | |
4722 | data reference (initial addressed accessed by the data reference). | |
4723 | For example, for vector of type V8HI, the following code is generated: | |
4724 | ||
920e8172 RS |
4725 | v8hi *ap; |
4726 | ap = (v8hi *)initial_address; | |
ebfd146a IR |
4727 | |
4728 | if OFFSET is not supplied: | |
4729 | initial_address = &a[init]; | |
4730 | if OFFSET is supplied: | |
4731 | initial_address = &a[init + OFFSET]; | |
356bbc4c JJ |
4732 | if BYTE_OFFSET is supplied: |
4733 | initial_address = &a[init] + BYTE_OFFSET; | |
ebfd146a IR |
4734 | |
4735 | Return the initial_address in INITIAL_ADDRESS. | |
4736 | ||
4737 | 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also | |
b8698a0f | 4738 | update the pointer in each iteration of the loop. |
ebfd146a IR |
4739 | |
4740 | Return the increment stmt that updates the pointer in PTR_INCR. | |
4741 | ||
2d4bca81 | 4742 | 3. Return the pointer. */ |
ebfd146a IR |
4743 | |
4744 | tree | |
308bc496 RB |
4745 | vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info, |
4746 | tree aggr_type, class loop *at_loop, tree offset, | |
32e8e429 | 4747 | tree *initial_address, gimple_stmt_iterator *gsi, |
2d4bca81 | 4748 | gimple **ptr_incr, bool only_init, |
32e8e429 | 4749 | tree byte_offset, tree iv_step) |
ebfd146a | 4750 | { |
595c2679 | 4751 | const char *base_name; |
308bc496 | 4752 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); |
99b1c316 | 4753 | class loop *loop = NULL; |
a70d6342 | 4754 | bool nested_in_vect_loop = false; |
99b1c316 | 4755 | class loop *containing_loop = NULL; |
920e8172 RS |
4756 | tree aggr_ptr_type; |
4757 | tree aggr_ptr; | |
ebfd146a | 4758 | tree new_temp; |
ebfd146a | 4759 | gimple_seq new_stmt_list = NULL; |
a70d6342 | 4760 | edge pe = NULL; |
ebfd146a | 4761 | basic_block new_bb; |
920e8172 | 4762 | tree aggr_ptr_init; |
89fa689a RS |
4763 | dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); |
4764 | struct data_reference *dr = dr_info->dr; | |
920e8172 | 4765 | tree aptr; |
ebfd146a IR |
4766 | gimple_stmt_iterator incr_gsi; |
4767 | bool insert_after; | |
4768 | tree indx_before_incr, indx_after_incr; | |
355fe088 | 4769 | gimple *incr; |
308bc496 | 4770 | bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo); |
b8698a0f | 4771 | |
ab2fc782 RS |
4772 | gcc_assert (iv_step != NULL_TREE |
4773 | || TREE_CODE (aggr_type) == ARRAY_TYPE | |
920e8172 RS |
4774 | || TREE_CODE (aggr_type) == VECTOR_TYPE); |
4775 | ||
a70d6342 IR |
4776 | if (loop_vinfo) |
4777 | { | |
4778 | loop = LOOP_VINFO_LOOP (loop_vinfo); | |
86a91c0a RS |
4779 | nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info); |
4780 | containing_loop = (gimple_bb (stmt_info->stmt))->loop_father; | |
a70d6342 IR |
4781 | pe = loop_preheader_edge (loop); |
4782 | } | |
4783 | else | |
4784 | { | |
4785 | gcc_assert (bb_vinfo); | |
4786 | only_init = true; | |
4787 | *ptr_incr = NULL; | |
4788 | } | |
b8698a0f | 4789 | |
ebfd146a | 4790 | /* Create an expression for the first address accessed by this load |
b8698a0f | 4791 | in LOOP. */ |
595c2679 | 4792 | base_name = get_name (DR_BASE_ADDRESS (dr)); |
ebfd146a | 4793 | |
73fbfcad | 4794 | if (dump_enabled_p ()) |
ebfd146a | 4795 | { |
595c2679 | 4796 | tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr)); |
78c60e3d | 4797 | dump_printf_loc (MSG_NOTE, vect_location, |
3c2a8ed0 DM |
4798 | "create %s-pointer variable to type: %T", |
4799 | get_tree_code_name (TREE_CODE (aggr_type)), | |
4800 | aggr_type); | |
595c2679 | 4801 | if (TREE_CODE (dr_base_type) == ARRAY_TYPE) |
78c60e3d | 4802 | dump_printf (MSG_NOTE, " vectorizing an array ref: "); |
38000232 MG |
4803 | else if (TREE_CODE (dr_base_type) == VECTOR_TYPE) |
4804 | dump_printf (MSG_NOTE, " vectorizing a vector ref: "); | |
595c2679 | 4805 | else if (TREE_CODE (dr_base_type) == RECORD_TYPE) |
78c60e3d | 4806 | dump_printf (MSG_NOTE, " vectorizing a record based array ref: "); |
595c2679 | 4807 | else |
78c60e3d | 4808 | dump_printf (MSG_NOTE, " vectorizing a pointer ref: "); |
3c2a8ed0 | 4809 | dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr)); |
ebfd146a IR |
4810 | } |
4811 | ||
4bdd44c4 RB |
4812 | /* (1) Create the new aggregate-pointer variable. |
4813 | Vector and array types inherit the alias set of their component | |
920e8172 RS |
4814 | type by default so we need to use a ref-all pointer if the data |
4815 | reference does not conflict with the created aggregated data | |
4816 | reference because it is not addressable. */ | |
4bdd44c4 RB |
4817 | bool need_ref_all = false; |
4818 | if (!alias_sets_conflict_p (get_alias_set (aggr_type), | |
3f49ba3f | 4819 | get_alias_set (DR_REF (dr)))) |
4bdd44c4 | 4820 | need_ref_all = true; |
3f49ba3f | 4821 | /* Likewise for any of the data references in the stmt group. */ |
2c53b149 | 4822 | else if (DR_GROUP_SIZE (stmt_info) > 1) |
ebfd146a | 4823 | { |
bffb8014 | 4824 | stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info); |
5006671f RG |
4825 | do |
4826 | { | |
4bdd44c4 RB |
4827 | struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo); |
4828 | if (!alias_sets_conflict_p (get_alias_set (aggr_type), | |
4829 | get_alias_set (DR_REF (sdr)))) | |
5006671f | 4830 | { |
4bdd44c4 | 4831 | need_ref_all = true; |
5006671f RG |
4832 | break; |
4833 | } | |
bffb8014 | 4834 | sinfo = DR_GROUP_NEXT_ELEMENT (sinfo); |
5006671f | 4835 | } |
bffb8014 | 4836 | while (sinfo); |
ebfd146a | 4837 | } |
4bdd44c4 RB |
4838 | aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode, |
4839 | need_ref_all); | |
4840 | aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name); | |
4841 | ||
ebfd146a | 4842 | |
ff802fa1 IR |
4843 | /* Note: If the dataref is in an inner-loop nested in LOOP, and we are |
4844 | vectorizing LOOP (i.e., outer-loop vectorization), we need to create two | |
4845 | def-use update cycles for the pointer: one relative to the outer-loop | |
4846 | (LOOP), which is what steps (3) and (4) below do. The other is relative | |
4847 | to the inner-loop (which is the inner-most loop containing the dataref), | |
4848 | and this is done be step (5) below. | |
ebfd146a | 4849 | |
ff802fa1 IR |
4850 | When vectorizing inner-most loops, the vectorized loop (LOOP) is also the |
4851 | inner-most loop, and so steps (3),(4) work the same, and step (5) is | |
4852 | redundant. Steps (3),(4) create the following: | |
ebfd146a IR |
4853 | |
4854 | vp0 = &base_addr; | |
4855 | LOOP: vp1 = phi(vp0,vp2) | |
b8698a0f | 4856 | ... |
ebfd146a IR |
4857 | ... |
4858 | vp2 = vp1 + step | |
4859 | goto LOOP | |
b8698a0f | 4860 | |
ff802fa1 IR |
4861 | If there is an inner-loop nested in loop, then step (5) will also be |
4862 | applied, and an additional update in the inner-loop will be created: | |
ebfd146a IR |
4863 | |
4864 | vp0 = &base_addr; | |
4865 | LOOP: vp1 = phi(vp0,vp2) | |
4866 | ... | |
4867 | inner: vp3 = phi(vp1,vp4) | |
4868 | vp4 = vp3 + inner_step | |
4869 | if () goto inner | |
4870 | ... | |
4871 | vp2 = vp1 + step | |
4872 | if () goto LOOP */ | |
4873 | ||
920e8172 RS |
4874 | /* (2) Calculate the initial address of the aggregate-pointer, and set |
4875 | the aggregate-pointer to point to it before the loop. */ | |
ebfd146a | 4876 | |
356bbc4c | 4877 | /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */ |
ebfd146a | 4878 | |
308bc496 RB |
4879 | new_temp = vect_create_addr_base_for_vector_ref (vinfo, |
4880 | stmt_info, &new_stmt_list, | |
3f5e8a76 | 4881 | offset, byte_offset); |
ebfd146a IR |
4882 | if (new_stmt_list) |
4883 | { | |
a70d6342 IR |
4884 | if (pe) |
4885 | { | |
4886 | new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list); | |
4887 | gcc_assert (!new_bb); | |
4888 | } | |
4889 | else | |
1b29f05e | 4890 | gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT); |
ebfd146a IR |
4891 | } |
4892 | ||
4893 | *initial_address = new_temp; | |
aed93b23 | 4894 | aggr_ptr_init = new_temp; |
ebfd146a | 4895 | |
920e8172 | 4896 | /* (3) Handle the updating of the aggregate-pointer inside the loop. |
ff802fa1 IR |
4897 | This is needed when ONLY_INIT is false, and also when AT_LOOP is the |
4898 | inner-loop nested in LOOP (during outer-loop vectorization). */ | |
ebfd146a | 4899 | |
a70d6342 | 4900 | /* No update in loop is required. */ |
b8698a0f | 4901 | if (only_init && (!loop_vinfo || at_loop == loop)) |
920e8172 | 4902 | aptr = aggr_ptr_init; |
ebfd146a IR |
4903 | else |
4904 | { | |
2d4bca81 RS |
4905 | /* Accesses to invariant addresses should be handled specially |
4906 | by the caller. */ | |
308bc496 | 4907 | tree step = vect_dr_behavior (vinfo, dr_info)->step; |
2d4bca81 RS |
4908 | gcc_assert (!integer_zerop (step)); |
4909 | ||
ab2fc782 RS |
4910 | if (iv_step == NULL_TREE) |
4911 | { | |
2d4bca81 RS |
4912 | /* The step of the aggregate pointer is the type size, |
4913 | negated for downward accesses. */ | |
ab2fc782 | 4914 | iv_step = TYPE_SIZE_UNIT (aggr_type); |
2d4bca81 | 4915 | if (tree_int_cst_sgn (step) == -1) |
ab2fc782 RS |
4916 | iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step); |
4917 | } | |
ebfd146a IR |
4918 | |
4919 | standard_iv_increment_position (loop, &incr_gsi, &insert_after); | |
4920 | ||
920e8172 | 4921 | create_iv (aggr_ptr_init, |
08940f33 | 4922 | fold_convert (aggr_ptr_type, iv_step), |
920e8172 | 4923 | aggr_ptr, loop, &incr_gsi, insert_after, |
ebfd146a IR |
4924 | &indx_before_incr, &indx_after_incr); |
4925 | incr = gsi_stmt (incr_gsi); | |
ebfd146a IR |
4926 | |
4927 | /* Copy the points-to information if it exists. */ | |
4928 | if (DR_PTR_INFO (dr)) | |
4929 | { | |
89fa689a RS |
4930 | vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info); |
4931 | vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info); | |
ebfd146a | 4932 | } |
ebfd146a IR |
4933 | if (ptr_incr) |
4934 | *ptr_incr = incr; | |
4935 | ||
920e8172 | 4936 | aptr = indx_before_incr; |
ebfd146a IR |
4937 | } |
4938 | ||
4939 | if (!nested_in_vect_loop || only_init) | |
920e8172 | 4940 | return aptr; |
ebfd146a IR |
4941 | |
4942 | ||
920e8172 | 4943 | /* (4) Handle the updating of the aggregate-pointer inside the inner-loop |
ff802fa1 | 4944 | nested in LOOP, if exists. */ |
ebfd146a IR |
4945 | |
4946 | gcc_assert (nested_in_vect_loop); | |
4947 | if (!only_init) | |
4948 | { | |
4949 | standard_iv_increment_position (containing_loop, &incr_gsi, | |
4950 | &insert_after); | |
920e8172 | 4951 | create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr, |
ebfd146a IR |
4952 | containing_loop, &incr_gsi, insert_after, &indx_before_incr, |
4953 | &indx_after_incr); | |
4954 | incr = gsi_stmt (incr_gsi); | |
ebfd146a IR |
4955 | |
4956 | /* Copy the points-to information if it exists. */ | |
4957 | if (DR_PTR_INFO (dr)) | |
4958 | { | |
89fa689a RS |
4959 | vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info); |
4960 | vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info); | |
ebfd146a | 4961 | } |
ebfd146a IR |
4962 | if (ptr_incr) |
4963 | *ptr_incr = incr; | |
4964 | ||
b8698a0f | 4965 | return indx_before_incr; |
ebfd146a IR |
4966 | } |
4967 | else | |
4968 | gcc_unreachable (); | |
4969 | } | |
4970 | ||
4971 | ||
4972 | /* Function bump_vector_ptr | |
4973 | ||
4974 | Increment a pointer (to a vector type) by vector-size. If requested, | |
b8698a0f | 4975 | i.e. if PTR-INCR is given, then also connect the new increment stmt |
ebfd146a IR |
4976 | to the existing def-use update-chain of the pointer, by modifying |
4977 | the PTR_INCR as illustrated below: | |
4978 | ||
4979 | The pointer def-use update-chain before this function: | |
4980 | DATAREF_PTR = phi (p_0, p_2) | |
4981 | .... | |
b8698a0f | 4982 | PTR_INCR: p_2 = DATAREF_PTR + step |
ebfd146a IR |
4983 | |
4984 | The pointer def-use update-chain after this function: | |
4985 | DATAREF_PTR = phi (p_0, p_2) | |
4986 | .... | |
4987 | NEW_DATAREF_PTR = DATAREF_PTR + BUMP | |
4988 | .... | |
4989 | PTR_INCR: p_2 = NEW_DATAREF_PTR + step | |
4990 | ||
4991 | Input: | |
b8698a0f | 4992 | DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated |
ebfd146a | 4993 | in the loop. |
b8698a0f | 4994 | PTR_INCR - optional. The stmt that updates the pointer in each iteration of |
ebfd146a | 4995 | the loop. The increment amount across iterations is expected |
b8698a0f | 4996 | to be vector_size. |
ebfd146a | 4997 | BSI - location where the new update stmt is to be placed. |
32e8e429 | 4998 | STMT_INFO - the original scalar memory-access stmt that is being vectorized. |
ebfd146a IR |
4999 | BUMP - optional. The offset by which to bump the pointer. If not given, |
5000 | the offset is assumed to be vector_size. | |
5001 | ||
5002 | Output: Return NEW_DATAREF_PTR as illustrated above. | |
b8698a0f | 5003 | |
ebfd146a IR |
5004 | */ |
5005 | ||
5006 | tree | |
308bc496 RB |
5007 | bump_vector_ptr (vec_info *vinfo, |
5008 | tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi, | |
32e8e429 | 5009 | stmt_vec_info stmt_info, tree bump) |
ebfd146a | 5010 | { |
ebfd146a IR |
5011 | struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); |
5012 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
ebfd146a | 5013 | tree update = TYPE_SIZE_UNIT (vectype); |
538dd0b7 | 5014 | gassign *incr_stmt; |
ebfd146a IR |
5015 | ssa_op_iter iter; |
5016 | use_operand_p use_p; | |
5017 | tree new_dataref_ptr; | |
5018 | ||
5019 | if (bump) | |
5020 | update = bump; | |
b8698a0f | 5021 | |
aed93b23 RB |
5022 | if (TREE_CODE (dataref_ptr) == SSA_NAME) |
5023 | new_dataref_ptr = copy_ssa_name (dataref_ptr); | |
5024 | else | |
5025 | new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr)); | |
0d0e4a03 JJ |
5026 | incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR, |
5027 | dataref_ptr, update); | |
308bc496 | 5028 | vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi); |
ebfd146a IR |
5029 | |
5030 | /* Copy the points-to information if it exists. */ | |
5031 | if (DR_PTR_INFO (dr)) | |
128aaeed RB |
5032 | { |
5033 | duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr)); | |
644ffefd | 5034 | mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr)); |
128aaeed | 5035 | } |
ebfd146a IR |
5036 | |
5037 | if (!ptr_incr) | |
5038 | return new_dataref_ptr; | |
5039 | ||
5040 | /* Update the vector-pointer's cross-iteration increment. */ | |
5041 | FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE) | |
5042 | { | |
5043 | tree use = USE_FROM_PTR (use_p); | |
5044 | ||
5045 | if (use == dataref_ptr) | |
5046 | SET_USE (use_p, new_dataref_ptr); | |
5047 | else | |
ab2fc782 | 5048 | gcc_assert (operand_equal_p (use, update, 0)); |
ebfd146a IR |
5049 | } |
5050 | ||
5051 | return new_dataref_ptr; | |
5052 | } | |
5053 | ||
5054 | ||
19986382 RB |
5055 | /* Copy memory reference info such as base/clique from the SRC reference |
5056 | to the DEST MEM_REF. */ | |
5057 | ||
5058 | void | |
5059 | vect_copy_ref_info (tree dest, tree src) | |
5060 | { | |
5061 | if (TREE_CODE (dest) != MEM_REF) | |
5062 | return; | |
5063 | ||
5064 | tree src_base = src; | |
5065 | while (handled_component_p (src_base)) | |
5066 | src_base = TREE_OPERAND (src_base, 0); | |
5067 | if (TREE_CODE (src_base) != MEM_REF | |
5068 | && TREE_CODE (src_base) != TARGET_MEM_REF) | |
5069 | return; | |
5070 | ||
5071 | MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base); | |
5072 | MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base); | |
5073 | } | |
5074 | ||
5075 | ||
ebfd146a IR |
5076 | /* Function vect_create_destination_var. |
5077 | ||
5078 | Create a new temporary of type VECTYPE. */ | |
5079 | ||
5080 | tree | |
5081 | vect_create_destination_var (tree scalar_dest, tree vectype) | |
5082 | { | |
5083 | tree vec_dest; | |
451dabda RB |
5084 | const char *name; |
5085 | char *new_name; | |
ebfd146a IR |
5086 | tree type; |
5087 | enum vect_var_kind kind; | |
5088 | ||
42fd8198 IE |
5089 | kind = vectype |
5090 | ? VECTOR_BOOLEAN_TYPE_P (vectype) | |
5091 | ? vect_mask_var | |
5092 | : vect_simple_var | |
5093 | : vect_scalar_var; | |
ebfd146a IR |
5094 | type = vectype ? vectype : TREE_TYPE (scalar_dest); |
5095 | ||
5096 | gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME); | |
5097 | ||
451dabda RB |
5098 | name = get_name (scalar_dest); |
5099 | if (name) | |
378b2932 | 5100 | new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest)); |
451dabda | 5101 | else |
378b2932 | 5102 | new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest)); |
ebfd146a | 5103 | vec_dest = vect_get_new_vect_var (type, kind, new_name); |
451dabda | 5104 | free (new_name); |
ebfd146a IR |
5105 | |
5106 | return vec_dest; | |
5107 | } | |
5108 | ||
0d0293ac | 5109 | /* Function vect_grouped_store_supported. |
ebfd146a | 5110 | |
e2c83630 RH |
5111 | Returns TRUE if interleave high and interleave low permutations |
5112 | are supported, and FALSE otherwise. */ | |
ebfd146a IR |
5113 | |
5114 | bool | |
0d0293ac | 5115 | vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count) |
ebfd146a | 5116 | { |
ef4bddc2 | 5117 | machine_mode mode = TYPE_MODE (vectype); |
b8698a0f | 5118 | |
e1377713 ES |
5119 | /* vect_permute_store_chain requires the group size to be equal to 3 or |
5120 | be a power of two. */ | |
5121 | if (count != 3 && exact_log2 (count) == -1) | |
b602d918 | 5122 | { |
73fbfcad | 5123 | if (dump_enabled_p ()) |
78c60e3d | 5124 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
e1377713 ES |
5125 | "the size of the group of accesses" |
5126 | " is not a power of 2 or not eqaul to 3\n"); | |
b602d918 RS |
5127 | return false; |
5128 | } | |
5129 | ||
e2c83630 | 5130 | /* Check that the permutation is supported. */ |
3fcc1b55 JJ |
5131 | if (VECTOR_MODE_P (mode)) |
5132 | { | |
7b777afa | 5133 | unsigned int i; |
e1377713 | 5134 | if (count == 3) |
3fcc1b55 | 5135 | { |
e1377713 ES |
5136 | unsigned int j0 = 0, j1 = 0, j2 = 0; |
5137 | unsigned int i, j; | |
5138 | ||
7b777afa RS |
5139 | unsigned int nelt; |
5140 | if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) | |
5141 | { | |
5142 | if (dump_enabled_p ()) | |
5143 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
5144 | "cannot handle groups of 3 stores for" | |
5145 | " variable-length vectors\n"); | |
5146 | return false; | |
5147 | } | |
5148 | ||
d980067b RS |
5149 | vec_perm_builder sel (nelt, nelt, 1); |
5150 | sel.quick_grow (nelt); | |
e3342de4 | 5151 | vec_perm_indices indices; |
e1377713 ES |
5152 | for (j = 0; j < 3; j++) |
5153 | { | |
5154 | int nelt0 = ((3 - j) * nelt) % 3; | |
5155 | int nelt1 = ((3 - j) * nelt + 1) % 3; | |
5156 | int nelt2 = ((3 - j) * nelt + 2) % 3; | |
5157 | for (i = 0; i < nelt; i++) | |
5158 | { | |
5159 | if (3 * i + nelt0 < nelt) | |
5160 | sel[3 * i + nelt0] = j0++; | |
5161 | if (3 * i + nelt1 < nelt) | |
5162 | sel[3 * i + nelt1] = nelt + j1++; | |
5163 | if (3 * i + nelt2 < nelt) | |
5164 | sel[3 * i + nelt2] = 0; | |
5165 | } | |
e3342de4 RS |
5166 | indices.new_vector (sel, 2, nelt); |
5167 | if (!can_vec_perm_const_p (mode, indices)) | |
e1377713 ES |
5168 | { |
5169 | if (dump_enabled_p ()) | |
5170 | dump_printf (MSG_MISSED_OPTIMIZATION, | |
7ac7e286 | 5171 | "permutation op not supported by target.\n"); |
e1377713 ES |
5172 | return false; |
5173 | } | |
5174 | ||
5175 | for (i = 0; i < nelt; i++) | |
5176 | { | |
5177 | if (3 * i + nelt0 < nelt) | |
5178 | sel[3 * i + nelt0] = 3 * i + nelt0; | |
5179 | if (3 * i + nelt1 < nelt) | |
5180 | sel[3 * i + nelt1] = 3 * i + nelt1; | |
5181 | if (3 * i + nelt2 < nelt) | |
5182 | sel[3 * i + nelt2] = nelt + j2++; | |
5183 | } | |
e3342de4 RS |
5184 | indices.new_vector (sel, 2, nelt); |
5185 | if (!can_vec_perm_const_p (mode, indices)) | |
e1377713 ES |
5186 | { |
5187 | if (dump_enabled_p ()) | |
5188 | dump_printf (MSG_MISSED_OPTIMIZATION, | |
7ac7e286 | 5189 | "permutation op not supported by target.\n"); |
e1377713 ES |
5190 | return false; |
5191 | } | |
5192 | } | |
5193 | return true; | |
3fcc1b55 | 5194 | } |
e1377713 | 5195 | else |
3fcc1b55 | 5196 | { |
e1377713 | 5197 | /* If length is not equal to 3 then only power of 2 is supported. */ |
146ec50f | 5198 | gcc_assert (pow2p_hwi (count)); |
7b777afa | 5199 | poly_uint64 nelt = GET_MODE_NUNITS (mode); |
e1377713 | 5200 | |
d980067b RS |
5201 | /* The encoding has 2 interleaved stepped patterns. */ |
5202 | vec_perm_builder sel (nelt, 2, 3); | |
5203 | sel.quick_grow (6); | |
5204 | for (i = 0; i < 3; i++) | |
e1377713 ES |
5205 | { |
5206 | sel[i * 2] = i; | |
5207 | sel[i * 2 + 1] = i + nelt; | |
5208 | } | |
e3342de4 RS |
5209 | vec_perm_indices indices (sel, 2, nelt); |
5210 | if (can_vec_perm_const_p (mode, indices)) | |
908a1a16 | 5211 | { |
d980067b | 5212 | for (i = 0; i < 6; i++) |
7b777afa | 5213 | sel[i] += exact_div (nelt, 2); |
e3342de4 RS |
5214 | indices.new_vector (sel, 2, nelt); |
5215 | if (can_vec_perm_const_p (mode, indices)) | |
908a1a16 RS |
5216 | return true; |
5217 | } | |
3fcc1b55 JJ |
5218 | } |
5219 | } | |
ebfd146a | 5220 | |
73fbfcad | 5221 | if (dump_enabled_p ()) |
78c60e3d | 5222 | dump_printf (MSG_MISSED_OPTIMIZATION, |
429ca5b4 | 5223 | "permutation op not supported by target.\n"); |
a6b3dfde | 5224 | return false; |
ebfd146a IR |
5225 | } |
5226 | ||
5227 | ||
7e11fc7f RS |
5228 | /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of |
5229 | type VECTYPE. MASKED_P says whether the masked form is needed. */ | |
272c6793 RS |
5230 | |
5231 | bool | |
7e11fc7f RS |
5232 | vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, |
5233 | bool masked_p) | |
272c6793 | 5234 | { |
7e11fc7f RS |
5235 | if (masked_p) |
5236 | return vect_lanes_optab_supported_p ("vec_mask_store_lanes", | |
5237 | vec_mask_store_lanes_optab, | |
5238 | vectype, count); | |
5239 | else | |
5240 | return vect_lanes_optab_supported_p ("vec_store_lanes", | |
5241 | vec_store_lanes_optab, | |
5242 | vectype, count); | |
272c6793 RS |
5243 | } |
5244 | ||
5245 | ||
ebfd146a IR |
5246 | /* Function vect_permute_store_chain. |
5247 | ||
5248 | Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be | |
e1377713 ES |
5249 | a power of 2 or equal to 3, generate interleave_high/low stmts to reorder |
5250 | the data correctly for the stores. Return the final references for stores | |
5251 | in RESULT_CHAIN. | |
ebfd146a IR |
5252 | |
5253 | E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. | |
ff802fa1 IR |
5254 | The input is 4 vectors each containing 8 elements. We assign a number to |
5255 | each element, the input sequence is: | |
ebfd146a IR |
5256 | |
5257 | 1st vec: 0 1 2 3 4 5 6 7 | |
5258 | 2nd vec: 8 9 10 11 12 13 14 15 | |
b8698a0f | 5259 | 3rd vec: 16 17 18 19 20 21 22 23 |
ebfd146a IR |
5260 | 4th vec: 24 25 26 27 28 29 30 31 |
5261 | ||
5262 | The output sequence should be: | |
5263 | ||
5264 | 1st vec: 0 8 16 24 1 9 17 25 | |
5265 | 2nd vec: 2 10 18 26 3 11 19 27 | |
5266 | 3rd vec: 4 12 20 28 5 13 21 30 | |
5267 | 4th vec: 6 14 22 30 7 15 23 31 | |
5268 | ||
5269 | i.e., we interleave the contents of the four vectors in their order. | |
5270 | ||
ff802fa1 | 5271 | We use interleave_high/low instructions to create such output. The input of |
ebfd146a | 5272 | each interleave_high/low operation is two vectors: |
b8698a0f L |
5273 | 1st vec 2nd vec |
5274 | 0 1 2 3 4 5 6 7 | |
5275 | the even elements of the result vector are obtained left-to-right from the | |
ff802fa1 | 5276 | high/low elements of the first vector. The odd elements of the result are |
ebfd146a IR |
5277 | obtained left-to-right from the high/low elements of the second vector. |
5278 | The output of interleave_high will be: 0 4 1 5 | |
5279 | and of interleave_low: 2 6 3 7 | |
5280 | ||
b8698a0f | 5281 | |
ff802fa1 | 5282 | The permutation is done in log LENGTH stages. In each stage interleave_high |
b8698a0f L |
5283 | and interleave_low stmts are created for each pair of vectors in DR_CHAIN, |
5284 | where the first argument is taken from the first half of DR_CHAIN and the | |
5285 | second argument from it's second half. | |
5286 | In our example, | |
ebfd146a IR |
5287 | |
5288 | I1: interleave_high (1st vec, 3rd vec) | |
5289 | I2: interleave_low (1st vec, 3rd vec) | |
5290 | I3: interleave_high (2nd vec, 4th vec) | |
5291 | I4: interleave_low (2nd vec, 4th vec) | |
5292 | ||
5293 | The output for the first stage is: | |
5294 | ||
5295 | I1: 0 16 1 17 2 18 3 19 | |
5296 | I2: 4 20 5 21 6 22 7 23 | |
5297 | I3: 8 24 9 25 10 26 11 27 | |
5298 | I4: 12 28 13 29 14 30 15 31 | |
5299 | ||
5300 | The output of the second stage, i.e. the final result is: | |
5301 | ||
5302 | I1: 0 8 16 24 1 9 17 25 | |
5303 | I2: 2 10 18 26 3 11 19 27 | |
5304 | I3: 4 12 20 28 5 13 21 30 | |
5305 | I4: 6 14 22 30 7 15 23 31. */ | |
b8698a0f | 5306 | |
b602d918 | 5307 | void |
308bc496 | 5308 | vect_permute_store_chain (vec_info *vinfo, vec<tree> dr_chain, |
b8698a0f | 5309 | unsigned int length, |
32e8e429 | 5310 | stmt_vec_info stmt_info, |
ebfd146a | 5311 | gimple_stmt_iterator *gsi, |
9771b263 | 5312 | vec<tree> *result_chain) |
ebfd146a | 5313 | { |
83d5977e | 5314 | tree vect1, vect2, high, low; |
355fe088 | 5315 | gimple *perm_stmt; |
91987857 | 5316 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
3fcc1b55 | 5317 | tree perm_mask_low, perm_mask_high; |
e1377713 ES |
5318 | tree data_ref; |
5319 | tree perm3_mask_low, perm3_mask_high; | |
edab8e10 | 5320 | unsigned int i, j, n, log_length = exact_log2 (length); |
908a1a16 | 5321 | |
b6b9227d JJ |
5322 | result_chain->quick_grow (length); |
5323 | memcpy (result_chain->address (), dr_chain.address (), | |
5324 | length * sizeof (tree)); | |
ebfd146a | 5325 | |
e1377713 | 5326 | if (length == 3) |
3fcc1b55 | 5327 | { |
edab8e10 | 5328 | /* vect_grouped_store_supported ensures that this is constant. */ |
928686b1 | 5329 | unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); |
e1377713 | 5330 | unsigned int j0 = 0, j1 = 0, j2 = 0; |
e2c83630 | 5331 | |
d980067b RS |
5332 | vec_perm_builder sel (nelt, nelt, 1); |
5333 | sel.quick_grow (nelt); | |
e3342de4 | 5334 | vec_perm_indices indices; |
e1377713 ES |
5335 | for (j = 0; j < 3; j++) |
5336 | { | |
5337 | int nelt0 = ((3 - j) * nelt) % 3; | |
5338 | int nelt1 = ((3 - j) * nelt + 1) % 3; | |
5339 | int nelt2 = ((3 - j) * nelt + 2) % 3; | |
3fcc1b55 | 5340 | |
e1377713 ES |
5341 | for (i = 0; i < nelt; i++) |
5342 | { | |
5343 | if (3 * i + nelt0 < nelt) | |
5344 | sel[3 * i + nelt0] = j0++; | |
5345 | if (3 * i + nelt1 < nelt) | |
5346 | sel[3 * i + nelt1] = nelt + j1++; | |
5347 | if (3 * i + nelt2 < nelt) | |
5348 | sel[3 * i + nelt2] = 0; | |
5349 | } | |
e3342de4 RS |
5350 | indices.new_vector (sel, 2, nelt); |
5351 | perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices); | |
e1377713 ES |
5352 | |
5353 | for (i = 0; i < nelt; i++) | |
5354 | { | |
5355 | if (3 * i + nelt0 < nelt) | |
5356 | sel[3 * i + nelt0] = 3 * i + nelt0; | |
5357 | if (3 * i + nelt1 < nelt) | |
5358 | sel[3 * i + nelt1] = 3 * i + nelt1; | |
5359 | if (3 * i + nelt2 < nelt) | |
5360 | sel[3 * i + nelt2] = nelt + j2++; | |
5361 | } | |
e3342de4 RS |
5362 | indices.new_vector (sel, 2, nelt); |
5363 | perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices); | |
e1377713 ES |
5364 | |
5365 | vect1 = dr_chain[0]; | |
5366 | vect2 = dr_chain[1]; | |
ebfd146a IR |
5367 | |
5368 | /* Create interleaving stmt: | |
e1377713 ES |
5369 | low = VEC_PERM_EXPR <vect1, vect2, |
5370 | {j, nelt, *, j + 1, nelt + j + 1, *, | |
5371 | j + 2, nelt + j + 2, *, ...}> */ | |
5372 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); | |
0d0e4a03 JJ |
5373 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, |
5374 | vect2, perm3_mask_low); | |
308bc496 | 5375 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
ebfd146a | 5376 | |
e1377713 ES |
5377 | vect1 = data_ref; |
5378 | vect2 = dr_chain[2]; | |
ebfd146a | 5379 | /* Create interleaving stmt: |
e1377713 ES |
5380 | low = VEC_PERM_EXPR <vect1, vect2, |
5381 | {0, 1, nelt + j, 3, 4, nelt + j + 1, | |
5382 | 6, 7, nelt + j + 2, ...}> */ | |
5383 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); | |
0d0e4a03 JJ |
5384 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, |
5385 | vect2, perm3_mask_high); | |
308bc496 | 5386 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
e1377713 | 5387 | (*result_chain)[j] = data_ref; |
ebfd146a | 5388 | } |
e1377713 ES |
5389 | } |
5390 | else | |
5391 | { | |
5392 | /* If length is not equal to 3 then only power of 2 is supported. */ | |
146ec50f | 5393 | gcc_assert (pow2p_hwi (length)); |
e1377713 | 5394 | |
d980067b | 5395 | /* The encoding has 2 interleaved stepped patterns. */ |
928686b1 | 5396 | poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype); |
d980067b RS |
5397 | vec_perm_builder sel (nelt, 2, 3); |
5398 | sel.quick_grow (6); | |
5399 | for (i = 0; i < 3; i++) | |
e1377713 ES |
5400 | { |
5401 | sel[i * 2] = i; | |
5402 | sel[i * 2 + 1] = i + nelt; | |
5403 | } | |
e3342de4 RS |
5404 | vec_perm_indices indices (sel, 2, nelt); |
5405 | perm_mask_high = vect_gen_perm_mask_checked (vectype, indices); | |
e1377713 | 5406 | |
d980067b | 5407 | for (i = 0; i < 6; i++) |
928686b1 | 5408 | sel[i] += exact_div (nelt, 2); |
e3342de4 RS |
5409 | indices.new_vector (sel, 2, nelt); |
5410 | perm_mask_low = vect_gen_perm_mask_checked (vectype, indices); | |
e1377713 ES |
5411 | |
5412 | for (i = 0, n = log_length; i < n; i++) | |
5413 | { | |
5414 | for (j = 0; j < length/2; j++) | |
5415 | { | |
5416 | vect1 = dr_chain[j]; | |
5417 | vect2 = dr_chain[j+length/2]; | |
5418 | ||
5419 | /* Create interleaving stmt: | |
5420 | high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, | |
5421 | ...}> */ | |
5422 | high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); | |
0d0e4a03 JJ |
5423 | perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1, |
5424 | vect2, perm_mask_high); | |
308bc496 | 5425 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
e1377713 ES |
5426 | (*result_chain)[2*j] = high; |
5427 | ||
5428 | /* Create interleaving stmt: | |
5429 | low = VEC_PERM_EXPR <vect1, vect2, | |
5430 | {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1, | |
5431 | ...}> */ | |
5432 | low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); | |
0d0e4a03 JJ |
5433 | perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1, |
5434 | vect2, perm_mask_low); | |
308bc496 | 5435 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
e1377713 ES |
5436 | (*result_chain)[2*j+1] = low; |
5437 | } | |
5438 | memcpy (dr_chain.address (), result_chain->address (), | |
5439 | length * sizeof (tree)); | |
5440 | } | |
ebfd146a | 5441 | } |
ebfd146a IR |
5442 | } |
5443 | ||
5444 | /* Function vect_setup_realignment | |
b8698a0f | 5445 | |
ebfd146a IR |
5446 | This function is called when vectorizing an unaligned load using |
5447 | the dr_explicit_realign[_optimized] scheme. | |
5448 | This function generates the following code at the loop prolog: | |
5449 | ||
5450 | p = initial_addr; | |
5451 | x msq_init = *(floor(p)); # prolog load | |
b8698a0f | 5452 | realignment_token = call target_builtin; |
ebfd146a IR |
5453 | loop: |
5454 | x msq = phi (msq_init, ---) | |
5455 | ||
b8698a0f | 5456 | The stmts marked with x are generated only for the case of |
ebfd146a IR |
5457 | dr_explicit_realign_optimized. |
5458 | ||
b8698a0f | 5459 | The code above sets up a new (vector) pointer, pointing to the first |
32e8e429 RS |
5460 | location accessed by STMT_INFO, and a "floor-aligned" load using that |
5461 | pointer. It also generates code to compute the "realignment-token" | |
5462 | (if the relevant target hook was defined), and creates a phi-node at the | |
5463 | loop-header bb whose arguments are the result of the prolog-load (created | |
5464 | by this function) and the result of a load that takes place in the loop | |
5465 | (to be created by the caller to this function). | |
ebfd146a IR |
5466 | |
5467 | For the case of dr_explicit_realign_optimized: | |
b8698a0f | 5468 | The caller to this function uses the phi-result (msq) to create the |
ebfd146a IR |
5469 | realignment code inside the loop, and sets up the missing phi argument, |
5470 | as follows: | |
b8698a0f | 5471 | loop: |
ebfd146a IR |
5472 | msq = phi (msq_init, lsq) |
5473 | lsq = *(floor(p')); # load in loop | |
5474 | result = realign_load (msq, lsq, realignment_token); | |
5475 | ||
5476 | For the case of dr_explicit_realign: | |
5477 | loop: | |
5478 | msq = *(floor(p)); # load in loop | |
5479 | p' = p + (VS-1); | |
5480 | lsq = *(floor(p')); # load in loop | |
5481 | result = realign_load (msq, lsq, realignment_token); | |
5482 | ||
5483 | Input: | |
32e8e429 RS |
5484 | STMT_INFO - (scalar) load stmt to be vectorized. This load accesses |
5485 | a memory location that may be unaligned. | |
ebfd146a IR |
5486 | BSI - place where new code is to be inserted. |
5487 | ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes | |
b8698a0f L |
5488 | is used. |
5489 | ||
ebfd146a IR |
5490 | Output: |
5491 | REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load | |
5492 | target hook, if defined. | |
5493 | Return value - the result of the loop-header phi node. */ | |
5494 | ||
5495 | tree | |
308bc496 RB |
5496 | vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info, |
5497 | gimple_stmt_iterator *gsi, tree *realignment_token, | |
ebfd146a IR |
5498 | enum dr_alignment_support alignment_support_scheme, |
5499 | tree init_addr, | |
99b1c316 | 5500 | class loop **at_loop) |
ebfd146a | 5501 | { |
ebfd146a | 5502 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
308bc496 | 5503 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); |
89fa689a RS |
5504 | dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); |
5505 | struct data_reference *dr = dr_info->dr; | |
99b1c316 | 5506 | class loop *loop = NULL; |
69f11a13 | 5507 | edge pe = NULL; |
86a91c0a | 5508 | tree scalar_dest = gimple_assign_lhs (stmt_info->stmt); |
ebfd146a | 5509 | tree vec_dest; |
355fe088 | 5510 | gimple *inc; |
ebfd146a IR |
5511 | tree ptr; |
5512 | tree data_ref; | |
ebfd146a IR |
5513 | basic_block new_bb; |
5514 | tree msq_init = NULL_TREE; | |
5515 | tree new_temp; | |
538dd0b7 | 5516 | gphi *phi_stmt; |
ebfd146a IR |
5517 | tree msq = NULL_TREE; |
5518 | gimple_seq stmts = NULL; | |
ebfd146a | 5519 | bool compute_in_loop = false; |
69f11a13 | 5520 | bool nested_in_vect_loop = false; |
99b1c316 MS |
5521 | class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father; |
5522 | class loop *loop_for_initial_load = NULL; | |
69f11a13 IR |
5523 | |
5524 | if (loop_vinfo) | |
5525 | { | |
5526 | loop = LOOP_VINFO_LOOP (loop_vinfo); | |
86a91c0a | 5527 | nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info); |
69f11a13 | 5528 | } |
ebfd146a IR |
5529 | |
5530 | gcc_assert (alignment_support_scheme == dr_explicit_realign | |
5531 | || alignment_support_scheme == dr_explicit_realign_optimized); | |
5532 | ||
5533 | /* We need to generate three things: | |
5534 | 1. the misalignment computation | |
5535 | 2. the extra vector load (for the optimized realignment scheme). | |
5536 | 3. the phi node for the two vectors from which the realignment is | |
ff802fa1 | 5537 | done (for the optimized realignment scheme). */ |
ebfd146a IR |
5538 | |
5539 | /* 1. Determine where to generate the misalignment computation. | |
5540 | ||
5541 | If INIT_ADDR is NULL_TREE, this indicates that the misalignment | |
5542 | calculation will be generated by this function, outside the loop (in the | |
5543 | preheader). Otherwise, INIT_ADDR had already been computed for us by the | |
5544 | caller, inside the loop. | |
5545 | ||
5546 | Background: If the misalignment remains fixed throughout the iterations of | |
5547 | the loop, then both realignment schemes are applicable, and also the | |
5548 | misalignment computation can be done outside LOOP. This is because we are | |
5549 | vectorizing LOOP, and so the memory accesses in LOOP advance in steps that | |
5550 | are a multiple of VS (the Vector Size), and therefore the misalignment in | |
5551 | different vectorized LOOP iterations is always the same. | |
5552 | The problem arises only if the memory access is in an inner-loop nested | |
5553 | inside LOOP, which is now being vectorized using outer-loop vectorization. | |
5554 | This is the only case when the misalignment of the memory access may not | |
5555 | remain fixed throughout the iterations of the inner-loop (as explained in | |
5556 | detail in vect_supportable_dr_alignment). In this case, not only is the | |
5557 | optimized realignment scheme not applicable, but also the misalignment | |
5558 | computation (and generation of the realignment token that is passed to | |
5559 | REALIGN_LOAD) have to be done inside the loop. | |
5560 | ||
5561 | In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode | |
5562 | or not, which in turn determines if the misalignment is computed inside | |
5563 | the inner-loop, or outside LOOP. */ | |
5564 | ||
69f11a13 | 5565 | if (init_addr != NULL_TREE || !loop_vinfo) |
ebfd146a IR |
5566 | { |
5567 | compute_in_loop = true; | |
5568 | gcc_assert (alignment_support_scheme == dr_explicit_realign); | |
5569 | } | |
5570 | ||
5571 | ||
5572 | /* 2. Determine where to generate the extra vector load. | |
5573 | ||
5574 | For the optimized realignment scheme, instead of generating two vector | |
5575 | loads in each iteration, we generate a single extra vector load in the | |
5576 | preheader of the loop, and in each iteration reuse the result of the | |
5577 | vector load from the previous iteration. In case the memory access is in | |
5578 | an inner-loop nested inside LOOP, which is now being vectorized using | |
5579 | outer-loop vectorization, we need to determine whether this initial vector | |
5580 | load should be generated at the preheader of the inner-loop, or can be | |
5581 | generated at the preheader of LOOP. If the memory access has no evolution | |
5582 | in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has | |
5583 | to be generated inside LOOP (in the preheader of the inner-loop). */ | |
5584 | ||
5585 | if (nested_in_vect_loop) | |
5586 | { | |
5587 | tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info); | |
5588 | bool invariant_in_outerloop = | |
5589 | (tree_int_cst_compare (outerloop_step, size_zero_node) == 0); | |
5590 | loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner); | |
5591 | } | |
5592 | else | |
5593 | loop_for_initial_load = loop; | |
5594 | if (at_loop) | |
5595 | *at_loop = loop_for_initial_load; | |
5596 | ||
69f11a13 IR |
5597 | if (loop_for_initial_load) |
5598 | pe = loop_preheader_edge (loop_for_initial_load); | |
5599 | ||
ebfd146a IR |
5600 | /* 3. For the case of the optimized realignment, create the first vector |
5601 | load at the loop preheader. */ | |
5602 | ||
5603 | if (alignment_support_scheme == dr_explicit_realign_optimized) | |
5604 | { | |
5605 | /* Create msq_init = *(floor(p1)) in the loop preheader */ | |
538dd0b7 | 5606 | gassign *new_stmt; |
ebfd146a IR |
5607 | |
5608 | gcc_assert (!compute_in_loop); | |
ebfd146a | 5609 | vec_dest = vect_create_destination_var (scalar_dest, vectype); |
308bc496 | 5610 | ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype, |
86a91c0a | 5611 | loop_for_initial_load, NULL_TREE, |
2d4bca81 | 5612 | &init_addr, NULL, &inc, true); |
b89dfa17 RB |
5613 | if (TREE_CODE (ptr) == SSA_NAME) |
5614 | new_temp = copy_ssa_name (ptr); | |
5615 | else | |
5616 | new_temp = make_ssa_name (TREE_TYPE (ptr)); | |
ca31798e AV |
5617 | poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info); |
5618 | tree type = TREE_TYPE (ptr); | |
0d0e4a03 JJ |
5619 | new_stmt = gimple_build_assign |
5620 | (new_temp, BIT_AND_EXPR, ptr, | |
ca31798e AV |
5621 | fold_build2 (MINUS_EXPR, type, |
5622 | build_int_cst (type, 0), | |
5623 | build_int_cst (type, align))); | |
75421dcd RG |
5624 | new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); |
5625 | gcc_assert (!new_bb); | |
20ede5c6 RG |
5626 | data_ref |
5627 | = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp, | |
5628 | build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0)); | |
19986382 | 5629 | vect_copy_ref_info (data_ref, DR_REF (dr)); |
ebfd146a IR |
5630 | new_stmt = gimple_build_assign (vec_dest, data_ref); |
5631 | new_temp = make_ssa_name (vec_dest, new_stmt); | |
5632 | gimple_assign_set_lhs (new_stmt, new_temp); | |
69f11a13 IR |
5633 | if (pe) |
5634 | { | |
5635 | new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); | |
5636 | gcc_assert (!new_bb); | |
5637 | } | |
5638 | else | |
5639 | gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); | |
5640 | ||
ebfd146a IR |
5641 | msq_init = gimple_assign_lhs (new_stmt); |
5642 | } | |
5643 | ||
5644 | /* 4. Create realignment token using a target builtin, if available. | |
5645 | It is done either inside the containing loop, or before LOOP (as | |
5646 | determined above). */ | |
5647 | ||
5648 | if (targetm.vectorize.builtin_mask_for_load) | |
5649 | { | |
538dd0b7 | 5650 | gcall *new_stmt; |
ebfd146a IR |
5651 | tree builtin_decl; |
5652 | ||
5653 | /* Compute INIT_ADDR - the initial addressed accessed by this memref. */ | |
69f11a13 | 5654 | if (!init_addr) |
ebfd146a IR |
5655 | { |
5656 | /* Generate the INIT_ADDR computation outside LOOP. */ | |
308bc496 RB |
5657 | init_addr = vect_create_addr_base_for_vector_ref (vinfo, |
5658 | stmt_info, &stmts, | |
3f5e8a76 | 5659 | NULL_TREE); |
69f11a13 IR |
5660 | if (loop) |
5661 | { | |
5662 | pe = loop_preheader_edge (loop); | |
5663 | new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); | |
5664 | gcc_assert (!new_bb); | |
5665 | } | |
5666 | else | |
5667 | gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); | |
ebfd146a IR |
5668 | } |
5669 | ||
5670 | builtin_decl = targetm.vectorize.builtin_mask_for_load (); | |
5671 | new_stmt = gimple_build_call (builtin_decl, 1, init_addr); | |
5672 | vec_dest = | |
5673 | vect_create_destination_var (scalar_dest, | |
5674 | gimple_call_return_type (new_stmt)); | |
5675 | new_temp = make_ssa_name (vec_dest, new_stmt); | |
5676 | gimple_call_set_lhs (new_stmt, new_temp); | |
5677 | ||
5678 | if (compute_in_loop) | |
5679 | gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); | |
5680 | else | |
5681 | { | |
5682 | /* Generate the misalignment computation outside LOOP. */ | |
5683 | pe = loop_preheader_edge (loop); | |
5684 | new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); | |
5685 | gcc_assert (!new_bb); | |
5686 | } | |
5687 | ||
5688 | *realignment_token = gimple_call_lhs (new_stmt); | |
5689 | ||
5690 | /* The result of the CALL_EXPR to this builtin is determined from | |
5691 | the value of the parameter and no global variables are touched | |
5692 | which makes the builtin a "const" function. Requiring the | |
5693 | builtin to have the "const" attribute makes it unnecessary | |
5694 | to call mark_call_clobbered. */ | |
5695 | gcc_assert (TREE_READONLY (builtin_decl)); | |
5696 | } | |
5697 | ||
5698 | if (alignment_support_scheme == dr_explicit_realign) | |
5699 | return msq; | |
5700 | ||
5701 | gcc_assert (!compute_in_loop); | |
5702 | gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized); | |
5703 | ||
5704 | ||
5705 | /* 5. Create msq = phi <msq_init, lsq> in loop */ | |
5706 | ||
5707 | pe = loop_preheader_edge (containing_loop); | |
5708 | vec_dest = vect_create_destination_var (scalar_dest, vectype); | |
b731b390 | 5709 | msq = make_ssa_name (vec_dest); |
ebfd146a | 5710 | phi_stmt = create_phi_node (msq, containing_loop->header); |
9e227d60 | 5711 | add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION); |
ebfd146a IR |
5712 | |
5713 | return msq; | |
5714 | } | |
5715 | ||
5716 | ||
0d0293ac | 5717 | /* Function vect_grouped_load_supported. |
ebfd146a | 5718 | |
4fb8ba9d RS |
5719 | COUNT is the size of the load group (the number of statements plus the |
5720 | number of gaps). SINGLE_ELEMENT_P is true if there is actually | |
5721 | only one statement, with a gap of COUNT - 1. | |
5722 | ||
5723 | Returns true if a suitable permute exists. */ | |
ebfd146a IR |
5724 | |
5725 | bool | |
4fb8ba9d RS |
5726 | vect_grouped_load_supported (tree vectype, bool single_element_p, |
5727 | unsigned HOST_WIDE_INT count) | |
ebfd146a | 5728 | { |
ef4bddc2 | 5729 | machine_mode mode = TYPE_MODE (vectype); |
ebfd146a | 5730 | |
4fb8ba9d RS |
5731 | /* If this is single-element interleaving with an element distance |
5732 | that leaves unused vector loads around punt - we at least create | |
5733 | very sub-optimal code in that case (and blow up memory, | |
5734 | see PR65518). */ | |
928686b1 | 5735 | if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype))) |
4fb8ba9d RS |
5736 | { |
5737 | if (dump_enabled_p ()) | |
5738 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
5739 | "single-element interleaving not supported " | |
5740 | "for not adjacent vector loads\n"); | |
5741 | return false; | |
5742 | } | |
5743 | ||
2c23db6d ES |
5744 | /* vect_permute_load_chain requires the group size to be equal to 3 or |
5745 | be a power of two. */ | |
5746 | if (count != 3 && exact_log2 (count) == -1) | |
b602d918 | 5747 | { |
73fbfcad | 5748 | if (dump_enabled_p ()) |
78c60e3d | 5749 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2c23db6d ES |
5750 | "the size of the group of accesses" |
5751 | " is not a power of 2 or not equal to 3\n"); | |
b602d918 RS |
5752 | return false; |
5753 | } | |
5754 | ||
e2c83630 RH |
5755 | /* Check that the permutation is supported. */ |
5756 | if (VECTOR_MODE_P (mode)) | |
5757 | { | |
7b777afa | 5758 | unsigned int i, j; |
2c23db6d | 5759 | if (count == 3) |
e2c83630 | 5760 | { |
7b777afa RS |
5761 | unsigned int nelt; |
5762 | if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) | |
5763 | { | |
5764 | if (dump_enabled_p ()) | |
5765 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
5766 | "cannot handle groups of 3 loads for" | |
5767 | " variable-length vectors\n"); | |
5768 | return false; | |
5769 | } | |
5770 | ||
d980067b RS |
5771 | vec_perm_builder sel (nelt, nelt, 1); |
5772 | sel.quick_grow (nelt); | |
e3342de4 | 5773 | vec_perm_indices indices; |
2c23db6d ES |
5774 | unsigned int k; |
5775 | for (k = 0; k < 3; k++) | |
5776 | { | |
5777 | for (i = 0; i < nelt; i++) | |
5778 | if (3 * i + k < 2 * nelt) | |
5779 | sel[i] = 3 * i + k; | |
5780 | else | |
5781 | sel[i] = 0; | |
e3342de4 RS |
5782 | indices.new_vector (sel, 2, nelt); |
5783 | if (!can_vec_perm_const_p (mode, indices)) | |
2c23db6d ES |
5784 | { |
5785 | if (dump_enabled_p ()) | |
5786 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
5787 | "shuffle of 3 loads is not supported by" | |
5788 | " target\n"); | |
21c0a521 | 5789 | return false; |
2c23db6d ES |
5790 | } |
5791 | for (i = 0, j = 0; i < nelt; i++) | |
5792 | if (3 * i + k < 2 * nelt) | |
5793 | sel[i] = i; | |
5794 | else | |
5795 | sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); | |
e3342de4 RS |
5796 | indices.new_vector (sel, 2, nelt); |
5797 | if (!can_vec_perm_const_p (mode, indices)) | |
2c23db6d ES |
5798 | { |
5799 | if (dump_enabled_p ()) | |
5800 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
5801 | "shuffle of 3 loads is not supported by" | |
5802 | " target\n"); | |
5803 | return false; | |
5804 | } | |
5805 | } | |
5806 | return true; | |
5807 | } | |
5808 | else | |
5809 | { | |
5810 | /* If length is not equal to 3 then only power of 2 is supported. */ | |
146ec50f | 5811 | gcc_assert (pow2p_hwi (count)); |
7b777afa | 5812 | poly_uint64 nelt = GET_MODE_NUNITS (mode); |
e3342de4 | 5813 | |
d980067b RS |
5814 | /* The encoding has a single stepped pattern. */ |
5815 | vec_perm_builder sel (nelt, 1, 3); | |
5816 | sel.quick_grow (3); | |
5817 | for (i = 0; i < 3; i++) | |
2c23db6d | 5818 | sel[i] = i * 2; |
e3342de4 RS |
5819 | vec_perm_indices indices (sel, 2, nelt); |
5820 | if (can_vec_perm_const_p (mode, indices)) | |
2c23db6d | 5821 | { |
d980067b | 5822 | for (i = 0; i < 3; i++) |
2c23db6d | 5823 | sel[i] = i * 2 + 1; |
e3342de4 RS |
5824 | indices.new_vector (sel, 2, nelt); |
5825 | if (can_vec_perm_const_p (mode, indices)) | |
2c23db6d ES |
5826 | return true; |
5827 | } | |
5828 | } | |
e2c83630 | 5829 | } |
ebfd146a | 5830 | |
73fbfcad | 5831 | if (dump_enabled_p ()) |
78c60e3d | 5832 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, |
2c23db6d | 5833 | "extract even/odd not supported by target\n"); |
a6b3dfde | 5834 | return false; |
ebfd146a IR |
5835 | } |
5836 | ||
7e11fc7f RS |
5837 | /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of |
5838 | type VECTYPE. MASKED_P says whether the masked form is needed. */ | |
272c6793 RS |
5839 | |
5840 | bool | |
7e11fc7f RS |
5841 | vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, |
5842 | bool masked_p) | |
272c6793 | 5843 | { |
7e11fc7f RS |
5844 | if (masked_p) |
5845 | return vect_lanes_optab_supported_p ("vec_mask_load_lanes", | |
5846 | vec_mask_load_lanes_optab, | |
5847 | vectype, count); | |
5848 | else | |
5849 | return vect_lanes_optab_supported_p ("vec_load_lanes", | |
5850 | vec_load_lanes_optab, | |
5851 | vectype, count); | |
272c6793 | 5852 | } |
ebfd146a IR |
5853 | |
5854 | /* Function vect_permute_load_chain. | |
5855 | ||
5856 | Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be | |
2c23db6d ES |
5857 | a power of 2 or equal to 3, generate extract_even/odd stmts to reorder |
5858 | the input data correctly. Return the final references for loads in | |
5859 | RESULT_CHAIN. | |
ebfd146a IR |
5860 | |
5861 | E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. | |
5862 | The input is 4 vectors each containing 8 elements. We assign a number to each | |
5863 | element, the input sequence is: | |
5864 | ||
5865 | 1st vec: 0 1 2 3 4 5 6 7 | |
5866 | 2nd vec: 8 9 10 11 12 13 14 15 | |
b8698a0f | 5867 | 3rd vec: 16 17 18 19 20 21 22 23 |
ebfd146a IR |
5868 | 4th vec: 24 25 26 27 28 29 30 31 |
5869 | ||
5870 | The output sequence should be: | |
5871 | ||
5872 | 1st vec: 0 4 8 12 16 20 24 28 | |
5873 | 2nd vec: 1 5 9 13 17 21 25 29 | |
b8698a0f | 5874 | 3rd vec: 2 6 10 14 18 22 26 30 |
ebfd146a IR |
5875 | 4th vec: 3 7 11 15 19 23 27 31 |
5876 | ||
5877 | i.e., the first output vector should contain the first elements of each | |
5878 | interleaving group, etc. | |
5879 | ||
ff802fa1 IR |
5880 | We use extract_even/odd instructions to create such output. The input of |
5881 | each extract_even/odd operation is two vectors | |
b8698a0f L |
5882 | 1st vec 2nd vec |
5883 | 0 1 2 3 4 5 6 7 | |
ebfd146a | 5884 | |
ff802fa1 | 5885 | and the output is the vector of extracted even/odd elements. The output of |
ebfd146a IR |
5886 | extract_even will be: 0 2 4 6 |
5887 | and of extract_odd: 1 3 5 7 | |
5888 | ||
b8698a0f | 5889 | |
ff802fa1 IR |
5890 | The permutation is done in log LENGTH stages. In each stage extract_even |
5891 | and extract_odd stmts are created for each pair of vectors in DR_CHAIN in | |
5892 | their order. In our example, | |
ebfd146a IR |
5893 | |
5894 | E1: extract_even (1st vec, 2nd vec) | |
5895 | E2: extract_odd (1st vec, 2nd vec) | |
5896 | E3: extract_even (3rd vec, 4th vec) | |
5897 | E4: extract_odd (3rd vec, 4th vec) | |
5898 | ||
5899 | The output for the first stage will be: | |
5900 | ||
5901 | E1: 0 2 4 6 8 10 12 14 | |
5902 | E2: 1 3 5 7 9 11 13 15 | |
b8698a0f | 5903 | E3: 16 18 20 22 24 26 28 30 |
ebfd146a IR |
5904 | E4: 17 19 21 23 25 27 29 31 |
5905 | ||
5906 | In order to proceed and create the correct sequence for the next stage (or | |
b8698a0f L |
5907 | for the correct output, if the second stage is the last one, as in our |
5908 | example), we first put the output of extract_even operation and then the | |
ebfd146a IR |
5909 | output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN). |
5910 | The input for the second stage is: | |
5911 | ||
5912 | 1st vec (E1): 0 2 4 6 8 10 12 14 | |
b8698a0f L |
5913 | 2nd vec (E3): 16 18 20 22 24 26 28 30 |
5914 | 3rd vec (E2): 1 3 5 7 9 11 13 15 | |
ebfd146a IR |
5915 | 4th vec (E4): 17 19 21 23 25 27 29 31 |
5916 | ||
5917 | The output of the second stage: | |
5918 | ||
5919 | E1: 0 4 8 12 16 20 24 28 | |
5920 | E2: 2 6 10 14 18 22 26 30 | |
5921 | E3: 1 5 9 13 17 21 25 29 | |
5922 | E4: 3 7 11 15 19 23 27 31 | |
5923 | ||
5924 | And RESULT_CHAIN after reordering: | |
5925 | ||
5926 | 1st vec (E1): 0 4 8 12 16 20 24 28 | |
5927 | 2nd vec (E3): 1 5 9 13 17 21 25 29 | |
b8698a0f | 5928 | 3rd vec (E2): 2 6 10 14 18 22 26 30 |
ebfd146a IR |
5929 | 4th vec (E4): 3 7 11 15 19 23 27 31. */ |
5930 | ||
b602d918 | 5931 | static void |
308bc496 | 5932 | vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain, |
b8698a0f | 5933 | unsigned int length, |
32e8e429 | 5934 | stmt_vec_info stmt_info, |
ebfd146a | 5935 | gimple_stmt_iterator *gsi, |
9771b263 | 5936 | vec<tree> *result_chain) |
ebfd146a | 5937 | { |
83d5977e | 5938 | tree data_ref, first_vect, second_vect; |
e2c83630 | 5939 | tree perm_mask_even, perm_mask_odd; |
2c23db6d | 5940 | tree perm3_mask_low, perm3_mask_high; |
355fe088 | 5941 | gimple *perm_stmt; |
91987857 | 5942 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
e2c83630 | 5943 | unsigned int i, j, log_length = exact_log2 (length); |
908a1a16 | 5944 | |
3f292312 JJ |
5945 | result_chain->quick_grow (length); |
5946 | memcpy (result_chain->address (), dr_chain.address (), | |
5947 | length * sizeof (tree)); | |
e2c83630 | 5948 | |
2c23db6d | 5949 | if (length == 3) |
ebfd146a | 5950 | { |
edab8e10 | 5951 | /* vect_grouped_load_supported ensures that this is constant. */ |
928686b1 | 5952 | unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); |
2c23db6d | 5953 | unsigned int k; |
ebfd146a | 5954 | |
d980067b RS |
5955 | vec_perm_builder sel (nelt, nelt, 1); |
5956 | sel.quick_grow (nelt); | |
e3342de4 | 5957 | vec_perm_indices indices; |
2c23db6d ES |
5958 | for (k = 0; k < 3; k++) |
5959 | { | |
5960 | for (i = 0; i < nelt; i++) | |
5961 | if (3 * i + k < 2 * nelt) | |
5962 | sel[i] = 3 * i + k; | |
5963 | else | |
5964 | sel[i] = 0; | |
e3342de4 RS |
5965 | indices.new_vector (sel, 2, nelt); |
5966 | perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices); | |
2c23db6d ES |
5967 | |
5968 | for (i = 0, j = 0; i < nelt; i++) | |
5969 | if (3 * i + k < 2 * nelt) | |
5970 | sel[i] = i; | |
5971 | else | |
5972 | sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); | |
e3342de4 RS |
5973 | indices.new_vector (sel, 2, nelt); |
5974 | perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices); | |
2c23db6d ES |
5975 | |
5976 | first_vect = dr_chain[0]; | |
5977 | second_vect = dr_chain[1]; | |
5978 | ||
5979 | /* Create interleaving stmt (low part of): | |
5980 | low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, | |
5981 | ...}> */ | |
f598c55c | 5982 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); |
0d0e4a03 JJ |
5983 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, |
5984 | second_vect, perm3_mask_low); | |
308bc496 | 5985 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
b8698a0f | 5986 | |
2c23db6d ES |
5987 | /* Create interleaving stmt (high part of): |
5988 | high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, | |
5989 | ...}> */ | |
5990 | first_vect = data_ref; | |
5991 | second_vect = dr_chain[2]; | |
f598c55c | 5992 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); |
0d0e4a03 JJ |
5993 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, |
5994 | second_vect, perm3_mask_high); | |
308bc496 | 5995 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
2c23db6d | 5996 | (*result_chain)[k] = data_ref; |
ebfd146a | 5997 | } |
ebfd146a | 5998 | } |
2c23db6d ES |
5999 | else |
6000 | { | |
6001 | /* If length is not equal to 3 then only power of 2 is supported. */ | |
146ec50f | 6002 | gcc_assert (pow2p_hwi (length)); |
2c23db6d | 6003 | |
d980067b | 6004 | /* The encoding has a single stepped pattern. */ |
928686b1 | 6005 | poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype); |
d980067b RS |
6006 | vec_perm_builder sel (nelt, 1, 3); |
6007 | sel.quick_grow (3); | |
6008 | for (i = 0; i < 3; ++i) | |
2c23db6d | 6009 | sel[i] = i * 2; |
e3342de4 RS |
6010 | vec_perm_indices indices (sel, 2, nelt); |
6011 | perm_mask_even = vect_gen_perm_mask_checked (vectype, indices); | |
2c23db6d | 6012 | |
d980067b | 6013 | for (i = 0; i < 3; ++i) |
2c23db6d | 6014 | sel[i] = i * 2 + 1; |
e3342de4 RS |
6015 | indices.new_vector (sel, 2, nelt); |
6016 | perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices); | |
ebfd146a | 6017 | |
2c23db6d ES |
6018 | for (i = 0; i < log_length; i++) |
6019 | { | |
6020 | for (j = 0; j < length; j += 2) | |
6021 | { | |
6022 | first_vect = dr_chain[j]; | |
6023 | second_vect = dr_chain[j+1]; | |
6024 | ||
6025 | /* data_ref = permute_even (first_data_ref, second_data_ref); */ | |
6026 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); | |
0d0e4a03 JJ |
6027 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6028 | first_vect, second_vect, | |
6029 | perm_mask_even); | |
308bc496 | 6030 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
2c23db6d ES |
6031 | (*result_chain)[j/2] = data_ref; |
6032 | ||
6033 | /* data_ref = permute_odd (first_data_ref, second_data_ref); */ | |
6034 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); | |
0d0e4a03 JJ |
6035 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6036 | first_vect, second_vect, | |
6037 | perm_mask_odd); | |
308bc496 | 6038 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
2c23db6d ES |
6039 | (*result_chain)[j/2+length/2] = data_ref; |
6040 | } | |
6041 | memcpy (dr_chain.address (), result_chain->address (), | |
6042 | length * sizeof (tree)); | |
6043 | } | |
6044 | } | |
6045 | } | |
ebfd146a | 6046 | |
f7917029 ES |
6047 | /* Function vect_shift_permute_load_chain. |
6048 | ||
6049 | Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate | |
6050 | sequence of stmts to reorder the input data accordingly. | |
6051 | Return the final references for loads in RESULT_CHAIN. | |
6052 | Return true if successed, false otherwise. | |
6053 | ||
6054 | E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8. | |
6055 | The input is 3 vectors each containing 8 elements. We assign a | |
6056 | number to each element, the input sequence is: | |
6057 | ||
6058 | 1st vec: 0 1 2 3 4 5 6 7 | |
6059 | 2nd vec: 8 9 10 11 12 13 14 15 | |
6060 | 3rd vec: 16 17 18 19 20 21 22 23 | |
6061 | ||
6062 | The output sequence should be: | |
6063 | ||
6064 | 1st vec: 0 3 6 9 12 15 18 21 | |
6065 | 2nd vec: 1 4 7 10 13 16 19 22 | |
6066 | 3rd vec: 2 5 8 11 14 17 20 23 | |
6067 | ||
6068 | We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output. | |
6069 | ||
6070 | First we shuffle all 3 vectors to get correct elements order: | |
6071 | ||
6072 | 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5) | |
6073 | 2nd vec: ( 8 11 14) ( 9 12 15) (10 13) | |
6074 | 3rd vec: (16 19 22) (17 20 23) (18 21) | |
6075 | ||
6076 | Next we unite and shift vector 3 times: | |
6077 | ||
6078 | 1st step: | |
6079 | shift right by 6 the concatenation of: | |
6080 | "1st vec" and "2nd vec" | |
6081 | ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13) | |
6082 | "2nd vec" and "3rd vec" | |
6083 | ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21) | |
6084 | "3rd vec" and "1st vec" | |
6085 | (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5) | |
6086 | | New vectors | | |
6087 | ||
6088 | So that now new vectors are: | |
6089 | ||
6090 | 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15) | |
6091 | 2nd vec: (10 13) (16 19 22) (17 20 23) | |
6092 | 3rd vec: (18 21) ( 0 3 6) ( 1 4 7) | |
6093 | ||
6094 | 2nd step: | |
6095 | shift right by 5 the concatenation of: | |
6096 | "1st vec" and "3rd vec" | |
6097 | ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7) | |
6098 | "2nd vec" and "1st vec" | |
6099 | (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15) | |
6100 | "3rd vec" and "2nd vec" | |
6101 | (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23) | |
6102 | | New vectors | | |
6103 | ||
6104 | So that now new vectors are: | |
6105 | ||
6106 | 1st vec: ( 9 12 15) (18 21) ( 0 3 6) | |
6107 | 2nd vec: (17 20 23) ( 2 5) ( 8 11 14) | |
6108 | 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY | |
6109 | ||
6110 | 3rd step: | |
6111 | shift right by 5 the concatenation of: | |
6112 | "1st vec" and "1st vec" | |
6113 | ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6) | |
6114 | shift right by 3 the concatenation of: | |
6115 | "2nd vec" and "2nd vec" | |
6116 | (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14) | |
6117 | | New vectors | | |
6118 | ||
6119 | So that now all vectors are READY: | |
6120 | 1st vec: ( 0 3 6) ( 9 12 15) (18 21) | |
6121 | 2nd vec: ( 2 5) ( 8 11 14) (17 20 23) | |
6122 | 3rd vec: ( 1 4 7) (10 13) (16 19 22) | |
6123 | ||
6124 | This algorithm is faster than one in vect_permute_load_chain if: | |
6125 | 1. "shift of a concatination" is faster than general permutation. | |
6126 | This is usually so. | |
6127 | 2. The TARGET machine can't execute vector instructions in parallel. | |
6128 | This is because each step of the algorithm depends on previous. | |
6129 | The algorithm in vect_permute_load_chain is much more parallel. | |
6130 | ||
6131 | The algorithm is applicable only for LOAD CHAIN LENGTH less than VF. | |
6132 | */ | |
6133 | ||
6134 | static bool | |
308bc496 | 6135 | vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain, |
f7917029 | 6136 | unsigned int length, |
32e8e429 | 6137 | stmt_vec_info stmt_info, |
f7917029 ES |
6138 | gimple_stmt_iterator *gsi, |
6139 | vec<tree> *result_chain) | |
6140 | { | |
6141 | tree vect[3], vect_shift[3], data_ref, first_vect, second_vect; | |
6142 | tree perm2_mask1, perm2_mask2, perm3_mask; | |
6143 | tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask; | |
355fe088 | 6144 | gimple *perm_stmt; |
f7917029 | 6145 | |
91987857 | 6146 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
f7917029 | 6147 | unsigned int i; |
308bc496 | 6148 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); |
f7917029 | 6149 | |
928686b1 RS |
6150 | unsigned HOST_WIDE_INT nelt, vf; |
6151 | if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt) | |
6152 | || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf)) | |
d9f21f6a RS |
6153 | /* Not supported for variable-length vectors. */ |
6154 | return false; | |
6155 | ||
e3342de4 | 6156 | vec_perm_builder sel (nelt, nelt, 1); |
908a1a16 RS |
6157 | sel.quick_grow (nelt); |
6158 | ||
f7917029 ES |
6159 | result_chain->quick_grow (length); |
6160 | memcpy (result_chain->address (), dr_chain.address (), | |
6161 | length * sizeof (tree)); | |
6162 | ||
d9f21f6a | 6163 | if (pow2p_hwi (length) && vf > 4) |
f7917029 | 6164 | { |
af4c011e | 6165 | unsigned int j, log_length = exact_log2 (length); |
f7917029 ES |
6166 | for (i = 0; i < nelt / 2; ++i) |
6167 | sel[i] = i * 2; | |
6168 | for (i = 0; i < nelt / 2; ++i) | |
6169 | sel[nelt / 2 + i] = i * 2 + 1; | |
e3342de4 RS |
6170 | vec_perm_indices indices (sel, 2, nelt); |
6171 | if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) | |
f7917029 ES |
6172 | { |
6173 | if (dump_enabled_p ()) | |
6174 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6175 | "shuffle of 2 fields structure is not \ | |
6176 | supported by target\n"); | |
6177 | return false; | |
6178 | } | |
e3342de4 | 6179 | perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices); |
f7917029 ES |
6180 | |
6181 | for (i = 0; i < nelt / 2; ++i) | |
6182 | sel[i] = i * 2 + 1; | |
6183 | for (i = 0; i < nelt / 2; ++i) | |
6184 | sel[nelt / 2 + i] = i * 2; | |
e3342de4 RS |
6185 | indices.new_vector (sel, 2, nelt); |
6186 | if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) | |
f7917029 ES |
6187 | { |
6188 | if (dump_enabled_p ()) | |
6189 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6190 | "shuffle of 2 fields structure is not \ | |
6191 | supported by target\n"); | |
6192 | return false; | |
6193 | } | |
e3342de4 | 6194 | perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices); |
f7917029 ES |
6195 | |
6196 | /* Generating permutation constant to shift all elements. | |
6197 | For vector length 8 it is {4 5 6 7 8 9 10 11}. */ | |
6198 | for (i = 0; i < nelt; i++) | |
6199 | sel[i] = nelt / 2 + i; | |
e3342de4 RS |
6200 | indices.new_vector (sel, 2, nelt); |
6201 | if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) | |
f7917029 ES |
6202 | { |
6203 | if (dump_enabled_p ()) | |
6204 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6205 | "shift permutation is not supported by target\n"); | |
6206 | return false; | |
6207 | } | |
e3342de4 | 6208 | shift1_mask = vect_gen_perm_mask_checked (vectype, indices); |
f7917029 ES |
6209 | |
6210 | /* Generating permutation constant to select vector from 2. | |
6211 | For vector length 8 it is {0 1 2 3 12 13 14 15}. */ | |
6212 | for (i = 0; i < nelt / 2; i++) | |
6213 | sel[i] = i; | |
6214 | for (i = nelt / 2; i < nelt; i++) | |
6215 | sel[i] = nelt + i; | |
e3342de4 RS |
6216 | indices.new_vector (sel, 2, nelt); |
6217 | if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) | |
f7917029 ES |
6218 | { |
6219 | if (dump_enabled_p ()) | |
6220 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6221 | "select is not supported by target\n"); | |
6222 | return false; | |
6223 | } | |
e3342de4 | 6224 | select_mask = vect_gen_perm_mask_checked (vectype, indices); |
f7917029 | 6225 | |
af4c011e ES |
6226 | for (i = 0; i < log_length; i++) |
6227 | { | |
6228 | for (j = 0; j < length; j += 2) | |
6229 | { | |
6230 | first_vect = dr_chain[j]; | |
6231 | second_vect = dr_chain[j + 1]; | |
6232 | ||
6233 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); | |
0d0e4a03 JJ |
6234 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6235 | first_vect, first_vect, | |
6236 | perm2_mask1); | |
308bc496 | 6237 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
af4c011e ES |
6238 | vect[0] = data_ref; |
6239 | ||
6240 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); | |
0d0e4a03 JJ |
6241 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6242 | second_vect, second_vect, | |
6243 | perm2_mask2); | |
308bc496 | 6244 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
af4c011e | 6245 | vect[1] = data_ref; |
f7917029 | 6246 | |
af4c011e | 6247 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift"); |
0d0e4a03 JJ |
6248 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6249 | vect[0], vect[1], shift1_mask); | |
308bc496 | 6250 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
af4c011e ES |
6251 | (*result_chain)[j/2 + length/2] = data_ref; |
6252 | ||
6253 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_select"); | |
0d0e4a03 JJ |
6254 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6255 | vect[0], vect[1], select_mask); | |
308bc496 | 6256 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
af4c011e ES |
6257 | (*result_chain)[j/2] = data_ref; |
6258 | } | |
6259 | memcpy (dr_chain.address (), result_chain->address (), | |
6260 | length * sizeof (tree)); | |
6261 | } | |
f7917029 ES |
6262 | return true; |
6263 | } | |
d9f21f6a | 6264 | if (length == 3 && vf > 2) |
f7917029 ES |
6265 | { |
6266 | unsigned int k = 0, l = 0; | |
6267 | ||
6268 | /* Generating permutation constant to get all elements in rigth order. | |
6269 | For vector length 8 it is {0 3 6 1 4 7 2 5}. */ | |
6270 | for (i = 0; i < nelt; i++) | |
6271 | { | |
6272 | if (3 * k + (l % 3) >= nelt) | |
6273 | { | |
6274 | k = 0; | |
6275 | l += (3 - (nelt % 3)); | |
6276 | } | |
6277 | sel[i] = 3 * k + (l % 3); | |
6278 | k++; | |
6279 | } | |
e3342de4 RS |
6280 | vec_perm_indices indices (sel, 2, nelt); |
6281 | if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) | |
f7917029 ES |
6282 | { |
6283 | if (dump_enabled_p ()) | |
6284 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6285 | "shuffle of 3 fields structure is not \ | |
6286 | supported by target\n"); | |
6287 | return false; | |
6288 | } | |
e3342de4 | 6289 | perm3_mask = vect_gen_perm_mask_checked (vectype, indices); |
f7917029 ES |
6290 | |
6291 | /* Generating permutation constant to shift all elements. | |
6292 | For vector length 8 it is {6 7 8 9 10 11 12 13}. */ | |
6293 | for (i = 0; i < nelt; i++) | |
6294 | sel[i] = 2 * (nelt / 3) + (nelt % 3) + i; | |
e3342de4 RS |
6295 | indices.new_vector (sel, 2, nelt); |
6296 | if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) | |
f7917029 ES |
6297 | { |
6298 | if (dump_enabled_p ()) | |
6299 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6300 | "shift permutation is not supported by target\n"); | |
6301 | return false; | |
6302 | } | |
e3342de4 | 6303 | shift1_mask = vect_gen_perm_mask_checked (vectype, indices); |
f7917029 ES |
6304 | |
6305 | /* Generating permutation constant to shift all elements. | |
6306 | For vector length 8 it is {5 6 7 8 9 10 11 12}. */ | |
6307 | for (i = 0; i < nelt; i++) | |
6308 | sel[i] = 2 * (nelt / 3) + 1 + i; | |
e3342de4 RS |
6309 | indices.new_vector (sel, 2, nelt); |
6310 | if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) | |
f7917029 ES |
6311 | { |
6312 | if (dump_enabled_p ()) | |
6313 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6314 | "shift permutation is not supported by target\n"); | |
6315 | return false; | |
6316 | } | |
e3342de4 | 6317 | shift2_mask = vect_gen_perm_mask_checked (vectype, indices); |
f7917029 ES |
6318 | |
6319 | /* Generating permutation constant to shift all elements. | |
6320 | For vector length 8 it is {3 4 5 6 7 8 9 10}. */ | |
6321 | for (i = 0; i < nelt; i++) | |
6322 | sel[i] = (nelt / 3) + (nelt % 3) / 2 + i; | |
e3342de4 RS |
6323 | indices.new_vector (sel, 2, nelt); |
6324 | if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) | |
f7917029 ES |
6325 | { |
6326 | if (dump_enabled_p ()) | |
6327 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6328 | "shift permutation is not supported by target\n"); | |
6329 | return false; | |
6330 | } | |
e3342de4 | 6331 | shift3_mask = vect_gen_perm_mask_checked (vectype, indices); |
f7917029 ES |
6332 | |
6333 | /* Generating permutation constant to shift all elements. | |
6334 | For vector length 8 it is {5 6 7 8 9 10 11 12}. */ | |
6335 | for (i = 0; i < nelt; i++) | |
6336 | sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i; | |
e3342de4 RS |
6337 | indices.new_vector (sel, 2, nelt); |
6338 | if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) | |
f7917029 ES |
6339 | { |
6340 | if (dump_enabled_p ()) | |
6341 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, | |
6342 | "shift permutation is not supported by target\n"); | |
6343 | return false; | |
6344 | } | |
e3342de4 | 6345 | shift4_mask = vect_gen_perm_mask_checked (vectype, indices); |
f7917029 ES |
6346 | |
6347 | for (k = 0; k < 3; k++) | |
6348 | { | |
f598c55c | 6349 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3"); |
0d0e4a03 JJ |
6350 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6351 | dr_chain[k], dr_chain[k], | |
6352 | perm3_mask); | |
308bc496 | 6353 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
f7917029 ES |
6354 | vect[k] = data_ref; |
6355 | } | |
6356 | ||
6357 | for (k = 0; k < 3; k++) | |
6358 | { | |
6359 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1"); | |
0d0e4a03 JJ |
6360 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6361 | vect[k % 3], vect[(k + 1) % 3], | |
6362 | shift1_mask); | |
308bc496 | 6363 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
f7917029 ES |
6364 | vect_shift[k] = data_ref; |
6365 | } | |
6366 | ||
6367 | for (k = 0; k < 3; k++) | |
6368 | { | |
6369 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2"); | |
0d0e4a03 JJ |
6370 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, |
6371 | vect_shift[(4 - k) % 3], | |
6372 | vect_shift[(3 - k) % 3], | |
6373 | shift2_mask); | |
308bc496 | 6374 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
f7917029 ES |
6375 | vect[k] = data_ref; |
6376 | } | |
6377 | ||
6378 | (*result_chain)[3 - (nelt % 3)] = vect[2]; | |
6379 | ||
6380 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3"); | |
0d0e4a03 JJ |
6381 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0], |
6382 | vect[0], shift3_mask); | |
308bc496 | 6383 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
f7917029 ES |
6384 | (*result_chain)[nelt % 3] = data_ref; |
6385 | ||
6386 | data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4"); | |
0d0e4a03 JJ |
6387 | perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1], |
6388 | vect[1], shift4_mask); | |
308bc496 | 6389 | vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); |
f7917029 ES |
6390 | (*result_chain)[0] = data_ref; |
6391 | return true; | |
6392 | } | |
6393 | return false; | |
6394 | } | |
6395 | ||
0d0293ac | 6396 | /* Function vect_transform_grouped_load. |
ebfd146a IR |
6397 | |
6398 | Given a chain of input interleaved data-refs (in DR_CHAIN), build statements | |
6399 | to perform their permutation and ascribe the result vectorized statements to | |
6400 | the scalar statements. | |
6401 | */ | |
6402 | ||
b602d918 | 6403 | void |
308bc496 RB |
6404 | vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info, |
6405 | vec<tree> dr_chain, | |
32e8e429 | 6406 | int size, gimple_stmt_iterator *gsi) |
ebfd146a | 6407 | { |
ef4bddc2 | 6408 | machine_mode mode; |
6e1aa848 | 6409 | vec<tree> result_chain = vNULL; |
ebfd146a | 6410 | |
b8698a0f L |
6411 | /* DR_CHAIN contains input data-refs that are a part of the interleaving. |
6412 | RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted | |
ebfd146a | 6413 | vectors, that are ready for vector computation. */ |
9771b263 | 6414 | result_chain.create (size); |
f7917029 ES |
6415 | |
6416 | /* If reassociation width for vector type is 2 or greater target machine can | |
6417 | execute 2 or more vector instructions in parallel. Otherwise try to | |
6418 | get chain for loads group using vect_shift_permute_load_chain. */ | |
91987857 | 6419 | mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info)); |
f7917029 | 6420 | if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1 |
146ec50f | 6421 | || pow2p_hwi (size) |
308bc496 | 6422 | || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info, |
f7917029 | 6423 | gsi, &result_chain)) |
308bc496 RB |
6424 | vect_permute_load_chain (vinfo, dr_chain, |
6425 | size, stmt_info, gsi, &result_chain); | |
6426 | vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain); | |
9771b263 | 6427 | result_chain.release (); |
272c6793 RS |
6428 | } |
6429 | ||
0d0293ac | 6430 | /* RESULT_CHAIN contains the output of a group of grouped loads that were |
32e8e429 | 6431 | generated as part of the vectorization of STMT_INFO. Assign the statement |
272c6793 RS |
6432 | for each vector to the associated scalar statement. */ |
6433 | ||
6434 | void | |
f25161bd | 6435 | vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info, |
32e8e429 | 6436 | vec<tree> result_chain) |
272c6793 | 6437 | { |
bffb8014 | 6438 | stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); |
272c6793 RS |
6439 | unsigned int i, gap_count; |
6440 | tree tmp_data_ref; | |
ebfd146a | 6441 | |
b8698a0f L |
6442 | /* Put a permuted data-ref in the VECTORIZED_STMT field. |
6443 | Since we scan the chain starting from it's first node, their order | |
ebfd146a | 6444 | corresponds the order of data-refs in RESULT_CHAIN. */ |
bffb8014 | 6445 | stmt_vec_info next_stmt_info = first_stmt_info; |
ebfd146a | 6446 | gap_count = 1; |
9771b263 | 6447 | FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref) |
ebfd146a | 6448 | { |
bffb8014 | 6449 | if (!next_stmt_info) |
ebfd146a IR |
6450 | break; |
6451 | ||
ff802fa1 IR |
6452 | /* Skip the gaps. Loads created for the gaps will be removed by dead |
6453 | code elimination pass later. No need to check for the first stmt in | |
ebfd146a | 6454 | the group, since it always exists. |
2c53b149 RB |
6455 | DR_GROUP_GAP is the number of steps in elements from the previous |
6456 | access (if there is no gap DR_GROUP_GAP is 1). We skip loads that | |
ff802fa1 | 6457 | correspond to the gaps. */ |
bffb8014 RS |
6458 | if (next_stmt_info != first_stmt_info |
6459 | && gap_count < DR_GROUP_GAP (next_stmt_info)) | |
f95b7597 RB |
6460 | { |
6461 | gap_count++; | |
6462 | continue; | |
6463 | } | |
ebfd146a | 6464 | |
f95b7597 RB |
6465 | /* ??? The following needs cleanup after the removal of |
6466 | DR_GROUP_SAME_DR_STMT. */ | |
6467 | if (next_stmt_info) | |
ebfd146a | 6468 | { |
f25161bd | 6469 | gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref); |
ebfd146a | 6470 | /* We assume that if VEC_STMT is not NULL, this is a case of multiple |
b05d5563 | 6471 | copies, and we put the new vector statement last. */ |
f25161bd | 6472 | STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt); |
ebfd146a | 6473 | |
bffb8014 | 6474 | next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info); |
ebfd146a | 6475 | gap_count = 1; |
ebfd146a IR |
6476 | } |
6477 | } | |
ebfd146a IR |
6478 | } |
6479 | ||
6480 | /* Function vect_force_dr_alignment_p. | |
6481 | ||
6482 | Returns whether the alignment of a DECL can be forced to be aligned | |
6483 | on ALIGNMENT bit boundary. */ | |
6484 | ||
b8698a0f | 6485 | bool |
ca31798e | 6486 | vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment) |
ebfd146a | 6487 | { |
8813a647 | 6488 | if (!VAR_P (decl)) |
ebfd146a IR |
6489 | return false; |
6490 | ||
428f0c67 JH |
6491 | if (decl_in_symtab_p (decl) |
6492 | && !symtab_node::get (decl)->can_increase_alignment_p ()) | |
6192fa79 JH |
6493 | return false; |
6494 | ||
ebfd146a | 6495 | if (TREE_STATIC (decl)) |
b2581735 IS |
6496 | return (known_le (alignment, |
6497 | (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT)); | |
ebfd146a | 6498 | else |
ca31798e | 6499 | return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT)); |
ebfd146a IR |
6500 | } |
6501 | ||
ebfd146a | 6502 | |
89fa689a | 6503 | /* Return whether the data reference DR_INFO is supported with respect to its |
720f5239 IR |
6504 | alignment. |
6505 | If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even | |
6506 | it is aligned, i.e., check if it is possible to vectorize it with different | |
ebfd146a IR |
6507 | alignment. */ |
6508 | ||
6509 | enum dr_alignment_support | |
308bc496 | 6510 | vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info, |
720f5239 | 6511 | bool check_aligned_accesses) |
ebfd146a | 6512 | { |
89fa689a RS |
6513 | data_reference *dr = dr_info->dr; |
6514 | stmt_vec_info stmt_info = dr_info->stmt; | |
ebfd146a | 6515 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
ef4bddc2 | 6516 | machine_mode mode = TYPE_MODE (vectype); |
308bc496 | 6517 | loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); |
99b1c316 | 6518 | class loop *vect_loop = NULL; |
a70d6342 | 6519 | bool nested_in_vect_loop = false; |
ebfd146a | 6520 | |
89fa689a | 6521 | if (aligned_access_p (dr_info) && !check_aligned_accesses) |
ebfd146a IR |
6522 | return dr_aligned; |
6523 | ||
5ce9450f JJ |
6524 | /* For now assume all conditional loads/stores support unaligned |
6525 | access without any special code. */ | |
78e02b3b RS |
6526 | if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) |
6527 | if (gimple_call_internal_p (stmt) | |
6528 | && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD | |
6529 | || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)) | |
6530 | return dr_unaligned_supported; | |
5ce9450f | 6531 | |
69f11a13 IR |
6532 | if (loop_vinfo) |
6533 | { | |
6534 | vect_loop = LOOP_VINFO_LOOP (loop_vinfo); | |
78e02b3b | 6535 | nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info); |
69f11a13 | 6536 | } |
a70d6342 | 6537 | |
ebfd146a IR |
6538 | /* Possibly unaligned access. */ |
6539 | ||
6540 | /* We can choose between using the implicit realignment scheme (generating | |
6541 | a misaligned_move stmt) and the explicit realignment scheme (generating | |
ff802fa1 IR |
6542 | aligned loads with a REALIGN_LOAD). There are two variants to the |
6543 | explicit realignment scheme: optimized, and unoptimized. | |
ebfd146a IR |
6544 | We can optimize the realignment only if the step between consecutive |
6545 | vector loads is equal to the vector size. Since the vector memory | |
6546 | accesses advance in steps of VS (Vector Size) in the vectorized loop, it | |
6547 | is guaranteed that the misalignment amount remains the same throughout the | |
6548 | execution of the vectorized loop. Therefore, we can create the | |
6549 | "realignment token" (the permutation mask that is passed to REALIGN_LOAD) | |
6550 | at the loop preheader. | |
6551 | ||
6552 | However, in the case of outer-loop vectorization, when vectorizing a | |
6553 | memory access in the inner-loop nested within the LOOP that is now being | |
6554 | vectorized, while it is guaranteed that the misalignment of the | |
6555 | vectorized memory access will remain the same in different outer-loop | |
6556 | iterations, it is *not* guaranteed that is will remain the same throughout | |
6557 | the execution of the inner-loop. This is because the inner-loop advances | |
6558 | with the original scalar step (and not in steps of VS). If the inner-loop | |
6559 | step happens to be a multiple of VS, then the misalignment remains fixed | |
6560 | and we can use the optimized realignment scheme. For example: | |
6561 | ||
6562 | for (i=0; i<N; i++) | |
6563 | for (j=0; j<M; j++) | |
6564 | s += a[i+j]; | |
6565 | ||
6566 | When vectorizing the i-loop in the above example, the step between | |
6567 | consecutive vector loads is 1, and so the misalignment does not remain | |
6568 | fixed across the execution of the inner-loop, and the realignment cannot | |
6569 | be optimized (as illustrated in the following pseudo vectorized loop): | |
6570 | ||
6571 | for (i=0; i<N; i+=4) | |
6572 | for (j=0; j<M; j++){ | |
6573 | vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...} | |
6574 | // when j is {0,1,2,3,4,5,6,7,...} respectively. | |
6575 | // (assuming that we start from an aligned address). | |
6576 | } | |
6577 | ||
6578 | We therefore have to use the unoptimized realignment scheme: | |
6579 | ||
6580 | for (i=0; i<N; i+=4) | |
6581 | for (j=k; j<M; j+=4) | |
6582 | vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming | |
6583 | // that the misalignment of the initial address is | |
6584 | // 0). | |
6585 | ||
6586 | The loop can then be vectorized as follows: | |
6587 | ||
6588 | for (k=0; k<4; k++){ | |
6589 | rt = get_realignment_token (&vp[k]); | |
6590 | for (i=0; i<N; i+=4){ | |
6591 | v1 = vp[i+k]; | |
6592 | for (j=k; j<M; j+=4){ | |
6593 | v2 = vp[i+j+VS-1]; | |
6594 | va = REALIGN_LOAD <v1,v2,rt>; | |
6595 | vs += va; | |
6596 | v1 = v2; | |
6597 | } | |
6598 | } | |
6599 | } */ | |
6600 | ||
6601 | if (DR_IS_READ (dr)) | |
6602 | { | |
0601d0cf RE |
6603 | bool is_packed = false; |
6604 | tree type = (TREE_TYPE (DR_REF (dr))); | |
6605 | ||
947131ba | 6606 | if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing |
ebfd146a IR |
6607 | && (!targetm.vectorize.builtin_mask_for_load |
6608 | || targetm.vectorize.builtin_mask_for_load ())) | |
6609 | { | |
6610 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
0ab34b9e RB |
6611 | |
6612 | /* If we are doing SLP then the accesses need not have the | |
6613 | same alignment, instead it depends on the SLP group size. */ | |
6614 | if (loop_vinfo | |
6615 | && STMT_SLP_TYPE (stmt_info) | |
d9f21f6a | 6616 | && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo) |
bffb8014 RS |
6617 | * (DR_GROUP_SIZE |
6618 | (DR_GROUP_FIRST_ELEMENT (stmt_info))), | |
d9f21f6a | 6619 | TYPE_VECTOR_SUBPARTS (vectype))) |
0ab34b9e RB |
6620 | ; |
6621 | else if (!loop_vinfo | |
6622 | || (nested_in_vect_loop | |
cf098191 RS |
6623 | && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)), |
6624 | GET_MODE_SIZE (TYPE_MODE (vectype))))) | |
ebfd146a IR |
6625 | return dr_explicit_realign; |
6626 | else | |
6627 | return dr_explicit_realign_optimized; | |
6628 | } | |
89fa689a | 6629 | if (!known_alignment_for_access_p (dr_info)) |
4c9bcf89 | 6630 | is_packed = not_size_aligned (DR_REF (dr)); |
b8698a0f | 6631 | |
c2873892 | 6632 | if (targetm.vectorize.support_vector_misalignment |
89fa689a | 6633 | (mode, type, DR_MISALIGNMENT (dr_info), is_packed)) |
ebfd146a IR |
6634 | /* Can't software pipeline the loads, but can at least do them. */ |
6635 | return dr_unaligned_supported; | |
6636 | } | |
0601d0cf RE |
6637 | else |
6638 | { | |
6639 | bool is_packed = false; | |
6640 | tree type = (TREE_TYPE (DR_REF (dr))); | |
ebfd146a | 6641 | |
89fa689a | 6642 | if (!known_alignment_for_access_p (dr_info)) |
4c9bcf89 | 6643 | is_packed = not_size_aligned (DR_REF (dr)); |
b8698a0f | 6644 | |
c2873892 | 6645 | if (targetm.vectorize.support_vector_misalignment |
89fa689a | 6646 | (mode, type, DR_MISALIGNMENT (dr_info), is_packed)) |
0601d0cf RE |
6647 | return dr_unaligned_supported; |
6648 | } | |
b8698a0f | 6649 | |
ebfd146a IR |
6650 | /* Unsupported. */ |
6651 | return dr_unaligned_unsupported; | |
6652 | } |