]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/i386/i386.c
PR target/77957
[thirdparty/gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2016 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "memmodel.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "cfgloop.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "stringpool.h"
33 #include "expmed.h"
34 #include "optabs.h"
35 #include "regs.h"
36 #include "emit-rtl.h"
37 #include "recog.h"
38 #include "cgraph.h"
39 #include "diagnostic.h"
40 #include "cfgbuild.h"
41 #include "alias.h"
42 #include "fold-const.h"
43 #include "attribs.h"
44 #include "calls.h"
45 #include "stor-layout.h"
46 #include "varasm.h"
47 #include "output.h"
48 #include "insn-attr.h"
49 #include "flags.h"
50 #include "except.h"
51 #include "explow.h"
52 #include "expr.h"
53 #include "cfgrtl.h"
54 #include "common/common-target.h"
55 #include "langhooks.h"
56 #include "reload.h"
57 #include "gimplify.h"
58 #include "dwarf2.h"
59 #include "tm-constrs.h"
60 #include "params.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "tree-chkp.h"
75 #include "rtl-chkp.h"
76 #include "dbgcnt.h"
77 #include "case-cfn-macros.h"
78 #include "regrename.h"
79 #include "dojump.h"
80
81 /* This file should be included last. */
82 #include "target-def.h"
83
84 static rtx legitimize_dllimport_symbol (rtx, bool);
85 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
86 static rtx legitimize_pe_coff_symbol (rtx, bool);
87 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
88
89 #ifndef CHECK_STACK_LIMIT
90 #define CHECK_STACK_LIMIT (-1)
91 #endif
92
93 /* Return index of given mode in mult and division cost tables. */
94 #define MODE_INDEX(mode) \
95 ((mode) == QImode ? 0 \
96 : (mode) == HImode ? 1 \
97 : (mode) == SImode ? 2 \
98 : (mode) == DImode ? 3 \
99 : 4)
100
101 /* Processor costs (relative to an add) */
102 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
103 #define COSTS_N_BYTES(N) ((N) * 2)
104
105 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
106
107 static stringop_algs ix86_size_memcpy[2] = {
108 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
109 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
110 static stringop_algs ix86_size_memset[2] = {
111 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
112 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
113
114 const
115 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
116 COSTS_N_BYTES (2), /* cost of an add instruction */
117 COSTS_N_BYTES (3), /* cost of a lea instruction */
118 COSTS_N_BYTES (2), /* variable shift costs */
119 COSTS_N_BYTES (3), /* constant shift costs */
120 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
121 COSTS_N_BYTES (3), /* HI */
122 COSTS_N_BYTES (3), /* SI */
123 COSTS_N_BYTES (3), /* DI */
124 COSTS_N_BYTES (5)}, /* other */
125 0, /* cost of multiply per each bit set */
126 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
127 COSTS_N_BYTES (3), /* HI */
128 COSTS_N_BYTES (3), /* SI */
129 COSTS_N_BYTES (3), /* DI */
130 COSTS_N_BYTES (5)}, /* other */
131 COSTS_N_BYTES (3), /* cost of movsx */
132 COSTS_N_BYTES (3), /* cost of movzx */
133 0, /* "large" insn */
134 2, /* MOVE_RATIO */
135 2, /* cost for loading QImode using movzbl */
136 {2, 2, 2}, /* cost of loading integer registers
137 in QImode, HImode and SImode.
138 Relative to reg-reg move (2). */
139 {2, 2, 2}, /* cost of storing integer registers */
140 2, /* cost of reg,reg fld/fst */
141 {2, 2, 2}, /* cost of loading fp registers
142 in SFmode, DFmode and XFmode */
143 {2, 2, 2}, /* cost of storing fp registers
144 in SFmode, DFmode and XFmode */
145 3, /* cost of moving MMX register */
146 {3, 3}, /* cost of loading MMX registers
147 in SImode and DImode */
148 {3, 3}, /* cost of storing MMX registers
149 in SImode and DImode */
150 3, /* cost of moving SSE register */
151 {3, 3, 3}, /* cost of loading SSE registers
152 in SImode, DImode and TImode */
153 {3, 3, 3}, /* cost of storing SSE registers
154 in SImode, DImode and TImode */
155 3, /* MMX or SSE register to integer */
156 0, /* size of l1 cache */
157 0, /* size of l2 cache */
158 0, /* size of prefetch block */
159 0, /* number of parallel prefetches */
160 2, /* Branch cost */
161 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
162 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
163 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
164 COSTS_N_BYTES (2), /* cost of FABS instruction. */
165 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
166 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
167 ix86_size_memcpy,
168 ix86_size_memset,
169 1, /* scalar_stmt_cost. */
170 1, /* scalar load_cost. */
171 1, /* scalar_store_cost. */
172 1, /* vec_stmt_cost. */
173 1, /* vec_to_scalar_cost. */
174 1, /* scalar_to_vec_cost. */
175 1, /* vec_align_load_cost. */
176 1, /* vec_unalign_load_cost. */
177 1, /* vec_store_cost. */
178 1, /* cond_taken_branch_cost. */
179 1, /* cond_not_taken_branch_cost. */
180 };
181
182 /* Processor costs (relative to an add) */
183 static stringop_algs i386_memcpy[2] = {
184 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
185 DUMMY_STRINGOP_ALGS};
186 static stringop_algs i386_memset[2] = {
187 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
188 DUMMY_STRINGOP_ALGS};
189
190 static const
191 struct processor_costs i386_cost = { /* 386 specific costs */
192 COSTS_N_INSNS (1), /* cost of an add instruction */
193 COSTS_N_INSNS (1), /* cost of a lea instruction */
194 COSTS_N_INSNS (3), /* variable shift costs */
195 COSTS_N_INSNS (2), /* constant shift costs */
196 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
197 COSTS_N_INSNS (6), /* HI */
198 COSTS_N_INSNS (6), /* SI */
199 COSTS_N_INSNS (6), /* DI */
200 COSTS_N_INSNS (6)}, /* other */
201 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
202 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
203 COSTS_N_INSNS (23), /* HI */
204 COSTS_N_INSNS (23), /* SI */
205 COSTS_N_INSNS (23), /* DI */
206 COSTS_N_INSNS (23)}, /* other */
207 COSTS_N_INSNS (3), /* cost of movsx */
208 COSTS_N_INSNS (2), /* cost of movzx */
209 15, /* "large" insn */
210 3, /* MOVE_RATIO */
211 4, /* cost for loading QImode using movzbl */
212 {2, 4, 2}, /* cost of loading integer registers
213 in QImode, HImode and SImode.
214 Relative to reg-reg move (2). */
215 {2, 4, 2}, /* cost of storing integer registers */
216 2, /* cost of reg,reg fld/fst */
217 {8, 8, 8}, /* cost of loading fp registers
218 in SFmode, DFmode and XFmode */
219 {8, 8, 8}, /* cost of storing fp registers
220 in SFmode, DFmode and XFmode */
221 2, /* cost of moving MMX register */
222 {4, 8}, /* cost of loading MMX registers
223 in SImode and DImode */
224 {4, 8}, /* cost of storing MMX registers
225 in SImode and DImode */
226 2, /* cost of moving SSE register */
227 {4, 8, 16}, /* cost of loading SSE registers
228 in SImode, DImode and TImode */
229 {4, 8, 16}, /* cost of storing SSE registers
230 in SImode, DImode and TImode */
231 3, /* MMX or SSE register to integer */
232 0, /* size of l1 cache */
233 0, /* size of l2 cache */
234 0, /* size of prefetch block */
235 0, /* number of parallel prefetches */
236 1, /* Branch cost */
237 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
238 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
239 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
240 COSTS_N_INSNS (22), /* cost of FABS instruction. */
241 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
242 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
243 i386_memcpy,
244 i386_memset,
245 1, /* scalar_stmt_cost. */
246 1, /* scalar load_cost. */
247 1, /* scalar_store_cost. */
248 1, /* vec_stmt_cost. */
249 1, /* vec_to_scalar_cost. */
250 1, /* scalar_to_vec_cost. */
251 1, /* vec_align_load_cost. */
252 2, /* vec_unalign_load_cost. */
253 1, /* vec_store_cost. */
254 3, /* cond_taken_branch_cost. */
255 1, /* cond_not_taken_branch_cost. */
256 };
257
258 static stringop_algs i486_memcpy[2] = {
259 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
260 DUMMY_STRINGOP_ALGS};
261 static stringop_algs i486_memset[2] = {
262 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
263 DUMMY_STRINGOP_ALGS};
264
265 static const
266 struct processor_costs i486_cost = { /* 486 specific costs */
267 COSTS_N_INSNS (1), /* cost of an add instruction */
268 COSTS_N_INSNS (1), /* cost of a lea instruction */
269 COSTS_N_INSNS (3), /* variable shift costs */
270 COSTS_N_INSNS (2), /* constant shift costs */
271 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
272 COSTS_N_INSNS (12), /* HI */
273 COSTS_N_INSNS (12), /* SI */
274 COSTS_N_INSNS (12), /* DI */
275 COSTS_N_INSNS (12)}, /* other */
276 1, /* cost of multiply per each bit set */
277 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
278 COSTS_N_INSNS (40), /* HI */
279 COSTS_N_INSNS (40), /* SI */
280 COSTS_N_INSNS (40), /* DI */
281 COSTS_N_INSNS (40)}, /* other */
282 COSTS_N_INSNS (3), /* cost of movsx */
283 COSTS_N_INSNS (2), /* cost of movzx */
284 15, /* "large" insn */
285 3, /* MOVE_RATIO */
286 4, /* cost for loading QImode using movzbl */
287 {2, 4, 2}, /* cost of loading integer registers
288 in QImode, HImode and SImode.
289 Relative to reg-reg move (2). */
290 {2, 4, 2}, /* cost of storing integer registers */
291 2, /* cost of reg,reg fld/fst */
292 {8, 8, 8}, /* cost of loading fp registers
293 in SFmode, DFmode and XFmode */
294 {8, 8, 8}, /* cost of storing fp registers
295 in SFmode, DFmode and XFmode */
296 2, /* cost of moving MMX register */
297 {4, 8}, /* cost of loading MMX registers
298 in SImode and DImode */
299 {4, 8}, /* cost of storing MMX registers
300 in SImode and DImode */
301 2, /* cost of moving SSE register */
302 {4, 8, 16}, /* cost of loading SSE registers
303 in SImode, DImode and TImode */
304 {4, 8, 16}, /* cost of storing SSE registers
305 in SImode, DImode and TImode */
306 3, /* MMX or SSE register to integer */
307 4, /* size of l1 cache. 486 has 8kB cache
308 shared for code and data, so 4kB is
309 not really precise. */
310 4, /* size of l2 cache */
311 0, /* size of prefetch block */
312 0, /* number of parallel prefetches */
313 1, /* Branch cost */
314 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
315 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
316 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
317 COSTS_N_INSNS (3), /* cost of FABS instruction. */
318 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
319 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
320 i486_memcpy,
321 i486_memset,
322 1, /* scalar_stmt_cost. */
323 1, /* scalar load_cost. */
324 1, /* scalar_store_cost. */
325 1, /* vec_stmt_cost. */
326 1, /* vec_to_scalar_cost. */
327 1, /* scalar_to_vec_cost. */
328 1, /* vec_align_load_cost. */
329 2, /* vec_unalign_load_cost. */
330 1, /* vec_store_cost. */
331 3, /* cond_taken_branch_cost. */
332 1, /* cond_not_taken_branch_cost. */
333 };
334
335 static stringop_algs pentium_memcpy[2] = {
336 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
337 DUMMY_STRINGOP_ALGS};
338 static stringop_algs pentium_memset[2] = {
339 {libcall, {{-1, rep_prefix_4_byte, false}}},
340 DUMMY_STRINGOP_ALGS};
341
342 static const
343 struct processor_costs pentium_cost = {
344 COSTS_N_INSNS (1), /* cost of an add instruction */
345 COSTS_N_INSNS (1), /* cost of a lea instruction */
346 COSTS_N_INSNS (4), /* variable shift costs */
347 COSTS_N_INSNS (1), /* constant shift costs */
348 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
349 COSTS_N_INSNS (11), /* HI */
350 COSTS_N_INSNS (11), /* SI */
351 COSTS_N_INSNS (11), /* DI */
352 COSTS_N_INSNS (11)}, /* other */
353 0, /* cost of multiply per each bit set */
354 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
355 COSTS_N_INSNS (25), /* HI */
356 COSTS_N_INSNS (25), /* SI */
357 COSTS_N_INSNS (25), /* DI */
358 COSTS_N_INSNS (25)}, /* other */
359 COSTS_N_INSNS (3), /* cost of movsx */
360 COSTS_N_INSNS (2), /* cost of movzx */
361 8, /* "large" insn */
362 6, /* MOVE_RATIO */
363 6, /* cost for loading QImode using movzbl */
364 {2, 4, 2}, /* cost of loading integer registers
365 in QImode, HImode and SImode.
366 Relative to reg-reg move (2). */
367 {2, 4, 2}, /* cost of storing integer registers */
368 2, /* cost of reg,reg fld/fst */
369 {2, 2, 6}, /* cost of loading fp registers
370 in SFmode, DFmode and XFmode */
371 {4, 4, 6}, /* cost of storing fp registers
372 in SFmode, DFmode and XFmode */
373 8, /* cost of moving MMX register */
374 {8, 8}, /* cost of loading MMX registers
375 in SImode and DImode */
376 {8, 8}, /* cost of storing MMX registers
377 in SImode and DImode */
378 2, /* cost of moving SSE register */
379 {4, 8, 16}, /* cost of loading SSE registers
380 in SImode, DImode and TImode */
381 {4, 8, 16}, /* cost of storing SSE registers
382 in SImode, DImode and TImode */
383 3, /* MMX or SSE register to integer */
384 8, /* size of l1 cache. */
385 8, /* size of l2 cache */
386 0, /* size of prefetch block */
387 0, /* number of parallel prefetches */
388 2, /* Branch cost */
389 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
390 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
391 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
392 COSTS_N_INSNS (1), /* cost of FABS instruction. */
393 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
394 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
395 pentium_memcpy,
396 pentium_memset,
397 1, /* scalar_stmt_cost. */
398 1, /* scalar load_cost. */
399 1, /* scalar_store_cost. */
400 1, /* vec_stmt_cost. */
401 1, /* vec_to_scalar_cost. */
402 1, /* scalar_to_vec_cost. */
403 1, /* vec_align_load_cost. */
404 2, /* vec_unalign_load_cost. */
405 1, /* vec_store_cost. */
406 3, /* cond_taken_branch_cost. */
407 1, /* cond_not_taken_branch_cost. */
408 };
409
410 static const
411 struct processor_costs lakemont_cost = {
412 COSTS_N_INSNS (1), /* cost of an add instruction */
413 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
414 COSTS_N_INSNS (1), /* variable shift costs */
415 COSTS_N_INSNS (1), /* constant shift costs */
416 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
417 COSTS_N_INSNS (11), /* HI */
418 COSTS_N_INSNS (11), /* SI */
419 COSTS_N_INSNS (11), /* DI */
420 COSTS_N_INSNS (11)}, /* other */
421 0, /* cost of multiply per each bit set */
422 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
423 COSTS_N_INSNS (25), /* HI */
424 COSTS_N_INSNS (25), /* SI */
425 COSTS_N_INSNS (25), /* DI */
426 COSTS_N_INSNS (25)}, /* other */
427 COSTS_N_INSNS (3), /* cost of movsx */
428 COSTS_N_INSNS (2), /* cost of movzx */
429 8, /* "large" insn */
430 17, /* MOVE_RATIO */
431 6, /* cost for loading QImode using movzbl */
432 {2, 4, 2}, /* cost of loading integer registers
433 in QImode, HImode and SImode.
434 Relative to reg-reg move (2). */
435 {2, 4, 2}, /* cost of storing integer registers */
436 2, /* cost of reg,reg fld/fst */
437 {2, 2, 6}, /* cost of loading fp registers
438 in SFmode, DFmode and XFmode */
439 {4, 4, 6}, /* cost of storing fp registers
440 in SFmode, DFmode and XFmode */
441 8, /* cost of moving MMX register */
442 {8, 8}, /* cost of loading MMX registers
443 in SImode and DImode */
444 {8, 8}, /* cost of storing MMX registers
445 in SImode and DImode */
446 2, /* cost of moving SSE register */
447 {4, 8, 16}, /* cost of loading SSE registers
448 in SImode, DImode and TImode */
449 {4, 8, 16}, /* cost of storing SSE registers
450 in SImode, DImode and TImode */
451 3, /* MMX or SSE register to integer */
452 8, /* size of l1 cache. */
453 8, /* size of l2 cache */
454 0, /* size of prefetch block */
455 0, /* number of parallel prefetches */
456 2, /* Branch cost */
457 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
458 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
459 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
460 COSTS_N_INSNS (1), /* cost of FABS instruction. */
461 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
462 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
463 pentium_memcpy,
464 pentium_memset,
465 1, /* scalar_stmt_cost. */
466 1, /* scalar load_cost. */
467 1, /* scalar_store_cost. */
468 1, /* vec_stmt_cost. */
469 1, /* vec_to_scalar_cost. */
470 1, /* scalar_to_vec_cost. */
471 1, /* vec_align_load_cost. */
472 2, /* vec_unalign_load_cost. */
473 1, /* vec_store_cost. */
474 3, /* cond_taken_branch_cost. */
475 1, /* cond_not_taken_branch_cost. */
476 };
477
478 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
479 (we ensure the alignment). For small blocks inline loop is still a
480 noticeable win, for bigger blocks either rep movsl or rep movsb is
481 way to go. Rep movsb has apparently more expensive startup time in CPU,
482 but after 4K the difference is down in the noise. */
483 static stringop_algs pentiumpro_memcpy[2] = {
484 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
485 {8192, rep_prefix_4_byte, false},
486 {-1, rep_prefix_1_byte, false}}},
487 DUMMY_STRINGOP_ALGS};
488 static stringop_algs pentiumpro_memset[2] = {
489 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
490 {8192, rep_prefix_4_byte, false},
491 {-1, libcall, false}}},
492 DUMMY_STRINGOP_ALGS};
493 static const
494 struct processor_costs pentiumpro_cost = {
495 COSTS_N_INSNS (1), /* cost of an add instruction */
496 COSTS_N_INSNS (1), /* cost of a lea instruction */
497 COSTS_N_INSNS (1), /* variable shift costs */
498 COSTS_N_INSNS (1), /* constant shift costs */
499 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
500 COSTS_N_INSNS (4), /* HI */
501 COSTS_N_INSNS (4), /* SI */
502 COSTS_N_INSNS (4), /* DI */
503 COSTS_N_INSNS (4)}, /* other */
504 0, /* cost of multiply per each bit set */
505 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
506 COSTS_N_INSNS (17), /* HI */
507 COSTS_N_INSNS (17), /* SI */
508 COSTS_N_INSNS (17), /* DI */
509 COSTS_N_INSNS (17)}, /* other */
510 COSTS_N_INSNS (1), /* cost of movsx */
511 COSTS_N_INSNS (1), /* cost of movzx */
512 8, /* "large" insn */
513 6, /* MOVE_RATIO */
514 2, /* cost for loading QImode using movzbl */
515 {4, 4, 4}, /* cost of loading integer registers
516 in QImode, HImode and SImode.
517 Relative to reg-reg move (2). */
518 {2, 2, 2}, /* cost of storing integer registers */
519 2, /* cost of reg,reg fld/fst */
520 {2, 2, 6}, /* cost of loading fp registers
521 in SFmode, DFmode and XFmode */
522 {4, 4, 6}, /* cost of storing fp registers
523 in SFmode, DFmode and XFmode */
524 2, /* cost of moving MMX register */
525 {2, 2}, /* cost of loading MMX registers
526 in SImode and DImode */
527 {2, 2}, /* cost of storing MMX registers
528 in SImode and DImode */
529 2, /* cost of moving SSE register */
530 {2, 2, 8}, /* cost of loading SSE registers
531 in SImode, DImode and TImode */
532 {2, 2, 8}, /* cost of storing SSE registers
533 in SImode, DImode and TImode */
534 3, /* MMX or SSE register to integer */
535 8, /* size of l1 cache. */
536 256, /* size of l2 cache */
537 32, /* size of prefetch block */
538 6, /* number of parallel prefetches */
539 2, /* Branch cost */
540 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
541 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
542 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
543 COSTS_N_INSNS (2), /* cost of FABS instruction. */
544 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
545 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
546 pentiumpro_memcpy,
547 pentiumpro_memset,
548 1, /* scalar_stmt_cost. */
549 1, /* scalar load_cost. */
550 1, /* scalar_store_cost. */
551 1, /* vec_stmt_cost. */
552 1, /* vec_to_scalar_cost. */
553 1, /* scalar_to_vec_cost. */
554 1, /* vec_align_load_cost. */
555 2, /* vec_unalign_load_cost. */
556 1, /* vec_store_cost. */
557 3, /* cond_taken_branch_cost. */
558 1, /* cond_not_taken_branch_cost. */
559 };
560
561 static stringop_algs geode_memcpy[2] = {
562 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
563 DUMMY_STRINGOP_ALGS};
564 static stringop_algs geode_memset[2] = {
565 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
566 DUMMY_STRINGOP_ALGS};
567 static const
568 struct processor_costs geode_cost = {
569 COSTS_N_INSNS (1), /* cost of an add instruction */
570 COSTS_N_INSNS (1), /* cost of a lea instruction */
571 COSTS_N_INSNS (2), /* variable shift costs */
572 COSTS_N_INSNS (1), /* constant shift costs */
573 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
574 COSTS_N_INSNS (4), /* HI */
575 COSTS_N_INSNS (7), /* SI */
576 COSTS_N_INSNS (7), /* DI */
577 COSTS_N_INSNS (7)}, /* other */
578 0, /* cost of multiply per each bit set */
579 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
580 COSTS_N_INSNS (23), /* HI */
581 COSTS_N_INSNS (39), /* SI */
582 COSTS_N_INSNS (39), /* DI */
583 COSTS_N_INSNS (39)}, /* other */
584 COSTS_N_INSNS (1), /* cost of movsx */
585 COSTS_N_INSNS (1), /* cost of movzx */
586 8, /* "large" insn */
587 4, /* MOVE_RATIO */
588 1, /* cost for loading QImode using movzbl */
589 {1, 1, 1}, /* cost of loading integer registers
590 in QImode, HImode and SImode.
591 Relative to reg-reg move (2). */
592 {1, 1, 1}, /* cost of storing integer registers */
593 1, /* cost of reg,reg fld/fst */
594 {1, 1, 1}, /* cost of loading fp registers
595 in SFmode, DFmode and XFmode */
596 {4, 6, 6}, /* cost of storing fp registers
597 in SFmode, DFmode and XFmode */
598
599 2, /* cost of moving MMX register */
600 {2, 2}, /* cost of loading MMX registers
601 in SImode and DImode */
602 {2, 2}, /* cost of storing MMX registers
603 in SImode and DImode */
604 2, /* cost of moving SSE register */
605 {2, 2, 8}, /* cost of loading SSE registers
606 in SImode, DImode and TImode */
607 {2, 2, 8}, /* cost of storing SSE registers
608 in SImode, DImode and TImode */
609 3, /* MMX or SSE register to integer */
610 64, /* size of l1 cache. */
611 128, /* size of l2 cache. */
612 32, /* size of prefetch block */
613 1, /* number of parallel prefetches */
614 1, /* Branch cost */
615 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
616 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
617 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
618 COSTS_N_INSNS (1), /* cost of FABS instruction. */
619 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
620 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
621 geode_memcpy,
622 geode_memset,
623 1, /* scalar_stmt_cost. */
624 1, /* scalar load_cost. */
625 1, /* scalar_store_cost. */
626 1, /* vec_stmt_cost. */
627 1, /* vec_to_scalar_cost. */
628 1, /* scalar_to_vec_cost. */
629 1, /* vec_align_load_cost. */
630 2, /* vec_unalign_load_cost. */
631 1, /* vec_store_cost. */
632 3, /* cond_taken_branch_cost. */
633 1, /* cond_not_taken_branch_cost. */
634 };
635
636 static stringop_algs k6_memcpy[2] = {
637 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
638 DUMMY_STRINGOP_ALGS};
639 static stringop_algs k6_memset[2] = {
640 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
641 DUMMY_STRINGOP_ALGS};
642 static const
643 struct processor_costs k6_cost = {
644 COSTS_N_INSNS (1), /* cost of an add instruction */
645 COSTS_N_INSNS (2), /* cost of a lea instruction */
646 COSTS_N_INSNS (1), /* variable shift costs */
647 COSTS_N_INSNS (1), /* constant shift costs */
648 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
649 COSTS_N_INSNS (3), /* HI */
650 COSTS_N_INSNS (3), /* SI */
651 COSTS_N_INSNS (3), /* DI */
652 COSTS_N_INSNS (3)}, /* other */
653 0, /* cost of multiply per each bit set */
654 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
655 COSTS_N_INSNS (18), /* HI */
656 COSTS_N_INSNS (18), /* SI */
657 COSTS_N_INSNS (18), /* DI */
658 COSTS_N_INSNS (18)}, /* other */
659 COSTS_N_INSNS (2), /* cost of movsx */
660 COSTS_N_INSNS (2), /* cost of movzx */
661 8, /* "large" insn */
662 4, /* MOVE_RATIO */
663 3, /* cost for loading QImode using movzbl */
664 {4, 5, 4}, /* cost of loading integer registers
665 in QImode, HImode and SImode.
666 Relative to reg-reg move (2). */
667 {2, 3, 2}, /* cost of storing integer registers */
668 4, /* cost of reg,reg fld/fst */
669 {6, 6, 6}, /* cost of loading fp registers
670 in SFmode, DFmode and XFmode */
671 {4, 4, 4}, /* cost of storing fp registers
672 in SFmode, DFmode and XFmode */
673 2, /* cost of moving MMX register */
674 {2, 2}, /* cost of loading MMX registers
675 in SImode and DImode */
676 {2, 2}, /* cost of storing MMX registers
677 in SImode and DImode */
678 2, /* cost of moving SSE register */
679 {2, 2, 8}, /* cost of loading SSE registers
680 in SImode, DImode and TImode */
681 {2, 2, 8}, /* cost of storing SSE registers
682 in SImode, DImode and TImode */
683 6, /* MMX or SSE register to integer */
684 32, /* size of l1 cache. */
685 32, /* size of l2 cache. Some models
686 have integrated l2 cache, but
687 optimizing for k6 is not important
688 enough to worry about that. */
689 32, /* size of prefetch block */
690 1, /* number of parallel prefetches */
691 1, /* Branch cost */
692 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
693 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
694 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
695 COSTS_N_INSNS (2), /* cost of FABS instruction. */
696 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
697 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
698 k6_memcpy,
699 k6_memset,
700 1, /* scalar_stmt_cost. */
701 1, /* scalar load_cost. */
702 1, /* scalar_store_cost. */
703 1, /* vec_stmt_cost. */
704 1, /* vec_to_scalar_cost. */
705 1, /* scalar_to_vec_cost. */
706 1, /* vec_align_load_cost. */
707 2, /* vec_unalign_load_cost. */
708 1, /* vec_store_cost. */
709 3, /* cond_taken_branch_cost. */
710 1, /* cond_not_taken_branch_cost. */
711 };
712
713 /* For some reason, Athlon deals better with REP prefix (relative to loops)
714 compared to K8. Alignment becomes important after 8 bytes for memcpy and
715 128 bytes for memset. */
716 static stringop_algs athlon_memcpy[2] = {
717 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
718 DUMMY_STRINGOP_ALGS};
719 static stringop_algs athlon_memset[2] = {
720 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
721 DUMMY_STRINGOP_ALGS};
722 static const
723 struct processor_costs athlon_cost = {
724 COSTS_N_INSNS (1), /* cost of an add instruction */
725 COSTS_N_INSNS (2), /* cost of a lea instruction */
726 COSTS_N_INSNS (1), /* variable shift costs */
727 COSTS_N_INSNS (1), /* constant shift costs */
728 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
729 COSTS_N_INSNS (5), /* HI */
730 COSTS_N_INSNS (5), /* SI */
731 COSTS_N_INSNS (5), /* DI */
732 COSTS_N_INSNS (5)}, /* other */
733 0, /* cost of multiply per each bit set */
734 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
735 COSTS_N_INSNS (26), /* HI */
736 COSTS_N_INSNS (42), /* SI */
737 COSTS_N_INSNS (74), /* DI */
738 COSTS_N_INSNS (74)}, /* other */
739 COSTS_N_INSNS (1), /* cost of movsx */
740 COSTS_N_INSNS (1), /* cost of movzx */
741 8, /* "large" insn */
742 9, /* MOVE_RATIO */
743 4, /* cost for loading QImode using movzbl */
744 {3, 4, 3}, /* cost of loading integer registers
745 in QImode, HImode and SImode.
746 Relative to reg-reg move (2). */
747 {3, 4, 3}, /* cost of storing integer registers */
748 4, /* cost of reg,reg fld/fst */
749 {4, 4, 12}, /* cost of loading fp registers
750 in SFmode, DFmode and XFmode */
751 {6, 6, 8}, /* cost of storing fp registers
752 in SFmode, DFmode and XFmode */
753 2, /* cost of moving MMX register */
754 {4, 4}, /* cost of loading MMX registers
755 in SImode and DImode */
756 {4, 4}, /* cost of storing MMX registers
757 in SImode and DImode */
758 2, /* cost of moving SSE register */
759 {4, 4, 6}, /* cost of loading SSE registers
760 in SImode, DImode and TImode */
761 {4, 4, 5}, /* cost of storing SSE registers
762 in SImode, DImode and TImode */
763 5, /* MMX or SSE register to integer */
764 64, /* size of l1 cache. */
765 256, /* size of l2 cache. */
766 64, /* size of prefetch block */
767 6, /* number of parallel prefetches */
768 5, /* Branch cost */
769 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
770 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
771 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
772 COSTS_N_INSNS (2), /* cost of FABS instruction. */
773 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
774 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
775 athlon_memcpy,
776 athlon_memset,
777 1, /* scalar_stmt_cost. */
778 1, /* scalar load_cost. */
779 1, /* scalar_store_cost. */
780 1, /* vec_stmt_cost. */
781 1, /* vec_to_scalar_cost. */
782 1, /* scalar_to_vec_cost. */
783 1, /* vec_align_load_cost. */
784 2, /* vec_unalign_load_cost. */
785 1, /* vec_store_cost. */
786 3, /* cond_taken_branch_cost. */
787 1, /* cond_not_taken_branch_cost. */
788 };
789
790 /* K8 has optimized REP instruction for medium sized blocks, but for very
791 small blocks it is better to use loop. For large blocks, libcall can
792 do nontemporary accesses and beat inline considerably. */
793 static stringop_algs k8_memcpy[2] = {
794 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
795 {-1, rep_prefix_4_byte, false}}},
796 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
797 {-1, libcall, false}}}};
798 static stringop_algs k8_memset[2] = {
799 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
800 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
801 {libcall, {{48, unrolled_loop, false},
802 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
803 static const
804 struct processor_costs k8_cost = {
805 COSTS_N_INSNS (1), /* cost of an add instruction */
806 COSTS_N_INSNS (2), /* cost of a lea instruction */
807 COSTS_N_INSNS (1), /* variable shift costs */
808 COSTS_N_INSNS (1), /* constant shift costs */
809 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
810 COSTS_N_INSNS (4), /* HI */
811 COSTS_N_INSNS (3), /* SI */
812 COSTS_N_INSNS (4), /* DI */
813 COSTS_N_INSNS (5)}, /* other */
814 0, /* cost of multiply per each bit set */
815 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
816 COSTS_N_INSNS (26), /* HI */
817 COSTS_N_INSNS (42), /* SI */
818 COSTS_N_INSNS (74), /* DI */
819 COSTS_N_INSNS (74)}, /* other */
820 COSTS_N_INSNS (1), /* cost of movsx */
821 COSTS_N_INSNS (1), /* cost of movzx */
822 8, /* "large" insn */
823 9, /* MOVE_RATIO */
824 4, /* cost for loading QImode using movzbl */
825 {3, 4, 3}, /* cost of loading integer registers
826 in QImode, HImode and SImode.
827 Relative to reg-reg move (2). */
828 {3, 4, 3}, /* cost of storing integer registers */
829 4, /* cost of reg,reg fld/fst */
830 {4, 4, 12}, /* cost of loading fp registers
831 in SFmode, DFmode and XFmode */
832 {6, 6, 8}, /* cost of storing fp registers
833 in SFmode, DFmode and XFmode */
834 2, /* cost of moving MMX register */
835 {3, 3}, /* cost of loading MMX registers
836 in SImode and DImode */
837 {4, 4}, /* cost of storing MMX registers
838 in SImode and DImode */
839 2, /* cost of moving SSE register */
840 {4, 3, 6}, /* cost of loading SSE registers
841 in SImode, DImode and TImode */
842 {4, 4, 5}, /* cost of storing SSE registers
843 in SImode, DImode and TImode */
844 5, /* MMX or SSE register to integer */
845 64, /* size of l1 cache. */
846 512, /* size of l2 cache. */
847 64, /* size of prefetch block */
848 /* New AMD processors never drop prefetches; if they cannot be performed
849 immediately, they are queued. We set number of simultaneous prefetches
850 to a large constant to reflect this (it probably is not a good idea not
851 to limit number of prefetches at all, as their execution also takes some
852 time). */
853 100, /* number of parallel prefetches */
854 3, /* Branch cost */
855 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
856 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
857 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
858 COSTS_N_INSNS (2), /* cost of FABS instruction. */
859 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
860 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
861
862 k8_memcpy,
863 k8_memset,
864 4, /* scalar_stmt_cost. */
865 2, /* scalar load_cost. */
866 2, /* scalar_store_cost. */
867 5, /* vec_stmt_cost. */
868 0, /* vec_to_scalar_cost. */
869 2, /* scalar_to_vec_cost. */
870 2, /* vec_align_load_cost. */
871 3, /* vec_unalign_load_cost. */
872 3, /* vec_store_cost. */
873 3, /* cond_taken_branch_cost. */
874 2, /* cond_not_taken_branch_cost. */
875 };
876
877 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
878 very small blocks it is better to use loop. For large blocks, libcall can
879 do nontemporary accesses and beat inline considerably. */
880 static stringop_algs amdfam10_memcpy[2] = {
881 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
882 {-1, rep_prefix_4_byte, false}}},
883 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
884 {-1, libcall, false}}}};
885 static stringop_algs amdfam10_memset[2] = {
886 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
887 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
888 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
889 {-1, libcall, false}}}};
890 struct processor_costs amdfam10_cost = {
891 COSTS_N_INSNS (1), /* cost of an add instruction */
892 COSTS_N_INSNS (2), /* cost of a lea instruction */
893 COSTS_N_INSNS (1), /* variable shift costs */
894 COSTS_N_INSNS (1), /* constant shift costs */
895 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
896 COSTS_N_INSNS (4), /* HI */
897 COSTS_N_INSNS (3), /* SI */
898 COSTS_N_INSNS (4), /* DI */
899 COSTS_N_INSNS (5)}, /* other */
900 0, /* cost of multiply per each bit set */
901 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
902 COSTS_N_INSNS (35), /* HI */
903 COSTS_N_INSNS (51), /* SI */
904 COSTS_N_INSNS (83), /* DI */
905 COSTS_N_INSNS (83)}, /* other */
906 COSTS_N_INSNS (1), /* cost of movsx */
907 COSTS_N_INSNS (1), /* cost of movzx */
908 8, /* "large" insn */
909 9, /* MOVE_RATIO */
910 4, /* cost for loading QImode using movzbl */
911 {3, 4, 3}, /* cost of loading integer registers
912 in QImode, HImode and SImode.
913 Relative to reg-reg move (2). */
914 {3, 4, 3}, /* cost of storing integer registers */
915 4, /* cost of reg,reg fld/fst */
916 {4, 4, 12}, /* cost of loading fp registers
917 in SFmode, DFmode and XFmode */
918 {6, 6, 8}, /* cost of storing fp registers
919 in SFmode, DFmode and XFmode */
920 2, /* cost of moving MMX register */
921 {3, 3}, /* cost of loading MMX registers
922 in SImode and DImode */
923 {4, 4}, /* cost of storing MMX registers
924 in SImode and DImode */
925 2, /* cost of moving SSE register */
926 {4, 4, 3}, /* cost of loading SSE registers
927 in SImode, DImode and TImode */
928 {4, 4, 5}, /* cost of storing SSE registers
929 in SImode, DImode and TImode */
930 3, /* MMX or SSE register to integer */
931 /* On K8:
932 MOVD reg64, xmmreg Double FSTORE 4
933 MOVD reg32, xmmreg Double FSTORE 4
934 On AMDFAM10:
935 MOVD reg64, xmmreg Double FADD 3
936 1/1 1/1
937 MOVD reg32, xmmreg Double FADD 3
938 1/1 1/1 */
939 64, /* size of l1 cache. */
940 512, /* size of l2 cache. */
941 64, /* size of prefetch block */
942 /* New AMD processors never drop prefetches; if they cannot be performed
943 immediately, they are queued. We set number of simultaneous prefetches
944 to a large constant to reflect this (it probably is not a good idea not
945 to limit number of prefetches at all, as their execution also takes some
946 time). */
947 100, /* number of parallel prefetches */
948 2, /* Branch cost */
949 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
950 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
951 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
952 COSTS_N_INSNS (2), /* cost of FABS instruction. */
953 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
954 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
955
956 amdfam10_memcpy,
957 amdfam10_memset,
958 4, /* scalar_stmt_cost. */
959 2, /* scalar load_cost. */
960 2, /* scalar_store_cost. */
961 6, /* vec_stmt_cost. */
962 0, /* vec_to_scalar_cost. */
963 2, /* scalar_to_vec_cost. */
964 2, /* vec_align_load_cost. */
965 2, /* vec_unalign_load_cost. */
966 2, /* vec_store_cost. */
967 2, /* cond_taken_branch_cost. */
968 1, /* cond_not_taken_branch_cost. */
969 };
970
971 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
972 very small blocks it is better to use loop. For large blocks, libcall
973 can do nontemporary accesses and beat inline considerably. */
974 static stringop_algs bdver1_memcpy[2] = {
975 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
976 {-1, rep_prefix_4_byte, false}}},
977 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
978 {-1, libcall, false}}}};
979 static stringop_algs bdver1_memset[2] = {
980 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
981 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
982 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
983 {-1, libcall, false}}}};
984
985 const struct processor_costs bdver1_cost = {
986 COSTS_N_INSNS (1), /* cost of an add instruction */
987 COSTS_N_INSNS (1), /* cost of a lea instruction */
988 COSTS_N_INSNS (1), /* variable shift costs */
989 COSTS_N_INSNS (1), /* constant shift costs */
990 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
991 COSTS_N_INSNS (4), /* HI */
992 COSTS_N_INSNS (4), /* SI */
993 COSTS_N_INSNS (6), /* DI */
994 COSTS_N_INSNS (6)}, /* other */
995 0, /* cost of multiply per each bit set */
996 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
997 COSTS_N_INSNS (35), /* HI */
998 COSTS_N_INSNS (51), /* SI */
999 COSTS_N_INSNS (83), /* DI */
1000 COSTS_N_INSNS (83)}, /* other */
1001 COSTS_N_INSNS (1), /* cost of movsx */
1002 COSTS_N_INSNS (1), /* cost of movzx */
1003 8, /* "large" insn */
1004 9, /* MOVE_RATIO */
1005 4, /* cost for loading QImode using movzbl */
1006 {5, 5, 4}, /* cost of loading integer registers
1007 in QImode, HImode and SImode.
1008 Relative to reg-reg move (2). */
1009 {4, 4, 4}, /* cost of storing integer registers */
1010 2, /* cost of reg,reg fld/fst */
1011 {5, 5, 12}, /* cost of loading fp registers
1012 in SFmode, DFmode and XFmode */
1013 {4, 4, 8}, /* cost of storing fp registers
1014 in SFmode, DFmode and XFmode */
1015 2, /* cost of moving MMX register */
1016 {4, 4}, /* cost of loading MMX registers
1017 in SImode and DImode */
1018 {4, 4}, /* cost of storing MMX registers
1019 in SImode and DImode */
1020 2, /* cost of moving SSE register */
1021 {4, 4, 4}, /* cost of loading SSE registers
1022 in SImode, DImode and TImode */
1023 {4, 4, 4}, /* cost of storing SSE registers
1024 in SImode, DImode and TImode */
1025 2, /* MMX or SSE register to integer */
1026 /* On K8:
1027 MOVD reg64, xmmreg Double FSTORE 4
1028 MOVD reg32, xmmreg Double FSTORE 4
1029 On AMDFAM10:
1030 MOVD reg64, xmmreg Double FADD 3
1031 1/1 1/1
1032 MOVD reg32, xmmreg Double FADD 3
1033 1/1 1/1 */
1034 16, /* size of l1 cache. */
1035 2048, /* size of l2 cache. */
1036 64, /* size of prefetch block */
1037 /* New AMD processors never drop prefetches; if they cannot be performed
1038 immediately, they are queued. We set number of simultaneous prefetches
1039 to a large constant to reflect this (it probably is not a good idea not
1040 to limit number of prefetches at all, as their execution also takes some
1041 time). */
1042 100, /* number of parallel prefetches */
1043 2, /* Branch cost */
1044 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1045 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1046 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1047 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1048 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1049 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1050
1051 bdver1_memcpy,
1052 bdver1_memset,
1053 6, /* scalar_stmt_cost. */
1054 4, /* scalar load_cost. */
1055 4, /* scalar_store_cost. */
1056 6, /* vec_stmt_cost. */
1057 0, /* vec_to_scalar_cost. */
1058 2, /* scalar_to_vec_cost. */
1059 4, /* vec_align_load_cost. */
1060 4, /* vec_unalign_load_cost. */
1061 4, /* vec_store_cost. */
1062 4, /* cond_taken_branch_cost. */
1063 2, /* cond_not_taken_branch_cost. */
1064 };
1065
1066 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1067 very small blocks it is better to use loop. For large blocks, libcall
1068 can do nontemporary accesses and beat inline considerably. */
1069
1070 static stringop_algs bdver2_memcpy[2] = {
1071 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1072 {-1, rep_prefix_4_byte, false}}},
1073 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1074 {-1, libcall, false}}}};
1075 static stringop_algs bdver2_memset[2] = {
1076 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1077 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1078 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1079 {-1, libcall, false}}}};
1080
1081 const struct processor_costs bdver2_cost = {
1082 COSTS_N_INSNS (1), /* cost of an add instruction */
1083 COSTS_N_INSNS (1), /* cost of a lea instruction */
1084 COSTS_N_INSNS (1), /* variable shift costs */
1085 COSTS_N_INSNS (1), /* constant shift costs */
1086 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1087 COSTS_N_INSNS (4), /* HI */
1088 COSTS_N_INSNS (4), /* SI */
1089 COSTS_N_INSNS (6), /* DI */
1090 COSTS_N_INSNS (6)}, /* other */
1091 0, /* cost of multiply per each bit set */
1092 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1093 COSTS_N_INSNS (35), /* HI */
1094 COSTS_N_INSNS (51), /* SI */
1095 COSTS_N_INSNS (83), /* DI */
1096 COSTS_N_INSNS (83)}, /* other */
1097 COSTS_N_INSNS (1), /* cost of movsx */
1098 COSTS_N_INSNS (1), /* cost of movzx */
1099 8, /* "large" insn */
1100 9, /* MOVE_RATIO */
1101 4, /* cost for loading QImode using movzbl */
1102 {5, 5, 4}, /* cost of loading integer registers
1103 in QImode, HImode and SImode.
1104 Relative to reg-reg move (2). */
1105 {4, 4, 4}, /* cost of storing integer registers */
1106 2, /* cost of reg,reg fld/fst */
1107 {5, 5, 12}, /* cost of loading fp registers
1108 in SFmode, DFmode and XFmode */
1109 {4, 4, 8}, /* cost of storing fp registers
1110 in SFmode, DFmode and XFmode */
1111 2, /* cost of moving MMX register */
1112 {4, 4}, /* cost of loading MMX registers
1113 in SImode and DImode */
1114 {4, 4}, /* cost of storing MMX registers
1115 in SImode and DImode */
1116 2, /* cost of moving SSE register */
1117 {4, 4, 4}, /* cost of loading SSE registers
1118 in SImode, DImode and TImode */
1119 {4, 4, 4}, /* cost of storing SSE registers
1120 in SImode, DImode and TImode */
1121 2, /* MMX or SSE register to integer */
1122 /* On K8:
1123 MOVD reg64, xmmreg Double FSTORE 4
1124 MOVD reg32, xmmreg Double FSTORE 4
1125 On AMDFAM10:
1126 MOVD reg64, xmmreg Double FADD 3
1127 1/1 1/1
1128 MOVD reg32, xmmreg Double FADD 3
1129 1/1 1/1 */
1130 16, /* size of l1 cache. */
1131 2048, /* size of l2 cache. */
1132 64, /* size of prefetch block */
1133 /* New AMD processors never drop prefetches; if they cannot be performed
1134 immediately, they are queued. We set number of simultaneous prefetches
1135 to a large constant to reflect this (it probably is not a good idea not
1136 to limit number of prefetches at all, as their execution also takes some
1137 time). */
1138 100, /* number of parallel prefetches */
1139 2, /* Branch cost */
1140 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1141 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1142 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1143 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1144 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1145 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1146
1147 bdver2_memcpy,
1148 bdver2_memset,
1149 6, /* scalar_stmt_cost. */
1150 4, /* scalar load_cost. */
1151 4, /* scalar_store_cost. */
1152 6, /* vec_stmt_cost. */
1153 0, /* vec_to_scalar_cost. */
1154 2, /* scalar_to_vec_cost. */
1155 4, /* vec_align_load_cost. */
1156 4, /* vec_unalign_load_cost. */
1157 4, /* vec_store_cost. */
1158 4, /* cond_taken_branch_cost. */
1159 2, /* cond_not_taken_branch_cost. */
1160 };
1161
1162
1163 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1164 very small blocks it is better to use loop. For large blocks, libcall
1165 can do nontemporary accesses and beat inline considerably. */
1166 static stringop_algs bdver3_memcpy[2] = {
1167 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1168 {-1, rep_prefix_4_byte, false}}},
1169 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1170 {-1, libcall, false}}}};
1171 static stringop_algs bdver3_memset[2] = {
1172 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1173 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1174 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1175 {-1, libcall, false}}}};
1176 struct processor_costs bdver3_cost = {
1177 COSTS_N_INSNS (1), /* cost of an add instruction */
1178 COSTS_N_INSNS (1), /* cost of a lea instruction */
1179 COSTS_N_INSNS (1), /* variable shift costs */
1180 COSTS_N_INSNS (1), /* constant shift costs */
1181 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1182 COSTS_N_INSNS (4), /* HI */
1183 COSTS_N_INSNS (4), /* SI */
1184 COSTS_N_INSNS (6), /* DI */
1185 COSTS_N_INSNS (6)}, /* other */
1186 0, /* cost of multiply per each bit set */
1187 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1188 COSTS_N_INSNS (35), /* HI */
1189 COSTS_N_INSNS (51), /* SI */
1190 COSTS_N_INSNS (83), /* DI */
1191 COSTS_N_INSNS (83)}, /* other */
1192 COSTS_N_INSNS (1), /* cost of movsx */
1193 COSTS_N_INSNS (1), /* cost of movzx */
1194 8, /* "large" insn */
1195 9, /* MOVE_RATIO */
1196 4, /* cost for loading QImode using movzbl */
1197 {5, 5, 4}, /* cost of loading integer registers
1198 in QImode, HImode and SImode.
1199 Relative to reg-reg move (2). */
1200 {4, 4, 4}, /* cost of storing integer registers */
1201 2, /* cost of reg,reg fld/fst */
1202 {5, 5, 12}, /* cost of loading fp registers
1203 in SFmode, DFmode and XFmode */
1204 {4, 4, 8}, /* cost of storing fp registers
1205 in SFmode, DFmode and XFmode */
1206 2, /* cost of moving MMX register */
1207 {4, 4}, /* cost of loading MMX registers
1208 in SImode and DImode */
1209 {4, 4}, /* cost of storing MMX registers
1210 in SImode and DImode */
1211 2, /* cost of moving SSE register */
1212 {4, 4, 4}, /* cost of loading SSE registers
1213 in SImode, DImode and TImode */
1214 {4, 4, 4}, /* cost of storing SSE registers
1215 in SImode, DImode and TImode */
1216 2, /* MMX or SSE register to integer */
1217 16, /* size of l1 cache. */
1218 2048, /* size of l2 cache. */
1219 64, /* size of prefetch block */
1220 /* New AMD processors never drop prefetches; if they cannot be performed
1221 immediately, they are queued. We set number of simultaneous prefetches
1222 to a large constant to reflect this (it probably is not a good idea not
1223 to limit number of prefetches at all, as their execution also takes some
1224 time). */
1225 100, /* number of parallel prefetches */
1226 2, /* Branch cost */
1227 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1228 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1229 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1230 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1231 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1232 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1233
1234 bdver3_memcpy,
1235 bdver3_memset,
1236 6, /* scalar_stmt_cost. */
1237 4, /* scalar load_cost. */
1238 4, /* scalar_store_cost. */
1239 6, /* vec_stmt_cost. */
1240 0, /* vec_to_scalar_cost. */
1241 2, /* scalar_to_vec_cost. */
1242 4, /* vec_align_load_cost. */
1243 4, /* vec_unalign_load_cost. */
1244 4, /* vec_store_cost. */
1245 4, /* cond_taken_branch_cost. */
1246 2, /* cond_not_taken_branch_cost. */
1247 };
1248
1249 /* BDVER4 has optimized REP instruction for medium sized blocks, but for
1250 very small blocks it is better to use loop. For large blocks, libcall
1251 can do nontemporary accesses and beat inline considerably. */
1252 static stringop_algs bdver4_memcpy[2] = {
1253 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1254 {-1, rep_prefix_4_byte, false}}},
1255 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1256 {-1, libcall, false}}}};
1257 static stringop_algs bdver4_memset[2] = {
1258 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1259 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1260 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1261 {-1, libcall, false}}}};
1262 struct processor_costs bdver4_cost = {
1263 COSTS_N_INSNS (1), /* cost of an add instruction */
1264 COSTS_N_INSNS (1), /* cost of a lea instruction */
1265 COSTS_N_INSNS (1), /* variable shift costs */
1266 COSTS_N_INSNS (1), /* constant shift costs */
1267 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1268 COSTS_N_INSNS (4), /* HI */
1269 COSTS_N_INSNS (4), /* SI */
1270 COSTS_N_INSNS (6), /* DI */
1271 COSTS_N_INSNS (6)}, /* other */
1272 0, /* cost of multiply per each bit set */
1273 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1274 COSTS_N_INSNS (35), /* HI */
1275 COSTS_N_INSNS (51), /* SI */
1276 COSTS_N_INSNS (83), /* DI */
1277 COSTS_N_INSNS (83)}, /* other */
1278 COSTS_N_INSNS (1), /* cost of movsx */
1279 COSTS_N_INSNS (1), /* cost of movzx */
1280 8, /* "large" insn */
1281 9, /* MOVE_RATIO */
1282 4, /* cost for loading QImode using movzbl */
1283 {5, 5, 4}, /* cost of loading integer registers
1284 in QImode, HImode and SImode.
1285 Relative to reg-reg move (2). */
1286 {4, 4, 4}, /* cost of storing integer registers */
1287 2, /* cost of reg,reg fld/fst */
1288 {5, 5, 12}, /* cost of loading fp registers
1289 in SFmode, DFmode and XFmode */
1290 {4, 4, 8}, /* cost of storing fp registers
1291 in SFmode, DFmode and XFmode */
1292 2, /* cost of moving MMX register */
1293 {4, 4}, /* cost of loading MMX registers
1294 in SImode and DImode */
1295 {4, 4}, /* cost of storing MMX registers
1296 in SImode and DImode */
1297 2, /* cost of moving SSE register */
1298 {4, 4, 4}, /* cost of loading SSE registers
1299 in SImode, DImode and TImode */
1300 {4, 4, 4}, /* cost of storing SSE registers
1301 in SImode, DImode and TImode */
1302 2, /* MMX or SSE register to integer */
1303 16, /* size of l1 cache. */
1304 2048, /* size of l2 cache. */
1305 64, /* size of prefetch block */
1306 /* New AMD processors never drop prefetches; if they cannot be performed
1307 immediately, they are queued. We set number of simultaneous prefetches
1308 to a large constant to reflect this (it probably is not a good idea not
1309 to limit number of prefetches at all, as their execution also takes some
1310 time). */
1311 100, /* number of parallel prefetches */
1312 2, /* Branch cost */
1313 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1314 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1315 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1316 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1317 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1318 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1319
1320 bdver4_memcpy,
1321 bdver4_memset,
1322 6, /* scalar_stmt_cost. */
1323 4, /* scalar load_cost. */
1324 4, /* scalar_store_cost. */
1325 6, /* vec_stmt_cost. */
1326 0, /* vec_to_scalar_cost. */
1327 2, /* scalar_to_vec_cost. */
1328 4, /* vec_align_load_cost. */
1329 4, /* vec_unalign_load_cost. */
1330 4, /* vec_store_cost. */
1331 4, /* cond_taken_branch_cost. */
1332 2, /* cond_not_taken_branch_cost. */
1333 };
1334
1335
1336 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1337 very small blocks it is better to use loop. For large blocks, libcall
1338 can do nontemporary accesses and beat inline considerably. */
1339 static stringop_algs znver1_memcpy[2] = {
1340 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1341 {-1, rep_prefix_4_byte, false}}},
1342 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1343 {-1, libcall, false}}}};
1344 static stringop_algs znver1_memset[2] = {
1345 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1346 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1347 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1348 {-1, libcall, false}}}};
1349 struct processor_costs znver1_cost = {
1350 COSTS_N_INSNS (1), /* cost of an add instruction. */
1351 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1352 COSTS_N_INSNS (1), /* variable shift costs. */
1353 COSTS_N_INSNS (1), /* constant shift costs. */
1354 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1355 COSTS_N_INSNS (3), /* HI. */
1356 COSTS_N_INSNS (3), /* SI. */
1357 COSTS_N_INSNS (4), /* DI. */
1358 COSTS_N_INSNS (4)}, /* other. */
1359 0, /* cost of multiply per each bit
1360 set. */
1361 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI. */
1362 COSTS_N_INSNS (35), /* HI. */
1363 COSTS_N_INSNS (51), /* SI. */
1364 COSTS_N_INSNS (83), /* DI. */
1365 COSTS_N_INSNS (83)}, /* other. */
1366 COSTS_N_INSNS (1), /* cost of movsx. */
1367 COSTS_N_INSNS (1), /* cost of movzx. */
1368 8, /* "large" insn. */
1369 9, /* MOVE_RATIO. */
1370 4, /* cost for loading QImode using
1371 movzbl. */
1372 {5, 5, 4}, /* cost of loading integer registers
1373 in QImode, HImode and SImode.
1374 Relative to reg-reg move (2). */
1375 {4, 4, 4}, /* cost of storing integer
1376 registers. */
1377 2, /* cost of reg,reg fld/fst. */
1378 {5, 5, 12}, /* cost of loading fp registers
1379 in SFmode, DFmode and XFmode. */
1380 {4, 4, 8}, /* cost of storing fp registers
1381 in SFmode, DFmode and XFmode. */
1382 2, /* cost of moving MMX register. */
1383 {4, 4}, /* cost of loading MMX registers
1384 in SImode and DImode. */
1385 {4, 4}, /* cost of storing MMX registers
1386 in SImode and DImode. */
1387 2, /* cost of moving SSE register. */
1388 {4, 4, 4}, /* cost of loading SSE registers
1389 in SImode, DImode and TImode. */
1390 {4, 4, 4}, /* cost of storing SSE registers
1391 in SImode, DImode and TImode. */
1392 2, /* MMX or SSE register to integer. */
1393 32, /* size of l1 cache. */
1394 512, /* size of l2 cache. */
1395 64, /* size of prefetch block. */
1396 /* New AMD processors never drop prefetches; if they cannot be performed
1397 immediately, they are queued. We set number of simultaneous prefetches
1398 to a large constant to reflect this (it probably is not a good idea not
1399 to limit number of prefetches at all, as their execution also takes some
1400 time). */
1401 100, /* number of parallel prefetches. */
1402 2, /* Branch cost. */
1403 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1404 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1405 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1406 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1407 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1408 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1409
1410 znver1_memcpy,
1411 znver1_memset,
1412 6, /* scalar_stmt_cost. */
1413 4, /* scalar load_cost. */
1414 4, /* scalar_store_cost. */
1415 6, /* vec_stmt_cost. */
1416 0, /* vec_to_scalar_cost. */
1417 2, /* scalar_to_vec_cost. */
1418 4, /* vec_align_load_cost. */
1419 4, /* vec_unalign_load_cost. */
1420 4, /* vec_store_cost. */
1421 4, /* cond_taken_branch_cost. */
1422 2, /* cond_not_taken_branch_cost. */
1423 };
1424
1425 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1426 very small blocks it is better to use loop. For large blocks, libcall can
1427 do nontemporary accesses and beat inline considerably. */
1428 static stringop_algs btver1_memcpy[2] = {
1429 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1430 {-1, rep_prefix_4_byte, false}}},
1431 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1432 {-1, libcall, false}}}};
1433 static stringop_algs btver1_memset[2] = {
1434 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1435 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1436 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1437 {-1, libcall, false}}}};
1438 const struct processor_costs btver1_cost = {
1439 COSTS_N_INSNS (1), /* cost of an add instruction */
1440 COSTS_N_INSNS (2), /* cost of a lea instruction */
1441 COSTS_N_INSNS (1), /* variable shift costs */
1442 COSTS_N_INSNS (1), /* constant shift costs */
1443 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1444 COSTS_N_INSNS (4), /* HI */
1445 COSTS_N_INSNS (3), /* SI */
1446 COSTS_N_INSNS (4), /* DI */
1447 COSTS_N_INSNS (5)}, /* other */
1448 0, /* cost of multiply per each bit set */
1449 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1450 COSTS_N_INSNS (35), /* HI */
1451 COSTS_N_INSNS (51), /* SI */
1452 COSTS_N_INSNS (83), /* DI */
1453 COSTS_N_INSNS (83)}, /* other */
1454 COSTS_N_INSNS (1), /* cost of movsx */
1455 COSTS_N_INSNS (1), /* cost of movzx */
1456 8, /* "large" insn */
1457 9, /* MOVE_RATIO */
1458 4, /* cost for loading QImode using movzbl */
1459 {3, 4, 3}, /* cost of loading integer registers
1460 in QImode, HImode and SImode.
1461 Relative to reg-reg move (2). */
1462 {3, 4, 3}, /* cost of storing integer registers */
1463 4, /* cost of reg,reg fld/fst */
1464 {4, 4, 12}, /* cost of loading fp registers
1465 in SFmode, DFmode and XFmode */
1466 {6, 6, 8}, /* cost of storing fp registers
1467 in SFmode, DFmode and XFmode */
1468 2, /* cost of moving MMX register */
1469 {3, 3}, /* cost of loading MMX registers
1470 in SImode and DImode */
1471 {4, 4}, /* cost of storing MMX registers
1472 in SImode and DImode */
1473 2, /* cost of moving SSE register */
1474 {4, 4, 3}, /* cost of loading SSE registers
1475 in SImode, DImode and TImode */
1476 {4, 4, 5}, /* cost of storing SSE registers
1477 in SImode, DImode and TImode */
1478 3, /* MMX or SSE register to integer */
1479 /* On K8:
1480 MOVD reg64, xmmreg Double FSTORE 4
1481 MOVD reg32, xmmreg Double FSTORE 4
1482 On AMDFAM10:
1483 MOVD reg64, xmmreg Double FADD 3
1484 1/1 1/1
1485 MOVD reg32, xmmreg Double FADD 3
1486 1/1 1/1 */
1487 32, /* size of l1 cache. */
1488 512, /* size of l2 cache. */
1489 64, /* size of prefetch block */
1490 100, /* number of parallel prefetches */
1491 2, /* Branch cost */
1492 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1493 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1494 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1495 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1496 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1497 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1498
1499 btver1_memcpy,
1500 btver1_memset,
1501 4, /* scalar_stmt_cost. */
1502 2, /* scalar load_cost. */
1503 2, /* scalar_store_cost. */
1504 6, /* vec_stmt_cost. */
1505 0, /* vec_to_scalar_cost. */
1506 2, /* scalar_to_vec_cost. */
1507 2, /* vec_align_load_cost. */
1508 2, /* vec_unalign_load_cost. */
1509 2, /* vec_store_cost. */
1510 2, /* cond_taken_branch_cost. */
1511 1, /* cond_not_taken_branch_cost. */
1512 };
1513
1514 static stringop_algs btver2_memcpy[2] = {
1515 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1516 {-1, rep_prefix_4_byte, false}}},
1517 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1518 {-1, libcall, false}}}};
1519 static stringop_algs btver2_memset[2] = {
1520 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1521 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1522 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1523 {-1, libcall, false}}}};
1524 const struct processor_costs btver2_cost = {
1525 COSTS_N_INSNS (1), /* cost of an add instruction */
1526 COSTS_N_INSNS (2), /* cost of a lea instruction */
1527 COSTS_N_INSNS (1), /* variable shift costs */
1528 COSTS_N_INSNS (1), /* constant shift costs */
1529 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1530 COSTS_N_INSNS (4), /* HI */
1531 COSTS_N_INSNS (3), /* SI */
1532 COSTS_N_INSNS (4), /* DI */
1533 COSTS_N_INSNS (5)}, /* other */
1534 0, /* cost of multiply per each bit set */
1535 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1536 COSTS_N_INSNS (35), /* HI */
1537 COSTS_N_INSNS (51), /* SI */
1538 COSTS_N_INSNS (83), /* DI */
1539 COSTS_N_INSNS (83)}, /* other */
1540 COSTS_N_INSNS (1), /* cost of movsx */
1541 COSTS_N_INSNS (1), /* cost of movzx */
1542 8, /* "large" insn */
1543 9, /* MOVE_RATIO */
1544 4, /* cost for loading QImode using movzbl */
1545 {3, 4, 3}, /* cost of loading integer registers
1546 in QImode, HImode and SImode.
1547 Relative to reg-reg move (2). */
1548 {3, 4, 3}, /* cost of storing integer registers */
1549 4, /* cost of reg,reg fld/fst */
1550 {4, 4, 12}, /* cost of loading fp registers
1551 in SFmode, DFmode and XFmode */
1552 {6, 6, 8}, /* cost of storing fp registers
1553 in SFmode, DFmode and XFmode */
1554 2, /* cost of moving MMX register */
1555 {3, 3}, /* cost of loading MMX registers
1556 in SImode and DImode */
1557 {4, 4}, /* cost of storing MMX registers
1558 in SImode and DImode */
1559 2, /* cost of moving SSE register */
1560 {4, 4, 3}, /* cost of loading SSE registers
1561 in SImode, DImode and TImode */
1562 {4, 4, 5}, /* cost of storing SSE registers
1563 in SImode, DImode and TImode */
1564 3, /* MMX or SSE register to integer */
1565 /* On K8:
1566 MOVD reg64, xmmreg Double FSTORE 4
1567 MOVD reg32, xmmreg Double FSTORE 4
1568 On AMDFAM10:
1569 MOVD reg64, xmmreg Double FADD 3
1570 1/1 1/1
1571 MOVD reg32, xmmreg Double FADD 3
1572 1/1 1/1 */
1573 32, /* size of l1 cache. */
1574 2048, /* size of l2 cache. */
1575 64, /* size of prefetch block */
1576 100, /* number of parallel prefetches */
1577 2, /* Branch cost */
1578 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1579 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1580 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1581 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1582 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1583 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1584 btver2_memcpy,
1585 btver2_memset,
1586 4, /* scalar_stmt_cost. */
1587 2, /* scalar load_cost. */
1588 2, /* scalar_store_cost. */
1589 6, /* vec_stmt_cost. */
1590 0, /* vec_to_scalar_cost. */
1591 2, /* scalar_to_vec_cost. */
1592 2, /* vec_align_load_cost. */
1593 2, /* vec_unalign_load_cost. */
1594 2, /* vec_store_cost. */
1595 2, /* cond_taken_branch_cost. */
1596 1, /* cond_not_taken_branch_cost. */
1597 };
1598
1599 static stringop_algs pentium4_memcpy[2] = {
1600 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1601 DUMMY_STRINGOP_ALGS};
1602 static stringop_algs pentium4_memset[2] = {
1603 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1604 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1605 DUMMY_STRINGOP_ALGS};
1606
1607 static const
1608 struct processor_costs pentium4_cost = {
1609 COSTS_N_INSNS (1), /* cost of an add instruction */
1610 COSTS_N_INSNS (3), /* cost of a lea instruction */
1611 COSTS_N_INSNS (4), /* variable shift costs */
1612 COSTS_N_INSNS (4), /* constant shift costs */
1613 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1614 COSTS_N_INSNS (15), /* HI */
1615 COSTS_N_INSNS (15), /* SI */
1616 COSTS_N_INSNS (15), /* DI */
1617 COSTS_N_INSNS (15)}, /* other */
1618 0, /* cost of multiply per each bit set */
1619 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1620 COSTS_N_INSNS (56), /* HI */
1621 COSTS_N_INSNS (56), /* SI */
1622 COSTS_N_INSNS (56), /* DI */
1623 COSTS_N_INSNS (56)}, /* other */
1624 COSTS_N_INSNS (1), /* cost of movsx */
1625 COSTS_N_INSNS (1), /* cost of movzx */
1626 16, /* "large" insn */
1627 6, /* MOVE_RATIO */
1628 2, /* cost for loading QImode using movzbl */
1629 {4, 5, 4}, /* cost of loading integer registers
1630 in QImode, HImode and SImode.
1631 Relative to reg-reg move (2). */
1632 {2, 3, 2}, /* cost of storing integer registers */
1633 2, /* cost of reg,reg fld/fst */
1634 {2, 2, 6}, /* cost of loading fp registers
1635 in SFmode, DFmode and XFmode */
1636 {4, 4, 6}, /* cost of storing fp registers
1637 in SFmode, DFmode and XFmode */
1638 2, /* cost of moving MMX register */
1639 {2, 2}, /* cost of loading MMX registers
1640 in SImode and DImode */
1641 {2, 2}, /* cost of storing MMX registers
1642 in SImode and DImode */
1643 12, /* cost of moving SSE register */
1644 {12, 12, 12}, /* cost of loading SSE registers
1645 in SImode, DImode and TImode */
1646 {2, 2, 8}, /* cost of storing SSE registers
1647 in SImode, DImode and TImode */
1648 10, /* MMX or SSE register to integer */
1649 8, /* size of l1 cache. */
1650 256, /* size of l2 cache. */
1651 64, /* size of prefetch block */
1652 6, /* number of parallel prefetches */
1653 2, /* Branch cost */
1654 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1655 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1656 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1657 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1658 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1659 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1660 pentium4_memcpy,
1661 pentium4_memset,
1662 1, /* scalar_stmt_cost. */
1663 1, /* scalar load_cost. */
1664 1, /* scalar_store_cost. */
1665 1, /* vec_stmt_cost. */
1666 1, /* vec_to_scalar_cost. */
1667 1, /* scalar_to_vec_cost. */
1668 1, /* vec_align_load_cost. */
1669 2, /* vec_unalign_load_cost. */
1670 1, /* vec_store_cost. */
1671 3, /* cond_taken_branch_cost. */
1672 1, /* cond_not_taken_branch_cost. */
1673 };
1674
1675 static stringop_algs nocona_memcpy[2] = {
1676 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1677 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1678 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1679
1680 static stringop_algs nocona_memset[2] = {
1681 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1682 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1683 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1684 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1685
1686 static const
1687 struct processor_costs nocona_cost = {
1688 COSTS_N_INSNS (1), /* cost of an add instruction */
1689 COSTS_N_INSNS (1), /* cost of a lea instruction */
1690 COSTS_N_INSNS (1), /* variable shift costs */
1691 COSTS_N_INSNS (1), /* constant shift costs */
1692 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1693 COSTS_N_INSNS (10), /* HI */
1694 COSTS_N_INSNS (10), /* SI */
1695 COSTS_N_INSNS (10), /* DI */
1696 COSTS_N_INSNS (10)}, /* other */
1697 0, /* cost of multiply per each bit set */
1698 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1699 COSTS_N_INSNS (66), /* HI */
1700 COSTS_N_INSNS (66), /* SI */
1701 COSTS_N_INSNS (66), /* DI */
1702 COSTS_N_INSNS (66)}, /* other */
1703 COSTS_N_INSNS (1), /* cost of movsx */
1704 COSTS_N_INSNS (1), /* cost of movzx */
1705 16, /* "large" insn */
1706 17, /* MOVE_RATIO */
1707 4, /* cost for loading QImode using movzbl */
1708 {4, 4, 4}, /* cost of loading integer registers
1709 in QImode, HImode and SImode.
1710 Relative to reg-reg move (2). */
1711 {4, 4, 4}, /* cost of storing integer registers */
1712 3, /* cost of reg,reg fld/fst */
1713 {12, 12, 12}, /* cost of loading fp registers
1714 in SFmode, DFmode and XFmode */
1715 {4, 4, 4}, /* cost of storing fp registers
1716 in SFmode, DFmode and XFmode */
1717 6, /* cost of moving MMX register */
1718 {12, 12}, /* cost of loading MMX registers
1719 in SImode and DImode */
1720 {12, 12}, /* cost of storing MMX registers
1721 in SImode and DImode */
1722 6, /* cost of moving SSE register */
1723 {12, 12, 12}, /* cost of loading SSE registers
1724 in SImode, DImode and TImode */
1725 {12, 12, 12}, /* cost of storing SSE registers
1726 in SImode, DImode and TImode */
1727 8, /* MMX or SSE register to integer */
1728 8, /* size of l1 cache. */
1729 1024, /* size of l2 cache. */
1730 64, /* size of prefetch block */
1731 8, /* number of parallel prefetches */
1732 1, /* Branch cost */
1733 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1734 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1735 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1736 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1737 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1738 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1739 nocona_memcpy,
1740 nocona_memset,
1741 1, /* scalar_stmt_cost. */
1742 1, /* scalar load_cost. */
1743 1, /* scalar_store_cost. */
1744 1, /* vec_stmt_cost. */
1745 1, /* vec_to_scalar_cost. */
1746 1, /* scalar_to_vec_cost. */
1747 1, /* vec_align_load_cost. */
1748 2, /* vec_unalign_load_cost. */
1749 1, /* vec_store_cost. */
1750 3, /* cond_taken_branch_cost. */
1751 1, /* cond_not_taken_branch_cost. */
1752 };
1753
1754 static stringop_algs atom_memcpy[2] = {
1755 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1756 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1757 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1758 static stringop_algs atom_memset[2] = {
1759 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1760 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1761 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1762 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1763 static const
1764 struct processor_costs atom_cost = {
1765 COSTS_N_INSNS (1), /* cost of an add instruction */
1766 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1767 COSTS_N_INSNS (1), /* variable shift costs */
1768 COSTS_N_INSNS (1), /* constant shift costs */
1769 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1770 COSTS_N_INSNS (4), /* HI */
1771 COSTS_N_INSNS (3), /* SI */
1772 COSTS_N_INSNS (4), /* DI */
1773 COSTS_N_INSNS (2)}, /* other */
1774 0, /* cost of multiply per each bit set */
1775 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1776 COSTS_N_INSNS (26), /* HI */
1777 COSTS_N_INSNS (42), /* SI */
1778 COSTS_N_INSNS (74), /* DI */
1779 COSTS_N_INSNS (74)}, /* other */
1780 COSTS_N_INSNS (1), /* cost of movsx */
1781 COSTS_N_INSNS (1), /* cost of movzx */
1782 8, /* "large" insn */
1783 17, /* MOVE_RATIO */
1784 4, /* cost for loading QImode using movzbl */
1785 {4, 4, 4}, /* cost of loading integer registers
1786 in QImode, HImode and SImode.
1787 Relative to reg-reg move (2). */
1788 {4, 4, 4}, /* cost of storing integer registers */
1789 4, /* cost of reg,reg fld/fst */
1790 {12, 12, 12}, /* cost of loading fp registers
1791 in SFmode, DFmode and XFmode */
1792 {6, 6, 8}, /* cost of storing fp registers
1793 in SFmode, DFmode and XFmode */
1794 2, /* cost of moving MMX register */
1795 {8, 8}, /* cost of loading MMX registers
1796 in SImode and DImode */
1797 {8, 8}, /* cost of storing MMX registers
1798 in SImode and DImode */
1799 2, /* cost of moving SSE register */
1800 {8, 8, 8}, /* cost of loading SSE registers
1801 in SImode, DImode and TImode */
1802 {8, 8, 8}, /* cost of storing SSE registers
1803 in SImode, DImode and TImode */
1804 5, /* MMX or SSE register to integer */
1805 32, /* size of l1 cache. */
1806 256, /* size of l2 cache. */
1807 64, /* size of prefetch block */
1808 6, /* number of parallel prefetches */
1809 3, /* Branch cost */
1810 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1811 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1812 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1813 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1814 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1815 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1816 atom_memcpy,
1817 atom_memset,
1818 1, /* scalar_stmt_cost. */
1819 1, /* scalar load_cost. */
1820 1, /* scalar_store_cost. */
1821 1, /* vec_stmt_cost. */
1822 1, /* vec_to_scalar_cost. */
1823 1, /* scalar_to_vec_cost. */
1824 1, /* vec_align_load_cost. */
1825 2, /* vec_unalign_load_cost. */
1826 1, /* vec_store_cost. */
1827 3, /* cond_taken_branch_cost. */
1828 1, /* cond_not_taken_branch_cost. */
1829 };
1830
1831 static stringop_algs slm_memcpy[2] = {
1832 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1833 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1834 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1835 static stringop_algs slm_memset[2] = {
1836 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1837 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1838 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1839 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1840 static const
1841 struct processor_costs slm_cost = {
1842 COSTS_N_INSNS (1), /* cost of an add instruction */
1843 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1844 COSTS_N_INSNS (1), /* variable shift costs */
1845 COSTS_N_INSNS (1), /* constant shift costs */
1846 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1847 COSTS_N_INSNS (3), /* HI */
1848 COSTS_N_INSNS (3), /* SI */
1849 COSTS_N_INSNS (4), /* DI */
1850 COSTS_N_INSNS (2)}, /* other */
1851 0, /* cost of multiply per each bit set */
1852 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1853 COSTS_N_INSNS (26), /* HI */
1854 COSTS_N_INSNS (42), /* SI */
1855 COSTS_N_INSNS (74), /* DI */
1856 COSTS_N_INSNS (74)}, /* other */
1857 COSTS_N_INSNS (1), /* cost of movsx */
1858 COSTS_N_INSNS (1), /* cost of movzx */
1859 8, /* "large" insn */
1860 17, /* MOVE_RATIO */
1861 4, /* cost for loading QImode using movzbl */
1862 {4, 4, 4}, /* cost of loading integer registers
1863 in QImode, HImode and SImode.
1864 Relative to reg-reg move (2). */
1865 {4, 4, 4}, /* cost of storing integer registers */
1866 4, /* cost of reg,reg fld/fst */
1867 {12, 12, 12}, /* cost of loading fp registers
1868 in SFmode, DFmode and XFmode */
1869 {6, 6, 8}, /* cost of storing fp registers
1870 in SFmode, DFmode and XFmode */
1871 2, /* cost of moving MMX register */
1872 {8, 8}, /* cost of loading MMX registers
1873 in SImode and DImode */
1874 {8, 8}, /* cost of storing MMX registers
1875 in SImode and DImode */
1876 2, /* cost of moving SSE register */
1877 {8, 8, 8}, /* cost of loading SSE registers
1878 in SImode, DImode and TImode */
1879 {8, 8, 8}, /* cost of storing SSE registers
1880 in SImode, DImode and TImode */
1881 5, /* MMX or SSE register to integer */
1882 32, /* size of l1 cache. */
1883 256, /* size of l2 cache. */
1884 64, /* size of prefetch block */
1885 6, /* number of parallel prefetches */
1886 3, /* Branch cost */
1887 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1888 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1889 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1890 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1891 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1892 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1893 slm_memcpy,
1894 slm_memset,
1895 1, /* scalar_stmt_cost. */
1896 1, /* scalar load_cost. */
1897 1, /* scalar_store_cost. */
1898 1, /* vec_stmt_cost. */
1899 4, /* vec_to_scalar_cost. */
1900 1, /* scalar_to_vec_cost. */
1901 1, /* vec_align_load_cost. */
1902 2, /* vec_unalign_load_cost. */
1903 1, /* vec_store_cost. */
1904 3, /* cond_taken_branch_cost. */
1905 1, /* cond_not_taken_branch_cost. */
1906 };
1907
1908 static stringop_algs intel_memcpy[2] = {
1909 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1910 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1911 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1912 static stringop_algs intel_memset[2] = {
1913 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1914 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1915 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1916 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1917 static const
1918 struct processor_costs intel_cost = {
1919 COSTS_N_INSNS (1), /* cost of an add instruction */
1920 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1921 COSTS_N_INSNS (1), /* variable shift costs */
1922 COSTS_N_INSNS (1), /* constant shift costs */
1923 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1924 COSTS_N_INSNS (3), /* HI */
1925 COSTS_N_INSNS (3), /* SI */
1926 COSTS_N_INSNS (4), /* DI */
1927 COSTS_N_INSNS (2)}, /* other */
1928 0, /* cost of multiply per each bit set */
1929 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1930 COSTS_N_INSNS (26), /* HI */
1931 COSTS_N_INSNS (42), /* SI */
1932 COSTS_N_INSNS (74), /* DI */
1933 COSTS_N_INSNS (74)}, /* other */
1934 COSTS_N_INSNS (1), /* cost of movsx */
1935 COSTS_N_INSNS (1), /* cost of movzx */
1936 8, /* "large" insn */
1937 17, /* MOVE_RATIO */
1938 4, /* cost for loading QImode using movzbl */
1939 {4, 4, 4}, /* cost of loading integer registers
1940 in QImode, HImode and SImode.
1941 Relative to reg-reg move (2). */
1942 {4, 4, 4}, /* cost of storing integer registers */
1943 4, /* cost of reg,reg fld/fst */
1944 {12, 12, 12}, /* cost of loading fp registers
1945 in SFmode, DFmode and XFmode */
1946 {6, 6, 8}, /* cost of storing fp registers
1947 in SFmode, DFmode and XFmode */
1948 2, /* cost of moving MMX register */
1949 {8, 8}, /* cost of loading MMX registers
1950 in SImode and DImode */
1951 {8, 8}, /* cost of storing MMX registers
1952 in SImode and DImode */
1953 2, /* cost of moving SSE register */
1954 {8, 8, 8}, /* cost of loading SSE registers
1955 in SImode, DImode and TImode */
1956 {8, 8, 8}, /* cost of storing SSE registers
1957 in SImode, DImode and TImode */
1958 5, /* MMX or SSE register to integer */
1959 32, /* size of l1 cache. */
1960 256, /* size of l2 cache. */
1961 64, /* size of prefetch block */
1962 6, /* number of parallel prefetches */
1963 3, /* Branch cost */
1964 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1965 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1966 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1967 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1968 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1969 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1970 intel_memcpy,
1971 intel_memset,
1972 1, /* scalar_stmt_cost. */
1973 1, /* scalar load_cost. */
1974 1, /* scalar_store_cost. */
1975 1, /* vec_stmt_cost. */
1976 4, /* vec_to_scalar_cost. */
1977 1, /* scalar_to_vec_cost. */
1978 1, /* vec_align_load_cost. */
1979 2, /* vec_unalign_load_cost. */
1980 1, /* vec_store_cost. */
1981 3, /* cond_taken_branch_cost. */
1982 1, /* cond_not_taken_branch_cost. */
1983 };
1984
1985 /* Generic should produce code tuned for Core-i7 (and newer chips)
1986 and btver1 (and newer chips). */
1987
1988 static stringop_algs generic_memcpy[2] = {
1989 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1990 {-1, libcall, false}}},
1991 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1992 {-1, libcall, false}}}};
1993 static stringop_algs generic_memset[2] = {
1994 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1995 {-1, libcall, false}}},
1996 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1997 {-1, libcall, false}}}};
1998 static const
1999 struct processor_costs generic_cost = {
2000 COSTS_N_INSNS (1), /* cost of an add instruction */
2001 /* On all chips taken into consideration lea is 2 cycles and more. With
2002 this cost however our current implementation of synth_mult results in
2003 use of unnecessary temporary registers causing regression on several
2004 SPECfp benchmarks. */
2005 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2006 COSTS_N_INSNS (1), /* variable shift costs */
2007 COSTS_N_INSNS (1), /* constant shift costs */
2008 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2009 COSTS_N_INSNS (4), /* HI */
2010 COSTS_N_INSNS (3), /* SI */
2011 COSTS_N_INSNS (4), /* DI */
2012 COSTS_N_INSNS (2)}, /* other */
2013 0, /* cost of multiply per each bit set */
2014 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2015 COSTS_N_INSNS (26), /* HI */
2016 COSTS_N_INSNS (42), /* SI */
2017 COSTS_N_INSNS (74), /* DI */
2018 COSTS_N_INSNS (74)}, /* other */
2019 COSTS_N_INSNS (1), /* cost of movsx */
2020 COSTS_N_INSNS (1), /* cost of movzx */
2021 8, /* "large" insn */
2022 17, /* MOVE_RATIO */
2023 4, /* cost for loading QImode using movzbl */
2024 {4, 4, 4}, /* cost of loading integer registers
2025 in QImode, HImode and SImode.
2026 Relative to reg-reg move (2). */
2027 {4, 4, 4}, /* cost of storing integer registers */
2028 4, /* cost of reg,reg fld/fst */
2029 {12, 12, 12}, /* cost of loading fp registers
2030 in SFmode, DFmode and XFmode */
2031 {6, 6, 8}, /* cost of storing fp registers
2032 in SFmode, DFmode and XFmode */
2033 2, /* cost of moving MMX register */
2034 {8, 8}, /* cost of loading MMX registers
2035 in SImode and DImode */
2036 {8, 8}, /* cost of storing MMX registers
2037 in SImode and DImode */
2038 2, /* cost of moving SSE register */
2039 {8, 8, 8}, /* cost of loading SSE registers
2040 in SImode, DImode and TImode */
2041 {8, 8, 8}, /* cost of storing SSE registers
2042 in SImode, DImode and TImode */
2043 5, /* MMX or SSE register to integer */
2044 32, /* size of l1 cache. */
2045 512, /* size of l2 cache. */
2046 64, /* size of prefetch block */
2047 6, /* number of parallel prefetches */
2048 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2049 value is increased to perhaps more appropriate value of 5. */
2050 3, /* Branch cost */
2051 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2052 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2053 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2054 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2055 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2056 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2057 generic_memcpy,
2058 generic_memset,
2059 1, /* scalar_stmt_cost. */
2060 1, /* scalar load_cost. */
2061 1, /* scalar_store_cost. */
2062 1, /* vec_stmt_cost. */
2063 1, /* vec_to_scalar_cost. */
2064 1, /* scalar_to_vec_cost. */
2065 1, /* vec_align_load_cost. */
2066 2, /* vec_unalign_load_cost. */
2067 1, /* vec_store_cost. */
2068 3, /* cond_taken_branch_cost. */
2069 1, /* cond_not_taken_branch_cost. */
2070 };
2071
2072 /* core_cost should produce code tuned for Core familly of CPUs. */
2073 static stringop_algs core_memcpy[2] = {
2074 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2075 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2076 {-1, libcall, false}}}};
2077 static stringop_algs core_memset[2] = {
2078 {libcall, {{6, loop_1_byte, true},
2079 {24, loop, true},
2080 {8192, rep_prefix_4_byte, true},
2081 {-1, libcall, false}}},
2082 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2083 {-1, libcall, false}}}};
2084
2085 static const
2086 struct processor_costs core_cost = {
2087 COSTS_N_INSNS (1), /* cost of an add instruction */
2088 /* On all chips taken into consideration lea is 2 cycles and more. With
2089 this cost however our current implementation of synth_mult results in
2090 use of unnecessary temporary registers causing regression on several
2091 SPECfp benchmarks. */
2092 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2093 COSTS_N_INSNS (1), /* variable shift costs */
2094 COSTS_N_INSNS (1), /* constant shift costs */
2095 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2096 COSTS_N_INSNS (4), /* HI */
2097 COSTS_N_INSNS (3), /* SI */
2098 COSTS_N_INSNS (4), /* DI */
2099 COSTS_N_INSNS (2)}, /* other */
2100 0, /* cost of multiply per each bit set */
2101 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2102 COSTS_N_INSNS (26), /* HI */
2103 COSTS_N_INSNS (42), /* SI */
2104 COSTS_N_INSNS (74), /* DI */
2105 COSTS_N_INSNS (74)}, /* other */
2106 COSTS_N_INSNS (1), /* cost of movsx */
2107 COSTS_N_INSNS (1), /* cost of movzx */
2108 8, /* "large" insn */
2109 17, /* MOVE_RATIO */
2110 4, /* cost for loading QImode using movzbl */
2111 {4, 4, 4}, /* cost of loading integer registers
2112 in QImode, HImode and SImode.
2113 Relative to reg-reg move (2). */
2114 {4, 4, 4}, /* cost of storing integer registers */
2115 4, /* cost of reg,reg fld/fst */
2116 {12, 12, 12}, /* cost of loading fp registers
2117 in SFmode, DFmode and XFmode */
2118 {6, 6, 8}, /* cost of storing fp registers
2119 in SFmode, DFmode and XFmode */
2120 2, /* cost of moving MMX register */
2121 {8, 8}, /* cost of loading MMX registers
2122 in SImode and DImode */
2123 {8, 8}, /* cost of storing MMX registers
2124 in SImode and DImode */
2125 2, /* cost of moving SSE register */
2126 {8, 8, 8}, /* cost of loading SSE registers
2127 in SImode, DImode and TImode */
2128 {8, 8, 8}, /* cost of storing SSE registers
2129 in SImode, DImode and TImode */
2130 5, /* MMX or SSE register to integer */
2131 64, /* size of l1 cache. */
2132 512, /* size of l2 cache. */
2133 64, /* size of prefetch block */
2134 6, /* number of parallel prefetches */
2135 /* FIXME perhaps more appropriate value is 5. */
2136 3, /* Branch cost */
2137 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2138 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2139 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2140 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2141 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2142 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2143 core_memcpy,
2144 core_memset,
2145 1, /* scalar_stmt_cost. */
2146 1, /* scalar load_cost. */
2147 1, /* scalar_store_cost. */
2148 1, /* vec_stmt_cost. */
2149 1, /* vec_to_scalar_cost. */
2150 1, /* scalar_to_vec_cost. */
2151 1, /* vec_align_load_cost. */
2152 2, /* vec_unalign_load_cost. */
2153 1, /* vec_store_cost. */
2154 3, /* cond_taken_branch_cost. */
2155 1, /* cond_not_taken_branch_cost. */
2156 };
2157
2158
2159 /* Set by -mtune. */
2160 const struct processor_costs *ix86_tune_cost = &pentium_cost;
2161
2162 /* Set by -mtune or -Os. */
2163 const struct processor_costs *ix86_cost = &pentium_cost;
2164
2165 /* Processor feature/optimization bitmasks. */
2166 #define m_386 (1U<<PROCESSOR_I386)
2167 #define m_486 (1U<<PROCESSOR_I486)
2168 #define m_PENT (1U<<PROCESSOR_PENTIUM)
2169 #define m_LAKEMONT (1U<<PROCESSOR_LAKEMONT)
2170 #define m_PPRO (1U<<PROCESSOR_PENTIUMPRO)
2171 #define m_PENT4 (1U<<PROCESSOR_PENTIUM4)
2172 #define m_NOCONA (1U<<PROCESSOR_NOCONA)
2173 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
2174 #define m_CORE2 (1U<<PROCESSOR_CORE2)
2175 #define m_NEHALEM (1U<<PROCESSOR_NEHALEM)
2176 #define m_SANDYBRIDGE (1U<<PROCESSOR_SANDYBRIDGE)
2177 #define m_HASWELL (1U<<PROCESSOR_HASWELL)
2178 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
2179 #define m_BONNELL (1U<<PROCESSOR_BONNELL)
2180 #define m_SILVERMONT (1U<<PROCESSOR_SILVERMONT)
2181 #define m_KNL (1U<<PROCESSOR_KNL)
2182 #define m_SKYLAKE_AVX512 (1U<<PROCESSOR_SKYLAKE_AVX512)
2183 #define m_INTEL (1U<<PROCESSOR_INTEL)
2184
2185 #define m_GEODE (1U<<PROCESSOR_GEODE)
2186 #define m_K6 (1U<<PROCESSOR_K6)
2187 #define m_K6_GEODE (m_K6 | m_GEODE)
2188 #define m_K8 (1U<<PROCESSOR_K8)
2189 #define m_ATHLON (1U<<PROCESSOR_ATHLON)
2190 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
2191 #define m_AMDFAM10 (1U<<PROCESSOR_AMDFAM10)
2192 #define m_BDVER1 (1U<<PROCESSOR_BDVER1)
2193 #define m_BDVER2 (1U<<PROCESSOR_BDVER2)
2194 #define m_BDVER3 (1U<<PROCESSOR_BDVER3)
2195 #define m_BDVER4 (1U<<PROCESSOR_BDVER4)
2196 #define m_ZNVER1 (1U<<PROCESSOR_ZNVER1)
2197 #define m_BTVER1 (1U<<PROCESSOR_BTVER1)
2198 #define m_BTVER2 (1U<<PROCESSOR_BTVER2)
2199 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
2200 #define m_BTVER (m_BTVER1 | m_BTVER2)
2201 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
2202 | m_ZNVER1)
2203
2204 #define m_GENERIC (1U<<PROCESSOR_GENERIC)
2205
2206 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
2207 #undef DEF_TUNE
2208 #define DEF_TUNE(tune, name, selector) name,
2209 #include "x86-tune.def"
2210 #undef DEF_TUNE
2211 };
2212
2213 /* Feature tests against the various tunings. */
2214 unsigned char ix86_tune_features[X86_TUNE_LAST];
2215
2216 /* Feature tests against the various tunings used to create ix86_tune_features
2217 based on the processor mask. */
2218 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
2219 #undef DEF_TUNE
2220 #define DEF_TUNE(tune, name, selector) selector,
2221 #include "x86-tune.def"
2222 #undef DEF_TUNE
2223 };
2224
2225 /* Feature tests against the various architecture variations. */
2226 unsigned char ix86_arch_features[X86_ARCH_LAST];
2227
2228 /* Feature tests against the various architecture variations, used to create
2229 ix86_arch_features based on the processor mask. */
2230 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2231 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2232 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
2233
2234 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2235 ~m_386,
2236
2237 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2238 ~(m_386 | m_486),
2239
2240 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2241 ~m_386,
2242
2243 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2244 ~m_386,
2245 };
2246
2247 /* In case the average insn count for single function invocation is
2248 lower than this constant, emit fast (but longer) prologue and
2249 epilogue code. */
2250 #define FAST_PROLOGUE_INSN_COUNT 20
2251
2252 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2253 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2254 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2255 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2256
2257 /* Array of the smallest class containing reg number REGNO, indexed by
2258 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2259
2260 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2261 {
2262 /* ax, dx, cx, bx */
2263 AREG, DREG, CREG, BREG,
2264 /* si, di, bp, sp */
2265 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2266 /* FP registers */
2267 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2268 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2269 /* arg pointer */
2270 NON_Q_REGS,
2271 /* flags, fpsr, fpcr, frame */
2272 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2273 /* SSE registers */
2274 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2275 SSE_REGS, SSE_REGS,
2276 /* MMX registers */
2277 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2278 MMX_REGS, MMX_REGS,
2279 /* REX registers */
2280 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2281 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2282 /* SSE REX registers */
2283 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2284 SSE_REGS, SSE_REGS,
2285 /* AVX-512 SSE registers */
2286 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2287 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2288 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2289 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2290 /* Mask registers. */
2291 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2292 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2293 /* MPX bound registers */
2294 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
2295 };
2296
2297 /* The "default" register map used in 32bit mode. */
2298
2299 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2300 {
2301 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2302 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2303 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2304 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2305 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2306 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2307 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2308 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2309 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2310 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2311 101, 102, 103, 104, /* bound registers */
2312 };
2313
2314 /* The "default" register map used in 64bit mode. */
2315
2316 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2317 {
2318 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2319 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2320 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2321 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2322 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2323 8,9,10,11,12,13,14,15, /* extended integer registers */
2324 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2325 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
2326 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
2327 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
2328 126, 127, 128, 129, /* bound registers */
2329 };
2330
2331 /* Define the register numbers to be used in Dwarf debugging information.
2332 The SVR4 reference port C compiler uses the following register numbers
2333 in its Dwarf output code:
2334 0 for %eax (gcc regno = 0)
2335 1 for %ecx (gcc regno = 2)
2336 2 for %edx (gcc regno = 1)
2337 3 for %ebx (gcc regno = 3)
2338 4 for %esp (gcc regno = 7)
2339 5 for %ebp (gcc regno = 6)
2340 6 for %esi (gcc regno = 4)
2341 7 for %edi (gcc regno = 5)
2342 The following three DWARF register numbers are never generated by
2343 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2344 believes these numbers have these meanings.
2345 8 for %eip (no gcc equivalent)
2346 9 for %eflags (gcc regno = 17)
2347 10 for %trapno (no gcc equivalent)
2348 It is not at all clear how we should number the FP stack registers
2349 for the x86 architecture. If the version of SDB on x86/svr4 were
2350 a bit less brain dead with respect to floating-point then we would
2351 have a precedent to follow with respect to DWARF register numbers
2352 for x86 FP registers, but the SDB on x86/svr4 is so completely
2353 broken with respect to FP registers that it is hardly worth thinking
2354 of it as something to strive for compatibility with.
2355 The version of x86/svr4 SDB I have at the moment does (partially)
2356 seem to believe that DWARF register number 11 is associated with
2357 the x86 register %st(0), but that's about all. Higher DWARF
2358 register numbers don't seem to be associated with anything in
2359 particular, and even for DWARF regno 11, SDB only seems to under-
2360 stand that it should say that a variable lives in %st(0) (when
2361 asked via an `=' command) if we said it was in DWARF regno 11,
2362 but SDB still prints garbage when asked for the value of the
2363 variable in question (via a `/' command).
2364 (Also note that the labels SDB prints for various FP stack regs
2365 when doing an `x' command are all wrong.)
2366 Note that these problems generally don't affect the native SVR4
2367 C compiler because it doesn't allow the use of -O with -g and
2368 because when it is *not* optimizing, it allocates a memory
2369 location for each floating-point variable, and the memory
2370 location is what gets described in the DWARF AT_location
2371 attribute for the variable in question.
2372 Regardless of the severe mental illness of the x86/svr4 SDB, we
2373 do something sensible here and we use the following DWARF
2374 register numbers. Note that these are all stack-top-relative
2375 numbers.
2376 11 for %st(0) (gcc regno = 8)
2377 12 for %st(1) (gcc regno = 9)
2378 13 for %st(2) (gcc regno = 10)
2379 14 for %st(3) (gcc regno = 11)
2380 15 for %st(4) (gcc regno = 12)
2381 16 for %st(5) (gcc regno = 13)
2382 17 for %st(6) (gcc regno = 14)
2383 18 for %st(7) (gcc regno = 15)
2384 */
2385 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2386 {
2387 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2388 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2389 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2390 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2391 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2392 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2393 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2394 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2395 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2396 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2397 101, 102, 103, 104, /* bound registers */
2398 };
2399
2400 /* Define parameter passing and return registers. */
2401
2402 static int const x86_64_int_parameter_registers[6] =
2403 {
2404 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2405 };
2406
2407 static int const x86_64_ms_abi_int_parameter_registers[4] =
2408 {
2409 CX_REG, DX_REG, R8_REG, R9_REG
2410 };
2411
2412 static int const x86_64_int_return_registers[4] =
2413 {
2414 AX_REG, DX_REG, DI_REG, SI_REG
2415 };
2416
2417 /* Additional registers that are clobbered by SYSV calls. */
2418
2419 int const x86_64_ms_sysv_extra_clobbered_registers[12] =
2420 {
2421 SI_REG, DI_REG,
2422 XMM6_REG, XMM7_REG,
2423 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2424 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2425 };
2426
2427 /* Define the structure for the machine field in struct function. */
2428
2429 struct GTY(()) stack_local_entry {
2430 unsigned short mode;
2431 unsigned short n;
2432 rtx rtl;
2433 struct stack_local_entry *next;
2434 };
2435
2436 /* Structure describing stack frame layout.
2437 Stack grows downward:
2438
2439 [arguments]
2440 <- ARG_POINTER
2441 saved pc
2442
2443 saved static chain if ix86_static_chain_on_stack
2444
2445 saved frame pointer if frame_pointer_needed
2446 <- HARD_FRAME_POINTER
2447 [saved regs]
2448 <- regs_save_offset
2449 [padding0]
2450
2451 [saved SSE regs]
2452 <- sse_regs_save_offset
2453 [padding1] |
2454 | <- FRAME_POINTER
2455 [va_arg registers] |
2456 |
2457 [frame] |
2458 |
2459 [padding2] | = to_allocate
2460 <- STACK_POINTER
2461 */
2462 struct ix86_frame
2463 {
2464 int nsseregs;
2465 int nregs;
2466 int va_arg_size;
2467 int red_zone_size;
2468 int outgoing_arguments_size;
2469
2470 /* The offsets relative to ARG_POINTER. */
2471 HOST_WIDE_INT frame_pointer_offset;
2472 HOST_WIDE_INT hard_frame_pointer_offset;
2473 HOST_WIDE_INT stack_pointer_offset;
2474 HOST_WIDE_INT hfp_save_offset;
2475 HOST_WIDE_INT reg_save_offset;
2476 HOST_WIDE_INT sse_reg_save_offset;
2477
2478 /* When save_regs_using_mov is set, emit prologue using
2479 move instead of push instructions. */
2480 bool save_regs_using_mov;
2481 };
2482
2483 /* Which cpu are we scheduling for. */
2484 enum attr_cpu ix86_schedule;
2485
2486 /* Which cpu are we optimizing for. */
2487 enum processor_type ix86_tune;
2488
2489 /* Which instruction set architecture to use. */
2490 enum processor_type ix86_arch;
2491
2492 /* True if processor has SSE prefetch instruction. */
2493 unsigned char x86_prefetch_sse;
2494
2495 /* -mstackrealign option */
2496 static const char ix86_force_align_arg_pointer_string[]
2497 = "force_align_arg_pointer";
2498
2499 static rtx (*ix86_gen_leave) (void);
2500 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2501 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2502 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2503 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2504 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2505 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
2506 static rtx (*ix86_gen_clzero) (rtx);
2507 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2508 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2509 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2510 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2511 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2512 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2513
2514 /* Preferred alignment for stack boundary in bits. */
2515 unsigned int ix86_preferred_stack_boundary;
2516
2517 /* Alignment for incoming stack boundary in bits specified at
2518 command line. */
2519 static unsigned int ix86_user_incoming_stack_boundary;
2520
2521 /* Default alignment for incoming stack boundary in bits. */
2522 static unsigned int ix86_default_incoming_stack_boundary;
2523
2524 /* Alignment for incoming stack boundary in bits. */
2525 unsigned int ix86_incoming_stack_boundary;
2526
2527 /* Calling abi specific va_list type nodes. */
2528 static GTY(()) tree sysv_va_list_type_node;
2529 static GTY(()) tree ms_va_list_type_node;
2530
2531 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2532 char internal_label_prefix[16];
2533 int internal_label_prefix_len;
2534
2535 /* Fence to use after loop using movnt. */
2536 tree x86_mfence;
2537
2538 /* Register class used for passing given 64bit part of the argument.
2539 These represent classes as documented by the PS ABI, with the exception
2540 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2541 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2542
2543 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2544 whenever possible (upper half does contain padding). */
2545 enum x86_64_reg_class
2546 {
2547 X86_64_NO_CLASS,
2548 X86_64_INTEGER_CLASS,
2549 X86_64_INTEGERSI_CLASS,
2550 X86_64_SSE_CLASS,
2551 X86_64_SSESF_CLASS,
2552 X86_64_SSEDF_CLASS,
2553 X86_64_SSEUP_CLASS,
2554 X86_64_X87_CLASS,
2555 X86_64_X87UP_CLASS,
2556 X86_64_COMPLEX_X87_CLASS,
2557 X86_64_MEMORY_CLASS
2558 };
2559
2560 #define MAX_CLASSES 8
2561
2562 /* Table of constants used by fldpi, fldln2, etc.... */
2563 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2564 static bool ext_80387_constants_init = 0;
2565
2566 \f
2567 static struct machine_function * ix86_init_machine_status (void);
2568 static rtx ix86_function_value (const_tree, const_tree, bool);
2569 static bool ix86_function_value_regno_p (const unsigned int);
2570 static unsigned int ix86_function_arg_boundary (machine_mode,
2571 const_tree);
2572 static rtx ix86_static_chain (const_tree, bool);
2573 static int ix86_function_regparm (const_tree, const_tree);
2574 static void ix86_compute_frame_layout (struct ix86_frame *);
2575 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
2576 rtx, rtx, int);
2577 static void ix86_add_new_builtins (HOST_WIDE_INT);
2578 static tree ix86_canonical_va_list_type (tree);
2579 static void predict_jump (int);
2580 static unsigned int split_stack_prologue_scratch_regno (void);
2581 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2582
2583 enum ix86_function_specific_strings
2584 {
2585 IX86_FUNCTION_SPECIFIC_ARCH,
2586 IX86_FUNCTION_SPECIFIC_TUNE,
2587 IX86_FUNCTION_SPECIFIC_MAX
2588 };
2589
2590 static char *ix86_target_string (HOST_WIDE_INT, int, int, const char *,
2591 const char *, enum fpmath_unit, bool);
2592 static void ix86_function_specific_save (struct cl_target_option *,
2593 struct gcc_options *opts);
2594 static void ix86_function_specific_restore (struct gcc_options *opts,
2595 struct cl_target_option *);
2596 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
2597 static void ix86_function_specific_print (FILE *, int,
2598 struct cl_target_option *);
2599 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2600 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2601 struct gcc_options *,
2602 struct gcc_options *,
2603 struct gcc_options *);
2604 static bool ix86_can_inline_p (tree, tree);
2605 static void ix86_set_current_function (tree);
2606 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2607
2608 static enum calling_abi ix86_function_abi (const_tree);
2609
2610 \f
2611 #ifndef SUBTARGET32_DEFAULT_CPU
2612 #define SUBTARGET32_DEFAULT_CPU "i386"
2613 #endif
2614
2615 /* Whether -mtune= or -march= were specified */
2616 static int ix86_tune_defaulted;
2617 static int ix86_arch_specified;
2618
2619 /* Vectorization library interface and handlers. */
2620 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
2621
2622 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
2623 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
2624
2625 /* Processor target table, indexed by processor number */
2626 struct ptt
2627 {
2628 const char *const name; /* processor name */
2629 const struct processor_costs *cost; /* Processor costs */
2630 const int align_loop; /* Default alignments. */
2631 const int align_loop_max_skip;
2632 const int align_jump;
2633 const int align_jump_max_skip;
2634 const int align_func;
2635 };
2636
2637 /* This table must be in sync with enum processor_type in i386.h. */
2638 static const struct ptt processor_target_table[PROCESSOR_max] =
2639 {
2640 {"generic", &generic_cost, 16, 10, 16, 10, 16},
2641 {"i386", &i386_cost, 4, 3, 4, 3, 4},
2642 {"i486", &i486_cost, 16, 15, 16, 15, 16},
2643 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
2644 {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
2645 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
2646 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
2647 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
2648 {"core2", &core_cost, 16, 10, 16, 10, 16},
2649 {"nehalem", &core_cost, 16, 10, 16, 10, 16},
2650 {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
2651 {"haswell", &core_cost, 16, 10, 16, 10, 16},
2652 {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
2653 {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
2654 {"knl", &slm_cost, 16, 15, 16, 7, 16},
2655 {"skylake-avx512", &core_cost, 16, 10, 16, 10, 16},
2656 {"intel", &intel_cost, 16, 15, 16, 7, 16},
2657 {"geode", &geode_cost, 0, 0, 0, 0, 0},
2658 {"k6", &k6_cost, 32, 7, 32, 7, 32},
2659 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
2660 {"k8", &k8_cost, 16, 7, 16, 7, 16},
2661 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
2662 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
2663 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
2664 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
2665 {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
2666 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
2667 {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
2668 {"znver1", &znver1_cost, 16, 10, 16, 7, 11}
2669 };
2670 \f
2671 static unsigned int
2672 rest_of_handle_insert_vzeroupper (void)
2673 {
2674 int i;
2675
2676 /* vzeroupper instructions are inserted immediately after reload to
2677 account for possible spills from 256bit registers. The pass
2678 reuses mode switching infrastructure by re-running mode insertion
2679 pass, so disable entities that have already been processed. */
2680 for (i = 0; i < MAX_386_ENTITIES; i++)
2681 ix86_optimize_mode_switching[i] = 0;
2682
2683 ix86_optimize_mode_switching[AVX_U128] = 1;
2684
2685 /* Call optimize_mode_switching. */
2686 g->get_passes ()->execute_pass_mode_switching ();
2687 return 0;
2688 }
2689
2690 /* Return 1 if INSN uses or defines a hard register.
2691 Hard register uses in a memory address are ignored.
2692 Clobbers and flags definitions are ignored. */
2693
2694 static bool
2695 has_non_address_hard_reg (rtx_insn *insn)
2696 {
2697 df_ref ref;
2698 FOR_EACH_INSN_DEF (ref, insn)
2699 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
2700 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
2701 && DF_REF_REGNO (ref) != FLAGS_REG)
2702 return true;
2703
2704 FOR_EACH_INSN_USE (ref, insn)
2705 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
2706 return true;
2707
2708 return false;
2709 }
2710
2711 /* Check if comparison INSN may be transformed
2712 into vector comparison. Currently we transform
2713 zero checks only which look like:
2714
2715 (set (reg:CCZ 17 flags)
2716 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
2717 (subreg:SI (reg:DI x) 0))
2718 (const_int 0 [0]))) */
2719
2720 static bool
2721 convertible_comparison_p (rtx_insn *insn)
2722 {
2723 if (!TARGET_SSE4_1)
2724 return false;
2725
2726 rtx def_set = single_set (insn);
2727
2728 gcc_assert (def_set);
2729
2730 rtx src = SET_SRC (def_set);
2731 rtx dst = SET_DEST (def_set);
2732
2733 gcc_assert (GET_CODE (src) == COMPARE);
2734
2735 if (GET_CODE (dst) != REG
2736 || REGNO (dst) != FLAGS_REG
2737 || GET_MODE (dst) != CCZmode)
2738 return false;
2739
2740 rtx op1 = XEXP (src, 0);
2741 rtx op2 = XEXP (src, 1);
2742
2743 if (op2 != CONST0_RTX (GET_MODE (op2)))
2744 return false;
2745
2746 if (GET_CODE (op1) != IOR)
2747 return false;
2748
2749 op2 = XEXP (op1, 1);
2750 op1 = XEXP (op1, 0);
2751
2752 if (!SUBREG_P (op1)
2753 || !SUBREG_P (op2)
2754 || GET_MODE (op1) != SImode
2755 || GET_MODE (op2) != SImode
2756 || ((SUBREG_BYTE (op1) != 0
2757 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
2758 && (SUBREG_BYTE (op2) != 0
2759 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
2760 return false;
2761
2762 op1 = SUBREG_REG (op1);
2763 op2 = SUBREG_REG (op2);
2764
2765 if (op1 != op2
2766 || !REG_P (op1)
2767 || GET_MODE (op1) != DImode)
2768 return false;
2769
2770 return true;
2771 }
2772
2773 /* The DImode version of scalar_to_vector_candidate_p. */
2774
2775 static bool
2776 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
2777 {
2778 rtx def_set = single_set (insn);
2779
2780 if (!def_set)
2781 return false;
2782
2783 if (has_non_address_hard_reg (insn))
2784 return false;
2785
2786 rtx src = SET_SRC (def_set);
2787 rtx dst = SET_DEST (def_set);
2788
2789 if (GET_CODE (src) == COMPARE)
2790 return convertible_comparison_p (insn);
2791
2792 /* We are interested in DImode promotion only. */
2793 if ((GET_MODE (src) != DImode
2794 && !CONST_INT_P (src))
2795 || GET_MODE (dst) != DImode)
2796 return false;
2797
2798 if (!REG_P (dst) && !MEM_P (dst))
2799 return false;
2800
2801 switch (GET_CODE (src))
2802 {
2803 case PLUS:
2804 case MINUS:
2805 case IOR:
2806 case XOR:
2807 case AND:
2808 break;
2809
2810 case REG:
2811 return true;
2812
2813 case MEM:
2814 case CONST_INT:
2815 return REG_P (dst);
2816
2817 default:
2818 return false;
2819 }
2820
2821 if (!REG_P (XEXP (src, 0))
2822 && !MEM_P (XEXP (src, 0))
2823 && !CONST_INT_P (XEXP (src, 0))
2824 /* Check for andnot case. */
2825 && (GET_CODE (src) != AND
2826 || GET_CODE (XEXP (src, 0)) != NOT
2827 || !REG_P (XEXP (XEXP (src, 0), 0))))
2828 return false;
2829
2830 if (!REG_P (XEXP (src, 1))
2831 && !MEM_P (XEXP (src, 1))
2832 && !CONST_INT_P (XEXP (src, 1)))
2833 return false;
2834
2835 if ((GET_MODE (XEXP (src, 0)) != DImode
2836 && !CONST_INT_P (XEXP (src, 0)))
2837 || (GET_MODE (XEXP (src, 1)) != DImode
2838 && !CONST_INT_P (XEXP (src, 1))))
2839 return false;
2840
2841 return true;
2842 }
2843
2844 /* The TImode version of scalar_to_vector_candidate_p. */
2845
2846 static bool
2847 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
2848 {
2849 rtx def_set = single_set (insn);
2850
2851 if (!def_set)
2852 return false;
2853
2854 if (has_non_address_hard_reg (insn))
2855 return false;
2856
2857 rtx src = SET_SRC (def_set);
2858 rtx dst = SET_DEST (def_set);
2859
2860 /* Only TImode load and store are allowed. */
2861 if (GET_MODE (dst) != TImode)
2862 return false;
2863
2864 if (MEM_P (dst))
2865 {
2866 /* Check for store. Memory must be aligned or unaligned store
2867 is optimal. Only support store from register, standard SSE
2868 constant or CONST_WIDE_INT generated from piecewise store.
2869
2870 ??? Verify performance impact before enabling CONST_INT for
2871 __int128 store. */
2872 if (misaligned_operand (dst, TImode)
2873 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
2874 return false;
2875
2876 switch (GET_CODE (src))
2877 {
2878 default:
2879 return false;
2880
2881 case REG:
2882 case CONST_WIDE_INT:
2883 return true;
2884
2885 case CONST_INT:
2886 return standard_sse_constant_p (src, TImode);
2887 }
2888 }
2889 else if (MEM_P (src))
2890 {
2891 /* Check for load. Memory must be aligned or unaligned load is
2892 optimal. */
2893 return (REG_P (dst)
2894 && (!misaligned_operand (src, TImode)
2895 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
2896 }
2897
2898 return false;
2899 }
2900
2901 /* Return 1 if INSN may be converted into vector
2902 instruction. */
2903
2904 static bool
2905 scalar_to_vector_candidate_p (rtx_insn *insn)
2906 {
2907 if (TARGET_64BIT)
2908 return timode_scalar_to_vector_candidate_p (insn);
2909 else
2910 return dimode_scalar_to_vector_candidate_p (insn);
2911 }
2912
2913 /* The DImode version of remove_non_convertible_regs. */
2914
2915 static void
2916 dimode_remove_non_convertible_regs (bitmap candidates)
2917 {
2918 bitmap_iterator bi;
2919 unsigned id;
2920 bitmap regs = BITMAP_ALLOC (NULL);
2921
2922 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
2923 {
2924 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
2925 rtx reg = SET_DEST (def_set);
2926
2927 if (!REG_P (reg)
2928 || bitmap_bit_p (regs, REGNO (reg))
2929 || HARD_REGISTER_P (reg))
2930 continue;
2931
2932 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
2933 def;
2934 def = DF_REF_NEXT_REG (def))
2935 {
2936 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2937 {
2938 if (dump_file)
2939 fprintf (dump_file,
2940 "r%d has non convertible definition in insn %d\n",
2941 REGNO (reg), DF_REF_INSN_UID (def));
2942
2943 bitmap_set_bit (regs, REGNO (reg));
2944 break;
2945 }
2946 }
2947 }
2948
2949 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
2950 {
2951 for (df_ref def = DF_REG_DEF_CHAIN (id);
2952 def;
2953 def = DF_REF_NEXT_REG (def))
2954 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2955 {
2956 if (dump_file)
2957 fprintf (dump_file, "Removing insn %d from candidates list\n",
2958 DF_REF_INSN_UID (def));
2959
2960 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
2961 }
2962 }
2963
2964 BITMAP_FREE (regs);
2965 }
2966
2967 /* For a register REGNO, scan instructions for its defs and uses.
2968 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
2969
2970 static void
2971 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
2972 unsigned int regno)
2973 {
2974 for (df_ref def = DF_REG_DEF_CHAIN (regno);
2975 def;
2976 def = DF_REF_NEXT_REG (def))
2977 {
2978 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2979 {
2980 if (dump_file)
2981 fprintf (dump_file,
2982 "r%d has non convertible def in insn %d\n",
2983 regno, DF_REF_INSN_UID (def));
2984
2985 bitmap_set_bit (regs, regno);
2986 break;
2987 }
2988 }
2989
2990 for (df_ref ref = DF_REG_USE_CHAIN (regno);
2991 ref;
2992 ref = DF_REF_NEXT_REG (ref))
2993 {
2994 /* Debug instructions are skipped. */
2995 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
2996 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
2997 {
2998 if (dump_file)
2999 fprintf (dump_file,
3000 "r%d has non convertible use in insn %d\n",
3001 regno, DF_REF_INSN_UID (ref));
3002
3003 bitmap_set_bit (regs, regno);
3004 break;
3005 }
3006 }
3007 }
3008
3009 /* The TImode version of remove_non_convertible_regs. */
3010
3011 static void
3012 timode_remove_non_convertible_regs (bitmap candidates)
3013 {
3014 bitmap_iterator bi;
3015 unsigned id;
3016 bitmap regs = BITMAP_ALLOC (NULL);
3017
3018 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
3019 {
3020 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
3021 rtx dest = SET_DEST (def_set);
3022 rtx src = SET_SRC (def_set);
3023
3024 if ((!REG_P (dest)
3025 || bitmap_bit_p (regs, REGNO (dest))
3026 || HARD_REGISTER_P (dest))
3027 && (!REG_P (src)
3028 || bitmap_bit_p (regs, REGNO (src))
3029 || HARD_REGISTER_P (src)))
3030 continue;
3031
3032 if (REG_P (dest))
3033 timode_check_non_convertible_regs (candidates, regs,
3034 REGNO (dest));
3035
3036 if (REG_P (src))
3037 timode_check_non_convertible_regs (candidates, regs,
3038 REGNO (src));
3039 }
3040
3041 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
3042 {
3043 for (df_ref def = DF_REG_DEF_CHAIN (id);
3044 def;
3045 def = DF_REF_NEXT_REG (def))
3046 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3047 {
3048 if (dump_file)
3049 fprintf (dump_file, "Removing insn %d from candidates list\n",
3050 DF_REF_INSN_UID (def));
3051
3052 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
3053 }
3054
3055 for (df_ref ref = DF_REG_USE_CHAIN (id);
3056 ref;
3057 ref = DF_REF_NEXT_REG (ref))
3058 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
3059 {
3060 if (dump_file)
3061 fprintf (dump_file, "Removing insn %d from candidates list\n",
3062 DF_REF_INSN_UID (ref));
3063
3064 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
3065 }
3066 }
3067
3068 BITMAP_FREE (regs);
3069 }
3070
3071 /* For a given bitmap of insn UIDs scans all instruction and
3072 remove insn from CANDIDATES in case it has both convertible
3073 and not convertible definitions.
3074
3075 All insns in a bitmap are conversion candidates according to
3076 scalar_to_vector_candidate_p. Currently it implies all insns
3077 are single_set. */
3078
3079 static void
3080 remove_non_convertible_regs (bitmap candidates)
3081 {
3082 if (TARGET_64BIT)
3083 timode_remove_non_convertible_regs (candidates);
3084 else
3085 dimode_remove_non_convertible_regs (candidates);
3086 }
3087
3088 class scalar_chain
3089 {
3090 public:
3091 scalar_chain ();
3092 virtual ~scalar_chain ();
3093
3094 static unsigned max_id;
3095
3096 /* ID of a chain. */
3097 unsigned int chain_id;
3098 /* A queue of instructions to be included into a chain. */
3099 bitmap queue;
3100 /* Instructions included into a chain. */
3101 bitmap insns;
3102 /* All registers defined by a chain. */
3103 bitmap defs;
3104 /* Registers used in both vector and sclar modes. */
3105 bitmap defs_conv;
3106
3107 void build (bitmap candidates, unsigned insn_uid);
3108 virtual int compute_convert_gain () = 0;
3109 int convert ();
3110
3111 protected:
3112 void add_to_queue (unsigned insn_uid);
3113 void emit_conversion_insns (rtx insns, rtx_insn *pos);
3114
3115 private:
3116 void add_insn (bitmap candidates, unsigned insn_uid);
3117 void analyze_register_chain (bitmap candidates, df_ref ref);
3118 virtual void mark_dual_mode_def (df_ref def) = 0;
3119 virtual void convert_insn (rtx_insn *insn) = 0;
3120 virtual void convert_registers () = 0;
3121 };
3122
3123 class dimode_scalar_chain : public scalar_chain
3124 {
3125 public:
3126 int compute_convert_gain ();
3127 private:
3128 void mark_dual_mode_def (df_ref def);
3129 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
3130 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
3131 void convert_insn (rtx_insn *insn);
3132 void convert_op (rtx *op, rtx_insn *insn);
3133 void convert_reg (unsigned regno);
3134 void make_vector_copies (unsigned regno);
3135 void convert_registers ();
3136 int vector_const_cost (rtx exp);
3137 };
3138
3139 class timode_scalar_chain : public scalar_chain
3140 {
3141 public:
3142 /* Convert from TImode to V1TImode is always faster. */
3143 int compute_convert_gain () { return 1; }
3144
3145 private:
3146 void mark_dual_mode_def (df_ref def);
3147 void fix_debug_reg_uses (rtx reg);
3148 void convert_insn (rtx_insn *insn);
3149 /* We don't convert registers to difference size. */
3150 void convert_registers () {}
3151 };
3152
3153 unsigned scalar_chain::max_id = 0;
3154
3155 /* Initialize new chain. */
3156
3157 scalar_chain::scalar_chain ()
3158 {
3159 chain_id = ++max_id;
3160
3161 if (dump_file)
3162 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
3163
3164 bitmap_obstack_initialize (NULL);
3165 insns = BITMAP_ALLOC (NULL);
3166 defs = BITMAP_ALLOC (NULL);
3167 defs_conv = BITMAP_ALLOC (NULL);
3168 queue = NULL;
3169 }
3170
3171 /* Free chain's data. */
3172
3173 scalar_chain::~scalar_chain ()
3174 {
3175 BITMAP_FREE (insns);
3176 BITMAP_FREE (defs);
3177 BITMAP_FREE (defs_conv);
3178 bitmap_obstack_release (NULL);
3179 }
3180
3181 /* Add instruction into chains' queue. */
3182
3183 void
3184 scalar_chain::add_to_queue (unsigned insn_uid)
3185 {
3186 if (bitmap_bit_p (insns, insn_uid)
3187 || bitmap_bit_p (queue, insn_uid))
3188 return;
3189
3190 if (dump_file)
3191 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
3192 insn_uid, chain_id);
3193 bitmap_set_bit (queue, insn_uid);
3194 }
3195
3196 /* For DImode conversion, mark register defined by DEF as requiring
3197 conversion. */
3198
3199 void
3200 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
3201 {
3202 gcc_assert (DF_REF_REG_DEF_P (def));
3203
3204 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
3205 return;
3206
3207 if (dump_file)
3208 fprintf (dump_file,
3209 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
3210 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
3211
3212 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
3213 }
3214
3215 /* For TImode conversion, it is unused. */
3216
3217 void
3218 timode_scalar_chain::mark_dual_mode_def (df_ref)
3219 {
3220 gcc_unreachable ();
3221 }
3222
3223 /* Check REF's chain to add new insns into a queue
3224 and find registers requiring conversion. */
3225
3226 void
3227 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
3228 {
3229 df_link *chain;
3230
3231 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
3232 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
3233 add_to_queue (DF_REF_INSN_UID (ref));
3234
3235 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
3236 {
3237 unsigned uid = DF_REF_INSN_UID (chain->ref);
3238
3239 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
3240 continue;
3241
3242 if (!DF_REF_REG_MEM_P (chain->ref))
3243 {
3244 if (bitmap_bit_p (insns, uid))
3245 continue;
3246
3247 if (bitmap_bit_p (candidates, uid))
3248 {
3249 add_to_queue (uid);
3250 continue;
3251 }
3252 }
3253
3254 if (DF_REF_REG_DEF_P (chain->ref))
3255 {
3256 if (dump_file)
3257 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
3258 DF_REF_REGNO (chain->ref), uid);
3259 mark_dual_mode_def (chain->ref);
3260 }
3261 else
3262 {
3263 if (dump_file)
3264 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
3265 DF_REF_REGNO (chain->ref), uid);
3266 mark_dual_mode_def (ref);
3267 }
3268 }
3269 }
3270
3271 /* Add instruction into a chain. */
3272
3273 void
3274 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
3275 {
3276 if (bitmap_bit_p (insns, insn_uid))
3277 return;
3278
3279 if (dump_file)
3280 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
3281
3282 bitmap_set_bit (insns, insn_uid);
3283
3284 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3285 rtx def_set = single_set (insn);
3286 if (def_set && REG_P (SET_DEST (def_set))
3287 && !HARD_REGISTER_P (SET_DEST (def_set)))
3288 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
3289
3290 df_ref ref;
3291 df_ref def;
3292 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3293 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
3294 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
3295 def;
3296 def = DF_REF_NEXT_REG (def))
3297 analyze_register_chain (candidates, def);
3298 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3299 if (!DF_REF_REG_MEM_P (ref))
3300 analyze_register_chain (candidates, ref);
3301 }
3302
3303 /* Build new chain starting from insn INSN_UID recursively
3304 adding all dependent uses and definitions. */
3305
3306 void
3307 scalar_chain::build (bitmap candidates, unsigned insn_uid)
3308 {
3309 queue = BITMAP_ALLOC (NULL);
3310 bitmap_set_bit (queue, insn_uid);
3311
3312 if (dump_file)
3313 fprintf (dump_file, "Building chain #%d...\n", chain_id);
3314
3315 while (!bitmap_empty_p (queue))
3316 {
3317 insn_uid = bitmap_first_set_bit (queue);
3318 bitmap_clear_bit (queue, insn_uid);
3319 bitmap_clear_bit (candidates, insn_uid);
3320 add_insn (candidates, insn_uid);
3321 }
3322
3323 if (dump_file)
3324 {
3325 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
3326 fprintf (dump_file, " insns: ");
3327 dump_bitmap (dump_file, insns);
3328 if (!bitmap_empty_p (defs_conv))
3329 {
3330 bitmap_iterator bi;
3331 unsigned id;
3332 const char *comma = "";
3333 fprintf (dump_file, " defs to convert: ");
3334 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
3335 {
3336 fprintf (dump_file, "%sr%d", comma, id);
3337 comma = ", ";
3338 }
3339 fprintf (dump_file, "\n");
3340 }
3341 }
3342
3343 BITMAP_FREE (queue);
3344 }
3345
3346 /* Return a cost of building a vector costant
3347 instead of using a scalar one. */
3348
3349 int
3350 dimode_scalar_chain::vector_const_cost (rtx exp)
3351 {
3352 gcc_assert (CONST_INT_P (exp));
3353
3354 if (standard_sse_constant_p (exp, V2DImode))
3355 return COSTS_N_INSNS (1);
3356 return ix86_cost->sse_load[1];
3357 }
3358
3359 /* Compute a gain for chain conversion. */
3360
3361 int
3362 dimode_scalar_chain::compute_convert_gain ()
3363 {
3364 bitmap_iterator bi;
3365 unsigned insn_uid;
3366 int gain = 0;
3367 int cost = 0;
3368
3369 if (dump_file)
3370 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
3371
3372 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
3373 {
3374 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3375 rtx def_set = single_set (insn);
3376 rtx src = SET_SRC (def_set);
3377 rtx dst = SET_DEST (def_set);
3378
3379 if (REG_P (src) && REG_P (dst))
3380 gain += COSTS_N_INSNS (2) - ix86_cost->sse_move;
3381 else if (REG_P (src) && MEM_P (dst))
3382 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3383 else if (MEM_P (src) && REG_P (dst))
3384 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
3385 else if (GET_CODE (src) == PLUS
3386 || GET_CODE (src) == MINUS
3387 || GET_CODE (src) == IOR
3388 || GET_CODE (src) == XOR
3389 || GET_CODE (src) == AND)
3390 {
3391 gain += ix86_cost->add;
3392 if (CONST_INT_P (XEXP (src, 0)))
3393 gain -= vector_const_cost (XEXP (src, 0));
3394 if (CONST_INT_P (XEXP (src, 1)))
3395 gain -= vector_const_cost (XEXP (src, 1));
3396 }
3397 else if (GET_CODE (src) == COMPARE)
3398 {
3399 /* Assume comparison cost is the same. */
3400 }
3401 else if (GET_CODE (src) == CONST_INT)
3402 {
3403 if (REG_P (dst))
3404 gain += COSTS_N_INSNS (2);
3405 else if (MEM_P (dst))
3406 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3407 gain -= vector_const_cost (src);
3408 }
3409 else
3410 gcc_unreachable ();
3411 }
3412
3413 if (dump_file)
3414 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
3415
3416 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
3417 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
3418
3419 if (dump_file)
3420 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
3421
3422 gain -= cost;
3423
3424 if (dump_file)
3425 fprintf (dump_file, " Total gain: %d\n", gain);
3426
3427 return gain;
3428 }
3429
3430 /* Replace REG in X with a V2DI subreg of NEW_REG. */
3431
3432 rtx
3433 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
3434 {
3435 if (x == reg)
3436 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
3437
3438 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
3439 int i, j;
3440 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
3441 {
3442 if (fmt[i] == 'e')
3443 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
3444 else if (fmt[i] == 'E')
3445 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
3446 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
3447 reg, new_reg);
3448 }
3449
3450 return x;
3451 }
3452
3453 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
3454
3455 void
3456 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
3457 rtx reg, rtx new_reg)
3458 {
3459 replace_with_subreg (single_set (insn), reg, new_reg);
3460 }
3461
3462 /* Insert generated conversion instruction sequence INSNS
3463 after instruction AFTER. New BB may be required in case
3464 instruction has EH region attached. */
3465
3466 void
3467 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
3468 {
3469 if (!control_flow_insn_p (after))
3470 {
3471 emit_insn_after (insns, after);
3472 return;
3473 }
3474
3475 basic_block bb = BLOCK_FOR_INSN (after);
3476 edge e = find_fallthru_edge (bb->succs);
3477 gcc_assert (e);
3478
3479 basic_block new_bb = split_edge (e);
3480 emit_insn_after (insns, BB_HEAD (new_bb));
3481 }
3482
3483 /* Make vector copies for all register REGNO definitions
3484 and replace its uses in a chain. */
3485
3486 void
3487 dimode_scalar_chain::make_vector_copies (unsigned regno)
3488 {
3489 rtx reg = regno_reg_rtx[regno];
3490 rtx vreg = gen_reg_rtx (DImode);
3491 df_ref ref;
3492
3493 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3494 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3495 {
3496 rtx_insn *insn = DF_REF_INSN (ref);
3497
3498 start_sequence ();
3499 if (TARGET_SSE4_1)
3500 {
3501 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3502 CONST0_RTX (V4SImode),
3503 gen_rtx_SUBREG (SImode, reg, 0)));
3504 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
3505 gen_rtx_SUBREG (V4SImode, vreg, 0),
3506 gen_rtx_SUBREG (SImode, reg, 4),
3507 GEN_INT (2)));
3508 }
3509 else if (TARGET_INTER_UNIT_MOVES_TO_VEC)
3510 {
3511 rtx tmp = gen_reg_rtx (DImode);
3512 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3513 CONST0_RTX (V4SImode),
3514 gen_rtx_SUBREG (SImode, reg, 0)));
3515 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
3516 CONST0_RTX (V4SImode),
3517 gen_rtx_SUBREG (SImode, reg, 4)));
3518 emit_insn (gen_vec_interleave_lowv4si
3519 (gen_rtx_SUBREG (V4SImode, vreg, 0),
3520 gen_rtx_SUBREG (V4SImode, vreg, 0),
3521 gen_rtx_SUBREG (V4SImode, tmp, 0)));
3522 }
3523 else
3524 {
3525 rtx tmp = assign_386_stack_local (DImode, SLOT_TEMP);
3526 emit_move_insn (adjust_address (tmp, SImode, 0),
3527 gen_rtx_SUBREG (SImode, reg, 0));
3528 emit_move_insn (adjust_address (tmp, SImode, 4),
3529 gen_rtx_SUBREG (SImode, reg, 4));
3530 emit_move_insn (vreg, tmp);
3531 }
3532 rtx_insn *seq = get_insns ();
3533 end_sequence ();
3534 emit_conversion_insns (seq, insn);
3535
3536 if (dump_file)
3537 fprintf (dump_file,
3538 " Copied r%d to a vector register r%d for insn %d\n",
3539 regno, REGNO (vreg), DF_REF_INSN_UID (ref));
3540 }
3541
3542 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3543 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3544 {
3545 replace_with_subreg_in_insn (DF_REF_INSN (ref), reg, vreg);
3546
3547 if (dump_file)
3548 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
3549 regno, REGNO (vreg), DF_REF_INSN_UID (ref));
3550 }
3551 }
3552
3553 /* Convert all definitions of register REGNO
3554 and fix its uses. Scalar copies may be created
3555 in case register is used in not convertible insn. */
3556
3557 void
3558 dimode_scalar_chain::convert_reg (unsigned regno)
3559 {
3560 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
3561 rtx reg = regno_reg_rtx[regno];
3562 rtx scopy = NULL_RTX;
3563 df_ref ref;
3564 bitmap conv;
3565
3566 conv = BITMAP_ALLOC (NULL);
3567 bitmap_copy (conv, insns);
3568
3569 if (scalar_copy)
3570 scopy = gen_reg_rtx (DImode);
3571
3572 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3573 {
3574 rtx_insn *insn = DF_REF_INSN (ref);
3575 rtx def_set = single_set (insn);
3576 rtx src = SET_SRC (def_set);
3577 rtx reg = DF_REF_REG (ref);
3578
3579 if (!MEM_P (src))
3580 {
3581 replace_with_subreg_in_insn (insn, reg, reg);
3582 bitmap_clear_bit (conv, INSN_UID (insn));
3583 }
3584
3585 if (scalar_copy)
3586 {
3587 rtx vcopy = gen_reg_rtx (V2DImode);
3588
3589 start_sequence ();
3590 if (TARGET_INTER_UNIT_MOVES_FROM_VEC)
3591 {
3592 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
3593 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3594 gen_rtx_SUBREG (SImode, vcopy, 0));
3595 emit_move_insn (vcopy,
3596 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
3597 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3598 gen_rtx_SUBREG (SImode, vcopy, 0));
3599 }
3600 else
3601 {
3602 rtx tmp = assign_386_stack_local (DImode, SLOT_TEMP);
3603 emit_move_insn (tmp, reg);
3604 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3605 adjust_address (tmp, SImode, 0));
3606 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3607 adjust_address (tmp, SImode, 4));
3608 }
3609 rtx_insn *seq = get_insns ();
3610 end_sequence ();
3611 emit_conversion_insns (seq, insn);
3612
3613 if (dump_file)
3614 fprintf (dump_file,
3615 " Copied r%d to a scalar register r%d for insn %d\n",
3616 regno, REGNO (scopy), INSN_UID (insn));
3617 }
3618 }
3619
3620 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3621 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3622 {
3623 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
3624 {
3625 rtx def_set = single_set (DF_REF_INSN (ref));
3626 if (!MEM_P (SET_DEST (def_set))
3627 || !REG_P (SET_SRC (def_set)))
3628 replace_with_subreg_in_insn (DF_REF_INSN (ref), reg, reg);
3629 bitmap_clear_bit (conv, DF_REF_INSN_UID (ref));
3630 }
3631 }
3632 /* Skip debug insns and uninitialized uses. */
3633 else if (DF_REF_CHAIN (ref)
3634 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
3635 {
3636 gcc_assert (scopy);
3637 replace_rtx (DF_REF_INSN (ref), reg, scopy);
3638 df_insn_rescan (DF_REF_INSN (ref));
3639 }
3640
3641 BITMAP_FREE (conv);
3642 }
3643
3644 /* Convert operand OP in INSN. We should handle
3645 memory operands and uninitialized registers.
3646 All other register uses are converted during
3647 registers conversion. */
3648
3649 void
3650 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
3651 {
3652 *op = copy_rtx_if_shared (*op);
3653
3654 if (GET_CODE (*op) == NOT)
3655 {
3656 convert_op (&XEXP (*op, 0), insn);
3657 PUT_MODE (*op, V2DImode);
3658 }
3659 else if (MEM_P (*op))
3660 {
3661 rtx tmp = gen_reg_rtx (DImode);
3662
3663 emit_insn_before (gen_move_insn (tmp, *op), insn);
3664 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
3665
3666 if (dump_file)
3667 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
3668 INSN_UID (insn), REGNO (tmp));
3669 }
3670 else if (REG_P (*op))
3671 {
3672 /* We may have not converted register usage in case
3673 this register has no definition. Otherwise it
3674 should be converted in convert_reg. */
3675 df_ref ref;
3676 FOR_EACH_INSN_USE (ref, insn)
3677 if (DF_REF_REGNO (ref) == REGNO (*op))
3678 {
3679 gcc_assert (!DF_REF_CHAIN (ref));
3680 break;
3681 }
3682 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
3683 }
3684 else if (CONST_INT_P (*op))
3685 {
3686 rtx vec_cst;
3687 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
3688
3689 /* Prefer all ones vector in case of -1. */
3690 if (constm1_operand (*op, GET_MODE (*op)))
3691 vec_cst = CONSTM1_RTX (V2DImode);
3692 else
3693 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
3694 gen_rtvec (2, *op, const0_rtx));
3695
3696 if (!standard_sse_constant_p (vec_cst, V2DImode))
3697 {
3698 start_sequence ();
3699 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
3700 rtx_insn *seq = get_insns ();
3701 end_sequence ();
3702 emit_insn_before (seq, insn);
3703 }
3704
3705 emit_insn_before (gen_move_insn (tmp, vec_cst), insn);
3706 *op = tmp;
3707 }
3708 else
3709 {
3710 gcc_assert (SUBREG_P (*op));
3711 gcc_assert (GET_MODE (*op) == V2DImode);
3712 }
3713 }
3714
3715 /* Convert INSN to vector mode. */
3716
3717 void
3718 dimode_scalar_chain::convert_insn (rtx_insn *insn)
3719 {
3720 rtx def_set = single_set (insn);
3721 rtx src = SET_SRC (def_set);
3722 rtx dst = SET_DEST (def_set);
3723 rtx subreg;
3724
3725 if (MEM_P (dst) && !REG_P (src))
3726 {
3727 /* There are no scalar integer instructions and therefore
3728 temporary register usage is required. */
3729 rtx tmp = gen_reg_rtx (DImode);
3730 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
3731 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
3732 }
3733
3734 switch (GET_CODE (src))
3735 {
3736 case PLUS:
3737 case MINUS:
3738 case IOR:
3739 case XOR:
3740 case AND:
3741 convert_op (&XEXP (src, 0), insn);
3742 convert_op (&XEXP (src, 1), insn);
3743 PUT_MODE (src, V2DImode);
3744 break;
3745
3746 case MEM:
3747 if (!REG_P (dst))
3748 convert_op (&src, insn);
3749 break;
3750
3751 case REG:
3752 if (!MEM_P (dst))
3753 convert_op (&src, insn);
3754 break;
3755
3756 case SUBREG:
3757 gcc_assert (GET_MODE (src) == V2DImode);
3758 break;
3759
3760 case COMPARE:
3761 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
3762
3763 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
3764 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
3765
3766 if (REG_P (src))
3767 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
3768 else
3769 subreg = copy_rtx_if_shared (src);
3770 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
3771 copy_rtx_if_shared (subreg),
3772 copy_rtx_if_shared (subreg)),
3773 insn);
3774 dst = gen_rtx_REG (CCmode, FLAGS_REG);
3775 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
3776 copy_rtx_if_shared (src)),
3777 UNSPEC_PTEST);
3778 break;
3779
3780 case CONST_INT:
3781 convert_op (&src, insn);
3782 break;
3783
3784 default:
3785 gcc_unreachable ();
3786 }
3787
3788 SET_SRC (def_set) = src;
3789 SET_DEST (def_set) = dst;
3790
3791 /* Drop possible dead definitions. */
3792 PATTERN (insn) = def_set;
3793
3794 INSN_CODE (insn) = -1;
3795 recog_memoized (insn);
3796 df_insn_rescan (insn);
3797 }
3798
3799 /* Fix uses of converted REG in debug insns. */
3800
3801 void
3802 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
3803 {
3804 if (!flag_var_tracking)
3805 return;
3806
3807 df_ref ref;
3808 for (ref = DF_REG_USE_CHAIN (REGNO (reg));
3809 ref;
3810 ref = DF_REF_NEXT_REG (ref))
3811 {
3812 rtx_insn *insn = DF_REF_INSN (ref);
3813 if (DEBUG_INSN_P (insn))
3814 {
3815 /* It may be a debug insn with a TImode variable in
3816 register. */
3817 rtx val = PATTERN (insn);
3818 if (GET_MODE (val) != TImode)
3819 continue;
3820 gcc_assert (GET_CODE (val) == VAR_LOCATION);
3821 rtx loc = PAT_VAR_LOCATION_LOC (val);
3822 /* It may have been converted to TImode already. */
3823 if (GET_MODE (loc) == TImode)
3824 continue;
3825 gcc_assert (REG_P (loc)
3826 && GET_MODE (loc) == V1TImode);
3827 /* Convert V1TImode register, which has been updated by a SET
3828 insn before, to SUBREG TImode. */
3829 PAT_VAR_LOCATION_LOC (val) = gen_rtx_SUBREG (TImode, loc, 0);
3830 df_insn_rescan (insn);
3831 }
3832 }
3833 }
3834
3835 /* Convert INSN from TImode to V1T1mode. */
3836
3837 void
3838 timode_scalar_chain::convert_insn (rtx_insn *insn)
3839 {
3840 rtx def_set = single_set (insn);
3841 rtx src = SET_SRC (def_set);
3842 rtx dst = SET_DEST (def_set);
3843
3844 switch (GET_CODE (dst))
3845 {
3846 case REG:
3847 {
3848 rtx tmp = find_reg_equal_equiv_note (insn);
3849 if (tmp)
3850 PUT_MODE (XEXP (tmp, 0), V1TImode);
3851 PUT_MODE (dst, V1TImode);
3852 fix_debug_reg_uses (dst);
3853 }
3854 break;
3855 case MEM:
3856 PUT_MODE (dst, V1TImode);
3857 break;
3858
3859 default:
3860 gcc_unreachable ();
3861 }
3862
3863 switch (GET_CODE (src))
3864 {
3865 case REG:
3866 PUT_MODE (src, V1TImode);
3867 /* Call fix_debug_reg_uses only if SRC is never defined. */
3868 if (!DF_REG_DEF_CHAIN (REGNO (src)))
3869 fix_debug_reg_uses (src);
3870 break;
3871
3872 case MEM:
3873 PUT_MODE (src, V1TImode);
3874 break;
3875
3876 case CONST_WIDE_INT:
3877 if (NONDEBUG_INSN_P (insn))
3878 {
3879 /* Since there are no instructions to store 128-bit constant,
3880 temporary register usage is required. */
3881 rtx tmp = gen_reg_rtx (V1TImode);
3882 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
3883 src = validize_mem (force_const_mem (V1TImode, src));
3884 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
3885 dst = tmp;
3886 }
3887 break;
3888
3889 case CONST_INT:
3890 switch (standard_sse_constant_p (src, TImode))
3891 {
3892 case 1:
3893 src = CONST0_RTX (GET_MODE (dst));
3894 break;
3895 case 2:
3896 src = CONSTM1_RTX (GET_MODE (dst));
3897 break;
3898 default:
3899 gcc_unreachable ();
3900 }
3901 if (NONDEBUG_INSN_P (insn))
3902 {
3903 rtx tmp = gen_reg_rtx (V1TImode);
3904 /* Since there are no instructions to store standard SSE
3905 constant, temporary register usage is required. */
3906 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
3907 dst = tmp;
3908 }
3909 break;
3910
3911 default:
3912 gcc_unreachable ();
3913 }
3914
3915 SET_SRC (def_set) = src;
3916 SET_DEST (def_set) = dst;
3917
3918 /* Drop possible dead definitions. */
3919 PATTERN (insn) = def_set;
3920
3921 INSN_CODE (insn) = -1;
3922 recog_memoized (insn);
3923 df_insn_rescan (insn);
3924 }
3925
3926 void
3927 dimode_scalar_chain::convert_registers ()
3928 {
3929 bitmap_iterator bi;
3930 unsigned id;
3931
3932 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
3933 convert_reg (id);
3934
3935 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
3936 make_vector_copies (id);
3937 }
3938
3939 /* Convert whole chain creating required register
3940 conversions and copies. */
3941
3942 int
3943 scalar_chain::convert ()
3944 {
3945 bitmap_iterator bi;
3946 unsigned id;
3947 int converted_insns = 0;
3948
3949 if (!dbg_cnt (stv_conversion))
3950 return 0;
3951
3952 if (dump_file)
3953 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
3954
3955 convert_registers ();
3956
3957 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
3958 {
3959 convert_insn (DF_INSN_UID_GET (id)->insn);
3960 converted_insns++;
3961 }
3962
3963 return converted_insns;
3964 }
3965
3966 /* Main STV pass function. Find and convert scalar
3967 instructions into vector mode when profitable. */
3968
3969 static unsigned int
3970 convert_scalars_to_vector ()
3971 {
3972 basic_block bb;
3973 bitmap candidates;
3974 int converted_insns = 0;
3975
3976 bitmap_obstack_initialize (NULL);
3977 candidates = BITMAP_ALLOC (NULL);
3978
3979 calculate_dominance_info (CDI_DOMINATORS);
3980 df_set_flags (DF_DEFER_INSN_RESCAN);
3981 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
3982 df_md_add_problem ();
3983 df_analyze ();
3984
3985 /* Find all instructions we want to convert into vector mode. */
3986 if (dump_file)
3987 fprintf (dump_file, "Searching for mode conversion candidates...\n");
3988
3989 FOR_EACH_BB_FN (bb, cfun)
3990 {
3991 rtx_insn *insn;
3992 FOR_BB_INSNS (bb, insn)
3993 if (scalar_to_vector_candidate_p (insn))
3994 {
3995 if (dump_file)
3996 fprintf (dump_file, " insn %d is marked as a candidate\n",
3997 INSN_UID (insn));
3998
3999 bitmap_set_bit (candidates, INSN_UID (insn));
4000 }
4001 }
4002
4003 remove_non_convertible_regs (candidates);
4004
4005 if (bitmap_empty_p (candidates))
4006 if (dump_file)
4007 fprintf (dump_file, "There are no candidates for optimization.\n");
4008
4009 while (!bitmap_empty_p (candidates))
4010 {
4011 unsigned uid = bitmap_first_set_bit (candidates);
4012 scalar_chain *chain;
4013
4014 if (TARGET_64BIT)
4015 chain = new timode_scalar_chain;
4016 else
4017 chain = new dimode_scalar_chain;
4018
4019 /* Find instructions chain we want to convert to vector mode.
4020 Check all uses and definitions to estimate all required
4021 conversions. */
4022 chain->build (candidates, uid);
4023
4024 if (chain->compute_convert_gain () > 0)
4025 converted_insns += chain->convert ();
4026 else
4027 if (dump_file)
4028 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
4029 chain->chain_id);
4030
4031 delete chain;
4032 }
4033
4034 if (dump_file)
4035 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
4036
4037 BITMAP_FREE (candidates);
4038 bitmap_obstack_release (NULL);
4039 df_process_deferred_rescans ();
4040
4041 /* Conversion means we may have 128bit register spills/fills
4042 which require aligned stack. */
4043 if (converted_insns)
4044 {
4045 if (crtl->stack_alignment_needed < 128)
4046 crtl->stack_alignment_needed = 128;
4047 if (crtl->stack_alignment_estimated < 128)
4048 crtl->stack_alignment_estimated = 128;
4049 }
4050
4051 return 0;
4052 }
4053
4054 namespace {
4055
4056 const pass_data pass_data_insert_vzeroupper =
4057 {
4058 RTL_PASS, /* type */
4059 "vzeroupper", /* name */
4060 OPTGROUP_NONE, /* optinfo_flags */
4061 TV_MACH_DEP, /* tv_id */
4062 0, /* properties_required */
4063 0, /* properties_provided */
4064 0, /* properties_destroyed */
4065 0, /* todo_flags_start */
4066 TODO_df_finish, /* todo_flags_finish */
4067 };
4068
4069 class pass_insert_vzeroupper : public rtl_opt_pass
4070 {
4071 public:
4072 pass_insert_vzeroupper(gcc::context *ctxt)
4073 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
4074 {}
4075
4076 /* opt_pass methods: */
4077 virtual bool gate (function *)
4078 {
4079 return TARGET_AVX && !TARGET_AVX512F
4080 && TARGET_VZEROUPPER && flag_expensive_optimizations
4081 && !optimize_size;
4082 }
4083
4084 virtual unsigned int execute (function *)
4085 {
4086 return rest_of_handle_insert_vzeroupper ();
4087 }
4088
4089 }; // class pass_insert_vzeroupper
4090
4091 const pass_data pass_data_stv =
4092 {
4093 RTL_PASS, /* type */
4094 "stv", /* name */
4095 OPTGROUP_NONE, /* optinfo_flags */
4096 TV_MACH_DEP, /* tv_id */
4097 0, /* properties_required */
4098 0, /* properties_provided */
4099 0, /* properties_destroyed */
4100 0, /* todo_flags_start */
4101 TODO_df_finish, /* todo_flags_finish */
4102 };
4103
4104 class pass_stv : public rtl_opt_pass
4105 {
4106 public:
4107 pass_stv (gcc::context *ctxt)
4108 : rtl_opt_pass (pass_data_stv, ctxt),
4109 timode_p (false)
4110 {}
4111
4112 /* opt_pass methods: */
4113 virtual bool gate (function *)
4114 {
4115 return (timode_p == !!TARGET_64BIT
4116 && TARGET_STV && TARGET_SSE2 && optimize > 1);
4117 }
4118
4119 virtual unsigned int execute (function *)
4120 {
4121 return convert_scalars_to_vector ();
4122 }
4123
4124 opt_pass *clone ()
4125 {
4126 return new pass_stv (m_ctxt);
4127 }
4128
4129 void set_pass_param (unsigned int n, bool param)
4130 {
4131 gcc_assert (n == 0);
4132 timode_p = param;
4133 }
4134
4135 private:
4136 bool timode_p;
4137 }; // class pass_stv
4138
4139 } // anon namespace
4140
4141 rtl_opt_pass *
4142 make_pass_insert_vzeroupper (gcc::context *ctxt)
4143 {
4144 return new pass_insert_vzeroupper (ctxt);
4145 }
4146
4147 rtl_opt_pass *
4148 make_pass_stv (gcc::context *ctxt)
4149 {
4150 return new pass_stv (ctxt);
4151 }
4152
4153 /* Return true if a red-zone is in use. */
4154
4155 bool
4156 ix86_using_red_zone (void)
4157 {
4158 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
4159 }
4160 \f
4161 /* Return a string that documents the current -m options. The caller is
4162 responsible for freeing the string. */
4163
4164 static char *
4165 ix86_target_string (HOST_WIDE_INT isa, int flags, int ix86_flags,
4166 const char *arch, const char *tune,
4167 enum fpmath_unit fpmath, bool add_nl_p)
4168 {
4169 struct ix86_target_opts
4170 {
4171 const char *option; /* option string */
4172 HOST_WIDE_INT mask; /* isa mask options */
4173 };
4174
4175 /* This table is ordered so that options like -msse4.2 that imply
4176 preceding options while match those first. */
4177 static struct ix86_target_opts isa_opts[] =
4178 {
4179 { "-mfma4", OPTION_MASK_ISA_FMA4 },
4180 { "-mfma", OPTION_MASK_ISA_FMA },
4181 { "-mxop", OPTION_MASK_ISA_XOP },
4182 { "-mlwp", OPTION_MASK_ISA_LWP },
4183 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
4184 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
4185 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
4186 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
4187 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
4188 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
4189 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
4190 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
4191 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
4192 { "-msse4a", OPTION_MASK_ISA_SSE4A },
4193 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
4194 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
4195 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
4196 { "-msse3", OPTION_MASK_ISA_SSE3 },
4197 { "-msse2", OPTION_MASK_ISA_SSE2 },
4198 { "-msse", OPTION_MASK_ISA_SSE },
4199 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
4200 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
4201 { "-mmmx", OPTION_MASK_ISA_MMX },
4202 { "-mabm", OPTION_MASK_ISA_ABM },
4203 { "-mbmi", OPTION_MASK_ISA_BMI },
4204 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
4205 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
4206 { "-mhle", OPTION_MASK_ISA_HLE },
4207 { "-mfxsr", OPTION_MASK_ISA_FXSR },
4208 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
4209 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
4210 { "-madx", OPTION_MASK_ISA_ADX },
4211 { "-mtbm", OPTION_MASK_ISA_TBM },
4212 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
4213 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
4214 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
4215 { "-maes", OPTION_MASK_ISA_AES },
4216 { "-msha", OPTION_MASK_ISA_SHA },
4217 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
4218 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
4219 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
4220 { "-mf16c", OPTION_MASK_ISA_F16C },
4221 { "-mrtm", OPTION_MASK_ISA_RTM },
4222 { "-mxsave", OPTION_MASK_ISA_XSAVE },
4223 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
4224 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
4225 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
4226 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
4227 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
4228 { "-mmpx", OPTION_MASK_ISA_MPX },
4229 { "-mclwb", OPTION_MASK_ISA_CLWB },
4230 { "-mmwaitx", OPTION_MASK_ISA_MWAITX },
4231 { "-mclzero", OPTION_MASK_ISA_CLZERO },
4232 { "-mpku", OPTION_MASK_ISA_PKU },
4233 };
4234
4235 /* Flag options. */
4236 static struct ix86_target_opts flag_opts[] =
4237 {
4238 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
4239 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
4240 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
4241 { "-m80387", MASK_80387 },
4242 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
4243 { "-malign-double", MASK_ALIGN_DOUBLE },
4244 { "-mcld", MASK_CLD },
4245 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
4246 { "-mieee-fp", MASK_IEEE_FP },
4247 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
4248 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
4249 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
4250 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
4251 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
4252 { "-mno-push-args", MASK_NO_PUSH_ARGS },
4253 { "-mno-red-zone", MASK_NO_RED_ZONE },
4254 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
4255 { "-mrecip", MASK_RECIP },
4256 { "-mrtd", MASK_RTD },
4257 { "-msseregparm", MASK_SSEREGPARM },
4258 { "-mstack-arg-probe", MASK_STACK_PROBE },
4259 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
4260 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
4261 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
4262 { "-mvzeroupper", MASK_VZEROUPPER },
4263 { "-mstv", MASK_STV},
4264 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
4265 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
4266 { "-mprefer-avx128", MASK_PREFER_AVX128},
4267 };
4268
4269 /* Additional flag options. */
4270 static struct ix86_target_opts ix86_flag_opts[] =
4271 {
4272 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY },
4273 };
4274
4275 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts)
4276 + ARRAY_SIZE (ix86_flag_opts) + 6][2];
4277
4278 char isa_other[40];
4279 char target_other[40];
4280 char ix86_target_other[40];
4281 unsigned num = 0;
4282 unsigned i, j;
4283 char *ret;
4284 char *ptr;
4285 size_t len;
4286 size_t line_len;
4287 size_t sep_len;
4288 const char *abi;
4289
4290 memset (opts, '\0', sizeof (opts));
4291
4292 /* Add -march= option. */
4293 if (arch)
4294 {
4295 opts[num][0] = "-march=";
4296 opts[num++][1] = arch;
4297 }
4298
4299 /* Add -mtune= option. */
4300 if (tune)
4301 {
4302 opts[num][0] = "-mtune=";
4303 opts[num++][1] = tune;
4304 }
4305
4306 /* Add -m32/-m64/-mx32. */
4307 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
4308 {
4309 if ((isa & OPTION_MASK_ABI_64) != 0)
4310 abi = "-m64";
4311 else
4312 abi = "-mx32";
4313 isa &= ~ (OPTION_MASK_ISA_64BIT
4314 | OPTION_MASK_ABI_64
4315 | OPTION_MASK_ABI_X32);
4316 }
4317 else
4318 abi = "-m32";
4319 opts[num++][0] = abi;
4320
4321 /* Pick out the options in isa options. */
4322 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
4323 {
4324 if ((isa & isa_opts[i].mask) != 0)
4325 {
4326 opts[num++][0] = isa_opts[i].option;
4327 isa &= ~ isa_opts[i].mask;
4328 }
4329 }
4330
4331 if (isa && add_nl_p)
4332 {
4333 opts[num++][0] = isa_other;
4334 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
4335 isa);
4336 }
4337
4338 /* Add flag options. */
4339 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
4340 {
4341 if ((flags & flag_opts[i].mask) != 0)
4342 {
4343 opts[num++][0] = flag_opts[i].option;
4344 flags &= ~ flag_opts[i].mask;
4345 }
4346 }
4347
4348 if (flags && add_nl_p)
4349 {
4350 opts[num++][0] = target_other;
4351 sprintf (target_other, "(other flags: %#x)", flags);
4352 }
4353
4354 /* Add additional flag options. */
4355 for (i = 0; i < ARRAY_SIZE (ix86_flag_opts); i++)
4356 {
4357 if ((ix86_flags & ix86_flag_opts[i].mask) != 0)
4358 {
4359 opts[num++][0] = ix86_flag_opts[i].option;
4360 ix86_flags &= ~ ix86_flag_opts[i].mask;
4361 }
4362 }
4363
4364 if (ix86_flags && add_nl_p)
4365 {
4366 opts[num++][0] = ix86_target_other;
4367 sprintf (ix86_target_other, "(other flags: %#x)", ix86_flags);
4368 }
4369
4370 /* Add -fpmath= option. */
4371 if (fpmath)
4372 {
4373 opts[num][0] = "-mfpmath=";
4374 switch ((int) fpmath)
4375 {
4376 case FPMATH_387:
4377 opts[num++][1] = "387";
4378 break;
4379
4380 case FPMATH_SSE:
4381 opts[num++][1] = "sse";
4382 break;
4383
4384 case FPMATH_387 | FPMATH_SSE:
4385 opts[num++][1] = "sse+387";
4386 break;
4387
4388 default:
4389 gcc_unreachable ();
4390 }
4391 }
4392
4393 /* Any options? */
4394 if (num == 0)
4395 return NULL;
4396
4397 gcc_assert (num < ARRAY_SIZE (opts));
4398
4399 /* Size the string. */
4400 len = 0;
4401 sep_len = (add_nl_p) ? 3 : 1;
4402 for (i = 0; i < num; i++)
4403 {
4404 len += sep_len;
4405 for (j = 0; j < 2; j++)
4406 if (opts[i][j])
4407 len += strlen (opts[i][j]);
4408 }
4409
4410 /* Build the string. */
4411 ret = ptr = (char *) xmalloc (len);
4412 line_len = 0;
4413
4414 for (i = 0; i < num; i++)
4415 {
4416 size_t len2[2];
4417
4418 for (j = 0; j < 2; j++)
4419 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
4420
4421 if (i != 0)
4422 {
4423 *ptr++ = ' ';
4424 line_len++;
4425
4426 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
4427 {
4428 *ptr++ = '\\';
4429 *ptr++ = '\n';
4430 line_len = 0;
4431 }
4432 }
4433
4434 for (j = 0; j < 2; j++)
4435 if (opts[i][j])
4436 {
4437 memcpy (ptr, opts[i][j], len2[j]);
4438 ptr += len2[j];
4439 line_len += len2[j];
4440 }
4441 }
4442
4443 *ptr = '\0';
4444 gcc_assert (ret + len >= ptr);
4445
4446 return ret;
4447 }
4448
4449 /* Return true, if profiling code should be emitted before
4450 prologue. Otherwise it returns false.
4451 Note: For x86 with "hotfix" it is sorried. */
4452 static bool
4453 ix86_profile_before_prologue (void)
4454 {
4455 return flag_fentry != 0;
4456 }
4457
4458 /* Function that is callable from the debugger to print the current
4459 options. */
4460 void ATTRIBUTE_UNUSED
4461 ix86_debug_options (void)
4462 {
4463 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
4464 ix86_target_flags,
4465 ix86_arch_string, ix86_tune_string,
4466 ix86_fpmath, true);
4467
4468 if (opts)
4469 {
4470 fprintf (stderr, "%s\n\n", opts);
4471 free (opts);
4472 }
4473 else
4474 fputs ("<no options>\n\n", stderr);
4475
4476 return;
4477 }
4478
4479 /* Return true if T is one of the bytes we should avoid with
4480 -fmitigate-rop. */
4481
4482 static bool
4483 ix86_rop_should_change_byte_p (int t)
4484 {
4485 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
4486 }
4487
4488 static const char *stringop_alg_names[] = {
4489 #define DEF_ENUM
4490 #define DEF_ALG(alg, name) #name,
4491 #include "stringop.def"
4492 #undef DEF_ENUM
4493 #undef DEF_ALG
4494 };
4495
4496 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
4497 The string is of the following form (or comma separated list of it):
4498
4499 strategy_alg:max_size:[align|noalign]
4500
4501 where the full size range for the strategy is either [0, max_size] or
4502 [min_size, max_size], in which min_size is the max_size + 1 of the
4503 preceding range. The last size range must have max_size == -1.
4504
4505 Examples:
4506
4507 1.
4508 -mmemcpy-strategy=libcall:-1:noalign
4509
4510 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
4511
4512
4513 2.
4514 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
4515
4516 This is to tell the compiler to use the following strategy for memset
4517 1) when the expected size is between [1, 16], use rep_8byte strategy;
4518 2) when the size is between [17, 2048], use vector_loop;
4519 3) when the size is > 2048, use libcall. */
4520
4521 struct stringop_size_range
4522 {
4523 int max;
4524 stringop_alg alg;
4525 bool noalign;
4526 };
4527
4528 static void
4529 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
4530 {
4531 const struct stringop_algs *default_algs;
4532 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
4533 char *curr_range_str, *next_range_str;
4534 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
4535 int i = 0, n = 0;
4536
4537 if (is_memset)
4538 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
4539 else
4540 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
4541
4542 curr_range_str = strategy_str;
4543
4544 do
4545 {
4546 int maxs;
4547 char alg_name[128];
4548 char align[16];
4549 next_range_str = strchr (curr_range_str, ',');
4550 if (next_range_str)
4551 *next_range_str++ = '\0';
4552
4553 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
4554 alg_name, &maxs, align))
4555 {
4556 error ("wrong argument %qs to option %qs", curr_range_str, opt);
4557 return;
4558 }
4559
4560 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
4561 {
4562 error ("size ranges of option %qs should be increasing", opt);
4563 return;
4564 }
4565
4566 for (i = 0; i < last_alg; i++)
4567 if (!strcmp (alg_name, stringop_alg_names[i]))
4568 break;
4569
4570 if (i == last_alg)
4571 {
4572 error ("wrong strategy name %qs specified for option %qs",
4573 alg_name, opt);
4574
4575 auto_vec <const char *> candidates;
4576 for (i = 0; i < last_alg; i++)
4577 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
4578 candidates.safe_push (stringop_alg_names[i]);
4579
4580 char *s;
4581 const char *hint
4582 = candidates_list_and_hint (alg_name, s, candidates);
4583 if (hint)
4584 inform (input_location,
4585 "valid arguments to %qs are: %s; did you mean %qs?",
4586 opt, s, hint);
4587 else
4588 inform (input_location, "valid arguments to %qs are: %s",
4589 opt, s);
4590 XDELETEVEC (s);
4591 return;
4592 }
4593
4594 if ((stringop_alg) i == rep_prefix_8_byte
4595 && !TARGET_64BIT)
4596 {
4597 /* rep; movq isn't available in 32-bit code. */
4598 error ("strategy name %qs specified for option %qs "
4599 "not supported for 32-bit code", alg_name, opt);
4600 return;
4601 }
4602
4603 input_ranges[n].max = maxs;
4604 input_ranges[n].alg = (stringop_alg) i;
4605 if (!strcmp (align, "align"))
4606 input_ranges[n].noalign = false;
4607 else if (!strcmp (align, "noalign"))
4608 input_ranges[n].noalign = true;
4609 else
4610 {
4611 error ("unknown alignment %qs specified for option %qs", align, opt);
4612 return;
4613 }
4614 n++;
4615 curr_range_str = next_range_str;
4616 }
4617 while (curr_range_str);
4618
4619 if (input_ranges[n - 1].max != -1)
4620 {
4621 error ("the max value for the last size range should be -1"
4622 " for option %qs", opt);
4623 return;
4624 }
4625
4626 if (n > MAX_STRINGOP_ALGS)
4627 {
4628 error ("too many size ranges specified in option %qs", opt);
4629 return;
4630 }
4631
4632 /* Now override the default algs array. */
4633 for (i = 0; i < n; i++)
4634 {
4635 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
4636 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
4637 = input_ranges[i].alg;
4638 *const_cast<int *>(&default_algs->size[i].noalign)
4639 = input_ranges[i].noalign;
4640 }
4641 }
4642
4643 \f
4644 /* parse -mtune-ctrl= option. When DUMP is true,
4645 print the features that are explicitly set. */
4646
4647 static void
4648 parse_mtune_ctrl_str (bool dump)
4649 {
4650 if (!ix86_tune_ctrl_string)
4651 return;
4652
4653 char *next_feature_string = NULL;
4654 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
4655 char *orig = curr_feature_string;
4656 int i;
4657 do
4658 {
4659 bool clear = false;
4660
4661 next_feature_string = strchr (curr_feature_string, ',');
4662 if (next_feature_string)
4663 *next_feature_string++ = '\0';
4664 if (*curr_feature_string == '^')
4665 {
4666 curr_feature_string++;
4667 clear = true;
4668 }
4669 for (i = 0; i < X86_TUNE_LAST; i++)
4670 {
4671 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
4672 {
4673 ix86_tune_features[i] = !clear;
4674 if (dump)
4675 fprintf (stderr, "Explicitly %s feature %s\n",
4676 clear ? "clear" : "set", ix86_tune_feature_names[i]);
4677 break;
4678 }
4679 }
4680 if (i == X86_TUNE_LAST)
4681 error ("Unknown parameter to option -mtune-ctrl: %s",
4682 clear ? curr_feature_string - 1 : curr_feature_string);
4683 curr_feature_string = next_feature_string;
4684 }
4685 while (curr_feature_string);
4686 free (orig);
4687 }
4688
4689 /* Helper function to set ix86_tune_features. IX86_TUNE is the
4690 processor type. */
4691
4692 static void
4693 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
4694 {
4695 unsigned int ix86_tune_mask = 1u << ix86_tune;
4696 int i;
4697
4698 for (i = 0; i < X86_TUNE_LAST; ++i)
4699 {
4700 if (ix86_tune_no_default)
4701 ix86_tune_features[i] = 0;
4702 else
4703 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4704 }
4705
4706 if (dump)
4707 {
4708 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
4709 for (i = 0; i < X86_TUNE_LAST; i++)
4710 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
4711 ix86_tune_features[i] ? "on" : "off");
4712 }
4713
4714 parse_mtune_ctrl_str (dump);
4715 }
4716
4717
4718 /* Default align_* from the processor table. */
4719
4720 static void
4721 ix86_default_align (struct gcc_options *opts)
4722 {
4723 if (opts->x_align_loops == 0)
4724 {
4725 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
4726 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
4727 }
4728 if (opts->x_align_jumps == 0)
4729 {
4730 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
4731 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
4732 }
4733 if (opts->x_align_functions == 0)
4734 {
4735 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
4736 }
4737 }
4738
4739 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
4740
4741 static void
4742 ix86_override_options_after_change (void)
4743 {
4744 ix86_default_align (&global_options);
4745 }
4746
4747 /* Override various settings based on options. If MAIN_ARGS_P, the
4748 options are from the command line, otherwise they are from
4749 attributes. Return true if there's an error related to march
4750 option. */
4751
4752 static bool
4753 ix86_option_override_internal (bool main_args_p,
4754 struct gcc_options *opts,
4755 struct gcc_options *opts_set)
4756 {
4757 int i;
4758 unsigned int ix86_arch_mask;
4759 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
4760
4761 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
4762 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
4763 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
4764 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
4765 #define PTA_AES (HOST_WIDE_INT_1 << 4)
4766 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
4767 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
4768 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
4769 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
4770 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
4771 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
4772 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
4773 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
4774 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
4775 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
4776 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
4777 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
4778 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
4779 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
4780 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
4781 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
4782 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
4783 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
4784 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
4785 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
4786 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
4787 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
4788 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
4789 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
4790 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
4791 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
4792 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
4793 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
4794 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
4795 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
4796 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
4797 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
4798 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
4799 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
4800 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
4801 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
4802 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
4803 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
4804 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
4805 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
4806 #define PTA_SHA (HOST_WIDE_INT_1 << 45)
4807 #define PTA_PREFETCHWT1 (HOST_WIDE_INT_1 << 46)
4808 #define PTA_CLFLUSHOPT (HOST_WIDE_INT_1 << 47)
4809 #define PTA_XSAVEC (HOST_WIDE_INT_1 << 48)
4810 #define PTA_XSAVES (HOST_WIDE_INT_1 << 49)
4811 #define PTA_AVX512DQ (HOST_WIDE_INT_1 << 50)
4812 #define PTA_AVX512BW (HOST_WIDE_INT_1 << 51)
4813 #define PTA_AVX512VL (HOST_WIDE_INT_1 << 52)
4814 #define PTA_AVX512IFMA (HOST_WIDE_INT_1 << 53)
4815 #define PTA_AVX512VBMI (HOST_WIDE_INT_1 << 54)
4816 #define PTA_CLWB (HOST_WIDE_INT_1 << 55)
4817 #define PTA_MWAITX (HOST_WIDE_INT_1 << 56)
4818 #define PTA_CLZERO (HOST_WIDE_INT_1 << 57)
4819 #define PTA_NO_80387 (HOST_WIDE_INT_1 << 58)
4820 #define PTA_PKU (HOST_WIDE_INT_1 << 59)
4821
4822 #define PTA_CORE2 \
4823 (PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 \
4824 | PTA_CX16 | PTA_FXSR)
4825 #define PTA_NEHALEM \
4826 (PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_POPCNT)
4827 #define PTA_WESTMERE \
4828 (PTA_NEHALEM | PTA_AES | PTA_PCLMUL)
4829 #define PTA_SANDYBRIDGE \
4830 (PTA_WESTMERE | PTA_AVX | PTA_XSAVE | PTA_XSAVEOPT)
4831 #define PTA_IVYBRIDGE \
4832 (PTA_SANDYBRIDGE | PTA_FSGSBASE | PTA_RDRND | PTA_F16C)
4833 #define PTA_HASWELL \
4834 (PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_LZCNT \
4835 | PTA_FMA | PTA_MOVBE | PTA_HLE)
4836 #define PTA_BROADWELL \
4837 (PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED)
4838 #define PTA_SKYLAKE \
4839 (PTA_BROADWELL | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES)
4840 #define PTA_SKYLAKE_AVX512 \
4841 (PTA_SKYLAKE | PTA_AVX512F | PTA_AVX512CD | PTA_AVX512VL \
4842 | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU)
4843 #define PTA_KNL \
4844 (PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD)
4845 #define PTA_BONNELL \
4846 (PTA_CORE2 | PTA_MOVBE)
4847 #define PTA_SILVERMONT \
4848 (PTA_WESTMERE | PTA_MOVBE)
4849
4850 /* if this reaches 64, need to widen struct pta flags below */
4851
4852 static struct pta
4853 {
4854 const char *const name; /* processor name or nickname. */
4855 const enum processor_type processor;
4856 const enum attr_cpu schedule;
4857 const unsigned HOST_WIDE_INT flags;
4858 }
4859 const processor_alias_table[] =
4860 {
4861 {"i386", PROCESSOR_I386, CPU_NONE, 0},
4862 {"i486", PROCESSOR_I486, CPU_NONE, 0},
4863 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
4864 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
4865 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
4866 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
4867 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
4868 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
4869 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
4870 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
4871 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4872 PTA_MMX | PTA_SSE | PTA_FXSR},
4873 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4874 PTA_MMX | PTA_SSE | PTA_FXSR},
4875 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4876 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
4877 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4878 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
4879 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
4880 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
4881 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
4882 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4883 PTA_MMX | PTA_SSE | PTA_FXSR},
4884 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4885 PTA_MMX | PTA_SSE | PTA_FXSR},
4886 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4887 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
4888 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
4889 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
4890 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
4891 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
4892 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
4893 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
4894 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
4895 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4896 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
4897 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
4898 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
4899 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
4900 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
4901 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
4902 PTA_SANDYBRIDGE},
4903 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
4904 PTA_SANDYBRIDGE},
4905 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
4906 PTA_IVYBRIDGE},
4907 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
4908 PTA_IVYBRIDGE},
4909 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
4910 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
4911 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
4912 {"skylake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE},
4913 {"skylake-avx512", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE_AVX512},
4914 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
4915 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
4916 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
4917 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
4918 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
4919 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
4920 {"geode", PROCESSOR_GEODE, CPU_GEODE,
4921 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
4922 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
4923 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
4924 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
4925 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
4926 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
4927 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
4928 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
4929 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
4930 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
4931 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
4932 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
4933 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
4934 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
4935 {"x86-64", PROCESSOR_K8, CPU_K8,
4936 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
4937 {"eden-x2", PROCESSOR_K8, CPU_K8,
4938 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
4939 {"nano", PROCESSOR_K8, CPU_K8,
4940 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4941 | PTA_SSSE3 | PTA_FXSR},
4942 {"nano-1000", PROCESSOR_K8, CPU_K8,
4943 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4944 | PTA_SSSE3 | PTA_FXSR},
4945 {"nano-2000", PROCESSOR_K8, CPU_K8,
4946 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4947 | PTA_SSSE3 | PTA_FXSR},
4948 {"nano-3000", PROCESSOR_K8, CPU_K8,
4949 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4950 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
4951 {"nano-x2", PROCESSOR_K8, CPU_K8,
4952 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4953 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
4954 {"eden-x4", PROCESSOR_K8, CPU_K8,
4955 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4956 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
4957 {"nano-x4", PROCESSOR_K8, CPU_K8,
4958 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4959 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
4960 {"k8", PROCESSOR_K8, CPU_K8,
4961 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
4962 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
4963 {"k8-sse3", PROCESSOR_K8, CPU_K8,
4964 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
4965 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
4966 {"opteron", PROCESSOR_K8, CPU_K8,
4967 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
4968 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
4969 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
4970 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
4971 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
4972 {"athlon64", PROCESSOR_K8, CPU_K8,
4973 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
4974 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
4975 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
4976 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
4977 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
4978 {"athlon-fx", PROCESSOR_K8, CPU_K8,
4979 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
4980 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
4981 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
4982 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
4983 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
4984 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
4985 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
4986 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
4987 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
4988 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4989 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
4990 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
4991 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
4992 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
4993 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4994 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
4995 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
4996 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
4997 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
4998 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
4999 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5000 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5001 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5002 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
5003 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
5004 | PTA_XSAVEOPT | PTA_FSGSBASE},
5005 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
5006 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5007 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5008 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5009 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
5010 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
5011 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
5012 | PTA_MOVBE | PTA_MWAITX},
5013 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
5014 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5015 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5016 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5017 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
5018 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
5019 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
5020 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
5021 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
5022 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
5023 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5024 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
5025 | PTA_FXSR | PTA_XSAVE},
5026 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
5027 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5028 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
5029 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
5030 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
5031 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
5032
5033 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
5034 PTA_64BIT
5035 | PTA_HLE /* flags are only used for -march switch. */ },
5036 };
5037
5038 /* -mrecip options. */
5039 static struct
5040 {
5041 const char *string; /* option name */
5042 unsigned int mask; /* mask bits to set */
5043 }
5044 const recip_options[] =
5045 {
5046 { "all", RECIP_MASK_ALL },
5047 { "none", RECIP_MASK_NONE },
5048 { "div", RECIP_MASK_DIV },
5049 { "sqrt", RECIP_MASK_SQRT },
5050 { "vec-div", RECIP_MASK_VEC_DIV },
5051 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
5052 };
5053
5054 int const pta_size = ARRAY_SIZE (processor_alias_table);
5055
5056 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
5057 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
5058 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5059 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
5060 #ifdef TARGET_BI_ARCH
5061 else
5062 {
5063 #if TARGET_BI_ARCH == 1
5064 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
5065 is on and OPTION_MASK_ABI_X32 is off. We turn off
5066 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
5067 -mx32. */
5068 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5069 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5070 #else
5071 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
5072 on and OPTION_MASK_ABI_64 is off. We turn off
5073 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
5074 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
5075 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
5076 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
5077 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5078 #endif
5079 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5080 && TARGET_IAMCU_P (opts->x_target_flags))
5081 sorry ("Intel MCU psABI isn%'t supported in %s mode",
5082 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
5083 }
5084 #endif
5085
5086 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5087 {
5088 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5089 OPTION_MASK_ABI_64 for TARGET_X32. */
5090 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5091 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5092 }
5093 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
5094 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
5095 | OPTION_MASK_ABI_X32
5096 | OPTION_MASK_ABI_64);
5097 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
5098 {
5099 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5100 OPTION_MASK_ABI_X32 for TARGET_LP64. */
5101 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5102 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5103 }
5104
5105 #ifdef SUBTARGET_OVERRIDE_OPTIONS
5106 SUBTARGET_OVERRIDE_OPTIONS;
5107 #endif
5108
5109 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
5110 SUBSUBTARGET_OVERRIDE_OPTIONS;
5111 #endif
5112
5113 /* -fPIC is the default for x86_64. */
5114 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
5115 opts->x_flag_pic = 2;
5116
5117 /* Need to check -mtune=generic first. */
5118 if (opts->x_ix86_tune_string)
5119 {
5120 /* As special support for cross compilers we read -mtune=native
5121 as -mtune=generic. With native compilers we won't see the
5122 -mtune=native, as it was changed by the driver. */
5123 if (!strcmp (opts->x_ix86_tune_string, "native"))
5124 {
5125 opts->x_ix86_tune_string = "generic";
5126 }
5127 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5128 warning (OPT_Wdeprecated,
5129 main_args_p
5130 ? "%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
5131 "or %<-mtune=generic%> instead as appropriate"
5132 : "%<target(\"tune=x86-64\")%> is deprecated; use "
5133 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%> "
5134 "instead as appropriate");
5135 }
5136 else
5137 {
5138 if (opts->x_ix86_arch_string)
5139 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
5140 if (!opts->x_ix86_tune_string)
5141 {
5142 opts->x_ix86_tune_string
5143 = processor_target_table[TARGET_CPU_DEFAULT].name;
5144 ix86_tune_defaulted = 1;
5145 }
5146
5147 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
5148 or defaulted. We need to use a sensible tune option. */
5149 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5150 {
5151 opts->x_ix86_tune_string = "generic";
5152 }
5153 }
5154
5155 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
5156 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5157 {
5158 /* rep; movq isn't available in 32-bit code. */
5159 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
5160 opts->x_ix86_stringop_alg = no_stringop;
5161 }
5162
5163 if (!opts->x_ix86_arch_string)
5164 opts->x_ix86_arch_string
5165 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
5166 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
5167 else
5168 ix86_arch_specified = 1;
5169
5170 if (opts_set->x_ix86_pmode)
5171 {
5172 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
5173 && opts->x_ix86_pmode == PMODE_SI)
5174 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5175 && opts->x_ix86_pmode == PMODE_DI))
5176 error ("address mode %qs not supported in the %s bit mode",
5177 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
5178 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
5179 }
5180 else
5181 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
5182 ? PMODE_DI : PMODE_SI;
5183
5184 if (!opts_set->x_ix86_abi)
5185 opts->x_ix86_abi = DEFAULT_ABI;
5186
5187 /* For targets using ms ABI enable ms-extensions, if not
5188 explicit turned off. For non-ms ABI we turn off this
5189 option. */
5190 if (!opts_set->x_flag_ms_extensions)
5191 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
5192
5193 if (opts_set->x_ix86_cmodel)
5194 {
5195 switch (opts->x_ix86_cmodel)
5196 {
5197 case CM_SMALL:
5198 case CM_SMALL_PIC:
5199 if (opts->x_flag_pic)
5200 opts->x_ix86_cmodel = CM_SMALL_PIC;
5201 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5202 error ("code model %qs not supported in the %s bit mode",
5203 "small", "32");
5204 break;
5205
5206 case CM_MEDIUM:
5207 case CM_MEDIUM_PIC:
5208 if (opts->x_flag_pic)
5209 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
5210 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5211 error ("code model %qs not supported in the %s bit mode",
5212 "medium", "32");
5213 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5214 error ("code model %qs not supported in x32 mode",
5215 "medium");
5216 break;
5217
5218 case CM_LARGE:
5219 case CM_LARGE_PIC:
5220 if (opts->x_flag_pic)
5221 opts->x_ix86_cmodel = CM_LARGE_PIC;
5222 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5223 error ("code model %qs not supported in the %s bit mode",
5224 "large", "32");
5225 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5226 error ("code model %qs not supported in x32 mode",
5227 "large");
5228 break;
5229
5230 case CM_32:
5231 if (opts->x_flag_pic)
5232 error ("code model %s does not support PIC mode", "32");
5233 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5234 error ("code model %qs not supported in the %s bit mode",
5235 "32", "64");
5236 break;
5237
5238 case CM_KERNEL:
5239 if (opts->x_flag_pic)
5240 {
5241 error ("code model %s does not support PIC mode", "kernel");
5242 opts->x_ix86_cmodel = CM_32;
5243 }
5244 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5245 error ("code model %qs not supported in the %s bit mode",
5246 "kernel", "32");
5247 break;
5248
5249 default:
5250 gcc_unreachable ();
5251 }
5252 }
5253 else
5254 {
5255 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
5256 use of rip-relative addressing. This eliminates fixups that
5257 would otherwise be needed if this object is to be placed in a
5258 DLL, and is essentially just as efficient as direct addressing. */
5259 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5260 && (TARGET_RDOS || TARGET_PECOFF))
5261 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
5262 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5263 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
5264 else
5265 opts->x_ix86_cmodel = CM_32;
5266 }
5267 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
5268 {
5269 error ("-masm=intel not supported in this configuration");
5270 opts->x_ix86_asm_dialect = ASM_ATT;
5271 }
5272 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
5273 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
5274 sorry ("%i-bit mode not compiled in",
5275 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
5276
5277 for (i = 0; i < pta_size; i++)
5278 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
5279 {
5280 if (!strcmp (opts->x_ix86_arch_string, "generic"))
5281 {
5282 error (main_args_p
5283 ? "%<generic%> CPU can be used only for %<-mtune=%> switch"
5284 : "%<generic%> CPU can be used only for "
5285 "%<target(\"tune=\")%> attribute");
5286 return false;
5287 }
5288 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
5289 {
5290 error (main_args_p
5291 ? "%<intel%> CPU can be used only for %<-mtune=%> switch"
5292 : "%<intel%> CPU can be used only for "
5293 "%<target(\"tune=\")%> attribute");
5294 return false;
5295 }
5296
5297 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5298 && !(processor_alias_table[i].flags & PTA_64BIT))
5299 {
5300 error ("CPU you selected does not support x86-64 "
5301 "instruction set");
5302 return false;
5303 }
5304
5305 ix86_schedule = processor_alias_table[i].schedule;
5306 ix86_arch = processor_alias_table[i].processor;
5307 /* Default cpu tuning to the architecture. */
5308 ix86_tune = ix86_arch;
5309
5310 if (processor_alias_table[i].flags & PTA_MMX
5311 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
5312 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
5313 if (processor_alias_table[i].flags & PTA_3DNOW
5314 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
5315 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
5316 if (processor_alias_table[i].flags & PTA_3DNOW_A
5317 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
5318 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
5319 if (processor_alias_table[i].flags & PTA_SSE
5320 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
5321 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
5322 if (processor_alias_table[i].flags & PTA_SSE2
5323 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
5324 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
5325 if (processor_alias_table[i].flags & PTA_SSE3
5326 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
5327 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
5328 if (processor_alias_table[i].flags & PTA_SSSE3
5329 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
5330 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
5331 if (processor_alias_table[i].flags & PTA_SSE4_1
5332 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
5333 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
5334 if (processor_alias_table[i].flags & PTA_SSE4_2
5335 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
5336 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
5337 if (processor_alias_table[i].flags & PTA_AVX
5338 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
5339 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
5340 if (processor_alias_table[i].flags & PTA_AVX2
5341 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
5342 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
5343 if (processor_alias_table[i].flags & PTA_FMA
5344 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
5345 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
5346 if (processor_alias_table[i].flags & PTA_SSE4A
5347 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
5348 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
5349 if (processor_alias_table[i].flags & PTA_FMA4
5350 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
5351 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
5352 if (processor_alias_table[i].flags & PTA_XOP
5353 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
5354 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
5355 if (processor_alias_table[i].flags & PTA_LWP
5356 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
5357 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
5358 if (processor_alias_table[i].flags & PTA_ABM
5359 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
5360 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
5361 if (processor_alias_table[i].flags & PTA_BMI
5362 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
5363 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
5364 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
5365 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
5366 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
5367 if (processor_alias_table[i].flags & PTA_TBM
5368 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
5369 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
5370 if (processor_alias_table[i].flags & PTA_BMI2
5371 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
5372 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
5373 if (processor_alias_table[i].flags & PTA_CX16
5374 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
5375 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
5376 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
5377 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
5378 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
5379 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
5380 && (processor_alias_table[i].flags & PTA_NO_SAHF))
5381 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
5382 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
5383 if (processor_alias_table[i].flags & PTA_MOVBE
5384 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
5385 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
5386 if (processor_alias_table[i].flags & PTA_AES
5387 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
5388 ix86_isa_flags |= OPTION_MASK_ISA_AES;
5389 if (processor_alias_table[i].flags & PTA_SHA
5390 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
5391 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
5392 if (processor_alias_table[i].flags & PTA_PCLMUL
5393 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
5394 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
5395 if (processor_alias_table[i].flags & PTA_FSGSBASE
5396 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
5397 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
5398 if (processor_alias_table[i].flags & PTA_RDRND
5399 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
5400 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
5401 if (processor_alias_table[i].flags & PTA_F16C
5402 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
5403 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
5404 if (processor_alias_table[i].flags & PTA_RTM
5405 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
5406 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
5407 if (processor_alias_table[i].flags & PTA_HLE
5408 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
5409 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
5410 if (processor_alias_table[i].flags & PTA_PRFCHW
5411 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
5412 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
5413 if (processor_alias_table[i].flags & PTA_RDSEED
5414 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
5415 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
5416 if (processor_alias_table[i].flags & PTA_ADX
5417 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
5418 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
5419 if (processor_alias_table[i].flags & PTA_FXSR
5420 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
5421 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
5422 if (processor_alias_table[i].flags & PTA_XSAVE
5423 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
5424 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
5425 if (processor_alias_table[i].flags & PTA_XSAVEOPT
5426 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
5427 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
5428 if (processor_alias_table[i].flags & PTA_AVX512F
5429 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
5430 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
5431 if (processor_alias_table[i].flags & PTA_AVX512ER
5432 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
5433 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
5434 if (processor_alias_table[i].flags & PTA_AVX512PF
5435 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
5436 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
5437 if (processor_alias_table[i].flags & PTA_AVX512CD
5438 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
5439 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
5440 if (processor_alias_table[i].flags & PTA_PREFETCHWT1
5441 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
5442 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
5443 if (processor_alias_table[i].flags & PTA_CLWB
5444 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
5445 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
5446 if (processor_alias_table[i].flags & PTA_CLFLUSHOPT
5447 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
5448 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
5449 if (processor_alias_table[i].flags & PTA_CLZERO
5450 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLZERO))
5451 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLZERO;
5452 if (processor_alias_table[i].flags & PTA_XSAVEC
5453 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
5454 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
5455 if (processor_alias_table[i].flags & PTA_XSAVES
5456 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
5457 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
5458 if (processor_alias_table[i].flags & PTA_AVX512DQ
5459 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
5460 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
5461 if (processor_alias_table[i].flags & PTA_AVX512BW
5462 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
5463 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
5464 if (processor_alias_table[i].flags & PTA_AVX512VL
5465 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
5466 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
5467 if (processor_alias_table[i].flags & PTA_MPX
5468 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MPX))
5469 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MPX;
5470 if (processor_alias_table[i].flags & PTA_AVX512VBMI
5471 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
5472 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
5473 if (processor_alias_table[i].flags & PTA_AVX512IFMA
5474 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
5475 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
5476 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
5477 x86_prefetch_sse = true;
5478 if (processor_alias_table[i].flags & PTA_MWAITX
5479 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MWAITX))
5480 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MWAITX;
5481 if (processor_alias_table[i].flags & PTA_PKU
5482 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
5483 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
5484
5485 /* Don't enable x87 instructions if only
5486 general registers are allowed. */
5487 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
5488 && !(opts_set->x_target_flags & MASK_80387))
5489 {
5490 if (processor_alias_table[i].flags & PTA_NO_80387)
5491 opts->x_target_flags &= ~MASK_80387;
5492 else
5493 opts->x_target_flags |= MASK_80387;
5494 }
5495 break;
5496 }
5497
5498 if (TARGET_X32 && (opts->x_ix86_isa_flags & OPTION_MASK_ISA_MPX))
5499 error ("Intel MPX does not support x32");
5500
5501 if (TARGET_X32 && (ix86_isa_flags & OPTION_MASK_ISA_MPX))
5502 error ("Intel MPX does not support x32");
5503
5504 if (i == pta_size)
5505 {
5506 error (main_args_p
5507 ? "bad value (%qs) for %<-march=%> switch"
5508 : "bad value (%qs) for %<target(\"arch=\")%> attribute",
5509 opts->x_ix86_arch_string);
5510
5511 auto_vec <const char *> candidates;
5512 for (i = 0; i < pta_size; i++)
5513 if (strcmp (processor_alias_table[i].name, "generic")
5514 && strcmp (processor_alias_table[i].name, "intel")
5515 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5516 || (processor_alias_table[i].flags & PTA_64BIT)))
5517 candidates.safe_push (processor_alias_table[i].name);
5518
5519 char *s;
5520 const char *hint
5521 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
5522 if (hint)
5523 inform (input_location,
5524 main_args_p
5525 ? "valid arguments to %<-march=%> switch are: "
5526 "%s; did you mean %qs?"
5527 : "valid arguments to %<target(\"arch=\")%> attribute are: "
5528 "%s; did you mean %qs?", s, hint);
5529 else
5530 inform (input_location,
5531 main_args_p
5532 ? "valid arguments to %<-march=%> switch are: %s"
5533 : "valid arguments to %<target(\"arch=\")%> attribute are: %s",
5534 s);
5535 XDELETEVEC (s);
5536 }
5537
5538 ix86_arch_mask = 1u << ix86_arch;
5539 for (i = 0; i < X86_ARCH_LAST; ++i)
5540 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
5541
5542 for (i = 0; i < pta_size; i++)
5543 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
5544 {
5545 ix86_schedule = processor_alias_table[i].schedule;
5546 ix86_tune = processor_alias_table[i].processor;
5547 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5548 {
5549 if (!(processor_alias_table[i].flags & PTA_64BIT))
5550 {
5551 if (ix86_tune_defaulted)
5552 {
5553 opts->x_ix86_tune_string = "x86-64";
5554 for (i = 0; i < pta_size; i++)
5555 if (! strcmp (opts->x_ix86_tune_string,
5556 processor_alias_table[i].name))
5557 break;
5558 ix86_schedule = processor_alias_table[i].schedule;
5559 ix86_tune = processor_alias_table[i].processor;
5560 }
5561 else
5562 error ("CPU you selected does not support x86-64 "
5563 "instruction set");
5564 }
5565 }
5566 /* Intel CPUs have always interpreted SSE prefetch instructions as
5567 NOPs; so, we can enable SSE prefetch instructions even when
5568 -mtune (rather than -march) points us to a processor that has them.
5569 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
5570 higher processors. */
5571 if (TARGET_CMOV
5572 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
5573 x86_prefetch_sse = true;
5574 break;
5575 }
5576
5577 if (ix86_tune_specified && i == pta_size)
5578 {
5579 error (main_args_p
5580 ? "bad value (%qs) for %<-mtune=%> switch"
5581 : "bad value (%qs) for %<target(\"tune=\")%> attribute",
5582 opts->x_ix86_tune_string);
5583
5584 auto_vec <const char *> candidates;
5585 for (i = 0; i < pta_size; i++)
5586 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5587 || (processor_alias_table[i].flags & PTA_64BIT))
5588 candidates.safe_push (processor_alias_table[i].name);
5589
5590 char *s;
5591 const char *hint
5592 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
5593 if (hint)
5594 inform (input_location,
5595 main_args_p
5596 ? "valid arguments to %<-mtune=%> switch are: "
5597 "%s; did you mean %qs?"
5598 : "valid arguments to %<target(\"tune=\")%> attribute are: "
5599 "%s; did you mean %qs?", s, hint);
5600 else
5601 inform (input_location,
5602 main_args_p
5603 ? "valid arguments to %<-mtune=%> switch are: %s"
5604 : "valid arguments to %<target(\"tune=\")%> attribute are: %s",
5605 s);
5606 XDELETEVEC (s);
5607 }
5608
5609 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
5610
5611 #ifndef USE_IX86_FRAME_POINTER
5612 #define USE_IX86_FRAME_POINTER 0
5613 #endif
5614
5615 #ifndef USE_X86_64_FRAME_POINTER
5616 #define USE_X86_64_FRAME_POINTER 0
5617 #endif
5618
5619 /* Set the default values for switches whose default depends on TARGET_64BIT
5620 in case they weren't overwritten by command line options. */
5621 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5622 {
5623 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
5624 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
5625 if (opts->x_flag_asynchronous_unwind_tables
5626 && !opts_set->x_flag_unwind_tables
5627 && TARGET_64BIT_MS_ABI)
5628 opts->x_flag_unwind_tables = 1;
5629 if (opts->x_flag_asynchronous_unwind_tables == 2)
5630 opts->x_flag_unwind_tables
5631 = opts->x_flag_asynchronous_unwind_tables = 1;
5632 if (opts->x_flag_pcc_struct_return == 2)
5633 opts->x_flag_pcc_struct_return = 0;
5634 }
5635 else
5636 {
5637 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
5638 opts->x_flag_omit_frame_pointer
5639 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
5640 if (opts->x_flag_asynchronous_unwind_tables == 2)
5641 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
5642 if (opts->x_flag_pcc_struct_return == 2)
5643 {
5644 /* Intel MCU psABI specifies that -freg-struct-return should
5645 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
5646 we check -miamcu so that -freg-struct-return is always
5647 turned on if -miamcu is used. */
5648 if (TARGET_IAMCU_P (opts->x_target_flags))
5649 opts->x_flag_pcc_struct_return = 0;
5650 else
5651 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
5652 }
5653 }
5654
5655 ix86_tune_cost = processor_target_table[ix86_tune].cost;
5656 /* TODO: ix86_cost should be chosen at instruction or function granuality
5657 so for cold code we use size_cost even in !optimize_size compilation. */
5658 if (opts->x_optimize_size)
5659 ix86_cost = &ix86_size_cost;
5660 else
5661 ix86_cost = ix86_tune_cost;
5662
5663 /* Arrange to set up i386_stack_locals for all functions. */
5664 init_machine_status = ix86_init_machine_status;
5665
5666 /* Validate -mregparm= value. */
5667 if (opts_set->x_ix86_regparm)
5668 {
5669 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5670 warning (0, "-mregparm is ignored in 64-bit mode");
5671 else if (TARGET_IAMCU_P (opts->x_target_flags))
5672 warning (0, "-mregparm is ignored for Intel MCU psABI");
5673 if (opts->x_ix86_regparm > REGPARM_MAX)
5674 {
5675 error ("-mregparm=%d is not between 0 and %d",
5676 opts->x_ix86_regparm, REGPARM_MAX);
5677 opts->x_ix86_regparm = 0;
5678 }
5679 }
5680 if (TARGET_IAMCU_P (opts->x_target_flags)
5681 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
5682 opts->x_ix86_regparm = REGPARM_MAX;
5683
5684 /* Default align_* from the processor table. */
5685 ix86_default_align (opts);
5686
5687 /* Provide default for -mbranch-cost= value. */
5688 if (!opts_set->x_ix86_branch_cost)
5689 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
5690
5691 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5692 {
5693 opts->x_target_flags
5694 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
5695
5696 /* Enable by default the SSE and MMX builtins. Do allow the user to
5697 explicitly disable any of these. In particular, disabling SSE and
5698 MMX for kernel code is extremely useful. */
5699 if (!ix86_arch_specified)
5700 opts->x_ix86_isa_flags
5701 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
5702 | TARGET_SUBTARGET64_ISA_DEFAULT)
5703 & ~opts->x_ix86_isa_flags_explicit);
5704
5705 if (TARGET_RTD_P (opts->x_target_flags))
5706 warning (0,
5707 main_args_p ? "%<-mrtd%> is ignored in 64bit mode"
5708 : "%<target(\"rtd\")%> is ignored in 64bit mode");
5709 }
5710 else
5711 {
5712 opts->x_target_flags
5713 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
5714
5715 if (!ix86_arch_specified)
5716 opts->x_ix86_isa_flags
5717 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
5718
5719 /* i386 ABI does not specify red zone. It still makes sense to use it
5720 when programmer takes care to stack from being destroyed. */
5721 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
5722 opts->x_target_flags |= MASK_NO_RED_ZONE;
5723 }
5724
5725 /* Keep nonleaf frame pointers. */
5726 if (opts->x_flag_omit_frame_pointer)
5727 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
5728 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
5729 opts->x_flag_omit_frame_pointer = 1;
5730
5731 /* If we're doing fast math, we don't care about comparison order
5732 wrt NaNs. This lets us use a shorter comparison sequence. */
5733 if (opts->x_flag_finite_math_only)
5734 opts->x_target_flags &= ~MASK_IEEE_FP;
5735
5736 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
5737 since the insns won't need emulation. */
5738 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
5739 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
5740
5741 /* Likewise, if the target doesn't have a 387, or we've specified
5742 software floating point, don't use 387 inline intrinsics. */
5743 if (!TARGET_80387_P (opts->x_target_flags))
5744 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
5745
5746 /* Turn on MMX builtins for -msse. */
5747 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
5748 opts->x_ix86_isa_flags
5749 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
5750
5751 /* Enable SSE prefetch. */
5752 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
5753 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
5754 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
5755 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
5756 x86_prefetch_sse = true;
5757
5758 /* Enable popcnt instruction for -msse4.2 or -mabm. */
5759 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
5760 || TARGET_ABM_P (opts->x_ix86_isa_flags))
5761 opts->x_ix86_isa_flags
5762 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
5763
5764 /* Enable lzcnt instruction for -mabm. */
5765 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
5766 opts->x_ix86_isa_flags
5767 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
5768
5769 /* Validate -mpreferred-stack-boundary= value or default it to
5770 PREFERRED_STACK_BOUNDARY_DEFAULT. */
5771 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
5772 if (opts_set->x_ix86_preferred_stack_boundary_arg)
5773 {
5774 int min = (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5775 ? (TARGET_SSE_P (opts->x_ix86_isa_flags) ? 4 : 3) : 2);
5776 int max = (TARGET_SEH ? 4 : 12);
5777
5778 if (opts->x_ix86_preferred_stack_boundary_arg < min
5779 || opts->x_ix86_preferred_stack_boundary_arg > max)
5780 {
5781 if (min == max)
5782 error ("-mpreferred-stack-boundary is not supported "
5783 "for this target");
5784 else
5785 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
5786 opts->x_ix86_preferred_stack_boundary_arg, min, max);
5787 }
5788 else
5789 ix86_preferred_stack_boundary
5790 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
5791 }
5792
5793 /* Set the default value for -mstackrealign. */
5794 if (opts->x_ix86_force_align_arg_pointer == -1)
5795 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
5796
5797 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
5798
5799 /* Validate -mincoming-stack-boundary= value or default it to
5800 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
5801 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
5802 if (opts_set->x_ix86_incoming_stack_boundary_arg)
5803 {
5804 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
5805
5806 if (opts->x_ix86_incoming_stack_boundary_arg < min
5807 || opts->x_ix86_incoming_stack_boundary_arg > 12)
5808 error ("-mincoming-stack-boundary=%d is not between %d and 12",
5809 opts->x_ix86_incoming_stack_boundary_arg, min);
5810 else
5811 {
5812 ix86_user_incoming_stack_boundary
5813 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
5814 ix86_incoming_stack_boundary
5815 = ix86_user_incoming_stack_boundary;
5816 }
5817 }
5818
5819 #ifndef NO_PROFILE_COUNTERS
5820 if (flag_nop_mcount)
5821 error ("-mnop-mcount is not compatible with this target");
5822 #endif
5823 if (flag_nop_mcount && flag_pic)
5824 error ("-mnop-mcount is not implemented for -fPIC");
5825
5826 /* Accept -msseregparm only if at least SSE support is enabled. */
5827 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
5828 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
5829 error (main_args_p
5830 ? "%<-msseregparm%> used without SSE enabled"
5831 : "%<target(\"sseregparm\")%> used without SSE enabled");
5832
5833 if (opts_set->x_ix86_fpmath)
5834 {
5835 if (opts->x_ix86_fpmath & FPMATH_SSE)
5836 {
5837 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
5838 {
5839 if (TARGET_80387_P (opts->x_target_flags))
5840 {
5841 warning (0, "SSE instruction set disabled, using 387 arithmetics");
5842 opts->x_ix86_fpmath = FPMATH_387;
5843 }
5844 }
5845 else if ((opts->x_ix86_fpmath & FPMATH_387)
5846 && !TARGET_80387_P (opts->x_target_flags))
5847 {
5848 warning (0, "387 instruction set disabled, using SSE arithmetics");
5849 opts->x_ix86_fpmath = FPMATH_SSE;
5850 }
5851 }
5852 }
5853 /* For all chips supporting SSE2, -mfpmath=sse performs better than
5854 fpmath=387. The second is however default at many targets since the
5855 extra 80bit precision of temporaries is considered to be part of ABI.
5856 Overwrite the default at least for -ffast-math.
5857 TODO: -mfpmath=both seems to produce same performing code with bit
5858 smaller binaries. It is however not clear if register allocation is
5859 ready for this setting.
5860 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
5861 codegen. We may switch to 387 with -ffast-math for size optimized
5862 functions. */
5863 else if (fast_math_flags_set_p (&global_options)
5864 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
5865 opts->x_ix86_fpmath = FPMATH_SSE;
5866 else
5867 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
5868
5869 /* Use external vectorized library in vectorizing intrinsics. */
5870 if (opts_set->x_ix86_veclibabi_type)
5871 switch (opts->x_ix86_veclibabi_type)
5872 {
5873 case ix86_veclibabi_type_svml:
5874 ix86_veclib_handler = ix86_veclibabi_svml;
5875 break;
5876
5877 case ix86_veclibabi_type_acml:
5878 ix86_veclib_handler = ix86_veclibabi_acml;
5879 break;
5880
5881 default:
5882 gcc_unreachable ();
5883 }
5884
5885 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
5886 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
5887 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
5888
5889 /* If stack probes are required, the space used for large function
5890 arguments on the stack must also be probed, so enable
5891 -maccumulate-outgoing-args so this happens in the prologue. */
5892 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
5893 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
5894 {
5895 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
5896 warning (0,
5897 main_args_p
5898 ? "stack probing requires %<-maccumulate-outgoing-args%> "
5899 "for correctness"
5900 : "stack probing requires "
5901 "%<target(\"accumulate-outgoing-args\")%> for correctness");
5902 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
5903 }
5904
5905 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
5906 so enable -maccumulate-outgoing-args when %ebp is fixed. */
5907 if (fixed_regs[BP_REG]
5908 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
5909 {
5910 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
5911 warning (0,
5912 main_args_p
5913 ? "fixed ebp register requires %<-maccumulate-outgoing-args%>"
5914 : "fixed ebp register requires "
5915 "%<target(\"accumulate-outgoing-args\")%>");
5916 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
5917 }
5918
5919 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
5920 {
5921 char *p;
5922 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
5923 p = strchr (internal_label_prefix, 'X');
5924 internal_label_prefix_len = p - internal_label_prefix;
5925 *p = '\0';
5926 }
5927
5928 /* When scheduling description is not available, disable scheduler pass
5929 so it won't slow down the compilation and make x87 code slower. */
5930 if (!TARGET_SCHEDULE)
5931 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
5932
5933 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
5934 ix86_tune_cost->simultaneous_prefetches,
5935 opts->x_param_values,
5936 opts_set->x_param_values);
5937 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
5938 ix86_tune_cost->prefetch_block,
5939 opts->x_param_values,
5940 opts_set->x_param_values);
5941 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
5942 ix86_tune_cost->l1_cache_size,
5943 opts->x_param_values,
5944 opts_set->x_param_values);
5945 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
5946 ix86_tune_cost->l2_cache_size,
5947 opts->x_param_values,
5948 opts_set->x_param_values);
5949
5950 /* Restrict number of if-converted SET insns to 1. */
5951 if (TARGET_ONE_IF_CONV_INSN)
5952 maybe_set_param_value (PARAM_MAX_RTL_IF_CONVERSION_INSNS,
5953 1,
5954 opts->x_param_values,
5955 opts_set->x_param_values);
5956
5957 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
5958 if (opts->x_flag_prefetch_loop_arrays < 0
5959 && HAVE_prefetch
5960 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
5961 && !opts->x_optimize_size
5962 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
5963 opts->x_flag_prefetch_loop_arrays = 1;
5964
5965 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
5966 can be opts->x_optimized to ap = __builtin_next_arg (0). */
5967 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
5968 targetm.expand_builtin_va_start = NULL;
5969
5970 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5971 {
5972 ix86_gen_leave = gen_leave_rex64;
5973 if (Pmode == DImode)
5974 {
5975 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
5976 ix86_gen_tls_local_dynamic_base_64
5977 = gen_tls_local_dynamic_base_64_di;
5978 }
5979 else
5980 {
5981 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
5982 ix86_gen_tls_local_dynamic_base_64
5983 = gen_tls_local_dynamic_base_64_si;
5984 }
5985 }
5986 else
5987 ix86_gen_leave = gen_leave;
5988
5989 if (Pmode == DImode)
5990 {
5991 ix86_gen_add3 = gen_adddi3;
5992 ix86_gen_sub3 = gen_subdi3;
5993 ix86_gen_sub3_carry = gen_subdi3_carry;
5994 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
5995 ix86_gen_andsp = gen_anddi3;
5996 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
5997 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
5998 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
5999 ix86_gen_monitor = gen_sse3_monitor_di;
6000 ix86_gen_monitorx = gen_monitorx_di;
6001 ix86_gen_clzero = gen_clzero_di;
6002 }
6003 else
6004 {
6005 ix86_gen_add3 = gen_addsi3;
6006 ix86_gen_sub3 = gen_subsi3;
6007 ix86_gen_sub3_carry = gen_subsi3_carry;
6008 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
6009 ix86_gen_andsp = gen_andsi3;
6010 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
6011 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
6012 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
6013 ix86_gen_monitor = gen_sse3_monitor_si;
6014 ix86_gen_monitorx = gen_monitorx_si;
6015 ix86_gen_clzero = gen_clzero_si;
6016 }
6017
6018 #ifdef USE_IX86_CLD
6019 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
6020 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
6021 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
6022 #endif
6023
6024 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic)
6025 {
6026 if (opts->x_flag_fentry > 0)
6027 sorry ("-mfentry isn%'t supported for 32-bit in combination "
6028 "with -fpic");
6029 opts->x_flag_fentry = 0;
6030 }
6031 else if (TARGET_SEH)
6032 {
6033 if (opts->x_flag_fentry == 0)
6034 sorry ("-mno-fentry isn%'t compatible with SEH");
6035 opts->x_flag_fentry = 1;
6036 }
6037 else if (opts->x_flag_fentry < 0)
6038 {
6039 #if defined(PROFILE_BEFORE_PROLOGUE)
6040 opts->x_flag_fentry = 1;
6041 #else
6042 opts->x_flag_fentry = 0;
6043 #endif
6044 }
6045
6046 if (!(opts_set->x_target_flags & MASK_VZEROUPPER))
6047 opts->x_target_flags |= MASK_VZEROUPPER;
6048 if (!(opts_set->x_target_flags & MASK_STV))
6049 opts->x_target_flags |= MASK_STV;
6050 /* Disable STV if -mpreferred-stack-boundary={2,3} or
6051 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
6052 stack realignment will be extra cost the pass doesn't take into
6053 account and the pass can't realign the stack. */
6054 if (ix86_preferred_stack_boundary < 128
6055 || ix86_incoming_stack_boundary < 128
6056 || opts->x_ix86_force_align_arg_pointer)
6057 opts->x_target_flags &= ~MASK_STV;
6058 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
6059 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
6060 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
6061 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
6062 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
6063 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
6064 /* Enable 128-bit AVX instruction generation
6065 for the auto-vectorizer. */
6066 if (TARGET_AVX128_OPTIMAL
6067 && !(opts_set->x_target_flags & MASK_PREFER_AVX128))
6068 opts->x_target_flags |= MASK_PREFER_AVX128;
6069
6070 if (opts->x_ix86_recip_name)
6071 {
6072 char *p = ASTRDUP (opts->x_ix86_recip_name);
6073 char *q;
6074 unsigned int mask, i;
6075 bool invert;
6076
6077 while ((q = strtok (p, ",")) != NULL)
6078 {
6079 p = NULL;
6080 if (*q == '!')
6081 {
6082 invert = true;
6083 q++;
6084 }
6085 else
6086 invert = false;
6087
6088 if (!strcmp (q, "default"))
6089 mask = RECIP_MASK_ALL;
6090 else
6091 {
6092 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
6093 if (!strcmp (q, recip_options[i].string))
6094 {
6095 mask = recip_options[i].mask;
6096 break;
6097 }
6098
6099 if (i == ARRAY_SIZE (recip_options))
6100 {
6101 error ("unknown option for -mrecip=%s", q);
6102 invert = false;
6103 mask = RECIP_MASK_NONE;
6104 }
6105 }
6106
6107 opts->x_recip_mask_explicit |= mask;
6108 if (invert)
6109 opts->x_recip_mask &= ~mask;
6110 else
6111 opts->x_recip_mask |= mask;
6112 }
6113 }
6114
6115 if (TARGET_RECIP_P (opts->x_target_flags))
6116 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
6117 else if (opts_set->x_target_flags & MASK_RECIP)
6118 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
6119
6120 /* Default long double to 64-bit for 32-bit Bionic and to __float128
6121 for 64-bit Bionic. Also default long double to 64-bit for Intel
6122 MCU psABI. */
6123 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
6124 && !(opts_set->x_target_flags
6125 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
6126 opts->x_target_flags |= (TARGET_64BIT
6127 ? MASK_LONG_DOUBLE_128
6128 : MASK_LONG_DOUBLE_64);
6129
6130 /* Only one of them can be active. */
6131 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
6132 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
6133
6134 /* Save the initial options in case the user does function specific
6135 options. */
6136 if (main_args_p)
6137 target_option_default_node = target_option_current_node
6138 = build_target_option_node (opts);
6139
6140 /* Handle stack protector */
6141 if (!opts_set->x_ix86_stack_protector_guard)
6142 opts->x_ix86_stack_protector_guard
6143 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
6144
6145 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
6146 if (opts->x_ix86_tune_memcpy_strategy)
6147 {
6148 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
6149 ix86_parse_stringop_strategy_string (str, false);
6150 free (str);
6151 }
6152
6153 if (opts->x_ix86_tune_memset_strategy)
6154 {
6155 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
6156 ix86_parse_stringop_strategy_string (str, true);
6157 free (str);
6158 }
6159
6160 return true;
6161 }
6162
6163 /* Implement the TARGET_OPTION_OVERRIDE hook. */
6164
6165 static void
6166 ix86_option_override (void)
6167 {
6168 ix86_option_override_internal (true, &global_options, &global_options_set);
6169 }
6170
6171 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
6172 static char *
6173 ix86_offload_options (void)
6174 {
6175 if (TARGET_LP64)
6176 return xstrdup ("-foffload-abi=lp64");
6177 return xstrdup ("-foffload-abi=ilp32");
6178 }
6179
6180 /* Update register usage after having seen the compiler flags. */
6181
6182 static void
6183 ix86_conditional_register_usage (void)
6184 {
6185 int i, c_mask;
6186
6187 /* If there are no caller-saved registers, preserve all registers.
6188 except fixed_regs and registers used for function return value
6189 since aggregate_value_p checks call_used_regs[regno] on return
6190 value. */
6191 if (cfun && cfun->machine->no_caller_saved_registers)
6192 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6193 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
6194 call_used_regs[i] = 0;
6195
6196 /* For 32-bit targets, squash the REX registers. */
6197 if (! TARGET_64BIT)
6198 {
6199 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
6200 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6201 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
6202 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6203 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6204 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6205 }
6206
6207 /* See the definition of CALL_USED_REGISTERS in i386.h. */
6208 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
6209
6210 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
6211
6212 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6213 {
6214 /* Set/reset conditionally defined registers from
6215 CALL_USED_REGISTERS initializer. */
6216 if (call_used_regs[i] > 1)
6217 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
6218
6219 /* Calculate registers of CLOBBERED_REGS register set
6220 as call used registers from GENERAL_REGS register set. */
6221 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
6222 && call_used_regs[i])
6223 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
6224 }
6225
6226 /* If MMX is disabled, squash the registers. */
6227 if (! TARGET_MMX)
6228 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6229 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
6230 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6231
6232 /* If SSE is disabled, squash the registers. */
6233 if (! TARGET_SSE)
6234 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6235 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
6236 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6237
6238 /* If the FPU is disabled, squash the registers. */
6239 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
6240 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6241 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
6242 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6243
6244 /* If AVX512F is disabled, squash the registers. */
6245 if (! TARGET_AVX512F)
6246 {
6247 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6248 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6249
6250 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
6251 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6252 }
6253
6254 /* If MPX is disabled, squash the registers. */
6255 if (! TARGET_MPX)
6256 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
6257 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6258 }
6259
6260 \f
6261 /* Save the current options */
6262
6263 static void
6264 ix86_function_specific_save (struct cl_target_option *ptr,
6265 struct gcc_options *opts)
6266 {
6267 ptr->arch = ix86_arch;
6268 ptr->schedule = ix86_schedule;
6269 ptr->prefetch_sse = x86_prefetch_sse;
6270 ptr->tune = ix86_tune;
6271 ptr->branch_cost = ix86_branch_cost;
6272 ptr->tune_defaulted = ix86_tune_defaulted;
6273 ptr->arch_specified = ix86_arch_specified;
6274 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
6275 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
6276 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
6277 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
6278 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
6279 ptr->x_ix86_abi = opts->x_ix86_abi;
6280 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
6281 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
6282 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
6283 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
6284 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
6285 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
6286 ptr->x_ix86_pmode = opts->x_ix86_pmode;
6287 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
6288 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
6289 ptr->x_ix86_regparm = opts->x_ix86_regparm;
6290 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
6291 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
6292 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
6293 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
6294 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
6295 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
6296 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
6297 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
6298 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
6299 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
6300
6301 /* The fields are char but the variables are not; make sure the
6302 values fit in the fields. */
6303 gcc_assert (ptr->arch == ix86_arch);
6304 gcc_assert (ptr->schedule == ix86_schedule);
6305 gcc_assert (ptr->tune == ix86_tune);
6306 gcc_assert (ptr->branch_cost == ix86_branch_cost);
6307 }
6308
6309 /* Restore the current options */
6310
6311 static void
6312 ix86_function_specific_restore (struct gcc_options *opts,
6313 struct cl_target_option *ptr)
6314 {
6315 enum processor_type old_tune = ix86_tune;
6316 enum processor_type old_arch = ix86_arch;
6317 unsigned int ix86_arch_mask;
6318 int i;
6319
6320 /* We don't change -fPIC. */
6321 opts->x_flag_pic = flag_pic;
6322
6323 ix86_arch = (enum processor_type) ptr->arch;
6324 ix86_schedule = (enum attr_cpu) ptr->schedule;
6325 ix86_tune = (enum processor_type) ptr->tune;
6326 x86_prefetch_sse = ptr->prefetch_sse;
6327 opts->x_ix86_branch_cost = ptr->branch_cost;
6328 ix86_tune_defaulted = ptr->tune_defaulted;
6329 ix86_arch_specified = ptr->arch_specified;
6330 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
6331 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
6332 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
6333 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
6334 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
6335 opts->x_ix86_abi = ptr->x_ix86_abi;
6336 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
6337 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
6338 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
6339 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
6340 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
6341 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
6342 opts->x_ix86_pmode = ptr->x_ix86_pmode;
6343 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
6344 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
6345 opts->x_ix86_regparm = ptr->x_ix86_regparm;
6346 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
6347 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
6348 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
6349 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
6350 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
6351 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
6352 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
6353 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
6354 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
6355 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
6356 ix86_tune_cost = processor_target_table[ix86_tune].cost;
6357 /* TODO: ix86_cost should be chosen at instruction or function granuality
6358 so for cold code we use size_cost even in !optimize_size compilation. */
6359 if (opts->x_optimize_size)
6360 ix86_cost = &ix86_size_cost;
6361 else
6362 ix86_cost = ix86_tune_cost;
6363
6364 /* Recreate the arch feature tests if the arch changed */
6365 if (old_arch != ix86_arch)
6366 {
6367 ix86_arch_mask = 1u << ix86_arch;
6368 for (i = 0; i < X86_ARCH_LAST; ++i)
6369 ix86_arch_features[i]
6370 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
6371 }
6372
6373 /* Recreate the tune optimization tests */
6374 if (old_tune != ix86_tune)
6375 set_ix86_tune_features (ix86_tune, false);
6376 }
6377
6378 /* Adjust target options after streaming them in. This is mainly about
6379 reconciling them with global options. */
6380
6381 static void
6382 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
6383 {
6384 /* flag_pic is a global option, but ix86_cmodel is target saved option
6385 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
6386 for PIC, or error out. */
6387 if (flag_pic)
6388 switch (ptr->x_ix86_cmodel)
6389 {
6390 case CM_SMALL:
6391 ptr->x_ix86_cmodel = CM_SMALL_PIC;
6392 break;
6393
6394 case CM_MEDIUM:
6395 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
6396 break;
6397
6398 case CM_LARGE:
6399 ptr->x_ix86_cmodel = CM_LARGE_PIC;
6400 break;
6401
6402 case CM_KERNEL:
6403 error ("code model %s does not support PIC mode", "kernel");
6404 break;
6405
6406 default:
6407 break;
6408 }
6409 else
6410 switch (ptr->x_ix86_cmodel)
6411 {
6412 case CM_SMALL_PIC:
6413 ptr->x_ix86_cmodel = CM_SMALL;
6414 break;
6415
6416 case CM_MEDIUM_PIC:
6417 ptr->x_ix86_cmodel = CM_MEDIUM;
6418 break;
6419
6420 case CM_LARGE_PIC:
6421 ptr->x_ix86_cmodel = CM_LARGE;
6422 break;
6423
6424 default:
6425 break;
6426 }
6427 }
6428
6429 /* Print the current options */
6430
6431 static void
6432 ix86_function_specific_print (FILE *file, int indent,
6433 struct cl_target_option *ptr)
6434 {
6435 char *target_string
6436 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
6437 ptr->x_ix86_target_flags, NULL, NULL,
6438 ptr->x_ix86_fpmath, false);
6439
6440 gcc_assert (ptr->arch < PROCESSOR_max);
6441 fprintf (file, "%*sarch = %d (%s)\n",
6442 indent, "",
6443 ptr->arch, processor_target_table[ptr->arch].name);
6444
6445 gcc_assert (ptr->tune < PROCESSOR_max);
6446 fprintf (file, "%*stune = %d (%s)\n",
6447 indent, "",
6448 ptr->tune, processor_target_table[ptr->tune].name);
6449
6450 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
6451
6452 if (target_string)
6453 {
6454 fprintf (file, "%*s%s\n", indent, "", target_string);
6455 free (target_string);
6456 }
6457 }
6458
6459 \f
6460 /* Inner function to process the attribute((target(...))), take an argument and
6461 set the current options from the argument. If we have a list, recursively go
6462 over the list. */
6463
6464 static bool
6465 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
6466 struct gcc_options *opts,
6467 struct gcc_options *opts_set,
6468 struct gcc_options *enum_opts_set)
6469 {
6470 char *next_optstr;
6471 bool ret = true;
6472
6473 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
6474 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
6475 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
6476 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
6477 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
6478
6479 enum ix86_opt_type
6480 {
6481 ix86_opt_unknown,
6482 ix86_opt_yes,
6483 ix86_opt_no,
6484 ix86_opt_str,
6485 ix86_opt_enum,
6486 ix86_opt_isa
6487 };
6488
6489 static const struct
6490 {
6491 const char *string;
6492 size_t len;
6493 enum ix86_opt_type type;
6494 int opt;
6495 int mask;
6496 } attrs[] = {
6497 /* isa options */
6498 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
6499 IX86_ATTR_ISA ("abm", OPT_mabm),
6500 IX86_ATTR_ISA ("bmi", OPT_mbmi),
6501 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
6502 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
6503 IX86_ATTR_ISA ("tbm", OPT_mtbm),
6504 IX86_ATTR_ISA ("aes", OPT_maes),
6505 IX86_ATTR_ISA ("sha", OPT_msha),
6506 IX86_ATTR_ISA ("avx", OPT_mavx),
6507 IX86_ATTR_ISA ("avx2", OPT_mavx2),
6508 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
6509 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
6510 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
6511 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
6512 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
6513 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
6514 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
6515 IX86_ATTR_ISA ("mmx", OPT_mmmx),
6516 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
6517 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
6518 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
6519 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
6520 IX86_ATTR_ISA ("sse", OPT_msse),
6521 IX86_ATTR_ISA ("sse2", OPT_msse2),
6522 IX86_ATTR_ISA ("sse3", OPT_msse3),
6523 IX86_ATTR_ISA ("sse4", OPT_msse4),
6524 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
6525 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
6526 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
6527 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
6528 IX86_ATTR_ISA ("fma4", OPT_mfma4),
6529 IX86_ATTR_ISA ("fma", OPT_mfma),
6530 IX86_ATTR_ISA ("xop", OPT_mxop),
6531 IX86_ATTR_ISA ("lwp", OPT_mlwp),
6532 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
6533 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
6534 IX86_ATTR_ISA ("f16c", OPT_mf16c),
6535 IX86_ATTR_ISA ("rtm", OPT_mrtm),
6536 IX86_ATTR_ISA ("hle", OPT_mhle),
6537 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
6538 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
6539 IX86_ATTR_ISA ("adx", OPT_madx),
6540 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
6541 IX86_ATTR_ISA ("xsave", OPT_mxsave),
6542 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
6543 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
6544 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
6545 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
6546 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
6547 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
6548 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
6549 IX86_ATTR_ISA ("clwb", OPT_mclwb),
6550 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
6551 IX86_ATTR_ISA ("clzero", OPT_mclzero),
6552 IX86_ATTR_ISA ("pku", OPT_mpku),
6553
6554 /* enum options */
6555 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
6556
6557 /* string options */
6558 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
6559 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
6560
6561 /* flag options */
6562 IX86_ATTR_YES ("cld",
6563 OPT_mcld,
6564 MASK_CLD),
6565
6566 IX86_ATTR_NO ("fancy-math-387",
6567 OPT_mfancy_math_387,
6568 MASK_NO_FANCY_MATH_387),
6569
6570 IX86_ATTR_YES ("ieee-fp",
6571 OPT_mieee_fp,
6572 MASK_IEEE_FP),
6573
6574 IX86_ATTR_YES ("inline-all-stringops",
6575 OPT_minline_all_stringops,
6576 MASK_INLINE_ALL_STRINGOPS),
6577
6578 IX86_ATTR_YES ("inline-stringops-dynamically",
6579 OPT_minline_stringops_dynamically,
6580 MASK_INLINE_STRINGOPS_DYNAMICALLY),
6581
6582 IX86_ATTR_NO ("align-stringops",
6583 OPT_mno_align_stringops,
6584 MASK_NO_ALIGN_STRINGOPS),
6585
6586 IX86_ATTR_YES ("recip",
6587 OPT_mrecip,
6588 MASK_RECIP),
6589
6590 };
6591
6592 /* If this is a list, recurse to get the options. */
6593 if (TREE_CODE (args) == TREE_LIST)
6594 {
6595 bool ret = true;
6596
6597 for (; args; args = TREE_CHAIN (args))
6598 if (TREE_VALUE (args)
6599 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
6600 p_strings, opts, opts_set,
6601 enum_opts_set))
6602 ret = false;
6603
6604 return ret;
6605 }
6606
6607 else if (TREE_CODE (args) != STRING_CST)
6608 {
6609 error ("attribute %<target%> argument not a string");
6610 return false;
6611 }
6612
6613 /* Handle multiple arguments separated by commas. */
6614 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
6615
6616 while (next_optstr && *next_optstr != '\0')
6617 {
6618 char *p = next_optstr;
6619 char *orig_p = p;
6620 char *comma = strchr (next_optstr, ',');
6621 const char *opt_string;
6622 size_t len, opt_len;
6623 int opt;
6624 bool opt_set_p;
6625 char ch;
6626 unsigned i;
6627 enum ix86_opt_type type = ix86_opt_unknown;
6628 int mask = 0;
6629
6630 if (comma)
6631 {
6632 *comma = '\0';
6633 len = comma - next_optstr;
6634 next_optstr = comma + 1;
6635 }
6636 else
6637 {
6638 len = strlen (p);
6639 next_optstr = NULL;
6640 }
6641
6642 /* Recognize no-xxx. */
6643 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
6644 {
6645 opt_set_p = false;
6646 p += 3;
6647 len -= 3;
6648 }
6649 else
6650 opt_set_p = true;
6651
6652 /* Find the option. */
6653 ch = *p;
6654 opt = N_OPTS;
6655 for (i = 0; i < ARRAY_SIZE (attrs); i++)
6656 {
6657 type = attrs[i].type;
6658 opt_len = attrs[i].len;
6659 if (ch == attrs[i].string[0]
6660 && ((type != ix86_opt_str && type != ix86_opt_enum)
6661 ? len == opt_len
6662 : len > opt_len)
6663 && memcmp (p, attrs[i].string, opt_len) == 0)
6664 {
6665 opt = attrs[i].opt;
6666 mask = attrs[i].mask;
6667 opt_string = attrs[i].string;
6668 break;
6669 }
6670 }
6671
6672 /* Process the option. */
6673 if (opt == N_OPTS)
6674 {
6675 error ("attribute(target(\"%s\")) is unknown", orig_p);
6676 ret = false;
6677 }
6678
6679 else if (type == ix86_opt_isa)
6680 {
6681 struct cl_decoded_option decoded;
6682
6683 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
6684 ix86_handle_option (opts, opts_set,
6685 &decoded, input_location);
6686 }
6687
6688 else if (type == ix86_opt_yes || type == ix86_opt_no)
6689 {
6690 if (type == ix86_opt_no)
6691 opt_set_p = !opt_set_p;
6692
6693 if (opt_set_p)
6694 opts->x_target_flags |= mask;
6695 else
6696 opts->x_target_flags &= ~mask;
6697 }
6698
6699 else if (type == ix86_opt_str)
6700 {
6701 if (p_strings[opt])
6702 {
6703 error ("option(\"%s\") was already specified", opt_string);
6704 ret = false;
6705 }
6706 else
6707 p_strings[opt] = xstrdup (p + opt_len);
6708 }
6709
6710 else if (type == ix86_opt_enum)
6711 {
6712 bool arg_ok;
6713 int value;
6714
6715 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
6716 if (arg_ok)
6717 set_option (opts, enum_opts_set, opt, value,
6718 p + opt_len, DK_UNSPECIFIED, input_location,
6719 global_dc);
6720 else
6721 {
6722 error ("attribute(target(\"%s\")) is unknown", orig_p);
6723 ret = false;
6724 }
6725 }
6726
6727 else
6728 gcc_unreachable ();
6729 }
6730
6731 return ret;
6732 }
6733
6734 /* Release allocated strings. */
6735 static void
6736 release_options_strings (char **option_strings)
6737 {
6738 /* Free up memory allocated to hold the strings */
6739 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
6740 free (option_strings[i]);
6741 }
6742
6743 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
6744
6745 tree
6746 ix86_valid_target_attribute_tree (tree args,
6747 struct gcc_options *opts,
6748 struct gcc_options *opts_set)
6749 {
6750 const char *orig_arch_string = opts->x_ix86_arch_string;
6751 const char *orig_tune_string = opts->x_ix86_tune_string;
6752 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
6753 int orig_tune_defaulted = ix86_tune_defaulted;
6754 int orig_arch_specified = ix86_arch_specified;
6755 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
6756 tree t = NULL_TREE;
6757 struct cl_target_option *def
6758 = TREE_TARGET_OPTION (target_option_default_node);
6759 struct gcc_options enum_opts_set;
6760
6761 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
6762
6763 /* Process each of the options on the chain. */
6764 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
6765 opts_set, &enum_opts_set))
6766 return error_mark_node;
6767
6768 /* If the changed options are different from the default, rerun
6769 ix86_option_override_internal, and then save the options away.
6770 The string options are attribute options, and will be undone
6771 when we copy the save structure. */
6772 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
6773 || opts->x_target_flags != def->x_target_flags
6774 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
6775 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
6776 || enum_opts_set.x_ix86_fpmath)
6777 {
6778 /* If we are using the default tune= or arch=, undo the string assigned,
6779 and use the default. */
6780 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
6781 {
6782 opts->x_ix86_arch_string
6783 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
6784
6785 /* If arch= is set, clear all bits in x_ix86_isa_flags,
6786 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
6787 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
6788 | OPTION_MASK_ABI_64
6789 | OPTION_MASK_ABI_X32
6790 | OPTION_MASK_CODE16);
6791
6792 }
6793 else if (!orig_arch_specified)
6794 opts->x_ix86_arch_string = NULL;
6795
6796 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
6797 opts->x_ix86_tune_string
6798 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
6799 else if (orig_tune_defaulted)
6800 opts->x_ix86_tune_string = NULL;
6801
6802 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
6803 if (enum_opts_set.x_ix86_fpmath)
6804 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
6805 else if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
6806 && TARGET_SSE_P (opts->x_ix86_isa_flags))
6807 {
6808 if (TARGET_80387_P (opts->x_target_flags))
6809 opts->x_ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE
6810 | FPMATH_387);
6811 else
6812 opts->x_ix86_fpmath = (enum fpmath_unit) FPMATH_SSE;
6813 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
6814 }
6815
6816 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
6817 bool r = ix86_option_override_internal (false, opts, opts_set);
6818 if (!r)
6819 {
6820 release_options_strings (option_strings);
6821 return error_mark_node;
6822 }
6823
6824 /* Add any builtin functions with the new isa if any. */
6825 ix86_add_new_builtins (opts->x_ix86_isa_flags);
6826
6827 /* Save the current options unless we are validating options for
6828 #pragma. */
6829 t = build_target_option_node (opts);
6830
6831 opts->x_ix86_arch_string = orig_arch_string;
6832 opts->x_ix86_tune_string = orig_tune_string;
6833 opts_set->x_ix86_fpmath = orig_fpmath_set;
6834
6835 release_options_strings (option_strings);
6836 }
6837
6838 return t;
6839 }
6840
6841 /* Hook to validate attribute((target("string"))). */
6842
6843 static bool
6844 ix86_valid_target_attribute_p (tree fndecl,
6845 tree ARG_UNUSED (name),
6846 tree args,
6847 int ARG_UNUSED (flags))
6848 {
6849 struct gcc_options func_options;
6850 tree new_target, new_optimize;
6851 bool ret = true;
6852
6853 /* attribute((target("default"))) does nothing, beyond
6854 affecting multi-versioning. */
6855 if (TREE_VALUE (args)
6856 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
6857 && TREE_CHAIN (args) == NULL_TREE
6858 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
6859 return true;
6860
6861 tree old_optimize = build_optimization_node (&global_options);
6862
6863 /* Get the optimization options of the current function. */
6864 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
6865
6866 if (!func_optimize)
6867 func_optimize = old_optimize;
6868
6869 /* Init func_options. */
6870 memset (&func_options, 0, sizeof (func_options));
6871 init_options_struct (&func_options, NULL);
6872 lang_hooks.init_options_struct (&func_options);
6873
6874 cl_optimization_restore (&func_options,
6875 TREE_OPTIMIZATION (func_optimize));
6876
6877 /* Initialize func_options to the default before its target options can
6878 be set. */
6879 cl_target_option_restore (&func_options,
6880 TREE_TARGET_OPTION (target_option_default_node));
6881
6882 new_target = ix86_valid_target_attribute_tree (args, &func_options,
6883 &global_options_set);
6884
6885 new_optimize = build_optimization_node (&func_options);
6886
6887 if (new_target == error_mark_node)
6888 ret = false;
6889
6890 else if (fndecl && new_target)
6891 {
6892 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
6893
6894 if (old_optimize != new_optimize)
6895 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
6896 }
6897
6898 finalize_options_struct (&func_options);
6899
6900 return ret;
6901 }
6902
6903 \f
6904 /* Hook to determine if one function can safely inline another. */
6905
6906 static bool
6907 ix86_can_inline_p (tree caller, tree callee)
6908 {
6909 bool ret = false;
6910 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
6911 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
6912
6913 /* If callee has no option attributes, then it is ok to inline. */
6914 if (!callee_tree)
6915 ret = true;
6916
6917 /* If caller has no option attributes, but callee does then it is not ok to
6918 inline. */
6919 else if (!caller_tree)
6920 ret = false;
6921
6922 else
6923 {
6924 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
6925 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
6926
6927 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
6928 can inline a SSE2 function but a SSE2 function can't inline a SSE4
6929 function. */
6930 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
6931 != callee_opts->x_ix86_isa_flags)
6932 ret = false;
6933
6934 /* See if we have the same non-isa options. */
6935 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
6936 ret = false;
6937
6938 /* See if arch, tune, etc. are the same. */
6939 else if (caller_opts->arch != callee_opts->arch)
6940 ret = false;
6941
6942 else if (caller_opts->tune != callee_opts->tune)
6943 ret = false;
6944
6945 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
6946 ret = false;
6947
6948 else if (caller_opts->branch_cost != callee_opts->branch_cost)
6949 ret = false;
6950
6951 else
6952 ret = true;
6953 }
6954
6955 return ret;
6956 }
6957
6958 \f
6959 /* Remember the last target of ix86_set_current_function. */
6960 static GTY(()) tree ix86_previous_fndecl;
6961
6962 /* Set targets globals to the default (or current #pragma GCC target
6963 if active). Invalidate ix86_previous_fndecl cache. */
6964
6965 void
6966 ix86_reset_previous_fndecl (void)
6967 {
6968 tree new_tree = target_option_current_node;
6969 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
6970 if (TREE_TARGET_GLOBALS (new_tree))
6971 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
6972 else if (new_tree == target_option_default_node)
6973 restore_target_globals (&default_target_globals);
6974 else
6975 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
6976 ix86_previous_fndecl = NULL_TREE;
6977 }
6978
6979 /* Set the func_type field from the function FNDECL. */
6980
6981 static void
6982 ix86_set_func_type (tree fndecl)
6983 {
6984 if (cfun->machine->func_type == TYPE_UNKNOWN)
6985 {
6986 if (lookup_attribute ("interrupt",
6987 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
6988 {
6989 int nargs = 0;
6990 for (tree arg = DECL_ARGUMENTS (fndecl);
6991 arg;
6992 arg = TREE_CHAIN (arg))
6993 nargs++;
6994 cfun->machine->no_caller_saved_registers = true;
6995 cfun->machine->func_type
6996 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
6997
6998 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
6999
7000 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
7001 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
7002 sorry ("Only DWARF debug format is supported for interrupt "
7003 "service routine.");
7004 }
7005 else
7006 {
7007 cfun->machine->func_type = TYPE_NORMAL;
7008 if (lookup_attribute ("no_caller_saved_registers",
7009 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7010 cfun->machine->no_caller_saved_registers = true;
7011 }
7012 }
7013 }
7014
7015 /* Establish appropriate back-end context for processing the function
7016 FNDECL. The argument might be NULL to indicate processing at top
7017 level, outside of any function scope. */
7018 static void
7019 ix86_set_current_function (tree fndecl)
7020 {
7021 /* Only change the context if the function changes. This hook is called
7022 several times in the course of compiling a function, and we don't want to
7023 slow things down too much or call target_reinit when it isn't safe. */
7024 if (fndecl == ix86_previous_fndecl)
7025 {
7026 /* There may be 2 function bodies for the same function FNDECL,
7027 one is extern inline and one isn't. Call ix86_set_func_type
7028 to set the func_type field. */
7029 if (fndecl != NULL_TREE)
7030 ix86_set_func_type (fndecl);
7031 return;
7032 }
7033
7034 tree old_tree;
7035 if (ix86_previous_fndecl == NULL_TREE)
7036 old_tree = target_option_current_node;
7037 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
7038 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
7039 else
7040 old_tree = target_option_default_node;
7041
7042 if (fndecl == NULL_TREE)
7043 {
7044 if (old_tree != target_option_current_node)
7045 ix86_reset_previous_fndecl ();
7046 return;
7047 }
7048
7049 ix86_set_func_type (fndecl);
7050
7051 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
7052 if (new_tree == NULL_TREE)
7053 new_tree = target_option_default_node;
7054
7055 if (old_tree != new_tree)
7056 {
7057 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
7058 if (TREE_TARGET_GLOBALS (new_tree))
7059 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7060 else if (new_tree == target_option_default_node)
7061 restore_target_globals (&default_target_globals);
7062 else
7063 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
7064 }
7065 ix86_previous_fndecl = fndecl;
7066
7067 static bool prev_no_caller_saved_registers;
7068
7069 /* 64-bit MS and SYSV ABI have different set of call used registers.
7070 Avoid expensive re-initialization of init_regs each time we switch
7071 function context. */
7072 if (TARGET_64BIT
7073 && (call_used_regs[SI_REG]
7074 == (cfun->machine->call_abi == MS_ABI)))
7075 reinit_regs ();
7076 /* Need to re-initialize init_regs if caller-saved registers are
7077 changed. */
7078 else if (prev_no_caller_saved_registers
7079 != cfun->machine->no_caller_saved_registers)
7080 reinit_regs ();
7081
7082 if (cfun->machine->func_type != TYPE_NORMAL
7083 || cfun->machine->no_caller_saved_registers)
7084 {
7085 /* Don't allow MPX, SSE, MMX nor x87 instructions since they
7086 may change processor state. */
7087 const char *isa;
7088 if (TARGET_MPX)
7089 isa = "MPX";
7090 else if (TARGET_SSE)
7091 isa = "SSE";
7092 else if (TARGET_MMX)
7093 isa = "MMX/3Dnow";
7094 else if (TARGET_80387)
7095 isa = "80387";
7096 else
7097 isa = NULL;
7098 if (isa != NULL)
7099 {
7100 if (cfun->machine->func_type != TYPE_NORMAL)
7101 sorry ("%s instructions aren't allowed in %s service routine",
7102 isa, (cfun->machine->func_type == TYPE_EXCEPTION
7103 ? "exception" : "interrupt"));
7104 else
7105 sorry ("%s instructions aren't allowed in function with "
7106 "no_caller_saved_registers attribute", isa);
7107 /* Don't issue the same error twice. */
7108 cfun->machine->func_type = TYPE_NORMAL;
7109 cfun->machine->no_caller_saved_registers = false;
7110 }
7111 }
7112
7113 prev_no_caller_saved_registers
7114 = cfun->machine->no_caller_saved_registers;
7115 }
7116
7117 \f
7118 /* Return true if this goes in large data/bss. */
7119
7120 static bool
7121 ix86_in_large_data_p (tree exp)
7122 {
7123 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
7124 return false;
7125
7126 if (exp == NULL_TREE)
7127 return false;
7128
7129 /* Functions are never large data. */
7130 if (TREE_CODE (exp) == FUNCTION_DECL)
7131 return false;
7132
7133 /* Automatic variables are never large data. */
7134 if (TREE_CODE (exp) == VAR_DECL && !is_global_var (exp))
7135 return false;
7136
7137 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
7138 {
7139 const char *section = DECL_SECTION_NAME (exp);
7140 if (strcmp (section, ".ldata") == 0
7141 || strcmp (section, ".lbss") == 0)
7142 return true;
7143 return false;
7144 }
7145 else
7146 {
7147 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
7148
7149 /* If this is an incomplete type with size 0, then we can't put it
7150 in data because it might be too big when completed. Also,
7151 int_size_in_bytes returns -1 if size can vary or is larger than
7152 an integer in which case also it is safer to assume that it goes in
7153 large data. */
7154 if (size <= 0 || size > ix86_section_threshold)
7155 return true;
7156 }
7157
7158 return false;
7159 }
7160
7161 /* i386-specific section flag to mark large sections. */
7162 #define SECTION_LARGE SECTION_MACH_DEP
7163
7164 /* Switch to the appropriate section for output of DECL.
7165 DECL is either a `VAR_DECL' node or a constant of some sort.
7166 RELOC indicates whether forming the initial value of DECL requires
7167 link-time relocations. */
7168
7169 ATTRIBUTE_UNUSED static section *
7170 x86_64_elf_select_section (tree decl, int reloc,
7171 unsigned HOST_WIDE_INT align)
7172 {
7173 if (ix86_in_large_data_p (decl))
7174 {
7175 const char *sname = NULL;
7176 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
7177 switch (categorize_decl_for_section (decl, reloc))
7178 {
7179 case SECCAT_DATA:
7180 sname = ".ldata";
7181 break;
7182 case SECCAT_DATA_REL:
7183 sname = ".ldata.rel";
7184 break;
7185 case SECCAT_DATA_REL_LOCAL:
7186 sname = ".ldata.rel.local";
7187 break;
7188 case SECCAT_DATA_REL_RO:
7189 sname = ".ldata.rel.ro";
7190 break;
7191 case SECCAT_DATA_REL_RO_LOCAL:
7192 sname = ".ldata.rel.ro.local";
7193 break;
7194 case SECCAT_BSS:
7195 sname = ".lbss";
7196 flags |= SECTION_BSS;
7197 break;
7198 case SECCAT_RODATA:
7199 case SECCAT_RODATA_MERGE_STR:
7200 case SECCAT_RODATA_MERGE_STR_INIT:
7201 case SECCAT_RODATA_MERGE_CONST:
7202 sname = ".lrodata";
7203 flags &= ~SECTION_WRITE;
7204 break;
7205 case SECCAT_SRODATA:
7206 case SECCAT_SDATA:
7207 case SECCAT_SBSS:
7208 gcc_unreachable ();
7209 case SECCAT_TEXT:
7210 case SECCAT_TDATA:
7211 case SECCAT_TBSS:
7212 /* We don't split these for medium model. Place them into
7213 default sections and hope for best. */
7214 break;
7215 }
7216 if (sname)
7217 {
7218 /* We might get called with string constants, but get_named_section
7219 doesn't like them as they are not DECLs. Also, we need to set
7220 flags in that case. */
7221 if (!DECL_P (decl))
7222 return get_section (sname, flags, NULL);
7223 return get_named_section (decl, sname, reloc);
7224 }
7225 }
7226 return default_elf_select_section (decl, reloc, align);
7227 }
7228
7229 /* Select a set of attributes for section NAME based on the properties
7230 of DECL and whether or not RELOC indicates that DECL's initializer
7231 might contain runtime relocations. */
7232
7233 static unsigned int ATTRIBUTE_UNUSED
7234 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
7235 {
7236 unsigned int flags = default_section_type_flags (decl, name, reloc);
7237
7238 if (ix86_in_large_data_p (decl))
7239 flags |= SECTION_LARGE;
7240
7241 if (decl == NULL_TREE
7242 && (strcmp (name, ".ldata.rel.ro") == 0
7243 || strcmp (name, ".ldata.rel.ro.local") == 0))
7244 flags |= SECTION_RELRO;
7245
7246 if (strcmp (name, ".lbss") == 0
7247 || strncmp (name, ".lbss.", 5) == 0
7248 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
7249 flags |= SECTION_BSS;
7250
7251 return flags;
7252 }
7253
7254 /* Build up a unique section name, expressed as a
7255 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
7256 RELOC indicates whether the initial value of EXP requires
7257 link-time relocations. */
7258
7259 static void ATTRIBUTE_UNUSED
7260 x86_64_elf_unique_section (tree decl, int reloc)
7261 {
7262 if (ix86_in_large_data_p (decl))
7263 {
7264 const char *prefix = NULL;
7265 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
7266 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
7267
7268 switch (categorize_decl_for_section (decl, reloc))
7269 {
7270 case SECCAT_DATA:
7271 case SECCAT_DATA_REL:
7272 case SECCAT_DATA_REL_LOCAL:
7273 case SECCAT_DATA_REL_RO:
7274 case SECCAT_DATA_REL_RO_LOCAL:
7275 prefix = one_only ? ".ld" : ".ldata";
7276 break;
7277 case SECCAT_BSS:
7278 prefix = one_only ? ".lb" : ".lbss";
7279 break;
7280 case SECCAT_RODATA:
7281 case SECCAT_RODATA_MERGE_STR:
7282 case SECCAT_RODATA_MERGE_STR_INIT:
7283 case SECCAT_RODATA_MERGE_CONST:
7284 prefix = one_only ? ".lr" : ".lrodata";
7285 break;
7286 case SECCAT_SRODATA:
7287 case SECCAT_SDATA:
7288 case SECCAT_SBSS:
7289 gcc_unreachable ();
7290 case SECCAT_TEXT:
7291 case SECCAT_TDATA:
7292 case SECCAT_TBSS:
7293 /* We don't split these for medium model. Place them into
7294 default sections and hope for best. */
7295 break;
7296 }
7297 if (prefix)
7298 {
7299 const char *name, *linkonce;
7300 char *string;
7301
7302 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
7303 name = targetm.strip_name_encoding (name);
7304
7305 /* If we're using one_only, then there needs to be a .gnu.linkonce
7306 prefix to the section name. */
7307 linkonce = one_only ? ".gnu.linkonce" : "";
7308
7309 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
7310
7311 set_decl_section_name (decl, string);
7312 return;
7313 }
7314 }
7315 default_unique_section (decl, reloc);
7316 }
7317
7318 #ifdef COMMON_ASM_OP
7319
7320 #ifndef LARGECOMM_SECTION_ASM_OP
7321 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
7322 #endif
7323
7324 /* This says how to output assembler code to declare an
7325 uninitialized external linkage data object.
7326
7327 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
7328 large objects. */
7329 void
7330 x86_elf_aligned_decl_common (FILE *file, tree decl,
7331 const char *name, unsigned HOST_WIDE_INT size,
7332 int align)
7333 {
7334 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7335 && size > (unsigned int)ix86_section_threshold)
7336 {
7337 switch_to_section (get_named_section (decl, ".lbss", 0));
7338 fputs (LARGECOMM_SECTION_ASM_OP, file);
7339 }
7340 else
7341 fputs (COMMON_ASM_OP, file);
7342 assemble_name (file, name);
7343 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
7344 size, align / BITS_PER_UNIT);
7345 }
7346 #endif
7347
7348 /* Utility function for targets to use in implementing
7349 ASM_OUTPUT_ALIGNED_BSS. */
7350
7351 void
7352 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
7353 unsigned HOST_WIDE_INT size, int align)
7354 {
7355 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7356 && size > (unsigned int)ix86_section_threshold)
7357 switch_to_section (get_named_section (decl, ".lbss", 0));
7358 else
7359 switch_to_section (bss_section);
7360 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
7361 #ifdef ASM_DECLARE_OBJECT_NAME
7362 last_assemble_variable_decl = decl;
7363 ASM_DECLARE_OBJECT_NAME (file, name, decl);
7364 #else
7365 /* Standard thing is just output label for the object. */
7366 ASM_OUTPUT_LABEL (file, name);
7367 #endif /* ASM_DECLARE_OBJECT_NAME */
7368 ASM_OUTPUT_SKIP (file, size ? size : 1);
7369 }
7370 \f
7371 /* Decide whether we must probe the stack before any space allocation
7372 on this target. It's essentially TARGET_STACK_PROBE except when
7373 -fstack-check causes the stack to be already probed differently. */
7374
7375 bool
7376 ix86_target_stack_probe (void)
7377 {
7378 /* Do not probe the stack twice if static stack checking is enabled. */
7379 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7380 return false;
7381
7382 return TARGET_STACK_PROBE;
7383 }
7384 \f
7385 /* Decide whether we can make a sibling call to a function. DECL is the
7386 declaration of the function being targeted by the call and EXP is the
7387 CALL_EXPR representing the call. */
7388
7389 static bool
7390 ix86_function_ok_for_sibcall (tree decl, tree exp)
7391 {
7392 tree type, decl_or_type;
7393 rtx a, b;
7394 bool bind_global = decl && !targetm.binds_local_p (decl);
7395
7396 /* Sibling call isn't OK if there are no caller-saved registers
7397 since all registers must be preserved before return. */
7398 if (cfun->machine->no_caller_saved_registers)
7399 return false;
7400
7401 /* If we are generating position-independent code, we cannot sibcall
7402 optimize direct calls to global functions, as the PLT requires
7403 %ebx be live. (Darwin does not have a PLT.) */
7404 if (!TARGET_MACHO
7405 && !TARGET_64BIT
7406 && flag_pic
7407 && flag_plt
7408 && bind_global)
7409 return false;
7410
7411 /* If we need to align the outgoing stack, then sibcalling would
7412 unalign the stack, which may break the called function. */
7413 if (ix86_minimum_incoming_stack_boundary (true)
7414 < PREFERRED_STACK_BOUNDARY)
7415 return false;
7416
7417 if (decl)
7418 {
7419 decl_or_type = decl;
7420 type = TREE_TYPE (decl);
7421 }
7422 else
7423 {
7424 /* We're looking at the CALL_EXPR, we need the type of the function. */
7425 type = CALL_EXPR_FN (exp); /* pointer expression */
7426 type = TREE_TYPE (type); /* pointer type */
7427 type = TREE_TYPE (type); /* function type */
7428 decl_or_type = type;
7429 }
7430
7431 /* Check that the return value locations are the same. Like
7432 if we are returning floats on the 80387 register stack, we cannot
7433 make a sibcall from a function that doesn't return a float to a
7434 function that does or, conversely, from a function that does return
7435 a float to a function that doesn't; the necessary stack adjustment
7436 would not be executed. This is also the place we notice
7437 differences in the return value ABI. Note that it is ok for one
7438 of the functions to have void return type as long as the return
7439 value of the other is passed in a register. */
7440 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
7441 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
7442 cfun->decl, false);
7443 if (STACK_REG_P (a) || STACK_REG_P (b))
7444 {
7445 if (!rtx_equal_p (a, b))
7446 return false;
7447 }
7448 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
7449 ;
7450 else if (!rtx_equal_p (a, b))
7451 return false;
7452
7453 if (TARGET_64BIT)
7454 {
7455 /* The SYSV ABI has more call-clobbered registers;
7456 disallow sibcalls from MS to SYSV. */
7457 if (cfun->machine->call_abi == MS_ABI
7458 && ix86_function_type_abi (type) == SYSV_ABI)
7459 return false;
7460 }
7461 else
7462 {
7463 /* If this call is indirect, we'll need to be able to use a
7464 call-clobbered register for the address of the target function.
7465 Make sure that all such registers are not used for passing
7466 parameters. Note that DLLIMPORT functions and call to global
7467 function via GOT slot are indirect. */
7468 if (!decl
7469 || (bind_global && flag_pic && !flag_plt)
7470 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
7471 {
7472 /* Check if regparm >= 3 since arg_reg_available is set to
7473 false if regparm == 0. If regparm is 1 or 2, there is
7474 always a call-clobbered register available.
7475
7476 ??? The symbol indirect call doesn't need a call-clobbered
7477 register. But we don't know if this is a symbol indirect
7478 call or not here. */
7479 if (ix86_function_regparm (type, NULL) >= 3
7480 && !cfun->machine->arg_reg_available)
7481 return false;
7482 }
7483 }
7484
7485 /* Otherwise okay. That also includes certain types of indirect calls. */
7486 return true;
7487 }
7488
7489 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
7490 and "sseregparm" calling convention attributes;
7491 arguments as in struct attribute_spec.handler. */
7492
7493 static tree
7494 ix86_handle_cconv_attribute (tree *node, tree name,
7495 tree args,
7496 int,
7497 bool *no_add_attrs)
7498 {
7499 if (TREE_CODE (*node) != FUNCTION_TYPE
7500 && TREE_CODE (*node) != METHOD_TYPE
7501 && TREE_CODE (*node) != FIELD_DECL
7502 && TREE_CODE (*node) != TYPE_DECL)
7503 {
7504 warning (OPT_Wattributes, "%qE attribute only applies to functions",
7505 name);
7506 *no_add_attrs = true;
7507 return NULL_TREE;
7508 }
7509
7510 /* Can combine regparm with all attributes but fastcall, and thiscall. */
7511 if (is_attribute_p ("regparm", name))
7512 {
7513 tree cst;
7514
7515 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7516 {
7517 error ("fastcall and regparm attributes are not compatible");
7518 }
7519
7520 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7521 {
7522 error ("regparam and thiscall attributes are not compatible");
7523 }
7524
7525 cst = TREE_VALUE (args);
7526 if (TREE_CODE (cst) != INTEGER_CST)
7527 {
7528 warning (OPT_Wattributes,
7529 "%qE attribute requires an integer constant argument",
7530 name);
7531 *no_add_attrs = true;
7532 }
7533 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
7534 {
7535 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
7536 name, REGPARM_MAX);
7537 *no_add_attrs = true;
7538 }
7539
7540 return NULL_TREE;
7541 }
7542
7543 if (TARGET_64BIT)
7544 {
7545 /* Do not warn when emulating the MS ABI. */
7546 if ((TREE_CODE (*node) != FUNCTION_TYPE
7547 && TREE_CODE (*node) != METHOD_TYPE)
7548 || ix86_function_type_abi (*node) != MS_ABI)
7549 warning (OPT_Wattributes, "%qE attribute ignored",
7550 name);
7551 *no_add_attrs = true;
7552 return NULL_TREE;
7553 }
7554
7555 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
7556 if (is_attribute_p ("fastcall", name))
7557 {
7558 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
7559 {
7560 error ("fastcall and cdecl attributes are not compatible");
7561 }
7562 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
7563 {
7564 error ("fastcall and stdcall attributes are not compatible");
7565 }
7566 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
7567 {
7568 error ("fastcall and regparm attributes are not compatible");
7569 }
7570 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7571 {
7572 error ("fastcall and thiscall attributes are not compatible");
7573 }
7574 }
7575
7576 /* Can combine stdcall with fastcall (redundant), regparm and
7577 sseregparm. */
7578 else if (is_attribute_p ("stdcall", name))
7579 {
7580 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
7581 {
7582 error ("stdcall and cdecl attributes are not compatible");
7583 }
7584 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7585 {
7586 error ("stdcall and fastcall attributes are not compatible");
7587 }
7588 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7589 {
7590 error ("stdcall and thiscall attributes are not compatible");
7591 }
7592 }
7593
7594 /* Can combine cdecl with regparm and sseregparm. */
7595 else if (is_attribute_p ("cdecl", name))
7596 {
7597 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
7598 {
7599 error ("stdcall and cdecl attributes are not compatible");
7600 }
7601 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7602 {
7603 error ("fastcall and cdecl attributes are not compatible");
7604 }
7605 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7606 {
7607 error ("cdecl and thiscall attributes are not compatible");
7608 }
7609 }
7610 else if (is_attribute_p ("thiscall", name))
7611 {
7612 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
7613 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
7614 name);
7615 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
7616 {
7617 error ("stdcall and thiscall attributes are not compatible");
7618 }
7619 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7620 {
7621 error ("fastcall and thiscall attributes are not compatible");
7622 }
7623 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
7624 {
7625 error ("cdecl and thiscall attributes are not compatible");
7626 }
7627 }
7628
7629 /* Can combine sseregparm with all attributes. */
7630
7631 return NULL_TREE;
7632 }
7633
7634 /* The transactional memory builtins are implicitly regparm or fastcall
7635 depending on the ABI. Override the generic do-nothing attribute that
7636 these builtins were declared with, and replace it with one of the two
7637 attributes that we expect elsewhere. */
7638
7639 static tree
7640 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
7641 int flags, bool *no_add_attrs)
7642 {
7643 tree alt;
7644
7645 /* In no case do we want to add the placeholder attribute. */
7646 *no_add_attrs = true;
7647
7648 /* The 64-bit ABI is unchanged for transactional memory. */
7649 if (TARGET_64BIT)
7650 return NULL_TREE;
7651
7652 /* ??? Is there a better way to validate 32-bit windows? We have
7653 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
7654 if (CHECK_STACK_LIMIT > 0)
7655 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
7656 else
7657 {
7658 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
7659 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
7660 }
7661 decl_attributes (node, alt, flags);
7662
7663 return NULL_TREE;
7664 }
7665
7666 /* This function determines from TYPE the calling-convention. */
7667
7668 unsigned int
7669 ix86_get_callcvt (const_tree type)
7670 {
7671 unsigned int ret = 0;
7672 bool is_stdarg;
7673 tree attrs;
7674
7675 if (TARGET_64BIT)
7676 return IX86_CALLCVT_CDECL;
7677
7678 attrs = TYPE_ATTRIBUTES (type);
7679 if (attrs != NULL_TREE)
7680 {
7681 if (lookup_attribute ("cdecl", attrs))
7682 ret |= IX86_CALLCVT_CDECL;
7683 else if (lookup_attribute ("stdcall", attrs))
7684 ret |= IX86_CALLCVT_STDCALL;
7685 else if (lookup_attribute ("fastcall", attrs))
7686 ret |= IX86_CALLCVT_FASTCALL;
7687 else if (lookup_attribute ("thiscall", attrs))
7688 ret |= IX86_CALLCVT_THISCALL;
7689
7690 /* Regparam isn't allowed for thiscall and fastcall. */
7691 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
7692 {
7693 if (lookup_attribute ("regparm", attrs))
7694 ret |= IX86_CALLCVT_REGPARM;
7695 if (lookup_attribute ("sseregparm", attrs))
7696 ret |= IX86_CALLCVT_SSEREGPARM;
7697 }
7698
7699 if (IX86_BASE_CALLCVT(ret) != 0)
7700 return ret;
7701 }
7702
7703 is_stdarg = stdarg_p (type);
7704 if (TARGET_RTD && !is_stdarg)
7705 return IX86_CALLCVT_STDCALL | ret;
7706
7707 if (ret != 0
7708 || is_stdarg
7709 || TREE_CODE (type) != METHOD_TYPE
7710 || ix86_function_type_abi (type) != MS_ABI)
7711 return IX86_CALLCVT_CDECL | ret;
7712
7713 return IX86_CALLCVT_THISCALL;
7714 }
7715
7716 /* Return 0 if the attributes for two types are incompatible, 1 if they
7717 are compatible, and 2 if they are nearly compatible (which causes a
7718 warning to be generated). */
7719
7720 static int
7721 ix86_comp_type_attributes (const_tree type1, const_tree type2)
7722 {
7723 unsigned int ccvt1, ccvt2;
7724
7725 if (TREE_CODE (type1) != FUNCTION_TYPE
7726 && TREE_CODE (type1) != METHOD_TYPE)
7727 return 1;
7728
7729 ccvt1 = ix86_get_callcvt (type1);
7730 ccvt2 = ix86_get_callcvt (type2);
7731 if (ccvt1 != ccvt2)
7732 return 0;
7733 if (ix86_function_regparm (type1, NULL)
7734 != ix86_function_regparm (type2, NULL))
7735 return 0;
7736
7737 return 1;
7738 }
7739 \f
7740 /* Return the regparm value for a function with the indicated TYPE and DECL.
7741 DECL may be NULL when calling function indirectly
7742 or considering a libcall. */
7743
7744 static int
7745 ix86_function_regparm (const_tree type, const_tree decl)
7746 {
7747 tree attr;
7748 int regparm;
7749 unsigned int ccvt;
7750
7751 if (TARGET_64BIT)
7752 return (ix86_function_type_abi (type) == SYSV_ABI
7753 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
7754 ccvt = ix86_get_callcvt (type);
7755 regparm = ix86_regparm;
7756
7757 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
7758 {
7759 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
7760 if (attr)
7761 {
7762 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
7763 return regparm;
7764 }
7765 }
7766 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
7767 return 2;
7768 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
7769 return 1;
7770
7771 /* Use register calling convention for local functions when possible. */
7772 if (decl
7773 && TREE_CODE (decl) == FUNCTION_DECL)
7774 {
7775 cgraph_node *target = cgraph_node::get (decl);
7776 if (target)
7777 target = target->function_symbol ();
7778
7779 /* Caller and callee must agree on the calling convention, so
7780 checking here just optimize means that with
7781 __attribute__((optimize (...))) caller could use regparm convention
7782 and callee not, or vice versa. Instead look at whether the callee
7783 is optimized or not. */
7784 if (target && opt_for_fn (target->decl, optimize)
7785 && !(profile_flag && !flag_fentry))
7786 {
7787 cgraph_local_info *i = &target->local;
7788 if (i && i->local && i->can_change_signature)
7789 {
7790 int local_regparm, globals = 0, regno;
7791
7792 /* Make sure no regparm register is taken by a
7793 fixed register variable. */
7794 for (local_regparm = 0; local_regparm < REGPARM_MAX;
7795 local_regparm++)
7796 if (fixed_regs[local_regparm])
7797 break;
7798
7799 /* We don't want to use regparm(3) for nested functions as
7800 these use a static chain pointer in the third argument. */
7801 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
7802 local_regparm = 2;
7803
7804 /* Save a register for the split stack. */
7805 if (local_regparm == 3 && flag_split_stack)
7806 local_regparm = 2;
7807
7808 /* Each fixed register usage increases register pressure,
7809 so less registers should be used for argument passing.
7810 This functionality can be overriden by an explicit
7811 regparm value. */
7812 for (regno = AX_REG; regno <= DI_REG; regno++)
7813 if (fixed_regs[regno])
7814 globals++;
7815
7816 local_regparm
7817 = globals < local_regparm ? local_regparm - globals : 0;
7818
7819 if (local_regparm > regparm)
7820 regparm = local_regparm;
7821 }
7822 }
7823 }
7824
7825 return regparm;
7826 }
7827
7828 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
7829 DFmode (2) arguments in SSE registers for a function with the
7830 indicated TYPE and DECL. DECL may be NULL when calling function
7831 indirectly or considering a libcall. Return -1 if any FP parameter
7832 should be rejected by error. This is used in siutation we imply SSE
7833 calling convetion but the function is called from another function with
7834 SSE disabled. Otherwise return 0. */
7835
7836 static int
7837 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
7838 {
7839 gcc_assert (!TARGET_64BIT);
7840
7841 /* Use SSE registers to pass SFmode and DFmode arguments if requested
7842 by the sseregparm attribute. */
7843 if (TARGET_SSEREGPARM
7844 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
7845 {
7846 if (!TARGET_SSE)
7847 {
7848 if (warn)
7849 {
7850 if (decl)
7851 error ("calling %qD with attribute sseregparm without "
7852 "SSE/SSE2 enabled", decl);
7853 else
7854 error ("calling %qT with attribute sseregparm without "
7855 "SSE/SSE2 enabled", type);
7856 }
7857 return 0;
7858 }
7859
7860 return 2;
7861 }
7862
7863 if (!decl)
7864 return 0;
7865
7866 cgraph_node *target = cgraph_node::get (decl);
7867 if (target)
7868 target = target->function_symbol ();
7869
7870 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
7871 (and DFmode for SSE2) arguments in SSE registers. */
7872 if (target
7873 /* TARGET_SSE_MATH */
7874 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
7875 && opt_for_fn (target->decl, optimize)
7876 && !(profile_flag && !flag_fentry))
7877 {
7878 cgraph_local_info *i = &target->local;
7879 if (i && i->local && i->can_change_signature)
7880 {
7881 /* Refuse to produce wrong code when local function with SSE enabled
7882 is called from SSE disabled function.
7883 FIXME: We need a way to detect these cases cross-ltrans partition
7884 and avoid using SSE calling conventions on local functions called
7885 from function with SSE disabled. For now at least delay the
7886 warning until we know we are going to produce wrong code.
7887 See PR66047 */
7888 if (!TARGET_SSE && warn)
7889 return -1;
7890 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
7891 ->x_ix86_isa_flags) ? 2 : 1;
7892 }
7893 }
7894
7895 return 0;
7896 }
7897
7898 /* Return true if EAX is live at the start of the function. Used by
7899 ix86_expand_prologue to determine if we need special help before
7900 calling allocate_stack_worker. */
7901
7902 static bool
7903 ix86_eax_live_at_start_p (void)
7904 {
7905 /* Cheat. Don't bother working forward from ix86_function_regparm
7906 to the function type to whether an actual argument is located in
7907 eax. Instead just look at cfg info, which is still close enough
7908 to correct at this point. This gives false positives for broken
7909 functions that might use uninitialized data that happens to be
7910 allocated in eax, but who cares? */
7911 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
7912 }
7913
7914 static bool
7915 ix86_keep_aggregate_return_pointer (tree fntype)
7916 {
7917 tree attr;
7918
7919 if (!TARGET_64BIT)
7920 {
7921 attr = lookup_attribute ("callee_pop_aggregate_return",
7922 TYPE_ATTRIBUTES (fntype));
7923 if (attr)
7924 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
7925
7926 /* For 32-bit MS-ABI the default is to keep aggregate
7927 return pointer. */
7928 if (ix86_function_type_abi (fntype) == MS_ABI)
7929 return true;
7930 }
7931 return KEEP_AGGREGATE_RETURN_POINTER != 0;
7932 }
7933
7934 /* Value is the number of bytes of arguments automatically
7935 popped when returning from a subroutine call.
7936 FUNDECL is the declaration node of the function (as a tree),
7937 FUNTYPE is the data type of the function (as a tree),
7938 or for a library call it is an identifier node for the subroutine name.
7939 SIZE is the number of bytes of arguments passed on the stack.
7940
7941 On the 80386, the RTD insn may be used to pop them if the number
7942 of args is fixed, but if the number is variable then the caller
7943 must pop them all. RTD can't be used for library calls now
7944 because the library is compiled with the Unix compiler.
7945 Use of RTD is a selectable option, since it is incompatible with
7946 standard Unix calling sequences. If the option is not selected,
7947 the caller must always pop the args.
7948
7949 The attribute stdcall is equivalent to RTD on a per module basis. */
7950
7951 static int
7952 ix86_return_pops_args (tree fundecl, tree funtype, int size)
7953 {
7954 unsigned int ccvt;
7955
7956 /* None of the 64-bit ABIs pop arguments. */
7957 if (TARGET_64BIT)
7958 return 0;
7959
7960 ccvt = ix86_get_callcvt (funtype);
7961
7962 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
7963 | IX86_CALLCVT_THISCALL)) != 0
7964 && ! stdarg_p (funtype))
7965 return size;
7966
7967 /* Lose any fake structure return argument if it is passed on the stack. */
7968 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
7969 && !ix86_keep_aggregate_return_pointer (funtype))
7970 {
7971 int nregs = ix86_function_regparm (funtype, fundecl);
7972 if (nregs == 0)
7973 return GET_MODE_SIZE (Pmode);
7974 }
7975
7976 return 0;
7977 }
7978
7979 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
7980
7981 static bool
7982 ix86_legitimate_combined_insn (rtx_insn *insn)
7983 {
7984 /* Check operand constraints in case hard registers were propagated
7985 into insn pattern. This check prevents combine pass from
7986 generating insn patterns with invalid hard register operands.
7987 These invalid insns can eventually confuse reload to error out
7988 with a spill failure. See also PRs 46829 and 46843. */
7989 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
7990 {
7991 int i;
7992
7993 extract_insn (insn);
7994 preprocess_constraints (insn);
7995
7996 int n_operands = recog_data.n_operands;
7997 int n_alternatives = recog_data.n_alternatives;
7998 for (i = 0; i < n_operands; i++)
7999 {
8000 rtx op = recog_data.operand[i];
8001 machine_mode mode = GET_MODE (op);
8002 const operand_alternative *op_alt;
8003 int offset = 0;
8004 bool win;
8005 int j;
8006
8007 /* A unary operator may be accepted by the predicate, but it
8008 is irrelevant for matching constraints. */
8009 if (UNARY_P (op))
8010 op = XEXP (op, 0);
8011
8012 if (SUBREG_P (op))
8013 {
8014 if (REG_P (SUBREG_REG (op))
8015 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
8016 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
8017 GET_MODE (SUBREG_REG (op)),
8018 SUBREG_BYTE (op),
8019 GET_MODE (op));
8020 op = SUBREG_REG (op);
8021 }
8022
8023 if (!(REG_P (op) && HARD_REGISTER_P (op)))
8024 continue;
8025
8026 op_alt = recog_op_alt;
8027
8028 /* Operand has no constraints, anything is OK. */
8029 win = !n_alternatives;
8030
8031 alternative_mask preferred = get_preferred_alternatives (insn);
8032 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
8033 {
8034 if (!TEST_BIT (preferred, j))
8035 continue;
8036 if (op_alt[i].anything_ok
8037 || (op_alt[i].matches != -1
8038 && operands_match_p
8039 (recog_data.operand[i],
8040 recog_data.operand[op_alt[i].matches]))
8041 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
8042 {
8043 win = true;
8044 break;
8045 }
8046 }
8047
8048 if (!win)
8049 return false;
8050 }
8051 }
8052
8053 return true;
8054 }
8055 \f
8056 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
8057
8058 static unsigned HOST_WIDE_INT
8059 ix86_asan_shadow_offset (void)
8060 {
8061 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
8062 : HOST_WIDE_INT_C (0x7fff8000))
8063 : (HOST_WIDE_INT_1 << 29);
8064 }
8065 \f
8066 /* Argument support functions. */
8067
8068 /* Return true when register may be used to pass function parameters. */
8069 bool
8070 ix86_function_arg_regno_p (int regno)
8071 {
8072 int i;
8073 enum calling_abi call_abi;
8074 const int *parm_regs;
8075
8076 if (TARGET_MPX && BND_REGNO_P (regno))
8077 return true;
8078
8079 if (!TARGET_64BIT)
8080 {
8081 if (TARGET_MACHO)
8082 return (regno < REGPARM_MAX
8083 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
8084 else
8085 return (regno < REGPARM_MAX
8086 || (TARGET_MMX && MMX_REGNO_P (regno)
8087 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
8088 || (TARGET_SSE && SSE_REGNO_P (regno)
8089 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
8090 }
8091
8092 if (TARGET_SSE && SSE_REGNO_P (regno)
8093 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
8094 return true;
8095
8096 /* TODO: The function should depend on current function ABI but
8097 builtins.c would need updating then. Therefore we use the
8098 default ABI. */
8099 call_abi = ix86_cfun_abi ();
8100
8101 /* RAX is used as hidden argument to va_arg functions. */
8102 if (call_abi == SYSV_ABI && regno == AX_REG)
8103 return true;
8104
8105 if (call_abi == MS_ABI)
8106 parm_regs = x86_64_ms_abi_int_parameter_registers;
8107 else
8108 parm_regs = x86_64_int_parameter_registers;
8109
8110 for (i = 0; i < (call_abi == MS_ABI
8111 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
8112 if (regno == parm_regs[i])
8113 return true;
8114 return false;
8115 }
8116
8117 /* Return if we do not know how to pass TYPE solely in registers. */
8118
8119 static bool
8120 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
8121 {
8122 if (must_pass_in_stack_var_size_or_pad (mode, type))
8123 return true;
8124
8125 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
8126 The layout_type routine is crafty and tries to trick us into passing
8127 currently unsupported vector types on the stack by using TImode. */
8128 return (!TARGET_64BIT && mode == TImode
8129 && type && TREE_CODE (type) != VECTOR_TYPE);
8130 }
8131
8132 /* It returns the size, in bytes, of the area reserved for arguments passed
8133 in registers for the function represented by fndecl dependent to the used
8134 abi format. */
8135 int
8136 ix86_reg_parm_stack_space (const_tree fndecl)
8137 {
8138 enum calling_abi call_abi = SYSV_ABI;
8139 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
8140 call_abi = ix86_function_abi (fndecl);
8141 else
8142 call_abi = ix86_function_type_abi (fndecl);
8143 if (TARGET_64BIT && call_abi == MS_ABI)
8144 return 32;
8145 return 0;
8146 }
8147
8148 /* We add this as a workaround in order to use libc_has_function
8149 hook in i386.md. */
8150 bool
8151 ix86_libc_has_function (enum function_class fn_class)
8152 {
8153 return targetm.libc_has_function (fn_class);
8154 }
8155
8156 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
8157 specifying the call abi used. */
8158 enum calling_abi
8159 ix86_function_type_abi (const_tree fntype)
8160 {
8161 enum calling_abi abi = ix86_abi;
8162
8163 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
8164 return abi;
8165
8166 if (abi == SYSV_ABI
8167 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
8168 {
8169 if (TARGET_X32)
8170 error ("X32 does not support ms_abi attribute");
8171
8172 abi = MS_ABI;
8173 }
8174 else if (abi == MS_ABI
8175 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
8176 abi = SYSV_ABI;
8177
8178 return abi;
8179 }
8180
8181 static enum calling_abi
8182 ix86_function_abi (const_tree fndecl)
8183 {
8184 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
8185 }
8186
8187 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
8188 specifying the call abi used. */
8189 enum calling_abi
8190 ix86_cfun_abi (void)
8191 {
8192 return cfun ? cfun->machine->call_abi : ix86_abi;
8193 }
8194
8195 static bool
8196 ix86_function_ms_hook_prologue (const_tree fn)
8197 {
8198 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
8199 {
8200 if (decl_function_context (fn) != NULL_TREE)
8201 error_at (DECL_SOURCE_LOCATION (fn),
8202 "ms_hook_prologue is not compatible with nested function");
8203 else
8204 return true;
8205 }
8206 return false;
8207 }
8208
8209 /* Write the extra assembler code needed to declare a function properly. */
8210
8211 void
8212 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
8213 tree decl)
8214 {
8215 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
8216
8217 if (is_ms_hook)
8218 {
8219 int i, filler_count = (TARGET_64BIT ? 32 : 16);
8220 unsigned int filler_cc = 0xcccccccc;
8221
8222 for (i = 0; i < filler_count; i += 4)
8223 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
8224 }
8225
8226 #ifdef SUBTARGET_ASM_UNWIND_INIT
8227 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
8228 #endif
8229
8230 ASM_OUTPUT_LABEL (asm_out_file, fname);
8231
8232 /* Output magic byte marker, if hot-patch attribute is set. */
8233 if (is_ms_hook)
8234 {
8235 if (TARGET_64BIT)
8236 {
8237 /* leaq [%rsp + 0], %rsp */
8238 asm_fprintf (asm_out_file, ASM_BYTE
8239 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
8240 }
8241 else
8242 {
8243 /* movl.s %edi, %edi
8244 push %ebp
8245 movl.s %esp, %ebp */
8246 asm_fprintf (asm_out_file, ASM_BYTE
8247 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
8248 }
8249 }
8250 }
8251
8252 /* regclass.c */
8253 extern void init_regs (void);
8254
8255 /* Implementation of call abi switching target hook. Specific to FNDECL
8256 the specific call register sets are set. See also
8257 ix86_conditional_register_usage for more details. */
8258 void
8259 ix86_call_abi_override (const_tree fndecl)
8260 {
8261 cfun->machine->call_abi = ix86_function_abi (fndecl);
8262 }
8263
8264 /* Return 1 if pseudo register should be created and used to hold
8265 GOT address for PIC code. */
8266 bool
8267 ix86_use_pseudo_pic_reg (void)
8268 {
8269 if ((TARGET_64BIT
8270 && (ix86_cmodel == CM_SMALL_PIC
8271 || TARGET_PECOFF))
8272 || !flag_pic)
8273 return false;
8274 return true;
8275 }
8276
8277 /* Initialize large model PIC register. */
8278
8279 static void
8280 ix86_init_large_pic_reg (unsigned int tmp_regno)
8281 {
8282 rtx_code_label *label;
8283 rtx tmp_reg;
8284
8285 gcc_assert (Pmode == DImode);
8286 label = gen_label_rtx ();
8287 emit_label (label);
8288 LABEL_PRESERVE_P (label) = 1;
8289 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
8290 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
8291 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
8292 label));
8293 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
8294 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
8295 pic_offset_table_rtx, tmp_reg));
8296 }
8297
8298 /* Create and initialize PIC register if required. */
8299 static void
8300 ix86_init_pic_reg (void)
8301 {
8302 edge entry_edge;
8303 rtx_insn *seq;
8304
8305 if (!ix86_use_pseudo_pic_reg ())
8306 return;
8307
8308 start_sequence ();
8309
8310 if (TARGET_64BIT)
8311 {
8312 if (ix86_cmodel == CM_LARGE_PIC)
8313 ix86_init_large_pic_reg (R11_REG);
8314 else
8315 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
8316 }
8317 else
8318 {
8319 /* If there is future mcount call in the function it is more profitable
8320 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
8321 rtx reg = crtl->profile
8322 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
8323 : pic_offset_table_rtx;
8324 rtx_insn *insn = emit_insn (gen_set_got (reg));
8325 RTX_FRAME_RELATED_P (insn) = 1;
8326 if (crtl->profile)
8327 emit_move_insn (pic_offset_table_rtx, reg);
8328 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
8329 }
8330
8331 seq = get_insns ();
8332 end_sequence ();
8333
8334 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
8335 insert_insn_on_edge (seq, entry_edge);
8336 commit_one_edge_insertion (entry_edge);
8337 }
8338
8339 /* Initialize a variable CUM of type CUMULATIVE_ARGS
8340 for a call to a function whose data type is FNTYPE.
8341 For a library call, FNTYPE is 0. */
8342
8343 void
8344 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
8345 tree fntype, /* tree ptr for function decl */
8346 rtx libname, /* SYMBOL_REF of library name or 0 */
8347 tree fndecl,
8348 int caller)
8349 {
8350 struct cgraph_local_info *i = NULL;
8351 struct cgraph_node *target = NULL;
8352
8353 memset (cum, 0, sizeof (*cum));
8354
8355 if (fndecl)
8356 {
8357 target = cgraph_node::get (fndecl);
8358 if (target)
8359 {
8360 target = target->function_symbol ();
8361 i = cgraph_node::local_info (target->decl);
8362 cum->call_abi = ix86_function_abi (target->decl);
8363 }
8364 else
8365 cum->call_abi = ix86_function_abi (fndecl);
8366 }
8367 else
8368 cum->call_abi = ix86_function_type_abi (fntype);
8369
8370 cum->caller = caller;
8371
8372 /* Set up the number of registers to use for passing arguments. */
8373 cum->nregs = ix86_regparm;
8374 if (TARGET_64BIT)
8375 {
8376 cum->nregs = (cum->call_abi == SYSV_ABI
8377 ? X86_64_REGPARM_MAX
8378 : X86_64_MS_REGPARM_MAX);
8379 }
8380 if (TARGET_SSE)
8381 {
8382 cum->sse_nregs = SSE_REGPARM_MAX;
8383 if (TARGET_64BIT)
8384 {
8385 cum->sse_nregs = (cum->call_abi == SYSV_ABI
8386 ? X86_64_SSE_REGPARM_MAX
8387 : X86_64_MS_SSE_REGPARM_MAX);
8388 }
8389 }
8390 if (TARGET_MMX)
8391 cum->mmx_nregs = MMX_REGPARM_MAX;
8392 cum->warn_avx512f = true;
8393 cum->warn_avx = true;
8394 cum->warn_sse = true;
8395 cum->warn_mmx = true;
8396
8397 /* Because type might mismatch in between caller and callee, we need to
8398 use actual type of function for local calls.
8399 FIXME: cgraph_analyze can be told to actually record if function uses
8400 va_start so for local functions maybe_vaarg can be made aggressive
8401 helping K&R code.
8402 FIXME: once typesytem is fixed, we won't need this code anymore. */
8403 if (i && i->local && i->can_change_signature)
8404 fntype = TREE_TYPE (target->decl);
8405 cum->stdarg = stdarg_p (fntype);
8406 cum->maybe_vaarg = (fntype
8407 ? (!prototype_p (fntype) || stdarg_p (fntype))
8408 : !libname);
8409
8410 cum->bnd_regno = FIRST_BND_REG;
8411 cum->bnds_in_bt = 0;
8412 cum->force_bnd_pass = 0;
8413 cum->decl = fndecl;
8414
8415 if (!TARGET_64BIT)
8416 {
8417 /* If there are variable arguments, then we won't pass anything
8418 in registers in 32-bit mode. */
8419 if (stdarg_p (fntype))
8420 {
8421 cum->nregs = 0;
8422 /* Since in 32-bit, variable arguments are always passed on
8423 stack, there is scratch register available for indirect
8424 sibcall. */
8425 cfun->machine->arg_reg_available = true;
8426 cum->sse_nregs = 0;
8427 cum->mmx_nregs = 0;
8428 cum->warn_avx512f = false;
8429 cum->warn_avx = false;
8430 cum->warn_sse = false;
8431 cum->warn_mmx = false;
8432 return;
8433 }
8434
8435 /* Use ecx and edx registers if function has fastcall attribute,
8436 else look for regparm information. */
8437 if (fntype)
8438 {
8439 unsigned int ccvt = ix86_get_callcvt (fntype);
8440 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
8441 {
8442 cum->nregs = 1;
8443 cum->fastcall = 1; /* Same first register as in fastcall. */
8444 }
8445 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
8446 {
8447 cum->nregs = 2;
8448 cum->fastcall = 1;
8449 }
8450 else
8451 cum->nregs = ix86_function_regparm (fntype, fndecl);
8452 }
8453
8454 /* Set up the number of SSE registers used for passing SFmode
8455 and DFmode arguments. Warn for mismatching ABI. */
8456 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
8457 }
8458
8459 cfun->machine->arg_reg_available = (cum->nregs > 0);
8460 }
8461
8462 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
8463 But in the case of vector types, it is some vector mode.
8464
8465 When we have only some of our vector isa extensions enabled, then there
8466 are some modes for which vector_mode_supported_p is false. For these
8467 modes, the generic vector support in gcc will choose some non-vector mode
8468 in order to implement the type. By computing the natural mode, we'll
8469 select the proper ABI location for the operand and not depend on whatever
8470 the middle-end decides to do with these vector types.
8471
8472 The midde-end can't deal with the vector types > 16 bytes. In this
8473 case, we return the original mode and warn ABI change if CUM isn't
8474 NULL.
8475
8476 If INT_RETURN is true, warn ABI change if the vector mode isn't
8477 available for function return value. */
8478
8479 static machine_mode
8480 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
8481 bool in_return)
8482 {
8483 machine_mode mode = TYPE_MODE (type);
8484
8485 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
8486 {
8487 HOST_WIDE_INT size = int_size_in_bytes (type);
8488 if ((size == 8 || size == 16 || size == 32 || size == 64)
8489 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
8490 && TYPE_VECTOR_SUBPARTS (type) > 1)
8491 {
8492 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
8493
8494 /* There are no XFmode vector modes. */
8495 if (innermode == XFmode)
8496 return mode;
8497
8498 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
8499 mode = MIN_MODE_VECTOR_FLOAT;
8500 else
8501 mode = MIN_MODE_VECTOR_INT;
8502
8503 /* Get the mode which has this inner mode and number of units. */
8504 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
8505 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
8506 && GET_MODE_INNER (mode) == innermode)
8507 {
8508 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
8509 {
8510 static bool warnedavx512f;
8511 static bool warnedavx512f_ret;
8512
8513 if (cum && cum->warn_avx512f && !warnedavx512f)
8514 {
8515 if (warning (OPT_Wpsabi, "AVX512F vector argument "
8516 "without AVX512F enabled changes the ABI"))
8517 warnedavx512f = true;
8518 }
8519 else if (in_return && !warnedavx512f_ret)
8520 {
8521 if (warning (OPT_Wpsabi, "AVX512F vector return "
8522 "without AVX512F enabled changes the ABI"))
8523 warnedavx512f_ret = true;
8524 }
8525
8526 return TYPE_MODE (type);
8527 }
8528 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
8529 {
8530 static bool warnedavx;
8531 static bool warnedavx_ret;
8532
8533 if (cum && cum->warn_avx && !warnedavx)
8534 {
8535 if (warning (OPT_Wpsabi, "AVX vector argument "
8536 "without AVX enabled changes the ABI"))
8537 warnedavx = true;
8538 }
8539 else if (in_return && !warnedavx_ret)
8540 {
8541 if (warning (OPT_Wpsabi, "AVX vector return "
8542 "without AVX enabled changes the ABI"))
8543 warnedavx_ret = true;
8544 }
8545
8546 return TYPE_MODE (type);
8547 }
8548 else if (((size == 8 && TARGET_64BIT) || size == 16)
8549 && !TARGET_SSE
8550 && !TARGET_IAMCU)
8551 {
8552 static bool warnedsse;
8553 static bool warnedsse_ret;
8554
8555 if (cum && cum->warn_sse && !warnedsse)
8556 {
8557 if (warning (OPT_Wpsabi, "SSE vector argument "
8558 "without SSE enabled changes the ABI"))
8559 warnedsse = true;
8560 }
8561 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
8562 {
8563 if (warning (OPT_Wpsabi, "SSE vector return "
8564 "without SSE enabled changes the ABI"))
8565 warnedsse_ret = true;
8566 }
8567 }
8568 else if ((size == 8 && !TARGET_64BIT)
8569 && (!cfun
8570 || cfun->machine->func_type == TYPE_NORMAL)
8571 && !TARGET_MMX
8572 && !TARGET_IAMCU)
8573 {
8574 static bool warnedmmx;
8575 static bool warnedmmx_ret;
8576
8577 if (cum && cum->warn_mmx && !warnedmmx)
8578 {
8579 if (warning (OPT_Wpsabi, "MMX vector argument "
8580 "without MMX enabled changes the ABI"))
8581 warnedmmx = true;
8582 }
8583 else if (in_return && !warnedmmx_ret)
8584 {
8585 if (warning (OPT_Wpsabi, "MMX vector return "
8586 "without MMX enabled changes the ABI"))
8587 warnedmmx_ret = true;
8588 }
8589 }
8590 return mode;
8591 }
8592
8593 gcc_unreachable ();
8594 }
8595 }
8596
8597 return mode;
8598 }
8599
8600 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
8601 this may not agree with the mode that the type system has chosen for the
8602 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
8603 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
8604
8605 static rtx
8606 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
8607 unsigned int regno)
8608 {
8609 rtx tmp;
8610
8611 if (orig_mode != BLKmode)
8612 tmp = gen_rtx_REG (orig_mode, regno);
8613 else
8614 {
8615 tmp = gen_rtx_REG (mode, regno);
8616 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
8617 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
8618 }
8619
8620 return tmp;
8621 }
8622
8623 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
8624 of this code is to classify each 8bytes of incoming argument by the register
8625 class and assign registers accordingly. */
8626
8627 /* Return the union class of CLASS1 and CLASS2.
8628 See the x86-64 PS ABI for details. */
8629
8630 static enum x86_64_reg_class
8631 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
8632 {
8633 /* Rule #1: If both classes are equal, this is the resulting class. */
8634 if (class1 == class2)
8635 return class1;
8636
8637 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
8638 the other class. */
8639 if (class1 == X86_64_NO_CLASS)
8640 return class2;
8641 if (class2 == X86_64_NO_CLASS)
8642 return class1;
8643
8644 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
8645 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
8646 return X86_64_MEMORY_CLASS;
8647
8648 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
8649 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
8650 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
8651 return X86_64_INTEGERSI_CLASS;
8652 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
8653 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
8654 return X86_64_INTEGER_CLASS;
8655
8656 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
8657 MEMORY is used. */
8658 if (class1 == X86_64_X87_CLASS
8659 || class1 == X86_64_X87UP_CLASS
8660 || class1 == X86_64_COMPLEX_X87_CLASS
8661 || class2 == X86_64_X87_CLASS
8662 || class2 == X86_64_X87UP_CLASS
8663 || class2 == X86_64_COMPLEX_X87_CLASS)
8664 return X86_64_MEMORY_CLASS;
8665
8666 /* Rule #6: Otherwise class SSE is used. */
8667 return X86_64_SSE_CLASS;
8668 }
8669
8670 /* Classify the argument of type TYPE and mode MODE.
8671 CLASSES will be filled by the register class used to pass each word
8672 of the operand. The number of words is returned. In case the parameter
8673 should be passed in memory, 0 is returned. As a special case for zero
8674 sized containers, classes[0] will be NO_CLASS and 1 is returned.
8675
8676 BIT_OFFSET is used internally for handling records and specifies offset
8677 of the offset in bits modulo 512 to avoid overflow cases.
8678
8679 See the x86-64 PS ABI for details.
8680 */
8681
8682 static int
8683 classify_argument (machine_mode mode, const_tree type,
8684 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
8685 {
8686 HOST_WIDE_INT bytes =
8687 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
8688 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
8689
8690 /* Variable sized entities are always passed/returned in memory. */
8691 if (bytes < 0)
8692 return 0;
8693
8694 if (mode != VOIDmode
8695 && targetm.calls.must_pass_in_stack (mode, type))
8696 return 0;
8697
8698 if (type && AGGREGATE_TYPE_P (type))
8699 {
8700 int i;
8701 tree field;
8702 enum x86_64_reg_class subclasses[MAX_CLASSES];
8703
8704 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
8705 if (bytes > 64)
8706 return 0;
8707
8708 for (i = 0; i < words; i++)
8709 classes[i] = X86_64_NO_CLASS;
8710
8711 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
8712 signalize memory class, so handle it as special case. */
8713 if (!words)
8714 {
8715 classes[0] = X86_64_NO_CLASS;
8716 return 1;
8717 }
8718
8719 /* Classify each field of record and merge classes. */
8720 switch (TREE_CODE (type))
8721 {
8722 case RECORD_TYPE:
8723 /* And now merge the fields of structure. */
8724 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
8725 {
8726 if (TREE_CODE (field) == FIELD_DECL)
8727 {
8728 int num;
8729
8730 if (TREE_TYPE (field) == error_mark_node)
8731 continue;
8732
8733 /* Bitfields are always classified as integer. Handle them
8734 early, since later code would consider them to be
8735 misaligned integers. */
8736 if (DECL_BIT_FIELD (field))
8737 {
8738 for (i = (int_bit_position (field)
8739 + (bit_offset % 64)) / 8 / 8;
8740 i < ((int_bit_position (field) + (bit_offset % 64))
8741 + tree_to_shwi (DECL_SIZE (field))
8742 + 63) / 8 / 8; i++)
8743 classes[i] =
8744 merge_classes (X86_64_INTEGER_CLASS,
8745 classes[i]);
8746 }
8747 else
8748 {
8749 int pos;
8750
8751 type = TREE_TYPE (field);
8752
8753 /* Flexible array member is ignored. */
8754 if (TYPE_MODE (type) == BLKmode
8755 && TREE_CODE (type) == ARRAY_TYPE
8756 && TYPE_SIZE (type) == NULL_TREE
8757 && TYPE_DOMAIN (type) != NULL_TREE
8758 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
8759 == NULL_TREE))
8760 {
8761 static bool warned;
8762
8763 if (!warned && warn_psabi)
8764 {
8765 warned = true;
8766 inform (input_location,
8767 "the ABI of passing struct with"
8768 " a flexible array member has"
8769 " changed in GCC 4.4");
8770 }
8771 continue;
8772 }
8773 num = classify_argument (TYPE_MODE (type), type,
8774 subclasses,
8775 (int_bit_position (field)
8776 + bit_offset) % 512);
8777 if (!num)
8778 return 0;
8779 pos = (int_bit_position (field)
8780 + (bit_offset % 64)) / 8 / 8;
8781 for (i = 0; i < num && (i + pos) < words; i++)
8782 classes[i + pos] =
8783 merge_classes (subclasses[i], classes[i + pos]);
8784 }
8785 }
8786 }
8787 break;
8788
8789 case ARRAY_TYPE:
8790 /* Arrays are handled as small records. */
8791 {
8792 int num;
8793 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
8794 TREE_TYPE (type), subclasses, bit_offset);
8795 if (!num)
8796 return 0;
8797
8798 /* The partial classes are now full classes. */
8799 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
8800 subclasses[0] = X86_64_SSE_CLASS;
8801 if (subclasses[0] == X86_64_INTEGERSI_CLASS
8802 && !((bit_offset % 64) == 0 && bytes == 4))
8803 subclasses[0] = X86_64_INTEGER_CLASS;
8804
8805 for (i = 0; i < words; i++)
8806 classes[i] = subclasses[i % num];
8807
8808 break;
8809 }
8810 case UNION_TYPE:
8811 case QUAL_UNION_TYPE:
8812 /* Unions are similar to RECORD_TYPE but offset is always 0.
8813 */
8814 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
8815 {
8816 if (TREE_CODE (field) == FIELD_DECL)
8817 {
8818 int num;
8819
8820 if (TREE_TYPE (field) == error_mark_node)
8821 continue;
8822
8823 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
8824 TREE_TYPE (field), subclasses,
8825 bit_offset);
8826 if (!num)
8827 return 0;
8828 for (i = 0; i < num && i < words; i++)
8829 classes[i] = merge_classes (subclasses[i], classes[i]);
8830 }
8831 }
8832 break;
8833
8834 default:
8835 gcc_unreachable ();
8836 }
8837
8838 if (words > 2)
8839 {
8840 /* When size > 16 bytes, if the first one isn't
8841 X86_64_SSE_CLASS or any other ones aren't
8842 X86_64_SSEUP_CLASS, everything should be passed in
8843 memory. */
8844 if (classes[0] != X86_64_SSE_CLASS)
8845 return 0;
8846
8847 for (i = 1; i < words; i++)
8848 if (classes[i] != X86_64_SSEUP_CLASS)
8849 return 0;
8850 }
8851
8852 /* Final merger cleanup. */
8853 for (i = 0; i < words; i++)
8854 {
8855 /* If one class is MEMORY, everything should be passed in
8856 memory. */
8857 if (classes[i] == X86_64_MEMORY_CLASS)
8858 return 0;
8859
8860 /* The X86_64_SSEUP_CLASS should be always preceded by
8861 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
8862 if (classes[i] == X86_64_SSEUP_CLASS
8863 && classes[i - 1] != X86_64_SSE_CLASS
8864 && classes[i - 1] != X86_64_SSEUP_CLASS)
8865 {
8866 /* The first one should never be X86_64_SSEUP_CLASS. */
8867 gcc_assert (i != 0);
8868 classes[i] = X86_64_SSE_CLASS;
8869 }
8870
8871 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
8872 everything should be passed in memory. */
8873 if (classes[i] == X86_64_X87UP_CLASS
8874 && (classes[i - 1] != X86_64_X87_CLASS))
8875 {
8876 static bool warned;
8877
8878 /* The first one should never be X86_64_X87UP_CLASS. */
8879 gcc_assert (i != 0);
8880 if (!warned && warn_psabi)
8881 {
8882 warned = true;
8883 inform (input_location,
8884 "the ABI of passing union with long double"
8885 " has changed in GCC 4.4");
8886 }
8887 return 0;
8888 }
8889 }
8890 return words;
8891 }
8892
8893 /* Compute alignment needed. We align all types to natural boundaries with
8894 exception of XFmode that is aligned to 64bits. */
8895 if (mode != VOIDmode && mode != BLKmode)
8896 {
8897 int mode_alignment = GET_MODE_BITSIZE (mode);
8898
8899 if (mode == XFmode)
8900 mode_alignment = 128;
8901 else if (mode == XCmode)
8902 mode_alignment = 256;
8903 if (COMPLEX_MODE_P (mode))
8904 mode_alignment /= 2;
8905 /* Misaligned fields are always returned in memory. */
8906 if (bit_offset % mode_alignment)
8907 return 0;
8908 }
8909
8910 /* for V1xx modes, just use the base mode */
8911 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
8912 && GET_MODE_UNIT_SIZE (mode) == bytes)
8913 mode = GET_MODE_INNER (mode);
8914
8915 /* Classification of atomic types. */
8916 switch (mode)
8917 {
8918 case SDmode:
8919 case DDmode:
8920 classes[0] = X86_64_SSE_CLASS;
8921 return 1;
8922 case TDmode:
8923 classes[0] = X86_64_SSE_CLASS;
8924 classes[1] = X86_64_SSEUP_CLASS;
8925 return 2;
8926 case DImode:
8927 case SImode:
8928 case HImode:
8929 case QImode:
8930 case CSImode:
8931 case CHImode:
8932 case CQImode:
8933 {
8934 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
8935
8936 /* Analyze last 128 bits only. */
8937 size = (size - 1) & 0x7f;
8938
8939 if (size < 32)
8940 {
8941 classes[0] = X86_64_INTEGERSI_CLASS;
8942 return 1;
8943 }
8944 else if (size < 64)
8945 {
8946 classes[0] = X86_64_INTEGER_CLASS;
8947 return 1;
8948 }
8949 else if (size < 64+32)
8950 {
8951 classes[0] = X86_64_INTEGER_CLASS;
8952 classes[1] = X86_64_INTEGERSI_CLASS;
8953 return 2;
8954 }
8955 else if (size < 64+64)
8956 {
8957 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
8958 return 2;
8959 }
8960 else
8961 gcc_unreachable ();
8962 }
8963 case CDImode:
8964 case TImode:
8965 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
8966 return 2;
8967 case COImode:
8968 case OImode:
8969 /* OImode shouldn't be used directly. */
8970 gcc_unreachable ();
8971 case CTImode:
8972 return 0;
8973 case SFmode:
8974 if (!(bit_offset % 64))
8975 classes[0] = X86_64_SSESF_CLASS;
8976 else
8977 classes[0] = X86_64_SSE_CLASS;
8978 return 1;
8979 case DFmode:
8980 classes[0] = X86_64_SSEDF_CLASS;
8981 return 1;
8982 case XFmode:
8983 classes[0] = X86_64_X87_CLASS;
8984 classes[1] = X86_64_X87UP_CLASS;
8985 return 2;
8986 case TFmode:
8987 classes[0] = X86_64_SSE_CLASS;
8988 classes[1] = X86_64_SSEUP_CLASS;
8989 return 2;
8990 case SCmode:
8991 classes[0] = X86_64_SSE_CLASS;
8992 if (!(bit_offset % 64))
8993 return 1;
8994 else
8995 {
8996 static bool warned;
8997
8998 if (!warned && warn_psabi)
8999 {
9000 warned = true;
9001 inform (input_location,
9002 "the ABI of passing structure with complex float"
9003 " member has changed in GCC 4.4");
9004 }
9005 classes[1] = X86_64_SSESF_CLASS;
9006 return 2;
9007 }
9008 case DCmode:
9009 classes[0] = X86_64_SSEDF_CLASS;
9010 classes[1] = X86_64_SSEDF_CLASS;
9011 return 2;
9012 case XCmode:
9013 classes[0] = X86_64_COMPLEX_X87_CLASS;
9014 return 1;
9015 case TCmode:
9016 /* This modes is larger than 16 bytes. */
9017 return 0;
9018 case V8SFmode:
9019 case V8SImode:
9020 case V32QImode:
9021 case V16HImode:
9022 case V4DFmode:
9023 case V4DImode:
9024 classes[0] = X86_64_SSE_CLASS;
9025 classes[1] = X86_64_SSEUP_CLASS;
9026 classes[2] = X86_64_SSEUP_CLASS;
9027 classes[3] = X86_64_SSEUP_CLASS;
9028 return 4;
9029 case V8DFmode:
9030 case V16SFmode:
9031 case V8DImode:
9032 case V16SImode:
9033 case V32HImode:
9034 case V64QImode:
9035 classes[0] = X86_64_SSE_CLASS;
9036 classes[1] = X86_64_SSEUP_CLASS;
9037 classes[2] = X86_64_SSEUP_CLASS;
9038 classes[3] = X86_64_SSEUP_CLASS;
9039 classes[4] = X86_64_SSEUP_CLASS;
9040 classes[5] = X86_64_SSEUP_CLASS;
9041 classes[6] = X86_64_SSEUP_CLASS;
9042 classes[7] = X86_64_SSEUP_CLASS;
9043 return 8;
9044 case V4SFmode:
9045 case V4SImode:
9046 case V16QImode:
9047 case V8HImode:
9048 case V2DFmode:
9049 case V2DImode:
9050 classes[0] = X86_64_SSE_CLASS;
9051 classes[1] = X86_64_SSEUP_CLASS;
9052 return 2;
9053 case V1TImode:
9054 case V1DImode:
9055 case V2SFmode:
9056 case V2SImode:
9057 case V4HImode:
9058 case V8QImode:
9059 classes[0] = X86_64_SSE_CLASS;
9060 return 1;
9061 case BLKmode:
9062 case VOIDmode:
9063 return 0;
9064 default:
9065 gcc_assert (VECTOR_MODE_P (mode));
9066
9067 if (bytes > 16)
9068 return 0;
9069
9070 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
9071
9072 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
9073 classes[0] = X86_64_INTEGERSI_CLASS;
9074 else
9075 classes[0] = X86_64_INTEGER_CLASS;
9076 classes[1] = X86_64_INTEGER_CLASS;
9077 return 1 + (bytes > 8);
9078 }
9079 }
9080
9081 /* Examine the argument and return set number of register required in each
9082 class. Return true iff parameter should be passed in memory. */
9083
9084 static bool
9085 examine_argument (machine_mode mode, const_tree type, int in_return,
9086 int *int_nregs, int *sse_nregs)
9087 {
9088 enum x86_64_reg_class regclass[MAX_CLASSES];
9089 int n = classify_argument (mode, type, regclass, 0);
9090
9091 *int_nregs = 0;
9092 *sse_nregs = 0;
9093
9094 if (!n)
9095 return true;
9096 for (n--; n >= 0; n--)
9097 switch (regclass[n])
9098 {
9099 case X86_64_INTEGER_CLASS:
9100 case X86_64_INTEGERSI_CLASS:
9101 (*int_nregs)++;
9102 break;
9103 case X86_64_SSE_CLASS:
9104 case X86_64_SSESF_CLASS:
9105 case X86_64_SSEDF_CLASS:
9106 (*sse_nregs)++;
9107 break;
9108 case X86_64_NO_CLASS:
9109 case X86_64_SSEUP_CLASS:
9110 break;
9111 case X86_64_X87_CLASS:
9112 case X86_64_X87UP_CLASS:
9113 case X86_64_COMPLEX_X87_CLASS:
9114 if (!in_return)
9115 return true;
9116 break;
9117 case X86_64_MEMORY_CLASS:
9118 gcc_unreachable ();
9119 }
9120
9121 return false;
9122 }
9123
9124 /* Construct container for the argument used by GCC interface. See
9125 FUNCTION_ARG for the detailed description. */
9126
9127 static rtx
9128 construct_container (machine_mode mode, machine_mode orig_mode,
9129 const_tree type, int in_return, int nintregs, int nsseregs,
9130 const int *intreg, int sse_regno)
9131 {
9132 /* The following variables hold the static issued_error state. */
9133 static bool issued_sse_arg_error;
9134 static bool issued_sse_ret_error;
9135 static bool issued_x87_ret_error;
9136
9137 machine_mode tmpmode;
9138 int bytes =
9139 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
9140 enum x86_64_reg_class regclass[MAX_CLASSES];
9141 int n;
9142 int i;
9143 int nexps = 0;
9144 int needed_sseregs, needed_intregs;
9145 rtx exp[MAX_CLASSES];
9146 rtx ret;
9147
9148 n = classify_argument (mode, type, regclass, 0);
9149 if (!n)
9150 return NULL;
9151 if (examine_argument (mode, type, in_return, &needed_intregs,
9152 &needed_sseregs))
9153 return NULL;
9154 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
9155 return NULL;
9156
9157 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
9158 some less clueful developer tries to use floating-point anyway. */
9159 if (needed_sseregs && !TARGET_SSE)
9160 {
9161 if (in_return)
9162 {
9163 if (!issued_sse_ret_error)
9164 {
9165 error ("SSE register return with SSE disabled");
9166 issued_sse_ret_error = true;
9167 }
9168 }
9169 else if (!issued_sse_arg_error)
9170 {
9171 error ("SSE register argument with SSE disabled");
9172 issued_sse_arg_error = true;
9173 }
9174 return NULL;
9175 }
9176
9177 /* Likewise, error if the ABI requires us to return values in the
9178 x87 registers and the user specified -mno-80387. */
9179 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
9180 for (i = 0; i < n; i++)
9181 if (regclass[i] == X86_64_X87_CLASS
9182 || regclass[i] == X86_64_X87UP_CLASS
9183 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
9184 {
9185 if (!issued_x87_ret_error)
9186 {
9187 error ("x87 register return with x87 disabled");
9188 issued_x87_ret_error = true;
9189 }
9190 return NULL;
9191 }
9192
9193 /* First construct simple cases. Avoid SCmode, since we want to use
9194 single register to pass this type. */
9195 if (n == 1 && mode != SCmode)
9196 switch (regclass[0])
9197 {
9198 case X86_64_INTEGER_CLASS:
9199 case X86_64_INTEGERSI_CLASS:
9200 return gen_rtx_REG (mode, intreg[0]);
9201 case X86_64_SSE_CLASS:
9202 case X86_64_SSESF_CLASS:
9203 case X86_64_SSEDF_CLASS:
9204 if (mode != BLKmode)
9205 return gen_reg_or_parallel (mode, orig_mode,
9206 SSE_REGNO (sse_regno));
9207 break;
9208 case X86_64_X87_CLASS:
9209 case X86_64_COMPLEX_X87_CLASS:
9210 return gen_rtx_REG (mode, FIRST_STACK_REG);
9211 case X86_64_NO_CLASS:
9212 /* Zero sized array, struct or class. */
9213 return NULL;
9214 default:
9215 gcc_unreachable ();
9216 }
9217 if (n == 2
9218 && regclass[0] == X86_64_SSE_CLASS
9219 && regclass[1] == X86_64_SSEUP_CLASS
9220 && mode != BLKmode)
9221 return gen_reg_or_parallel (mode, orig_mode,
9222 SSE_REGNO (sse_regno));
9223 if (n == 4
9224 && regclass[0] == X86_64_SSE_CLASS
9225 && regclass[1] == X86_64_SSEUP_CLASS
9226 && regclass[2] == X86_64_SSEUP_CLASS
9227 && regclass[3] == X86_64_SSEUP_CLASS
9228 && mode != BLKmode)
9229 return gen_reg_or_parallel (mode, orig_mode,
9230 SSE_REGNO (sse_regno));
9231 if (n == 8
9232 && regclass[0] == X86_64_SSE_CLASS
9233 && regclass[1] == X86_64_SSEUP_CLASS
9234 && regclass[2] == X86_64_SSEUP_CLASS
9235 && regclass[3] == X86_64_SSEUP_CLASS
9236 && regclass[4] == X86_64_SSEUP_CLASS
9237 && regclass[5] == X86_64_SSEUP_CLASS
9238 && regclass[6] == X86_64_SSEUP_CLASS
9239 && regclass[7] == X86_64_SSEUP_CLASS
9240 && mode != BLKmode)
9241 return gen_reg_or_parallel (mode, orig_mode,
9242 SSE_REGNO (sse_regno));
9243 if (n == 2
9244 && regclass[0] == X86_64_X87_CLASS
9245 && regclass[1] == X86_64_X87UP_CLASS)
9246 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
9247
9248 if (n == 2
9249 && regclass[0] == X86_64_INTEGER_CLASS
9250 && regclass[1] == X86_64_INTEGER_CLASS
9251 && (mode == CDImode || mode == TImode)
9252 && intreg[0] + 1 == intreg[1])
9253 return gen_rtx_REG (mode, intreg[0]);
9254
9255 /* Otherwise figure out the entries of the PARALLEL. */
9256 for (i = 0; i < n; i++)
9257 {
9258 int pos;
9259
9260 switch (regclass[i])
9261 {
9262 case X86_64_NO_CLASS:
9263 break;
9264 case X86_64_INTEGER_CLASS:
9265 case X86_64_INTEGERSI_CLASS:
9266 /* Merge TImodes on aligned occasions here too. */
9267 if (i * 8 + 8 > bytes)
9268 tmpmode
9269 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
9270 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
9271 tmpmode = SImode;
9272 else
9273 tmpmode = DImode;
9274 /* We've requested 24 bytes we
9275 don't have mode for. Use DImode. */
9276 if (tmpmode == BLKmode)
9277 tmpmode = DImode;
9278 exp [nexps++]
9279 = gen_rtx_EXPR_LIST (VOIDmode,
9280 gen_rtx_REG (tmpmode, *intreg),
9281 GEN_INT (i*8));
9282 intreg++;
9283 break;
9284 case X86_64_SSESF_CLASS:
9285 exp [nexps++]
9286 = gen_rtx_EXPR_LIST (VOIDmode,
9287 gen_rtx_REG (SFmode,
9288 SSE_REGNO (sse_regno)),
9289 GEN_INT (i*8));
9290 sse_regno++;
9291 break;
9292 case X86_64_SSEDF_CLASS:
9293 exp [nexps++]
9294 = gen_rtx_EXPR_LIST (VOIDmode,
9295 gen_rtx_REG (DFmode,
9296 SSE_REGNO (sse_regno)),
9297 GEN_INT (i*8));
9298 sse_regno++;
9299 break;
9300 case X86_64_SSE_CLASS:
9301 pos = i;
9302 switch (n)
9303 {
9304 case 1:
9305 tmpmode = DImode;
9306 break;
9307 case 2:
9308 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
9309 {
9310 tmpmode = TImode;
9311 i++;
9312 }
9313 else
9314 tmpmode = DImode;
9315 break;
9316 case 4:
9317 gcc_assert (i == 0
9318 && regclass[1] == X86_64_SSEUP_CLASS
9319 && regclass[2] == X86_64_SSEUP_CLASS
9320 && regclass[3] == X86_64_SSEUP_CLASS);
9321 tmpmode = OImode;
9322 i += 3;
9323 break;
9324 case 8:
9325 gcc_assert (i == 0
9326 && regclass[1] == X86_64_SSEUP_CLASS
9327 && regclass[2] == X86_64_SSEUP_CLASS
9328 && regclass[3] == X86_64_SSEUP_CLASS
9329 && regclass[4] == X86_64_SSEUP_CLASS
9330 && regclass[5] == X86_64_SSEUP_CLASS
9331 && regclass[6] == X86_64_SSEUP_CLASS
9332 && regclass[7] == X86_64_SSEUP_CLASS);
9333 tmpmode = XImode;
9334 i += 7;
9335 break;
9336 default:
9337 gcc_unreachable ();
9338 }
9339 exp [nexps++]
9340 = gen_rtx_EXPR_LIST (VOIDmode,
9341 gen_rtx_REG (tmpmode,
9342 SSE_REGNO (sse_regno)),
9343 GEN_INT (pos*8));
9344 sse_regno++;
9345 break;
9346 default:
9347 gcc_unreachable ();
9348 }
9349 }
9350
9351 /* Empty aligned struct, union or class. */
9352 if (nexps == 0)
9353 return NULL;
9354
9355 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
9356 for (i = 0; i < nexps; i++)
9357 XVECEXP (ret, 0, i) = exp [i];
9358 return ret;
9359 }
9360
9361 /* Update the data in CUM to advance over an argument of mode MODE
9362 and data type TYPE. (TYPE is null for libcalls where that information
9363 may not be available.)
9364
9365 Return a number of integer regsiters advanced over. */
9366
9367 static int
9368 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
9369 const_tree type, HOST_WIDE_INT bytes,
9370 HOST_WIDE_INT words)
9371 {
9372 int res = 0;
9373 bool error_p = NULL;
9374
9375 if (TARGET_IAMCU)
9376 {
9377 /* Intel MCU psABI passes scalars and aggregates no larger than 8
9378 bytes in registers. */
9379 if (!VECTOR_MODE_P (mode) && bytes <= 8)
9380 goto pass_in_reg;
9381 return res;
9382 }
9383
9384 switch (mode)
9385 {
9386 default:
9387 break;
9388
9389 case BLKmode:
9390 if (bytes < 0)
9391 break;
9392 /* FALLTHRU */
9393
9394 case DImode:
9395 case SImode:
9396 case HImode:
9397 case QImode:
9398 pass_in_reg:
9399 cum->words += words;
9400 cum->nregs -= words;
9401 cum->regno += words;
9402 if (cum->nregs >= 0)
9403 res = words;
9404 if (cum->nregs <= 0)
9405 {
9406 cum->nregs = 0;
9407 cfun->machine->arg_reg_available = false;
9408 cum->regno = 0;
9409 }
9410 break;
9411
9412 case OImode:
9413 /* OImode shouldn't be used directly. */
9414 gcc_unreachable ();
9415
9416 case DFmode:
9417 if (cum->float_in_sse == -1)
9418 error_p = 1;
9419 if (cum->float_in_sse < 2)
9420 break;
9421 /* FALLTHRU */
9422 case SFmode:
9423 if (cum->float_in_sse == -1)
9424 error_p = 1;
9425 if (cum->float_in_sse < 1)
9426 break;
9427 /* FALLTHRU */
9428
9429 case V8SFmode:
9430 case V8SImode:
9431 case V64QImode:
9432 case V32HImode:
9433 case V16SImode:
9434 case V8DImode:
9435 case V16SFmode:
9436 case V8DFmode:
9437 case V32QImode:
9438 case V16HImode:
9439 case V4DFmode:
9440 case V4DImode:
9441 case TImode:
9442 case V16QImode:
9443 case V8HImode:
9444 case V4SImode:
9445 case V2DImode:
9446 case V4SFmode:
9447 case V2DFmode:
9448 if (!type || !AGGREGATE_TYPE_P (type))
9449 {
9450 cum->sse_words += words;
9451 cum->sse_nregs -= 1;
9452 cum->sse_regno += 1;
9453 if (cum->sse_nregs <= 0)
9454 {
9455 cum->sse_nregs = 0;
9456 cum->sse_regno = 0;
9457 }
9458 }
9459 break;
9460
9461 case V8QImode:
9462 case V4HImode:
9463 case V2SImode:
9464 case V2SFmode:
9465 case V1TImode:
9466 case V1DImode:
9467 if (!type || !AGGREGATE_TYPE_P (type))
9468 {
9469 cum->mmx_words += words;
9470 cum->mmx_nregs -= 1;
9471 cum->mmx_regno += 1;
9472 if (cum->mmx_nregs <= 0)
9473 {
9474 cum->mmx_nregs = 0;
9475 cum->mmx_regno = 0;
9476 }
9477 }
9478 break;
9479 }
9480 if (error_p)
9481 {
9482 cum->float_in_sse = 0;
9483 error ("calling %qD with SSE calling convention without "
9484 "SSE/SSE2 enabled", cum->decl);
9485 sorry ("this is a GCC bug that can be worked around by adding "
9486 "attribute used to function called");
9487 }
9488
9489 return res;
9490 }
9491
9492 static int
9493 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
9494 const_tree type, HOST_WIDE_INT words, bool named)
9495 {
9496 int int_nregs, sse_nregs;
9497
9498 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
9499 if (!named && (VALID_AVX512F_REG_MODE (mode)
9500 || VALID_AVX256_REG_MODE (mode)))
9501 return 0;
9502
9503 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
9504 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
9505 {
9506 cum->nregs -= int_nregs;
9507 cum->sse_nregs -= sse_nregs;
9508 cum->regno += int_nregs;
9509 cum->sse_regno += sse_nregs;
9510 return int_nregs;
9511 }
9512 else
9513 {
9514 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
9515 cum->words = ROUND_UP (cum->words, align);
9516 cum->words += words;
9517 return 0;
9518 }
9519 }
9520
9521 static int
9522 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
9523 HOST_WIDE_INT words)
9524 {
9525 /* Otherwise, this should be passed indirect. */
9526 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
9527
9528 cum->words += words;
9529 if (cum->nregs > 0)
9530 {
9531 cum->nregs -= 1;
9532 cum->regno += 1;
9533 return 1;
9534 }
9535 return 0;
9536 }
9537
9538 /* Update the data in CUM to advance over an argument of mode MODE and
9539 data type TYPE. (TYPE is null for libcalls where that information
9540 may not be available.) */
9541
9542 static void
9543 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
9544 const_tree type, bool named)
9545 {
9546 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9547 HOST_WIDE_INT bytes, words;
9548 int nregs;
9549
9550 /* The argument of interrupt handler is a special case and is
9551 handled in ix86_function_arg. */
9552 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
9553 return;
9554
9555 if (mode == BLKmode)
9556 bytes = int_size_in_bytes (type);
9557 else
9558 bytes = GET_MODE_SIZE (mode);
9559 words = CEIL (bytes, UNITS_PER_WORD);
9560
9561 if (type)
9562 mode = type_natural_mode (type, NULL, false);
9563
9564 if ((type && POINTER_BOUNDS_TYPE_P (type))
9565 || POINTER_BOUNDS_MODE_P (mode))
9566 {
9567 /* If we pass bounds in BT then just update remained bounds count. */
9568 if (cum->bnds_in_bt)
9569 {
9570 cum->bnds_in_bt--;
9571 return;
9572 }
9573
9574 /* Update remained number of bounds to force. */
9575 if (cum->force_bnd_pass)
9576 cum->force_bnd_pass--;
9577
9578 cum->bnd_regno++;
9579
9580 return;
9581 }
9582
9583 /* The first arg not going to Bounds Tables resets this counter. */
9584 cum->bnds_in_bt = 0;
9585 /* For unnamed args we always pass bounds to avoid bounds mess when
9586 passed and received types do not match. If bounds do not follow
9587 unnamed arg, still pretend required number of bounds were passed. */
9588 if (cum->force_bnd_pass)
9589 {
9590 cum->bnd_regno += cum->force_bnd_pass;
9591 cum->force_bnd_pass = 0;
9592 }
9593
9594 if (TARGET_64BIT)
9595 {
9596 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
9597
9598 if (call_abi == MS_ABI)
9599 nregs = function_arg_advance_ms_64 (cum, bytes, words);
9600 else
9601 nregs = function_arg_advance_64 (cum, mode, type, words, named);
9602 }
9603 else
9604 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
9605
9606 /* For stdarg we expect bounds to be passed for each value passed
9607 in register. */
9608 if (cum->stdarg)
9609 cum->force_bnd_pass = nregs;
9610 /* For pointers passed in memory we expect bounds passed in Bounds
9611 Table. */
9612 if (!nregs)
9613 cum->bnds_in_bt = chkp_type_bounds_count (type);
9614 }
9615
9616 /* Define where to put the arguments to a function.
9617 Value is zero to push the argument on the stack,
9618 or a hard register in which to store the argument.
9619
9620 MODE is the argument's machine mode.
9621 TYPE is the data type of the argument (as a tree).
9622 This is null for libcalls where that information may
9623 not be available.
9624 CUM is a variable of type CUMULATIVE_ARGS which gives info about
9625 the preceding args and about the function being called.
9626 NAMED is nonzero if this argument is a named parameter
9627 (otherwise it is an extra parameter matching an ellipsis). */
9628
9629 static rtx
9630 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
9631 machine_mode orig_mode, const_tree type,
9632 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
9633 {
9634 bool error_p = false;
9635 /* Avoid the AL settings for the Unix64 ABI. */
9636 if (mode == VOIDmode)
9637 return constm1_rtx;
9638
9639 if (TARGET_IAMCU)
9640 {
9641 /* Intel MCU psABI passes scalars and aggregates no larger than 8
9642 bytes in registers. */
9643 if (!VECTOR_MODE_P (mode) && bytes <= 8)
9644 goto pass_in_reg;
9645 return NULL_RTX;
9646 }
9647
9648 switch (mode)
9649 {
9650 default:
9651 break;
9652
9653 case BLKmode:
9654 if (bytes < 0)
9655 break;
9656 /* FALLTHRU */
9657 case DImode:
9658 case SImode:
9659 case HImode:
9660 case QImode:
9661 pass_in_reg:
9662 if (words <= cum->nregs)
9663 {
9664 int regno = cum->regno;
9665
9666 /* Fastcall allocates the first two DWORD (SImode) or
9667 smaller arguments to ECX and EDX if it isn't an
9668 aggregate type . */
9669 if (cum->fastcall)
9670 {
9671 if (mode == BLKmode
9672 || mode == DImode
9673 || (type && AGGREGATE_TYPE_P (type)))
9674 break;
9675
9676 /* ECX not EAX is the first allocated register. */
9677 if (regno == AX_REG)
9678 regno = CX_REG;
9679 }
9680 return gen_rtx_REG (mode, regno);
9681 }
9682 break;
9683
9684 case DFmode:
9685 if (cum->float_in_sse == -1)
9686 error_p = 1;
9687 if (cum->float_in_sse < 2)
9688 break;
9689 /* FALLTHRU */
9690 case SFmode:
9691 if (cum->float_in_sse == -1)
9692 error_p = 1;
9693 if (cum->float_in_sse < 1)
9694 break;
9695 /* FALLTHRU */
9696 case TImode:
9697 /* In 32bit, we pass TImode in xmm registers. */
9698 case V16QImode:
9699 case V8HImode:
9700 case V4SImode:
9701 case V2DImode:
9702 case V4SFmode:
9703 case V2DFmode:
9704 if (!type || !AGGREGATE_TYPE_P (type))
9705 {
9706 if (cum->sse_nregs)
9707 return gen_reg_or_parallel (mode, orig_mode,
9708 cum->sse_regno + FIRST_SSE_REG);
9709 }
9710 break;
9711
9712 case OImode:
9713 case XImode:
9714 /* OImode and XImode shouldn't be used directly. */
9715 gcc_unreachable ();
9716
9717 case V64QImode:
9718 case V32HImode:
9719 case V16SImode:
9720 case V8DImode:
9721 case V16SFmode:
9722 case V8DFmode:
9723 case V8SFmode:
9724 case V8SImode:
9725 case V32QImode:
9726 case V16HImode:
9727 case V4DFmode:
9728 case V4DImode:
9729 if (!type || !AGGREGATE_TYPE_P (type))
9730 {
9731 if (cum->sse_nregs)
9732 return gen_reg_or_parallel (mode, orig_mode,
9733 cum->sse_regno + FIRST_SSE_REG);
9734 }
9735 break;
9736
9737 case V8QImode:
9738 case V4HImode:
9739 case V2SImode:
9740 case V2SFmode:
9741 case V1TImode:
9742 case V1DImode:
9743 if (!type || !AGGREGATE_TYPE_P (type))
9744 {
9745 if (cum->mmx_nregs)
9746 return gen_reg_or_parallel (mode, orig_mode,
9747 cum->mmx_regno + FIRST_MMX_REG);
9748 }
9749 break;
9750 }
9751 if (error_p)
9752 {
9753 cum->float_in_sse = 0;
9754 error ("calling %qD with SSE calling convention without "
9755 "SSE/SSE2 enabled", cum->decl);
9756 sorry ("this is a GCC bug that can be worked around by adding "
9757 "attribute used to function called");
9758 }
9759
9760 return NULL_RTX;
9761 }
9762
9763 static rtx
9764 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
9765 machine_mode orig_mode, const_tree type, bool named)
9766 {
9767 /* Handle a hidden AL argument containing number of registers
9768 for varargs x86-64 functions. */
9769 if (mode == VOIDmode)
9770 return GEN_INT (cum->maybe_vaarg
9771 ? (cum->sse_nregs < 0
9772 ? X86_64_SSE_REGPARM_MAX
9773 : cum->sse_regno)
9774 : -1);
9775
9776 switch (mode)
9777 {
9778 default:
9779 break;
9780
9781 case V8SFmode:
9782 case V8SImode:
9783 case V32QImode:
9784 case V16HImode:
9785 case V4DFmode:
9786 case V4DImode:
9787 case V16SFmode:
9788 case V16SImode:
9789 case V64QImode:
9790 case V32HImode:
9791 case V8DFmode:
9792 case V8DImode:
9793 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
9794 if (!named)
9795 return NULL;
9796 break;
9797 }
9798
9799 return construct_container (mode, orig_mode, type, 0, cum->nregs,
9800 cum->sse_nregs,
9801 &x86_64_int_parameter_registers [cum->regno],
9802 cum->sse_regno);
9803 }
9804
9805 static rtx
9806 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
9807 machine_mode orig_mode, bool named,
9808 HOST_WIDE_INT bytes)
9809 {
9810 unsigned int regno;
9811
9812 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
9813 We use value of -2 to specify that current function call is MSABI. */
9814 if (mode == VOIDmode)
9815 return GEN_INT (-2);
9816
9817 /* If we've run out of registers, it goes on the stack. */
9818 if (cum->nregs == 0)
9819 return NULL_RTX;
9820
9821 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
9822
9823 /* Only floating point modes are passed in anything but integer regs. */
9824 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
9825 {
9826 if (named)
9827 regno = cum->regno + FIRST_SSE_REG;
9828 else
9829 {
9830 rtx t1, t2;
9831
9832 /* Unnamed floating parameters are passed in both the
9833 SSE and integer registers. */
9834 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
9835 t2 = gen_rtx_REG (mode, regno);
9836 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
9837 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
9838 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
9839 }
9840 }
9841 /* Handle aggregated types passed in register. */
9842 if (orig_mode == BLKmode)
9843 {
9844 if (bytes > 0 && bytes <= 8)
9845 mode = (bytes > 4 ? DImode : SImode);
9846 if (mode == BLKmode)
9847 mode = DImode;
9848 }
9849
9850 return gen_reg_or_parallel (mode, orig_mode, regno);
9851 }
9852
9853 /* Return where to put the arguments to a function.
9854 Return zero to push the argument on the stack, or a hard register in which to store the argument.
9855
9856 MODE is the argument's machine mode. TYPE is the data type of the
9857 argument. It is null for libcalls where that information may not be
9858 available. CUM gives information about the preceding args and about
9859 the function being called. NAMED is nonzero if this argument is a
9860 named parameter (otherwise it is an extra parameter matching an
9861 ellipsis). */
9862
9863 static rtx
9864 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
9865 const_tree type, bool named)
9866 {
9867 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9868 machine_mode mode = omode;
9869 HOST_WIDE_INT bytes, words;
9870 rtx arg;
9871
9872 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
9873 {
9874 gcc_assert (type != NULL_TREE);
9875 if (POINTER_TYPE_P (type))
9876 {
9877 /* This is the pointer argument. */
9878 gcc_assert (TYPE_MODE (type) == Pmode);
9879 if (cfun->machine->func_type == TYPE_INTERRUPT)
9880 /* -WORD(AP) in the current frame in interrupt handler. */
9881 arg = plus_constant (Pmode, arg_pointer_rtx,
9882 -UNITS_PER_WORD);
9883 else
9884 /* (AP) in the current frame in exception handler. */
9885 arg = arg_pointer_rtx;
9886 }
9887 else
9888 {
9889 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
9890 && TREE_CODE (type) == INTEGER_TYPE
9891 && TYPE_MODE (type) == word_mode);
9892 /* The integer argument is the error code at -WORD(AP) in
9893 the current frame in exception handler. */
9894 arg = gen_rtx_MEM (word_mode,
9895 plus_constant (Pmode,
9896 arg_pointer_rtx,
9897 -UNITS_PER_WORD));
9898 }
9899 return arg;
9900 }
9901
9902 /* All pointer bounds arguments are handled separately here. */
9903 if ((type && POINTER_BOUNDS_TYPE_P (type))
9904 || POINTER_BOUNDS_MODE_P (mode))
9905 {
9906 /* Return NULL if bounds are forced to go in Bounds Table. */
9907 if (cum->bnds_in_bt)
9908 arg = NULL;
9909 /* Return the next available bound reg if any. */
9910 else if (cum->bnd_regno <= LAST_BND_REG)
9911 arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
9912 /* Return the next special slot number otherwise. */
9913 else
9914 arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
9915
9916 return arg;
9917 }
9918
9919 if (mode == BLKmode)
9920 bytes = int_size_in_bytes (type);
9921 else
9922 bytes = GET_MODE_SIZE (mode);
9923 words = CEIL (bytes, UNITS_PER_WORD);
9924
9925 /* To simplify the code below, represent vector types with a vector mode
9926 even if MMX/SSE are not active. */
9927 if (type && TREE_CODE (type) == VECTOR_TYPE)
9928 mode = type_natural_mode (type, cum, false);
9929
9930 if (TARGET_64BIT)
9931 {
9932 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
9933
9934 if (call_abi == MS_ABI)
9935 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
9936 else
9937 arg = function_arg_64 (cum, mode, omode, type, named);
9938 }
9939 else
9940 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
9941
9942 return arg;
9943 }
9944
9945 /* A C expression that indicates when an argument must be passed by
9946 reference. If nonzero for an argument, a copy of that argument is
9947 made in memory and a pointer to the argument is passed instead of
9948 the argument itself. The pointer is passed in whatever way is
9949 appropriate for passing a pointer to that type. */
9950
9951 static bool
9952 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
9953 const_tree type, bool)
9954 {
9955 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9956
9957 /* Bounds are never passed by reference. */
9958 if ((type && POINTER_BOUNDS_TYPE_P (type))
9959 || POINTER_BOUNDS_MODE_P (mode))
9960 return false;
9961
9962 if (TARGET_64BIT)
9963 {
9964 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
9965
9966 /* See Windows x64 Software Convention. */
9967 if (call_abi == MS_ABI)
9968 {
9969 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
9970
9971 if (type)
9972 {
9973 /* Arrays are passed by reference. */
9974 if (TREE_CODE (type) == ARRAY_TYPE)
9975 return true;
9976
9977 if (RECORD_OR_UNION_TYPE_P (type))
9978 {
9979 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
9980 are passed by reference. */
9981 msize = int_size_in_bytes (type);
9982 }
9983 }
9984
9985 /* __m128 is passed by reference. */
9986 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
9987 }
9988 else if (type && int_size_in_bytes (type) == -1)
9989 return true;
9990 }
9991
9992 return false;
9993 }
9994
9995 /* Return true when TYPE should be 128bit aligned for 32bit argument
9996 passing ABI. XXX: This function is obsolete and is only used for
9997 checking psABI compatibility with previous versions of GCC. */
9998
9999 static bool
10000 ix86_compat_aligned_value_p (const_tree type)
10001 {
10002 machine_mode mode = TYPE_MODE (type);
10003 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
10004 || mode == TDmode
10005 || mode == TFmode
10006 || mode == TCmode)
10007 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
10008 return true;
10009 if (TYPE_ALIGN (type) < 128)
10010 return false;
10011
10012 if (AGGREGATE_TYPE_P (type))
10013 {
10014 /* Walk the aggregates recursively. */
10015 switch (TREE_CODE (type))
10016 {
10017 case RECORD_TYPE:
10018 case UNION_TYPE:
10019 case QUAL_UNION_TYPE:
10020 {
10021 tree field;
10022
10023 /* Walk all the structure fields. */
10024 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
10025 {
10026 if (TREE_CODE (field) == FIELD_DECL
10027 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
10028 return true;
10029 }
10030 break;
10031 }
10032
10033 case ARRAY_TYPE:
10034 /* Just for use if some languages passes arrays by value. */
10035 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
10036 return true;
10037 break;
10038
10039 default:
10040 gcc_unreachable ();
10041 }
10042 }
10043 return false;
10044 }
10045
10046 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
10047 XXX: This function is obsolete and is only used for checking psABI
10048 compatibility with previous versions of GCC. */
10049
10050 static unsigned int
10051 ix86_compat_function_arg_boundary (machine_mode mode,
10052 const_tree type, unsigned int align)
10053 {
10054 /* In 32bit, only _Decimal128 and __float128 are aligned to their
10055 natural boundaries. */
10056 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
10057 {
10058 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
10059 make an exception for SSE modes since these require 128bit
10060 alignment.
10061
10062 The handling here differs from field_alignment. ICC aligns MMX
10063 arguments to 4 byte boundaries, while structure fields are aligned
10064 to 8 byte boundaries. */
10065 if (!type)
10066 {
10067 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
10068 align = PARM_BOUNDARY;
10069 }
10070 else
10071 {
10072 if (!ix86_compat_aligned_value_p (type))
10073 align = PARM_BOUNDARY;
10074 }
10075 }
10076 if (align > BIGGEST_ALIGNMENT)
10077 align = BIGGEST_ALIGNMENT;
10078 return align;
10079 }
10080
10081 /* Return true when TYPE should be 128bit aligned for 32bit argument
10082 passing ABI. */
10083
10084 static bool
10085 ix86_contains_aligned_value_p (const_tree type)
10086 {
10087 machine_mode mode = TYPE_MODE (type);
10088
10089 if (mode == XFmode || mode == XCmode)
10090 return false;
10091
10092 if (TYPE_ALIGN (type) < 128)
10093 return false;
10094
10095 if (AGGREGATE_TYPE_P (type))
10096 {
10097 /* Walk the aggregates recursively. */
10098 switch (TREE_CODE (type))
10099 {
10100 case RECORD_TYPE:
10101 case UNION_TYPE:
10102 case QUAL_UNION_TYPE:
10103 {
10104 tree field;
10105
10106 /* Walk all the structure fields. */
10107 for (field = TYPE_FIELDS (type);
10108 field;
10109 field = DECL_CHAIN (field))
10110 {
10111 if (TREE_CODE (field) == FIELD_DECL
10112 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
10113 return true;
10114 }
10115 break;
10116 }
10117
10118 case ARRAY_TYPE:
10119 /* Just for use if some languages passes arrays by value. */
10120 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
10121 return true;
10122 break;
10123
10124 default:
10125 gcc_unreachable ();
10126 }
10127 }
10128 else
10129 return TYPE_ALIGN (type) >= 128;
10130
10131 return false;
10132 }
10133
10134 /* Gives the alignment boundary, in bits, of an argument with the
10135 specified mode and type. */
10136
10137 static unsigned int
10138 ix86_function_arg_boundary (machine_mode mode, const_tree type)
10139 {
10140 unsigned int align;
10141 if (type)
10142 {
10143 /* Since the main variant type is used for call, we convert it to
10144 the main variant type. */
10145 type = TYPE_MAIN_VARIANT (type);
10146 align = TYPE_ALIGN (type);
10147 }
10148 else
10149 align = GET_MODE_ALIGNMENT (mode);
10150 if (align < PARM_BOUNDARY)
10151 align = PARM_BOUNDARY;
10152 else
10153 {
10154 static bool warned;
10155 unsigned int saved_align = align;
10156
10157 if (!TARGET_64BIT)
10158 {
10159 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
10160 if (!type)
10161 {
10162 if (mode == XFmode || mode == XCmode)
10163 align = PARM_BOUNDARY;
10164 }
10165 else if (!ix86_contains_aligned_value_p (type))
10166 align = PARM_BOUNDARY;
10167
10168 if (align < 128)
10169 align = PARM_BOUNDARY;
10170 }
10171
10172 if (warn_psabi
10173 && !warned
10174 && align != ix86_compat_function_arg_boundary (mode, type,
10175 saved_align))
10176 {
10177 warned = true;
10178 inform (input_location,
10179 "The ABI for passing parameters with %d-byte"
10180 " alignment has changed in GCC 4.6",
10181 align / BITS_PER_UNIT);
10182 }
10183 }
10184
10185 return align;
10186 }
10187
10188 /* Return true if N is a possible register number of function value. */
10189
10190 static bool
10191 ix86_function_value_regno_p (const unsigned int regno)
10192 {
10193 switch (regno)
10194 {
10195 case AX_REG:
10196 return true;
10197 case DX_REG:
10198 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
10199 case DI_REG:
10200 case SI_REG:
10201 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
10202
10203 case BND0_REG:
10204 case BND1_REG:
10205 return chkp_function_instrumented_p (current_function_decl);
10206
10207 /* Complex values are returned in %st(0)/%st(1) pair. */
10208 case ST0_REG:
10209 case ST1_REG:
10210 /* TODO: The function should depend on current function ABI but
10211 builtins.c would need updating then. Therefore we use the
10212 default ABI. */
10213 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
10214 return false;
10215 return TARGET_FLOAT_RETURNS_IN_80387;
10216
10217 /* Complex values are returned in %xmm0/%xmm1 pair. */
10218 case XMM0_REG:
10219 case XMM1_REG:
10220 return TARGET_SSE;
10221
10222 case MM0_REG:
10223 if (TARGET_MACHO || TARGET_64BIT)
10224 return false;
10225 return TARGET_MMX;
10226 }
10227
10228 return false;
10229 }
10230
10231 /* Define how to find the value returned by a function.
10232 VALTYPE is the data type of the value (as a tree).
10233 If the precise function being called is known, FUNC is its FUNCTION_DECL;
10234 otherwise, FUNC is 0. */
10235
10236 static rtx
10237 function_value_32 (machine_mode orig_mode, machine_mode mode,
10238 const_tree fntype, const_tree fn)
10239 {
10240 unsigned int regno;
10241
10242 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
10243 we normally prevent this case when mmx is not available. However
10244 some ABIs may require the result to be returned like DImode. */
10245 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
10246 regno = FIRST_MMX_REG;
10247
10248 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
10249 we prevent this case when sse is not available. However some ABIs
10250 may require the result to be returned like integer TImode. */
10251 else if (mode == TImode
10252 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
10253 regno = FIRST_SSE_REG;
10254
10255 /* 32-byte vector modes in %ymm0. */
10256 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
10257 regno = FIRST_SSE_REG;
10258
10259 /* 64-byte vector modes in %zmm0. */
10260 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
10261 regno = FIRST_SSE_REG;
10262
10263 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
10264 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
10265 regno = FIRST_FLOAT_REG;
10266 else
10267 /* Most things go in %eax. */
10268 regno = AX_REG;
10269
10270 /* Override FP return register with %xmm0 for local functions when
10271 SSE math is enabled or for functions with sseregparm attribute. */
10272 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
10273 {
10274 int sse_level = ix86_function_sseregparm (fntype, fn, false);
10275 if (sse_level == -1)
10276 {
10277 error ("calling %qD with SSE caling convention without "
10278 "SSE/SSE2 enabled", fn);
10279 sorry ("this is a GCC bug that can be worked around by adding "
10280 "attribute used to function called");
10281 }
10282 else if ((sse_level >= 1 && mode == SFmode)
10283 || (sse_level == 2 && mode == DFmode))
10284 regno = FIRST_SSE_REG;
10285 }
10286
10287 /* OImode shouldn't be used directly. */
10288 gcc_assert (mode != OImode);
10289
10290 return gen_rtx_REG (orig_mode, regno);
10291 }
10292
10293 static rtx
10294 function_value_64 (machine_mode orig_mode, machine_mode mode,
10295 const_tree valtype)
10296 {
10297 rtx ret;
10298
10299 /* Handle libcalls, which don't provide a type node. */
10300 if (valtype == NULL)
10301 {
10302 unsigned int regno;
10303
10304 switch (mode)
10305 {
10306 case SFmode:
10307 case SCmode:
10308 case DFmode:
10309 case DCmode:
10310 case TFmode:
10311 case SDmode:
10312 case DDmode:
10313 case TDmode:
10314 regno = FIRST_SSE_REG;
10315 break;
10316 case XFmode:
10317 case XCmode:
10318 regno = FIRST_FLOAT_REG;
10319 break;
10320 case TCmode:
10321 return NULL;
10322 default:
10323 regno = AX_REG;
10324 }
10325
10326 return gen_rtx_REG (mode, regno);
10327 }
10328 else if (POINTER_TYPE_P (valtype))
10329 {
10330 /* Pointers are always returned in word_mode. */
10331 mode = word_mode;
10332 }
10333
10334 ret = construct_container (mode, orig_mode, valtype, 1,
10335 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
10336 x86_64_int_return_registers, 0);
10337
10338 /* For zero sized structures, construct_container returns NULL, but we
10339 need to keep rest of compiler happy by returning meaningful value. */
10340 if (!ret)
10341 ret = gen_rtx_REG (orig_mode, AX_REG);
10342
10343 return ret;
10344 }
10345
10346 static rtx
10347 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
10348 const_tree valtype)
10349 {
10350 unsigned int regno = AX_REG;
10351
10352 if (TARGET_SSE)
10353 {
10354 switch (GET_MODE_SIZE (mode))
10355 {
10356 case 16:
10357 if (valtype != NULL_TREE
10358 && !VECTOR_INTEGER_TYPE_P (valtype)
10359 && !VECTOR_INTEGER_TYPE_P (valtype)
10360 && !INTEGRAL_TYPE_P (valtype)
10361 && !VECTOR_FLOAT_TYPE_P (valtype))
10362 break;
10363 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
10364 && !COMPLEX_MODE_P (mode))
10365 regno = FIRST_SSE_REG;
10366 break;
10367 case 8:
10368 case 4:
10369 if (mode == SFmode || mode == DFmode)
10370 regno = FIRST_SSE_REG;
10371 break;
10372 default:
10373 break;
10374 }
10375 }
10376 return gen_rtx_REG (orig_mode, regno);
10377 }
10378
10379 static rtx
10380 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
10381 machine_mode orig_mode, machine_mode mode)
10382 {
10383 const_tree fn, fntype;
10384
10385 fn = NULL_TREE;
10386 if (fntype_or_decl && DECL_P (fntype_or_decl))
10387 fn = fntype_or_decl;
10388 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
10389
10390 if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
10391 || POINTER_BOUNDS_MODE_P (mode))
10392 return gen_rtx_REG (BNDmode, FIRST_BND_REG);
10393 else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
10394 return function_value_ms_64 (orig_mode, mode, valtype);
10395 else if (TARGET_64BIT)
10396 return function_value_64 (orig_mode, mode, valtype);
10397 else
10398 return function_value_32 (orig_mode, mode, fntype, fn);
10399 }
10400
10401 static rtx
10402 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
10403 {
10404 machine_mode mode, orig_mode;
10405
10406 orig_mode = TYPE_MODE (valtype);
10407 mode = type_natural_mode (valtype, NULL, true);
10408 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
10409 }
10410
10411 /* Return an RTX representing a place where a function returns
10412 or recieves pointer bounds or NULL if no bounds are returned.
10413
10414 VALTYPE is a data type of a value returned by the function.
10415
10416 FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
10417 or FUNCTION_TYPE of the function.
10418
10419 If OUTGOING is false, return a place in which the caller will
10420 see the return value. Otherwise, return a place where a
10421 function returns a value. */
10422
10423 static rtx
10424 ix86_function_value_bounds (const_tree valtype,
10425 const_tree fntype_or_decl ATTRIBUTE_UNUSED,
10426 bool outgoing ATTRIBUTE_UNUSED)
10427 {
10428 rtx res = NULL_RTX;
10429
10430 if (BOUNDED_TYPE_P (valtype))
10431 res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
10432 else if (chkp_type_has_pointer (valtype))
10433 {
10434 bitmap slots;
10435 rtx bounds[2];
10436 bitmap_iterator bi;
10437 unsigned i, bnd_no = 0;
10438
10439 bitmap_obstack_initialize (NULL);
10440 slots = BITMAP_ALLOC (NULL);
10441 chkp_find_bound_slots (valtype, slots);
10442
10443 EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
10444 {
10445 rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
10446 rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
10447 gcc_assert (bnd_no < 2);
10448 bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
10449 }
10450
10451 res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
10452
10453 BITMAP_FREE (slots);
10454 bitmap_obstack_release (NULL);
10455 }
10456 else
10457 res = NULL_RTX;
10458
10459 return res;
10460 }
10461
10462 /* Pointer function arguments and return values are promoted to
10463 word_mode for normal functions. */
10464
10465 static machine_mode
10466 ix86_promote_function_mode (const_tree type, machine_mode mode,
10467 int *punsignedp, const_tree fntype,
10468 int for_return)
10469 {
10470 if (cfun->machine->func_type == TYPE_NORMAL
10471 && type != NULL_TREE
10472 && POINTER_TYPE_P (type))
10473 {
10474 *punsignedp = POINTERS_EXTEND_UNSIGNED;
10475 return word_mode;
10476 }
10477 return default_promote_function_mode (type, mode, punsignedp, fntype,
10478 for_return);
10479 }
10480
10481 /* Return true if a structure, union or array with MODE containing FIELD
10482 should be accessed using BLKmode. */
10483
10484 static bool
10485 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
10486 {
10487 /* Union with XFmode must be in BLKmode. */
10488 return (mode == XFmode
10489 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
10490 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
10491 }
10492
10493 rtx
10494 ix86_libcall_value (machine_mode mode)
10495 {
10496 return ix86_function_value_1 (NULL, NULL, mode, mode);
10497 }
10498
10499 /* Return true iff type is returned in memory. */
10500
10501 static bool
10502 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
10503 {
10504 #ifdef SUBTARGET_RETURN_IN_MEMORY
10505 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
10506 #else
10507 const machine_mode mode = type_natural_mode (type, NULL, true);
10508 HOST_WIDE_INT size;
10509
10510 if (POINTER_BOUNDS_TYPE_P (type))
10511 return false;
10512
10513 if (TARGET_64BIT)
10514 {
10515 if (ix86_function_type_abi (fntype) == MS_ABI)
10516 {
10517 size = int_size_in_bytes (type);
10518
10519 /* __m128 is returned in xmm0. */
10520 if ((!type || VECTOR_INTEGER_TYPE_P (type)
10521 || INTEGRAL_TYPE_P (type)
10522 || VECTOR_FLOAT_TYPE_P (type))
10523 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
10524 && !COMPLEX_MODE_P (mode)
10525 && (GET_MODE_SIZE (mode) == 16 || size == 16))
10526 return false;
10527
10528 /* Otherwise, the size must be exactly in [1248]. */
10529 return size != 1 && size != 2 && size != 4 && size != 8;
10530 }
10531 else
10532 {
10533 int needed_intregs, needed_sseregs;
10534
10535 return examine_argument (mode, type, 1,
10536 &needed_intregs, &needed_sseregs);
10537 }
10538 }
10539 else
10540 {
10541 size = int_size_in_bytes (type);
10542
10543 /* Intel MCU psABI returns scalars and aggregates no larger than 8
10544 bytes in registers. */
10545 if (TARGET_IAMCU)
10546 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
10547
10548 if (mode == BLKmode)
10549 return true;
10550
10551 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
10552 return false;
10553
10554 if (VECTOR_MODE_P (mode) || mode == TImode)
10555 {
10556 /* User-created vectors small enough to fit in EAX. */
10557 if (size < 8)
10558 return false;
10559
10560 /* Unless ABI prescibes otherwise,
10561 MMX/3dNow values are returned in MM0 if available. */
10562
10563 if (size == 8)
10564 return TARGET_VECT8_RETURNS || !TARGET_MMX;
10565
10566 /* SSE values are returned in XMM0 if available. */
10567 if (size == 16)
10568 return !TARGET_SSE;
10569
10570 /* AVX values are returned in YMM0 if available. */
10571 if (size == 32)
10572 return !TARGET_AVX;
10573
10574 /* AVX512F values are returned in ZMM0 if available. */
10575 if (size == 64)
10576 return !TARGET_AVX512F;
10577 }
10578
10579 if (mode == XFmode)
10580 return false;
10581
10582 if (size > 12)
10583 return true;
10584
10585 /* OImode shouldn't be used directly. */
10586 gcc_assert (mode != OImode);
10587
10588 return false;
10589 }
10590 #endif
10591 }
10592
10593 \f
10594 /* Create the va_list data type. */
10595
10596 static tree
10597 ix86_build_builtin_va_list_64 (void)
10598 {
10599 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
10600
10601 record = lang_hooks.types.make_type (RECORD_TYPE);
10602 type_decl = build_decl (BUILTINS_LOCATION,
10603 TYPE_DECL, get_identifier ("__va_list_tag"), record);
10604
10605 f_gpr = build_decl (BUILTINS_LOCATION,
10606 FIELD_DECL, get_identifier ("gp_offset"),
10607 unsigned_type_node);
10608 f_fpr = build_decl (BUILTINS_LOCATION,
10609 FIELD_DECL, get_identifier ("fp_offset"),
10610 unsigned_type_node);
10611 f_ovf = build_decl (BUILTINS_LOCATION,
10612 FIELD_DECL, get_identifier ("overflow_arg_area"),
10613 ptr_type_node);
10614 f_sav = build_decl (BUILTINS_LOCATION,
10615 FIELD_DECL, get_identifier ("reg_save_area"),
10616 ptr_type_node);
10617
10618 va_list_gpr_counter_field = f_gpr;
10619 va_list_fpr_counter_field = f_fpr;
10620
10621 DECL_FIELD_CONTEXT (f_gpr) = record;
10622 DECL_FIELD_CONTEXT (f_fpr) = record;
10623 DECL_FIELD_CONTEXT (f_ovf) = record;
10624 DECL_FIELD_CONTEXT (f_sav) = record;
10625
10626 TYPE_STUB_DECL (record) = type_decl;
10627 TYPE_NAME (record) = type_decl;
10628 TYPE_FIELDS (record) = f_gpr;
10629 DECL_CHAIN (f_gpr) = f_fpr;
10630 DECL_CHAIN (f_fpr) = f_ovf;
10631 DECL_CHAIN (f_ovf) = f_sav;
10632
10633 layout_type (record);
10634
10635 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
10636 NULL_TREE, TYPE_ATTRIBUTES (record));
10637
10638 /* The correct type is an array type of one element. */
10639 return build_array_type (record, build_index_type (size_zero_node));
10640 }
10641
10642 /* Setup the builtin va_list data type and for 64-bit the additional
10643 calling convention specific va_list data types. */
10644
10645 static tree
10646 ix86_build_builtin_va_list (void)
10647 {
10648 if (TARGET_64BIT)
10649 {
10650 /* Initialize ABI specific va_list builtin types.
10651
10652 In lto1, we can encounter two va_list types:
10653 - one as a result of the type-merge across TUs, and
10654 - the one constructed here.
10655 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
10656 a type identity check in canonical_va_list_type based on
10657 TYPE_MAIN_VARIANT (which we used to have) will not work.
10658 Instead, we tag each va_list_type_node with its unique attribute, and
10659 look for the attribute in the type identity check in
10660 canonical_va_list_type.
10661
10662 Tagging sysv_va_list_type_node directly with the attribute is
10663 problematic since it's a array of one record, which will degrade into a
10664 pointer to record when used as parameter (see build_va_arg comments for
10665 an example), dropping the attribute in the process. So we tag the
10666 record instead. */
10667
10668 /* For SYSV_ABI we use an array of one record. */
10669 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
10670
10671 /* For MS_ABI we use plain pointer to argument area. */
10672 tree char_ptr_type = build_pointer_type (char_type_node);
10673 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
10674 TYPE_ATTRIBUTES (char_ptr_type));
10675 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
10676
10677 return ((ix86_abi == MS_ABI)
10678 ? ms_va_list_type_node
10679 : sysv_va_list_type_node);
10680 }
10681 else
10682 {
10683 /* For i386 we use plain pointer to argument area. */
10684 return build_pointer_type (char_type_node);
10685 }
10686 }
10687
10688 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
10689
10690 static void
10691 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
10692 {
10693 rtx save_area, mem;
10694 alias_set_type set;
10695 int i, max;
10696
10697 /* GPR size of varargs save area. */
10698 if (cfun->va_list_gpr_size)
10699 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
10700 else
10701 ix86_varargs_gpr_size = 0;
10702
10703 /* FPR size of varargs save area. We don't need it if we don't pass
10704 anything in SSE registers. */
10705 if (TARGET_SSE && cfun->va_list_fpr_size)
10706 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
10707 else
10708 ix86_varargs_fpr_size = 0;
10709
10710 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
10711 return;
10712
10713 save_area = frame_pointer_rtx;
10714 set = get_varargs_alias_set ();
10715
10716 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
10717 if (max > X86_64_REGPARM_MAX)
10718 max = X86_64_REGPARM_MAX;
10719
10720 for (i = cum->regno; i < max; i++)
10721 {
10722 mem = gen_rtx_MEM (word_mode,
10723 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
10724 MEM_NOTRAP_P (mem) = 1;
10725 set_mem_alias_set (mem, set);
10726 emit_move_insn (mem,
10727 gen_rtx_REG (word_mode,
10728 x86_64_int_parameter_registers[i]));
10729 }
10730
10731 if (ix86_varargs_fpr_size)
10732 {
10733 machine_mode smode;
10734 rtx_code_label *label;
10735 rtx test;
10736
10737 /* Now emit code to save SSE registers. The AX parameter contains number
10738 of SSE parameter registers used to call this function, though all we
10739 actually check here is the zero/non-zero status. */
10740
10741 label = gen_label_rtx ();
10742 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
10743 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
10744 label));
10745
10746 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
10747 we used movdqa (i.e. TImode) instead? Perhaps even better would
10748 be if we could determine the real mode of the data, via a hook
10749 into pass_stdarg. Ignore all that for now. */
10750 smode = V4SFmode;
10751 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
10752 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
10753
10754 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
10755 if (max > X86_64_SSE_REGPARM_MAX)
10756 max = X86_64_SSE_REGPARM_MAX;
10757
10758 for (i = cum->sse_regno; i < max; ++i)
10759 {
10760 mem = plus_constant (Pmode, save_area,
10761 i * 16 + ix86_varargs_gpr_size);
10762 mem = gen_rtx_MEM (smode, mem);
10763 MEM_NOTRAP_P (mem) = 1;
10764 set_mem_alias_set (mem, set);
10765 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
10766
10767 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
10768 }
10769
10770 emit_label (label);
10771 }
10772 }
10773
10774 static void
10775 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
10776 {
10777 alias_set_type set = get_varargs_alias_set ();
10778 int i;
10779
10780 /* Reset to zero, as there might be a sysv vaarg used
10781 before. */
10782 ix86_varargs_gpr_size = 0;
10783 ix86_varargs_fpr_size = 0;
10784
10785 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
10786 {
10787 rtx reg, mem;
10788
10789 mem = gen_rtx_MEM (Pmode,
10790 plus_constant (Pmode, virtual_incoming_args_rtx,
10791 i * UNITS_PER_WORD));
10792 MEM_NOTRAP_P (mem) = 1;
10793 set_mem_alias_set (mem, set);
10794
10795 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
10796 emit_move_insn (mem, reg);
10797 }
10798 }
10799
10800 static void
10801 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10802 tree type, int *, int no_rtl)
10803 {
10804 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10805 CUMULATIVE_ARGS next_cum;
10806 tree fntype;
10807
10808 /* This argument doesn't appear to be used anymore. Which is good,
10809 because the old code here didn't suppress rtl generation. */
10810 gcc_assert (!no_rtl);
10811
10812 if (!TARGET_64BIT)
10813 return;
10814
10815 fntype = TREE_TYPE (current_function_decl);
10816
10817 /* For varargs, we do not want to skip the dummy va_dcl argument.
10818 For stdargs, we do want to skip the last named argument. */
10819 next_cum = *cum;
10820 if (stdarg_p (fntype))
10821 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
10822 true);
10823
10824 if (cum->call_abi == MS_ABI)
10825 setup_incoming_varargs_ms_64 (&next_cum);
10826 else
10827 setup_incoming_varargs_64 (&next_cum);
10828 }
10829
10830 static void
10831 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
10832 enum machine_mode mode,
10833 tree type,
10834 int *pretend_size ATTRIBUTE_UNUSED,
10835 int no_rtl)
10836 {
10837 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10838 CUMULATIVE_ARGS next_cum;
10839 tree fntype;
10840 rtx save_area;
10841 int bnd_reg, i, max;
10842
10843 gcc_assert (!no_rtl);
10844
10845 /* Do nothing if we use plain pointer to argument area. */
10846 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
10847 return;
10848
10849 fntype = TREE_TYPE (current_function_decl);
10850
10851 /* For varargs, we do not want to skip the dummy va_dcl argument.
10852 For stdargs, we do want to skip the last named argument. */
10853 next_cum = *cum;
10854 if (stdarg_p (fntype))
10855 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
10856 true);
10857 save_area = frame_pointer_rtx;
10858
10859 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
10860 if (max > X86_64_REGPARM_MAX)
10861 max = X86_64_REGPARM_MAX;
10862
10863 bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
10864 if (chkp_function_instrumented_p (current_function_decl))
10865 for (i = cum->regno; i < max; i++)
10866 {
10867 rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
10868 rtx ptr = gen_rtx_REG (Pmode,
10869 x86_64_int_parameter_registers[i]);
10870 rtx bounds;
10871
10872 if (bnd_reg <= LAST_BND_REG)
10873 bounds = gen_rtx_REG (BNDmode, bnd_reg);
10874 else
10875 {
10876 rtx ldx_addr =
10877 plus_constant (Pmode, arg_pointer_rtx,
10878 (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
10879 bounds = gen_reg_rtx (BNDmode);
10880 emit_insn (BNDmode == BND64mode
10881 ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
10882 : gen_bnd32_ldx (bounds, ldx_addr, ptr));
10883 }
10884
10885 emit_insn (BNDmode == BND64mode
10886 ? gen_bnd64_stx (addr, ptr, bounds)
10887 : gen_bnd32_stx (addr, ptr, bounds));
10888
10889 bnd_reg++;
10890 }
10891 }
10892
10893
10894 /* Checks if TYPE is of kind va_list char *. */
10895
10896 static bool
10897 is_va_list_char_pointer (tree type)
10898 {
10899 tree canonic;
10900
10901 /* For 32-bit it is always true. */
10902 if (!TARGET_64BIT)
10903 return true;
10904 canonic = ix86_canonical_va_list_type (type);
10905 return (canonic == ms_va_list_type_node
10906 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
10907 }
10908
10909 /* Implement va_start. */
10910
10911 static void
10912 ix86_va_start (tree valist, rtx nextarg)
10913 {
10914 HOST_WIDE_INT words, n_gpr, n_fpr;
10915 tree f_gpr, f_fpr, f_ovf, f_sav;
10916 tree gpr, fpr, ovf, sav, t;
10917 tree type;
10918 rtx ovf_rtx;
10919
10920 if (flag_split_stack
10921 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
10922 {
10923 unsigned int scratch_regno;
10924
10925 /* When we are splitting the stack, we can't refer to the stack
10926 arguments using internal_arg_pointer, because they may be on
10927 the old stack. The split stack prologue will arrange to
10928 leave a pointer to the old stack arguments in a scratch
10929 register, which we here copy to a pseudo-register. The split
10930 stack prologue can't set the pseudo-register directly because
10931 it (the prologue) runs before any registers have been saved. */
10932
10933 scratch_regno = split_stack_prologue_scratch_regno ();
10934 if (scratch_regno != INVALID_REGNUM)
10935 {
10936 rtx reg;
10937 rtx_insn *seq;
10938
10939 reg = gen_reg_rtx (Pmode);
10940 cfun->machine->split_stack_varargs_pointer = reg;
10941
10942 start_sequence ();
10943 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
10944 seq = get_insns ();
10945 end_sequence ();
10946
10947 push_topmost_sequence ();
10948 emit_insn_after (seq, entry_of_function ());
10949 pop_topmost_sequence ();
10950 }
10951 }
10952
10953 /* Only 64bit target needs something special. */
10954 if (is_va_list_char_pointer (TREE_TYPE (valist)))
10955 {
10956 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
10957 std_expand_builtin_va_start (valist, nextarg);
10958 else
10959 {
10960 rtx va_r, next;
10961
10962 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
10963 next = expand_binop (ptr_mode, add_optab,
10964 cfun->machine->split_stack_varargs_pointer,
10965 crtl->args.arg_offset_rtx,
10966 NULL_RTX, 0, OPTAB_LIB_WIDEN);
10967 convert_move (va_r, next, 0);
10968
10969 /* Store zero bounds for va_list. */
10970 if (chkp_function_instrumented_p (current_function_decl))
10971 chkp_expand_bounds_reset_for_mem (valist,
10972 make_tree (TREE_TYPE (valist),
10973 next));
10974
10975 }
10976 return;
10977 }
10978
10979 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
10980 f_fpr = DECL_CHAIN (f_gpr);
10981 f_ovf = DECL_CHAIN (f_fpr);
10982 f_sav = DECL_CHAIN (f_ovf);
10983
10984 valist = build_simple_mem_ref (valist);
10985 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
10986 /* The following should be folded into the MEM_REF offset. */
10987 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
10988 f_gpr, NULL_TREE);
10989 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
10990 f_fpr, NULL_TREE);
10991 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
10992 f_ovf, NULL_TREE);
10993 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
10994 f_sav, NULL_TREE);
10995
10996 /* Count number of gp and fp argument registers used. */
10997 words = crtl->args.info.words;
10998 n_gpr = crtl->args.info.regno;
10999 n_fpr = crtl->args.info.sse_regno;
11000
11001 if (cfun->va_list_gpr_size)
11002 {
11003 type = TREE_TYPE (gpr);
11004 t = build2 (MODIFY_EXPR, type,
11005 gpr, build_int_cst (type, n_gpr * 8));
11006 TREE_SIDE_EFFECTS (t) = 1;
11007 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11008 }
11009
11010 if (TARGET_SSE && cfun->va_list_fpr_size)
11011 {
11012 type = TREE_TYPE (fpr);
11013 t = build2 (MODIFY_EXPR, type, fpr,
11014 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
11015 TREE_SIDE_EFFECTS (t) = 1;
11016 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11017 }
11018
11019 /* Find the overflow area. */
11020 type = TREE_TYPE (ovf);
11021 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11022 ovf_rtx = crtl->args.internal_arg_pointer;
11023 else
11024 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
11025 t = make_tree (type, ovf_rtx);
11026 if (words != 0)
11027 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
11028
11029 /* Store zero bounds for overflow area pointer. */
11030 if (chkp_function_instrumented_p (current_function_decl))
11031 chkp_expand_bounds_reset_for_mem (ovf, t);
11032
11033 t = build2 (MODIFY_EXPR, type, ovf, t);
11034 TREE_SIDE_EFFECTS (t) = 1;
11035 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11036
11037 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
11038 {
11039 /* Find the register save area.
11040 Prologue of the function save it right above stack frame. */
11041 type = TREE_TYPE (sav);
11042 t = make_tree (type, frame_pointer_rtx);
11043 if (!ix86_varargs_gpr_size)
11044 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
11045
11046 /* Store zero bounds for save area pointer. */
11047 if (chkp_function_instrumented_p (current_function_decl))
11048 chkp_expand_bounds_reset_for_mem (sav, t);
11049
11050 t = build2 (MODIFY_EXPR, type, sav, t);
11051 TREE_SIDE_EFFECTS (t) = 1;
11052 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11053 }
11054 }
11055
11056 /* Implement va_arg. */
11057
11058 static tree
11059 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
11060 gimple_seq *post_p)
11061 {
11062 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
11063 tree f_gpr, f_fpr, f_ovf, f_sav;
11064 tree gpr, fpr, ovf, sav, t;
11065 int size, rsize;
11066 tree lab_false, lab_over = NULL_TREE;
11067 tree addr, t2;
11068 rtx container;
11069 int indirect_p = 0;
11070 tree ptrtype;
11071 machine_mode nat_mode;
11072 unsigned int arg_boundary;
11073
11074 /* Only 64bit target needs something special. */
11075 if (is_va_list_char_pointer (TREE_TYPE (valist)))
11076 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
11077
11078 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
11079 f_fpr = DECL_CHAIN (f_gpr);
11080 f_ovf = DECL_CHAIN (f_fpr);
11081 f_sav = DECL_CHAIN (f_ovf);
11082
11083 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
11084 valist, f_gpr, NULL_TREE);
11085
11086 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
11087 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
11088 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
11089
11090 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
11091 if (indirect_p)
11092 type = build_pointer_type (type);
11093 size = int_size_in_bytes (type);
11094 rsize = CEIL (size, UNITS_PER_WORD);
11095
11096 nat_mode = type_natural_mode (type, NULL, false);
11097 switch (nat_mode)
11098 {
11099 case V8SFmode:
11100 case V8SImode:
11101 case V32QImode:
11102 case V16HImode:
11103 case V4DFmode:
11104 case V4DImode:
11105 case V16SFmode:
11106 case V16SImode:
11107 case V64QImode:
11108 case V32HImode:
11109 case V8DFmode:
11110 case V8DImode:
11111 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
11112 if (!TARGET_64BIT_MS_ABI)
11113 {
11114 container = NULL;
11115 break;
11116 }
11117 /* FALLTHRU */
11118
11119 default:
11120 container = construct_container (nat_mode, TYPE_MODE (type),
11121 type, 0, X86_64_REGPARM_MAX,
11122 X86_64_SSE_REGPARM_MAX, intreg,
11123 0);
11124 break;
11125 }
11126
11127 /* Pull the value out of the saved registers. */
11128
11129 addr = create_tmp_var (ptr_type_node, "addr");
11130
11131 if (container)
11132 {
11133 int needed_intregs, needed_sseregs;
11134 bool need_temp;
11135 tree int_addr, sse_addr;
11136
11137 lab_false = create_artificial_label (UNKNOWN_LOCATION);
11138 lab_over = create_artificial_label (UNKNOWN_LOCATION);
11139
11140 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
11141
11142 need_temp = (!REG_P (container)
11143 && ((needed_intregs && TYPE_ALIGN (type) > 64)
11144 || TYPE_ALIGN (type) > 128));
11145
11146 /* In case we are passing structure, verify that it is consecutive block
11147 on the register save area. If not we need to do moves. */
11148 if (!need_temp && !REG_P (container))
11149 {
11150 /* Verify that all registers are strictly consecutive */
11151 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
11152 {
11153 int i;
11154
11155 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11156 {
11157 rtx slot = XVECEXP (container, 0, i);
11158 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
11159 || INTVAL (XEXP (slot, 1)) != i * 16)
11160 need_temp = true;
11161 }
11162 }
11163 else
11164 {
11165 int i;
11166
11167 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11168 {
11169 rtx slot = XVECEXP (container, 0, i);
11170 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
11171 || INTVAL (XEXP (slot, 1)) != i * 8)
11172 need_temp = true;
11173 }
11174 }
11175 }
11176 if (!need_temp)
11177 {
11178 int_addr = addr;
11179 sse_addr = addr;
11180 }
11181 else
11182 {
11183 int_addr = create_tmp_var (ptr_type_node, "int_addr");
11184 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
11185 }
11186
11187 /* First ensure that we fit completely in registers. */
11188 if (needed_intregs)
11189 {
11190 t = build_int_cst (TREE_TYPE (gpr),
11191 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
11192 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
11193 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11194 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11195 gimplify_and_add (t, pre_p);
11196 }
11197 if (needed_sseregs)
11198 {
11199 t = build_int_cst (TREE_TYPE (fpr),
11200 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
11201 + X86_64_REGPARM_MAX * 8);
11202 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
11203 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11204 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11205 gimplify_and_add (t, pre_p);
11206 }
11207
11208 /* Compute index to start of area used for integer regs. */
11209 if (needed_intregs)
11210 {
11211 /* int_addr = gpr + sav; */
11212 t = fold_build_pointer_plus (sav, gpr);
11213 gimplify_assign (int_addr, t, pre_p);
11214 }
11215 if (needed_sseregs)
11216 {
11217 /* sse_addr = fpr + sav; */
11218 t = fold_build_pointer_plus (sav, fpr);
11219 gimplify_assign (sse_addr, t, pre_p);
11220 }
11221 if (need_temp)
11222 {
11223 int i, prev_size = 0;
11224 tree temp = create_tmp_var (type, "va_arg_tmp");
11225
11226 /* addr = &temp; */
11227 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
11228 gimplify_assign (addr, t, pre_p);
11229
11230 for (i = 0; i < XVECLEN (container, 0); i++)
11231 {
11232 rtx slot = XVECEXP (container, 0, i);
11233 rtx reg = XEXP (slot, 0);
11234 machine_mode mode = GET_MODE (reg);
11235 tree piece_type;
11236 tree addr_type;
11237 tree daddr_type;
11238 tree src_addr, src;
11239 int src_offset;
11240 tree dest_addr, dest;
11241 int cur_size = GET_MODE_SIZE (mode);
11242
11243 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
11244 prev_size = INTVAL (XEXP (slot, 1));
11245 if (prev_size + cur_size > size)
11246 {
11247 cur_size = size - prev_size;
11248 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
11249 if (mode == BLKmode)
11250 mode = QImode;
11251 }
11252 piece_type = lang_hooks.types.type_for_mode (mode, 1);
11253 if (mode == GET_MODE (reg))
11254 addr_type = build_pointer_type (piece_type);
11255 else
11256 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11257 true);
11258 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11259 true);
11260
11261 if (SSE_REGNO_P (REGNO (reg)))
11262 {
11263 src_addr = sse_addr;
11264 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
11265 }
11266 else
11267 {
11268 src_addr = int_addr;
11269 src_offset = REGNO (reg) * 8;
11270 }
11271 src_addr = fold_convert (addr_type, src_addr);
11272 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
11273
11274 dest_addr = fold_convert (daddr_type, addr);
11275 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
11276 if (cur_size == GET_MODE_SIZE (mode))
11277 {
11278 src = build_va_arg_indirect_ref (src_addr);
11279 dest = build_va_arg_indirect_ref (dest_addr);
11280
11281 gimplify_assign (dest, src, pre_p);
11282 }
11283 else
11284 {
11285 tree copy
11286 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
11287 3, dest_addr, src_addr,
11288 size_int (cur_size));
11289 gimplify_and_add (copy, pre_p);
11290 }
11291 prev_size += cur_size;
11292 }
11293 }
11294
11295 if (needed_intregs)
11296 {
11297 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
11298 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
11299 gimplify_assign (gpr, t, pre_p);
11300 }
11301
11302 if (needed_sseregs)
11303 {
11304 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
11305 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
11306 gimplify_assign (unshare_expr (fpr), t, pre_p);
11307 }
11308
11309 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
11310
11311 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
11312 }
11313
11314 /* ... otherwise out of the overflow area. */
11315
11316 /* When we align parameter on stack for caller, if the parameter
11317 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
11318 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
11319 here with caller. */
11320 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
11321 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
11322 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
11323
11324 /* Care for on-stack alignment if needed. */
11325 if (arg_boundary <= 64 || size == 0)
11326 t = ovf;
11327 else
11328 {
11329 HOST_WIDE_INT align = arg_boundary / 8;
11330 t = fold_build_pointer_plus_hwi (ovf, align - 1);
11331 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
11332 build_int_cst (TREE_TYPE (t), -align));
11333 }
11334
11335 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
11336 gimplify_assign (addr, t, pre_p);
11337
11338 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
11339 gimplify_assign (unshare_expr (ovf), t, pre_p);
11340
11341 if (container)
11342 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
11343
11344 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
11345 addr = fold_convert (ptrtype, addr);
11346
11347 if (indirect_p)
11348 addr = build_va_arg_indirect_ref (addr);
11349 return build_va_arg_indirect_ref (addr);
11350 }
11351 \f
11352 /* Return true if OPNUM's MEM should be matched
11353 in movabs* patterns. */
11354
11355 bool
11356 ix86_check_movabs (rtx insn, int opnum)
11357 {
11358 rtx set, mem;
11359
11360 set = PATTERN (insn);
11361 if (GET_CODE (set) == PARALLEL)
11362 set = XVECEXP (set, 0, 0);
11363 gcc_assert (GET_CODE (set) == SET);
11364 mem = XEXP (set, opnum);
11365 while (SUBREG_P (mem))
11366 mem = SUBREG_REG (mem);
11367 gcc_assert (MEM_P (mem));
11368 return volatile_ok || !MEM_VOLATILE_P (mem);
11369 }
11370
11371 /* Return false if INSN contains a MEM with a non-default address space. */
11372 bool
11373 ix86_check_no_addr_space (rtx insn)
11374 {
11375 subrtx_var_iterator::array_type array;
11376 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
11377 {
11378 rtx x = *iter;
11379 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
11380 return false;
11381 }
11382 return true;
11383 }
11384 \f
11385 /* Initialize the table of extra 80387 mathematical constants. */
11386
11387 static void
11388 init_ext_80387_constants (void)
11389 {
11390 static const char * cst[5] =
11391 {
11392 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
11393 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
11394 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
11395 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
11396 "3.1415926535897932385128089594061862044", /* 4: fldpi */
11397 };
11398 int i;
11399
11400 for (i = 0; i < 5; i++)
11401 {
11402 real_from_string (&ext_80387_constants_table[i], cst[i]);
11403 /* Ensure each constant is rounded to XFmode precision. */
11404 real_convert (&ext_80387_constants_table[i],
11405 XFmode, &ext_80387_constants_table[i]);
11406 }
11407
11408 ext_80387_constants_init = 1;
11409 }
11410
11411 /* Return non-zero if the constant is something that
11412 can be loaded with a special instruction. */
11413
11414 int
11415 standard_80387_constant_p (rtx x)
11416 {
11417 machine_mode mode = GET_MODE (x);
11418
11419 const REAL_VALUE_TYPE *r;
11420
11421 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
11422 return -1;
11423
11424 if (x == CONST0_RTX (mode))
11425 return 1;
11426 if (x == CONST1_RTX (mode))
11427 return 2;
11428
11429 r = CONST_DOUBLE_REAL_VALUE (x);
11430
11431 /* For XFmode constants, try to find a special 80387 instruction when
11432 optimizing for size or on those CPUs that benefit from them. */
11433 if (mode == XFmode
11434 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
11435 {
11436 int i;
11437
11438 if (! ext_80387_constants_init)
11439 init_ext_80387_constants ();
11440
11441 for (i = 0; i < 5; i++)
11442 if (real_identical (r, &ext_80387_constants_table[i]))
11443 return i + 3;
11444 }
11445
11446 /* Load of the constant -0.0 or -1.0 will be split as
11447 fldz;fchs or fld1;fchs sequence. */
11448 if (real_isnegzero (r))
11449 return 8;
11450 if (real_identical (r, &dconstm1))
11451 return 9;
11452
11453 return 0;
11454 }
11455
11456 /* Return the opcode of the special instruction to be used to load
11457 the constant X. */
11458
11459 const char *
11460 standard_80387_constant_opcode (rtx x)
11461 {
11462 switch (standard_80387_constant_p (x))
11463 {
11464 case 1:
11465 return "fldz";
11466 case 2:
11467 return "fld1";
11468 case 3:
11469 return "fldlg2";
11470 case 4:
11471 return "fldln2";
11472 case 5:
11473 return "fldl2e";
11474 case 6:
11475 return "fldl2t";
11476 case 7:
11477 return "fldpi";
11478 case 8:
11479 case 9:
11480 return "#";
11481 default:
11482 gcc_unreachable ();
11483 }
11484 }
11485
11486 /* Return the CONST_DOUBLE representing the 80387 constant that is
11487 loaded by the specified special instruction. The argument IDX
11488 matches the return value from standard_80387_constant_p. */
11489
11490 rtx
11491 standard_80387_constant_rtx (int idx)
11492 {
11493 int i;
11494
11495 if (! ext_80387_constants_init)
11496 init_ext_80387_constants ();
11497
11498 switch (idx)
11499 {
11500 case 3:
11501 case 4:
11502 case 5:
11503 case 6:
11504 case 7:
11505 i = idx - 3;
11506 break;
11507
11508 default:
11509 gcc_unreachable ();
11510 }
11511
11512 return const_double_from_real_value (ext_80387_constants_table[i],
11513 XFmode);
11514 }
11515
11516 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
11517 in supported SSE/AVX vector mode. */
11518
11519 int
11520 standard_sse_constant_p (rtx x, machine_mode pred_mode)
11521 {
11522 machine_mode mode;
11523
11524 if (!TARGET_SSE)
11525 return 0;
11526
11527 mode = GET_MODE (x);
11528
11529 if (x == const0_rtx || const0_operand (x, mode))
11530 return 1;
11531
11532 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
11533 {
11534 /* VOIDmode integer constant, get mode from the predicate. */
11535 if (mode == VOIDmode)
11536 mode = pred_mode;
11537
11538 switch (GET_MODE_SIZE (mode))
11539 {
11540 case 64:
11541 if (TARGET_AVX512F)
11542 return 2;
11543 break;
11544 case 32:
11545 if (TARGET_AVX2)
11546 return 2;
11547 break;
11548 case 16:
11549 if (TARGET_SSE2)
11550 return 2;
11551 break;
11552 case 0:
11553 /* VOIDmode */
11554 gcc_unreachable ();
11555 default:
11556 break;
11557 }
11558 }
11559
11560 return 0;
11561 }
11562
11563 /* Return the opcode of the special instruction to be used to load
11564 the constant X. */
11565
11566 const char *
11567 standard_sse_constant_opcode (rtx_insn *insn, rtx x)
11568 {
11569 machine_mode mode;
11570
11571 gcc_assert (TARGET_SSE);
11572
11573 mode = GET_MODE (x);
11574
11575 if (x == const0_rtx || const0_operand (x, mode))
11576 {
11577 switch (get_attr_mode (insn))
11578 {
11579 case MODE_XI:
11580 return "vpxord\t%g0, %g0, %g0";
11581 case MODE_OI:
11582 return (TARGET_AVX512VL
11583 ? "vpxord\t%x0, %x0, %x0"
11584 : "vpxor\t%x0, %x0, %x0");
11585 case MODE_TI:
11586 return (TARGET_AVX512VL
11587 ? "vpxord\t%t0, %t0, %t0"
11588 : "%vpxor\t%0, %d0");
11589
11590 case MODE_V8DF:
11591 return (TARGET_AVX512DQ
11592 ? "vxorpd\t%g0, %g0, %g0"
11593 : "vpxorq\t%g0, %g0, %g0");
11594 case MODE_V4DF:
11595 return "vxorpd\t%x0, %x0, %x0";
11596 case MODE_V2DF:
11597 return "%vxorpd\t%0, %d0";
11598
11599 case MODE_V16SF:
11600 return (TARGET_AVX512DQ
11601 ? "vxorps\t%g0, %g0, %g0"
11602 : "vpxord\t%g0, %g0, %g0");
11603 case MODE_V8SF:
11604 return "vxorps\t%x0, %x0, %x0";
11605 case MODE_V4SF:
11606 return "%vxorps\t%0, %d0";
11607
11608 default:
11609 gcc_unreachable ();
11610 }
11611 }
11612 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
11613 {
11614 enum attr_mode insn_mode = get_attr_mode (insn);
11615
11616 switch (insn_mode)
11617 {
11618 case MODE_XI:
11619 case MODE_V8DF:
11620 case MODE_V16SF:
11621 gcc_assert (TARGET_AVX512F);
11622 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
11623
11624 case MODE_OI:
11625 case MODE_V4DF:
11626 case MODE_V8SF:
11627 gcc_assert (TARGET_AVX2);
11628 /* FALLTHRU */
11629 case MODE_TI:
11630 case MODE_V2DF:
11631 case MODE_V4SF:
11632 gcc_assert (TARGET_SSE2);
11633 return (TARGET_AVX
11634 ? "vpcmpeqd\t%0, %0, %0"
11635 : "pcmpeqd\t%0, %0");
11636
11637 default:
11638 gcc_unreachable ();
11639 }
11640 }
11641
11642 gcc_unreachable ();
11643 }
11644
11645 /* Returns true if INSN can be transformed from a memory load
11646 to a supported FP constant load. */
11647
11648 bool
11649 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
11650 {
11651 rtx src = find_constant_src (insn);
11652
11653 gcc_assert (REG_P (dst));
11654
11655 if (src == NULL
11656 || (SSE_REGNO_P (REGNO (dst))
11657 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
11658 || (STACK_REGNO_P (REGNO (dst))
11659 && standard_80387_constant_p (src) < 1))
11660 return false;
11661
11662 return true;
11663 }
11664
11665 /* Returns true if OP contains a symbol reference */
11666
11667 bool
11668 symbolic_reference_mentioned_p (rtx op)
11669 {
11670 const char *fmt;
11671 int i;
11672
11673 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
11674 return true;
11675
11676 fmt = GET_RTX_FORMAT (GET_CODE (op));
11677 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
11678 {
11679 if (fmt[i] == 'E')
11680 {
11681 int j;
11682
11683 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
11684 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
11685 return true;
11686 }
11687
11688 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
11689 return true;
11690 }
11691
11692 return false;
11693 }
11694
11695 /* Return true if it is appropriate to emit `ret' instructions in the
11696 body of a function. Do this only if the epilogue is simple, needing a
11697 couple of insns. Prior to reloading, we can't tell how many registers
11698 must be saved, so return false then. Return false if there is no frame
11699 marker to de-allocate. */
11700
11701 bool
11702 ix86_can_use_return_insn_p (void)
11703 {
11704 struct ix86_frame frame;
11705
11706 /* Don't use `ret' instruction in interrupt handler. */
11707 if (! reload_completed
11708 || frame_pointer_needed
11709 || cfun->machine->func_type != TYPE_NORMAL)
11710 return 0;
11711
11712 /* Don't allow more than 32k pop, since that's all we can do
11713 with one instruction. */
11714 if (crtl->args.pops_args && crtl->args.size >= 32768)
11715 return 0;
11716
11717 ix86_compute_frame_layout (&frame);
11718 return (frame.stack_pointer_offset == UNITS_PER_WORD
11719 && (frame.nregs + frame.nsseregs) == 0);
11720 }
11721 \f
11722 /* Value should be nonzero if functions must have frame pointers.
11723 Zero means the frame pointer need not be set up (and parms may
11724 be accessed via the stack pointer) in functions that seem suitable. */
11725
11726 static bool
11727 ix86_frame_pointer_required (void)
11728 {
11729 /* If we accessed previous frames, then the generated code expects
11730 to be able to access the saved ebp value in our frame. */
11731 if (cfun->machine->accesses_prev_frame)
11732 return true;
11733
11734 /* Several x86 os'es need a frame pointer for other reasons,
11735 usually pertaining to setjmp. */
11736 if (SUBTARGET_FRAME_POINTER_REQUIRED)
11737 return true;
11738
11739 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
11740 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
11741 return true;
11742
11743 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
11744 allocation is 4GB. */
11745 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
11746 return true;
11747
11748 /* SSE saves require frame-pointer when stack is misaligned. */
11749 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
11750 return true;
11751
11752 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
11753 turns off the frame pointer by default. Turn it back on now if
11754 we've not got a leaf function. */
11755 if (TARGET_OMIT_LEAF_FRAME_POINTER
11756 && (!crtl->is_leaf
11757 || ix86_current_function_calls_tls_descriptor))
11758 return true;
11759
11760 if (crtl->profile && !flag_fentry)
11761 return true;
11762
11763 return false;
11764 }
11765
11766 /* Record that the current function accesses previous call frames. */
11767
11768 void
11769 ix86_setup_frame_addresses (void)
11770 {
11771 cfun->machine->accesses_prev_frame = 1;
11772 }
11773 \f
11774 #ifndef USE_HIDDEN_LINKONCE
11775 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
11776 # define USE_HIDDEN_LINKONCE 1
11777 # else
11778 # define USE_HIDDEN_LINKONCE 0
11779 # endif
11780 #endif
11781
11782 static int pic_labels_used;
11783
11784 /* Fills in the label name that should be used for a pc thunk for
11785 the given register. */
11786
11787 static void
11788 get_pc_thunk_name (char name[32], unsigned int regno)
11789 {
11790 gcc_assert (!TARGET_64BIT);
11791
11792 if (USE_HIDDEN_LINKONCE)
11793 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
11794 else
11795 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
11796 }
11797
11798
11799 /* This function generates code for -fpic that loads %ebx with
11800 the return address of the caller and then returns. */
11801
11802 static void
11803 ix86_code_end (void)
11804 {
11805 rtx xops[2];
11806 int regno;
11807
11808 for (regno = AX_REG; regno <= SP_REG; regno++)
11809 {
11810 char name[32];
11811 tree decl;
11812
11813 if (!(pic_labels_used & (1 << regno)))
11814 continue;
11815
11816 get_pc_thunk_name (name, regno);
11817
11818 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
11819 get_identifier (name),
11820 build_function_type_list (void_type_node, NULL_TREE));
11821 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
11822 NULL_TREE, void_type_node);
11823 TREE_PUBLIC (decl) = 1;
11824 TREE_STATIC (decl) = 1;
11825 DECL_IGNORED_P (decl) = 1;
11826
11827 #if TARGET_MACHO
11828 if (TARGET_MACHO)
11829 {
11830 switch_to_section (darwin_sections[text_coal_section]);
11831 fputs ("\t.weak_definition\t", asm_out_file);
11832 assemble_name (asm_out_file, name);
11833 fputs ("\n\t.private_extern\t", asm_out_file);
11834 assemble_name (asm_out_file, name);
11835 putc ('\n', asm_out_file);
11836 ASM_OUTPUT_LABEL (asm_out_file, name);
11837 DECL_WEAK (decl) = 1;
11838 }
11839 else
11840 #endif
11841 if (USE_HIDDEN_LINKONCE)
11842 {
11843 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
11844
11845 targetm.asm_out.unique_section (decl, 0);
11846 switch_to_section (get_named_section (decl, NULL, 0));
11847
11848 targetm.asm_out.globalize_label (asm_out_file, name);
11849 fputs ("\t.hidden\t", asm_out_file);
11850 assemble_name (asm_out_file, name);
11851 putc ('\n', asm_out_file);
11852 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
11853 }
11854 else
11855 {
11856 switch_to_section (text_section);
11857 ASM_OUTPUT_LABEL (asm_out_file, name);
11858 }
11859
11860 DECL_INITIAL (decl) = make_node (BLOCK);
11861 current_function_decl = decl;
11862 allocate_struct_function (decl, false);
11863 init_function_start (decl);
11864 first_function_block_is_cold = false;
11865 /* Make sure unwind info is emitted for the thunk if needed. */
11866 final_start_function (emit_barrier (), asm_out_file, 1);
11867
11868 /* Pad stack IP move with 4 instructions (two NOPs count
11869 as one instruction). */
11870 if (TARGET_PAD_SHORT_FUNCTION)
11871 {
11872 int i = 8;
11873
11874 while (i--)
11875 fputs ("\tnop\n", asm_out_file);
11876 }
11877
11878 xops[0] = gen_rtx_REG (Pmode, regno);
11879 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
11880 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
11881 output_asm_insn ("%!ret", NULL);
11882 final_end_function ();
11883 init_insn_lengths ();
11884 free_after_compilation (cfun);
11885 set_cfun (NULL);
11886 current_function_decl = NULL;
11887 }
11888
11889 if (flag_split_stack)
11890 file_end_indicate_split_stack ();
11891 }
11892
11893 /* Emit code for the SET_GOT patterns. */
11894
11895 const char *
11896 output_set_got (rtx dest, rtx label)
11897 {
11898 rtx xops[3];
11899
11900 xops[0] = dest;
11901
11902 if (TARGET_VXWORKS_RTP && flag_pic)
11903 {
11904 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
11905 xops[2] = gen_rtx_MEM (Pmode,
11906 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
11907 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
11908
11909 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
11910 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
11911 an unadorned address. */
11912 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
11913 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
11914 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
11915 return "";
11916 }
11917
11918 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
11919
11920 if (flag_pic)
11921 {
11922 char name[32];
11923 get_pc_thunk_name (name, REGNO (dest));
11924 pic_labels_used |= 1 << REGNO (dest);
11925
11926 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
11927 xops[2] = gen_rtx_MEM (QImode, xops[2]);
11928 output_asm_insn ("%!call\t%X2", xops);
11929
11930 #if TARGET_MACHO
11931 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
11932 This is what will be referenced by the Mach-O PIC subsystem. */
11933 if (machopic_should_output_picbase_label () || !label)
11934 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
11935
11936 /* When we are restoring the pic base at the site of a nonlocal label,
11937 and we decided to emit the pic base above, we will still output a
11938 local label used for calculating the correction offset (even though
11939 the offset will be 0 in that case). */
11940 if (label)
11941 targetm.asm_out.internal_label (asm_out_file, "L",
11942 CODE_LABEL_NUMBER (label));
11943 #endif
11944 }
11945 else
11946 {
11947 if (TARGET_MACHO)
11948 /* We don't need a pic base, we're not producing pic. */
11949 gcc_unreachable ();
11950
11951 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
11952 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
11953 targetm.asm_out.internal_label (asm_out_file, "L",
11954 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
11955 }
11956
11957 if (!TARGET_MACHO)
11958 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
11959
11960 return "";
11961 }
11962
11963 /* Generate an "push" pattern for input ARG. */
11964
11965 static rtx
11966 gen_push (rtx arg)
11967 {
11968 struct machine_function *m = cfun->machine;
11969
11970 if (m->fs.cfa_reg == stack_pointer_rtx)
11971 m->fs.cfa_offset += UNITS_PER_WORD;
11972 m->fs.sp_offset += UNITS_PER_WORD;
11973
11974 if (REG_P (arg) && GET_MODE (arg) != word_mode)
11975 arg = gen_rtx_REG (word_mode, REGNO (arg));
11976
11977 return gen_rtx_SET (gen_rtx_MEM (word_mode,
11978 gen_rtx_PRE_DEC (Pmode,
11979 stack_pointer_rtx)),
11980 arg);
11981 }
11982
11983 /* Generate an "pop" pattern for input ARG. */
11984
11985 static rtx
11986 gen_pop (rtx arg)
11987 {
11988 if (REG_P (arg) && GET_MODE (arg) != word_mode)
11989 arg = gen_rtx_REG (word_mode, REGNO (arg));
11990
11991 return gen_rtx_SET (arg,
11992 gen_rtx_MEM (word_mode,
11993 gen_rtx_POST_INC (Pmode,
11994 stack_pointer_rtx)));
11995 }
11996
11997 /* Return >= 0 if there is an unused call-clobbered register available
11998 for the entire function. */
11999
12000 static unsigned int
12001 ix86_select_alt_pic_regnum (void)
12002 {
12003 if (ix86_use_pseudo_pic_reg ())
12004 return INVALID_REGNUM;
12005
12006 if (crtl->is_leaf
12007 && !crtl->profile
12008 && !ix86_current_function_calls_tls_descriptor)
12009 {
12010 int i, drap;
12011 /* Can't use the same register for both PIC and DRAP. */
12012 if (crtl->drap_reg)
12013 drap = REGNO (crtl->drap_reg);
12014 else
12015 drap = -1;
12016 for (i = 2; i >= 0; --i)
12017 if (i != drap && !df_regs_ever_live_p (i))
12018 return i;
12019 }
12020
12021 return INVALID_REGNUM;
12022 }
12023
12024 /* Return true if REGNO is used by the epilogue. */
12025
12026 bool
12027 ix86_epilogue_uses (int regno)
12028 {
12029 /* If there are no caller-saved registers, we preserve all registers,
12030 except for MMX and x87 registers which aren't supported when saving
12031 and restoring registers. Don't explicitly save SP register since
12032 it is always preserved. */
12033 return (epilogue_completed
12034 && cfun->machine->no_caller_saved_registers
12035 && !fixed_regs[regno]
12036 && !STACK_REGNO_P (regno)
12037 && !MMX_REGNO_P (regno));
12038 }
12039
12040 /* Return nonzero if register REGNO can be used as a scratch register
12041 in peephole2. */
12042
12043 static bool
12044 ix86_hard_regno_scratch_ok (unsigned int regno)
12045 {
12046 /* If there are no caller-saved registers, we can't use any register
12047 as a scratch register after epilogue and use REGNO as scratch
12048 register only if it has been used before to avoid saving and
12049 restoring it. */
12050 return (!cfun->machine->no_caller_saved_registers
12051 || (!epilogue_completed
12052 && df_regs_ever_live_p (regno)));
12053 }
12054
12055 /* Return TRUE if we need to save REGNO. */
12056
12057 static bool
12058 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
12059 {
12060 /* If there are no caller-saved registers, we preserve all registers,
12061 except for MMX and x87 registers which aren't supported when saving
12062 and restoring registers. Don't explicitly save SP register since
12063 it is always preserved. */
12064 if (cfun->machine->no_caller_saved_registers)
12065 {
12066 /* Don't preserve registers used for function return value. */
12067 rtx reg = crtl->return_rtx;
12068 if (reg)
12069 {
12070 unsigned int i = REGNO (reg);
12071 unsigned int nregs = hard_regno_nregs[i][GET_MODE (reg)];
12072 while (nregs-- > 0)
12073 if ((i + nregs) == regno)
12074 return false;
12075
12076 reg = crtl->return_bnd;
12077 if (reg)
12078 {
12079 i = REGNO (reg);
12080 nregs = hard_regno_nregs[i][GET_MODE (reg)];
12081 while (nregs-- > 0)
12082 if ((i + nregs) == regno)
12083 return false;
12084 }
12085 }
12086
12087 return (df_regs_ever_live_p (regno)
12088 && !fixed_regs[regno]
12089 && !STACK_REGNO_P (regno)
12090 && !MMX_REGNO_P (regno)
12091 && (regno != HARD_FRAME_POINTER_REGNUM
12092 || !frame_pointer_needed));
12093 }
12094
12095 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
12096 && pic_offset_table_rtx)
12097 {
12098 if (ix86_use_pseudo_pic_reg ())
12099 {
12100 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
12101 _mcount in prologue. */
12102 if (!TARGET_64BIT && flag_pic && crtl->profile)
12103 return true;
12104 }
12105 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
12106 || crtl->profile
12107 || crtl->calls_eh_return
12108 || crtl->uses_const_pool
12109 || cfun->has_nonlocal_label)
12110 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
12111 }
12112
12113 if (crtl->calls_eh_return && maybe_eh_return)
12114 {
12115 unsigned i;
12116 for (i = 0; ; i++)
12117 {
12118 unsigned test = EH_RETURN_DATA_REGNO (i);
12119 if (test == INVALID_REGNUM)
12120 break;
12121 if (test == regno)
12122 return true;
12123 }
12124 }
12125
12126 if (crtl->drap_reg
12127 && regno == REGNO (crtl->drap_reg)
12128 && !cfun->machine->no_drap_save_restore)
12129 return true;
12130
12131 return (df_regs_ever_live_p (regno)
12132 && !call_used_regs[regno]
12133 && !fixed_regs[regno]
12134 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
12135 }
12136
12137 /* Return number of saved general prupose registers. */
12138
12139 static int
12140 ix86_nsaved_regs (void)
12141 {
12142 int nregs = 0;
12143 int regno;
12144
12145 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12146 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true))
12147 nregs ++;
12148 return nregs;
12149 }
12150
12151 /* Return number of saved SSE registers. */
12152
12153 static int
12154 ix86_nsaved_sseregs (void)
12155 {
12156 int nregs = 0;
12157 int regno;
12158
12159 if (!TARGET_64BIT_MS_ABI)
12160 return 0;
12161 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12162 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
12163 nregs ++;
12164 return nregs;
12165 }
12166
12167 /* Given FROM and TO register numbers, say whether this elimination is
12168 allowed. If stack alignment is needed, we can only replace argument
12169 pointer with hard frame pointer, or replace frame pointer with stack
12170 pointer. Otherwise, frame pointer elimination is automatically
12171 handled and all other eliminations are valid. */
12172
12173 static bool
12174 ix86_can_eliminate (const int from, const int to)
12175 {
12176 if (stack_realign_fp)
12177 return ((from == ARG_POINTER_REGNUM
12178 && to == HARD_FRAME_POINTER_REGNUM)
12179 || (from == FRAME_POINTER_REGNUM
12180 && to == STACK_POINTER_REGNUM));
12181 else
12182 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
12183 }
12184
12185 /* Return the offset between two registers, one to be eliminated, and the other
12186 its replacement, at the start of a routine. */
12187
12188 HOST_WIDE_INT
12189 ix86_initial_elimination_offset (int from, int to)
12190 {
12191 struct ix86_frame frame;
12192 ix86_compute_frame_layout (&frame);
12193
12194 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
12195 return frame.hard_frame_pointer_offset;
12196 else if (from == FRAME_POINTER_REGNUM
12197 && to == HARD_FRAME_POINTER_REGNUM)
12198 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
12199 else
12200 {
12201 gcc_assert (to == STACK_POINTER_REGNUM);
12202
12203 if (from == ARG_POINTER_REGNUM)
12204 return frame.stack_pointer_offset;
12205
12206 gcc_assert (from == FRAME_POINTER_REGNUM);
12207 return frame.stack_pointer_offset - frame.frame_pointer_offset;
12208 }
12209 }
12210
12211 /* In a dynamically-aligned function, we can't know the offset from
12212 stack pointer to frame pointer, so we must ensure that setjmp
12213 eliminates fp against the hard fp (%ebp) rather than trying to
12214 index from %esp up to the top of the frame across a gap that is
12215 of unknown (at compile-time) size. */
12216 static rtx
12217 ix86_builtin_setjmp_frame_value (void)
12218 {
12219 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
12220 }
12221
12222 /* When using -fsplit-stack, the allocation routines set a field in
12223 the TCB to the bottom of the stack plus this much space, measured
12224 in bytes. */
12225
12226 #define SPLIT_STACK_AVAILABLE 256
12227
12228 /* Fill structure ix86_frame about frame of currently computed function. */
12229
12230 static void
12231 ix86_compute_frame_layout (struct ix86_frame *frame)
12232 {
12233 unsigned HOST_WIDE_INT stack_alignment_needed;
12234 HOST_WIDE_INT offset;
12235 unsigned HOST_WIDE_INT preferred_alignment;
12236 HOST_WIDE_INT size = get_frame_size ();
12237 HOST_WIDE_INT to_allocate;
12238
12239 frame->nregs = ix86_nsaved_regs ();
12240 frame->nsseregs = ix86_nsaved_sseregs ();
12241
12242 /* 64-bit MS ABI seem to require stack alignment to be always 16,
12243 except for function prologues, leaf functions and when the defult
12244 incoming stack boundary is overriden at command line or via
12245 force_align_arg_pointer attribute. */
12246 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
12247 && (!crtl->is_leaf || cfun->calls_alloca != 0
12248 || ix86_current_function_calls_tls_descriptor
12249 || ix86_incoming_stack_boundary < 128))
12250 {
12251 crtl->preferred_stack_boundary = 128;
12252 crtl->stack_alignment_needed = 128;
12253 }
12254
12255 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
12256 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
12257
12258 gcc_assert (!size || stack_alignment_needed);
12259 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
12260 gcc_assert (preferred_alignment <= stack_alignment_needed);
12261
12262 /* For SEH we have to limit the amount of code movement into the prologue.
12263 At present we do this via a BLOCKAGE, at which point there's very little
12264 scheduling that can be done, which means that there's very little point
12265 in doing anything except PUSHs. */
12266 if (TARGET_SEH)
12267 cfun->machine->use_fast_prologue_epilogue = false;
12268
12269 /* During reload iteration the amount of registers saved can change.
12270 Recompute the value as needed. Do not recompute when amount of registers
12271 didn't change as reload does multiple calls to the function and does not
12272 expect the decision to change within single iteration. */
12273 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun))
12274 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
12275 {
12276 int count = frame->nregs;
12277 struct cgraph_node *node = cgraph_node::get (current_function_decl);
12278
12279 cfun->machine->use_fast_prologue_epilogue_nregs = count;
12280
12281 /* The fast prologue uses move instead of push to save registers. This
12282 is significantly longer, but also executes faster as modern hardware
12283 can execute the moves in parallel, but can't do that for push/pop.
12284
12285 Be careful about choosing what prologue to emit: When function takes
12286 many instructions to execute we may use slow version as well as in
12287 case function is known to be outside hot spot (this is known with
12288 feedback only). Weight the size of function by number of registers
12289 to save as it is cheap to use one or two push instructions but very
12290 slow to use many of them. */
12291 if (count)
12292 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
12293 if (node->frequency < NODE_FREQUENCY_NORMAL
12294 || (flag_branch_probabilities
12295 && node->frequency < NODE_FREQUENCY_HOT))
12296 cfun->machine->use_fast_prologue_epilogue = false;
12297 else
12298 cfun->machine->use_fast_prologue_epilogue
12299 = !expensive_function_p (count);
12300 }
12301
12302 frame->save_regs_using_mov
12303 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
12304 /* If static stack checking is enabled and done with probes,
12305 the registers need to be saved before allocating the frame. */
12306 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
12307
12308 /* Skip return address. */
12309 offset = UNITS_PER_WORD;
12310
12311 /* Skip pushed static chain. */
12312 if (ix86_static_chain_on_stack)
12313 offset += UNITS_PER_WORD;
12314
12315 /* Skip saved base pointer. */
12316 if (frame_pointer_needed)
12317 offset += UNITS_PER_WORD;
12318 frame->hfp_save_offset = offset;
12319
12320 /* The traditional frame pointer location is at the top of the frame. */
12321 frame->hard_frame_pointer_offset = offset;
12322
12323 /* Register save area */
12324 offset += frame->nregs * UNITS_PER_WORD;
12325 frame->reg_save_offset = offset;
12326
12327 /* On SEH target, registers are pushed just before the frame pointer
12328 location. */
12329 if (TARGET_SEH)
12330 frame->hard_frame_pointer_offset = offset;
12331
12332 /* Align and set SSE register save area. */
12333 if (frame->nsseregs)
12334 {
12335 /* The only ABI that has saved SSE registers (Win64) also has a
12336 16-byte aligned default stack, and thus we don't need to be
12337 within the re-aligned local stack frame to save them. In case
12338 incoming stack boundary is aligned to less than 16 bytes,
12339 unaligned move of SSE register will be emitted, so there is
12340 no point to round up the SSE register save area outside the
12341 re-aligned local stack frame to 16 bytes. */
12342 if (ix86_incoming_stack_boundary >= 128)
12343 offset = ROUND_UP (offset, 16);
12344 offset += frame->nsseregs * 16;
12345 }
12346 frame->sse_reg_save_offset = offset;
12347
12348 /* The re-aligned stack starts here. Values before this point are not
12349 directly comparable with values below this point. In order to make
12350 sure that no value happens to be the same before and after, force
12351 the alignment computation below to add a non-zero value. */
12352 if (stack_realign_fp)
12353 offset = ROUND_UP (offset, stack_alignment_needed);
12354
12355 /* Va-arg area */
12356 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
12357 offset += frame->va_arg_size;
12358
12359 /* Align start of frame for local function. */
12360 if (stack_realign_fp
12361 || offset != frame->sse_reg_save_offset
12362 || size != 0
12363 || !crtl->is_leaf
12364 || cfun->calls_alloca
12365 || ix86_current_function_calls_tls_descriptor)
12366 offset = ROUND_UP (offset, stack_alignment_needed);
12367
12368 /* Frame pointer points here. */
12369 frame->frame_pointer_offset = offset;
12370
12371 offset += size;
12372
12373 /* Add outgoing arguments area. Can be skipped if we eliminated
12374 all the function calls as dead code.
12375 Skipping is however impossible when function calls alloca. Alloca
12376 expander assumes that last crtl->outgoing_args_size
12377 of stack frame are unused. */
12378 if (ACCUMULATE_OUTGOING_ARGS
12379 && (!crtl->is_leaf || cfun->calls_alloca
12380 || ix86_current_function_calls_tls_descriptor))
12381 {
12382 offset += crtl->outgoing_args_size;
12383 frame->outgoing_arguments_size = crtl->outgoing_args_size;
12384 }
12385 else
12386 frame->outgoing_arguments_size = 0;
12387
12388 /* Align stack boundary. Only needed if we're calling another function
12389 or using alloca. */
12390 if (!crtl->is_leaf || cfun->calls_alloca
12391 || ix86_current_function_calls_tls_descriptor)
12392 offset = ROUND_UP (offset, preferred_alignment);
12393
12394 /* We've reached end of stack frame. */
12395 frame->stack_pointer_offset = offset;
12396
12397 /* Size prologue needs to allocate. */
12398 to_allocate = offset - frame->sse_reg_save_offset;
12399
12400 if ((!to_allocate && frame->nregs <= 1)
12401 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)))
12402 frame->save_regs_using_mov = false;
12403
12404 if (ix86_using_red_zone ()
12405 && crtl->sp_is_unchanging
12406 && crtl->is_leaf
12407 && !ix86_pc_thunk_call_expanded
12408 && !ix86_current_function_calls_tls_descriptor)
12409 {
12410 frame->red_zone_size = to_allocate;
12411 if (frame->save_regs_using_mov)
12412 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
12413 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
12414 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
12415 }
12416 else
12417 frame->red_zone_size = 0;
12418 frame->stack_pointer_offset -= frame->red_zone_size;
12419
12420 /* The SEH frame pointer location is near the bottom of the frame.
12421 This is enforced by the fact that the difference between the
12422 stack pointer and the frame pointer is limited to 240 bytes in
12423 the unwind data structure. */
12424 if (TARGET_SEH)
12425 {
12426 HOST_WIDE_INT diff;
12427
12428 /* If we can leave the frame pointer where it is, do so. Also, returns
12429 the establisher frame for __builtin_frame_address (0). */
12430 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
12431 if (diff <= SEH_MAX_FRAME_SIZE
12432 && (diff > 240 || (diff & 15) != 0)
12433 && !crtl->accesses_prior_frames)
12434 {
12435 /* Ideally we'd determine what portion of the local stack frame
12436 (within the constraint of the lowest 240) is most heavily used.
12437 But without that complication, simply bias the frame pointer
12438 by 128 bytes so as to maximize the amount of the local stack
12439 frame that is addressable with 8-bit offsets. */
12440 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
12441 }
12442 }
12443 }
12444
12445 /* This is semi-inlined memory_address_length, but simplified
12446 since we know that we're always dealing with reg+offset, and
12447 to avoid having to create and discard all that rtl. */
12448
12449 static inline int
12450 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
12451 {
12452 int len = 4;
12453
12454 if (offset == 0)
12455 {
12456 /* EBP and R13 cannot be encoded without an offset. */
12457 len = (regno == BP_REG || regno == R13_REG);
12458 }
12459 else if (IN_RANGE (offset, -128, 127))
12460 len = 1;
12461
12462 /* ESP and R12 must be encoded with a SIB byte. */
12463 if (regno == SP_REG || regno == R12_REG)
12464 len++;
12465
12466 return len;
12467 }
12468
12469 /* Return an RTX that points to CFA_OFFSET within the stack frame.
12470 The valid base registers are taken from CFUN->MACHINE->FS. */
12471
12472 static rtx
12473 choose_baseaddr (HOST_WIDE_INT cfa_offset)
12474 {
12475 const struct machine_function *m = cfun->machine;
12476 rtx base_reg = NULL;
12477 HOST_WIDE_INT base_offset = 0;
12478
12479 if (m->use_fast_prologue_epilogue)
12480 {
12481 /* Choose the base register most likely to allow the most scheduling
12482 opportunities. Generally FP is valid throughout the function,
12483 while DRAP must be reloaded within the epilogue. But choose either
12484 over the SP due to increased encoding size. */
12485
12486 if (m->fs.fp_valid)
12487 {
12488 base_reg = hard_frame_pointer_rtx;
12489 base_offset = m->fs.fp_offset - cfa_offset;
12490 }
12491 else if (m->fs.drap_valid)
12492 {
12493 base_reg = crtl->drap_reg;
12494 base_offset = 0 - cfa_offset;
12495 }
12496 else if (m->fs.sp_valid)
12497 {
12498 base_reg = stack_pointer_rtx;
12499 base_offset = m->fs.sp_offset - cfa_offset;
12500 }
12501 }
12502 else
12503 {
12504 HOST_WIDE_INT toffset;
12505 int len = 16, tlen;
12506
12507 /* Choose the base register with the smallest address encoding.
12508 With a tie, choose FP > DRAP > SP. */
12509 if (m->fs.sp_valid)
12510 {
12511 base_reg = stack_pointer_rtx;
12512 base_offset = m->fs.sp_offset - cfa_offset;
12513 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
12514 }
12515 if (m->fs.drap_valid)
12516 {
12517 toffset = 0 - cfa_offset;
12518 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
12519 if (tlen <= len)
12520 {
12521 base_reg = crtl->drap_reg;
12522 base_offset = toffset;
12523 len = tlen;
12524 }
12525 }
12526 if (m->fs.fp_valid)
12527 {
12528 toffset = m->fs.fp_offset - cfa_offset;
12529 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
12530 if (tlen <= len)
12531 {
12532 base_reg = hard_frame_pointer_rtx;
12533 base_offset = toffset;
12534 len = tlen;
12535 }
12536 }
12537 }
12538 gcc_assert (base_reg != NULL);
12539
12540 return plus_constant (Pmode, base_reg, base_offset);
12541 }
12542
12543 /* Emit code to save registers in the prologue. */
12544
12545 static void
12546 ix86_emit_save_regs (void)
12547 {
12548 unsigned int regno;
12549 rtx_insn *insn;
12550
12551 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
12552 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true))
12553 {
12554 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
12555 RTX_FRAME_RELATED_P (insn) = 1;
12556 }
12557 }
12558
12559 /* Emit a single register save at CFA - CFA_OFFSET. */
12560
12561 static void
12562 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
12563 HOST_WIDE_INT cfa_offset)
12564 {
12565 struct machine_function *m = cfun->machine;
12566 rtx reg = gen_rtx_REG (mode, regno);
12567 rtx mem, addr, base, insn;
12568 unsigned int align;
12569
12570 addr = choose_baseaddr (cfa_offset);
12571 mem = gen_frame_mem (mode, addr);
12572
12573 /* The location is aligned up to INCOMING_STACK_BOUNDARY. */
12574 align = MIN (GET_MODE_ALIGNMENT (mode), INCOMING_STACK_BOUNDARY);
12575 set_mem_align (mem, align);
12576
12577 insn = emit_insn (gen_rtx_SET (mem, reg));
12578 RTX_FRAME_RELATED_P (insn) = 1;
12579
12580 base = addr;
12581 if (GET_CODE (base) == PLUS)
12582 base = XEXP (base, 0);
12583 gcc_checking_assert (REG_P (base));
12584
12585 /* When saving registers into a re-aligned local stack frame, avoid
12586 any tricky guessing by dwarf2out. */
12587 if (m->fs.realigned)
12588 {
12589 gcc_checking_assert (stack_realign_drap);
12590
12591 if (regno == REGNO (crtl->drap_reg))
12592 {
12593 /* A bit of a hack. We force the DRAP register to be saved in
12594 the re-aligned stack frame, which provides us with a copy
12595 of the CFA that will last past the prologue. Install it. */
12596 gcc_checking_assert (cfun->machine->fs.fp_valid);
12597 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12598 cfun->machine->fs.fp_offset - cfa_offset);
12599 mem = gen_rtx_MEM (mode, addr);
12600 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
12601 }
12602 else
12603 {
12604 /* The frame pointer is a stable reference within the
12605 aligned frame. Use it. */
12606 gcc_checking_assert (cfun->machine->fs.fp_valid);
12607 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12608 cfun->machine->fs.fp_offset - cfa_offset);
12609 mem = gen_rtx_MEM (mode, addr);
12610 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
12611 }
12612 }
12613
12614 /* The memory may not be relative to the current CFA register,
12615 which means that we may need to generate a new pattern for
12616 use by the unwind info. */
12617 else if (base != m->fs.cfa_reg)
12618 {
12619 addr = plus_constant (Pmode, m->fs.cfa_reg,
12620 m->fs.cfa_offset - cfa_offset);
12621 mem = gen_rtx_MEM (mode, addr);
12622 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
12623 }
12624 }
12625
12626 /* Emit code to save registers using MOV insns.
12627 First register is stored at CFA - CFA_OFFSET. */
12628 static void
12629 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
12630 {
12631 unsigned int regno;
12632
12633 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12634 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true))
12635 {
12636 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
12637 cfa_offset -= UNITS_PER_WORD;
12638 }
12639 }
12640
12641 /* Emit code to save SSE registers using MOV insns.
12642 First register is stored at CFA - CFA_OFFSET. */
12643 static void
12644 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
12645 {
12646 unsigned int regno;
12647
12648 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12649 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
12650 {
12651 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
12652 cfa_offset -= GET_MODE_SIZE (V4SFmode);
12653 }
12654 }
12655
12656 static GTY(()) rtx queued_cfa_restores;
12657
12658 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
12659 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
12660 Don't add the note if the previously saved value will be left untouched
12661 within stack red-zone till return, as unwinders can find the same value
12662 in the register and on the stack. */
12663
12664 static void
12665 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
12666 {
12667 if (!crtl->shrink_wrapped
12668 && cfa_offset <= cfun->machine->fs.red_zone_offset)
12669 return;
12670
12671 if (insn)
12672 {
12673 add_reg_note (insn, REG_CFA_RESTORE, reg);
12674 RTX_FRAME_RELATED_P (insn) = 1;
12675 }
12676 else
12677 queued_cfa_restores
12678 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
12679 }
12680
12681 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
12682
12683 static void
12684 ix86_add_queued_cfa_restore_notes (rtx insn)
12685 {
12686 rtx last;
12687 if (!queued_cfa_restores)
12688 return;
12689 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
12690 ;
12691 XEXP (last, 1) = REG_NOTES (insn);
12692 REG_NOTES (insn) = queued_cfa_restores;
12693 queued_cfa_restores = NULL_RTX;
12694 RTX_FRAME_RELATED_P (insn) = 1;
12695 }
12696
12697 /* Expand prologue or epilogue stack adjustment.
12698 The pattern exist to put a dependency on all ebp-based memory accesses.
12699 STYLE should be negative if instructions should be marked as frame related,
12700 zero if %r11 register is live and cannot be freely used and positive
12701 otherwise. */
12702
12703 static void
12704 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
12705 int style, bool set_cfa)
12706 {
12707 struct machine_function *m = cfun->machine;
12708 rtx insn;
12709 bool add_frame_related_expr = false;
12710
12711 if (Pmode == SImode)
12712 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
12713 else if (x86_64_immediate_operand (offset, DImode))
12714 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
12715 else
12716 {
12717 rtx tmp;
12718 /* r11 is used by indirect sibcall return as well, set before the
12719 epilogue and used after the epilogue. */
12720 if (style)
12721 tmp = gen_rtx_REG (DImode, R11_REG);
12722 else
12723 {
12724 gcc_assert (src != hard_frame_pointer_rtx
12725 && dest != hard_frame_pointer_rtx);
12726 tmp = hard_frame_pointer_rtx;
12727 }
12728 insn = emit_insn (gen_rtx_SET (tmp, offset));
12729 if (style < 0)
12730 add_frame_related_expr = true;
12731
12732 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
12733 }
12734
12735 insn = emit_insn (insn);
12736 if (style >= 0)
12737 ix86_add_queued_cfa_restore_notes (insn);
12738
12739 if (set_cfa)
12740 {
12741 rtx r;
12742
12743 gcc_assert (m->fs.cfa_reg == src);
12744 m->fs.cfa_offset += INTVAL (offset);
12745 m->fs.cfa_reg = dest;
12746
12747 r = gen_rtx_PLUS (Pmode, src, offset);
12748 r = gen_rtx_SET (dest, r);
12749 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
12750 RTX_FRAME_RELATED_P (insn) = 1;
12751 }
12752 else if (style < 0)
12753 {
12754 RTX_FRAME_RELATED_P (insn) = 1;
12755 if (add_frame_related_expr)
12756 {
12757 rtx r = gen_rtx_PLUS (Pmode, src, offset);
12758 r = gen_rtx_SET (dest, r);
12759 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
12760 }
12761 }
12762
12763 if (dest == stack_pointer_rtx)
12764 {
12765 HOST_WIDE_INT ooffset = m->fs.sp_offset;
12766 bool valid = m->fs.sp_valid;
12767
12768 if (src == hard_frame_pointer_rtx)
12769 {
12770 valid = m->fs.fp_valid;
12771 ooffset = m->fs.fp_offset;
12772 }
12773 else if (src == crtl->drap_reg)
12774 {
12775 valid = m->fs.drap_valid;
12776 ooffset = 0;
12777 }
12778 else
12779 {
12780 /* Else there are two possibilities: SP itself, which we set
12781 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
12782 taken care of this by hand along the eh_return path. */
12783 gcc_checking_assert (src == stack_pointer_rtx
12784 || offset == const0_rtx);
12785 }
12786
12787 m->fs.sp_offset = ooffset - INTVAL (offset);
12788 m->fs.sp_valid = valid;
12789 }
12790 }
12791
12792 /* Find an available register to be used as dynamic realign argument
12793 pointer regsiter. Such a register will be written in prologue and
12794 used in begin of body, so it must not be
12795 1. parameter passing register.
12796 2. GOT pointer.
12797 We reuse static-chain register if it is available. Otherwise, we
12798 use DI for i386 and R13 for x86-64. We chose R13 since it has
12799 shorter encoding.
12800
12801 Return: the regno of chosen register. */
12802
12803 static unsigned int
12804 find_drap_reg (void)
12805 {
12806 tree decl = cfun->decl;
12807
12808 /* Always use callee-saved register if there are no caller-saved
12809 registers. */
12810 if (TARGET_64BIT)
12811 {
12812 /* Use R13 for nested function or function need static chain.
12813 Since function with tail call may use any caller-saved
12814 registers in epilogue, DRAP must not use caller-saved
12815 register in such case. */
12816 if (DECL_STATIC_CHAIN (decl)
12817 || cfun->machine->no_caller_saved_registers
12818 || crtl->tail_call_emit)
12819 return R13_REG;
12820
12821 return R10_REG;
12822 }
12823 else
12824 {
12825 /* Use DI for nested function or function need static chain.
12826 Since function with tail call may use any caller-saved
12827 registers in epilogue, DRAP must not use caller-saved
12828 register in such case. */
12829 if (DECL_STATIC_CHAIN (decl)
12830 || cfun->machine->no_caller_saved_registers
12831 || crtl->tail_call_emit)
12832 return DI_REG;
12833
12834 /* Reuse static chain register if it isn't used for parameter
12835 passing. */
12836 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
12837 {
12838 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
12839 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
12840 return CX_REG;
12841 }
12842 return DI_REG;
12843 }
12844 }
12845
12846 /* Handle a "force_align_arg_pointer" attribute. */
12847
12848 static tree
12849 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
12850 tree, int, bool *no_add_attrs)
12851 {
12852 if (TREE_CODE (*node) != FUNCTION_TYPE
12853 && TREE_CODE (*node) != METHOD_TYPE
12854 && TREE_CODE (*node) != FIELD_DECL
12855 && TREE_CODE (*node) != TYPE_DECL)
12856 {
12857 warning (OPT_Wattributes, "%qE attribute only applies to functions",
12858 name);
12859 *no_add_attrs = true;
12860 }
12861
12862 return NULL_TREE;
12863 }
12864
12865 /* Return minimum incoming stack alignment. */
12866
12867 static unsigned int
12868 ix86_minimum_incoming_stack_boundary (bool sibcall)
12869 {
12870 unsigned int incoming_stack_boundary;
12871
12872 /* Stack of interrupt handler is always aligned to MIN_STACK_BOUNDARY.
12873 */
12874 if (cfun->machine->func_type != TYPE_NORMAL)
12875 incoming_stack_boundary = MIN_STACK_BOUNDARY;
12876 /* Prefer the one specified at command line. */
12877 else if (ix86_user_incoming_stack_boundary)
12878 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
12879 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
12880 if -mstackrealign is used, it isn't used for sibcall check and
12881 estimated stack alignment is 128bit. */
12882 else if (!sibcall
12883 && ix86_force_align_arg_pointer
12884 && crtl->stack_alignment_estimated == 128)
12885 incoming_stack_boundary = MIN_STACK_BOUNDARY;
12886 else
12887 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
12888
12889 /* Incoming stack alignment can be changed on individual functions
12890 via force_align_arg_pointer attribute. We use the smallest
12891 incoming stack boundary. */
12892 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
12893 && lookup_attribute (ix86_force_align_arg_pointer_string,
12894 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
12895 incoming_stack_boundary = MIN_STACK_BOUNDARY;
12896
12897 /* The incoming stack frame has to be aligned at least at
12898 parm_stack_boundary. */
12899 if (incoming_stack_boundary < crtl->parm_stack_boundary)
12900 incoming_stack_boundary = crtl->parm_stack_boundary;
12901
12902 /* Stack at entrance of main is aligned by runtime. We use the
12903 smallest incoming stack boundary. */
12904 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
12905 && DECL_NAME (current_function_decl)
12906 && MAIN_NAME_P (DECL_NAME (current_function_decl))
12907 && DECL_FILE_SCOPE_P (current_function_decl))
12908 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
12909
12910 return incoming_stack_boundary;
12911 }
12912
12913 /* Update incoming stack boundary and estimated stack alignment. */
12914
12915 static void
12916 ix86_update_stack_boundary (void)
12917 {
12918 ix86_incoming_stack_boundary
12919 = ix86_minimum_incoming_stack_boundary (false);
12920
12921 /* x86_64 vararg needs 16byte stack alignment for register save
12922 area. */
12923 if (TARGET_64BIT
12924 && cfun->stdarg
12925 && crtl->stack_alignment_estimated < 128)
12926 crtl->stack_alignment_estimated = 128;
12927
12928 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
12929 if (ix86_tls_descriptor_calls_expanded_in_cfun
12930 && crtl->preferred_stack_boundary < 128)
12931 crtl->preferred_stack_boundary = 128;
12932 }
12933
12934 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
12935 needed or an rtx for DRAP otherwise. */
12936
12937 static rtx
12938 ix86_get_drap_rtx (void)
12939 {
12940 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
12941 crtl->need_drap = true;
12942
12943 if (stack_realign_drap)
12944 {
12945 /* Assign DRAP to vDRAP and returns vDRAP */
12946 unsigned int regno = find_drap_reg ();
12947 rtx drap_vreg;
12948 rtx arg_ptr;
12949 rtx_insn *seq, *insn;
12950
12951 arg_ptr = gen_rtx_REG (Pmode, regno);
12952 crtl->drap_reg = arg_ptr;
12953
12954 start_sequence ();
12955 drap_vreg = copy_to_reg (arg_ptr);
12956 seq = get_insns ();
12957 end_sequence ();
12958
12959 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
12960 if (!optimize)
12961 {
12962 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
12963 RTX_FRAME_RELATED_P (insn) = 1;
12964 }
12965 return drap_vreg;
12966 }
12967 else
12968 return NULL;
12969 }
12970
12971 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
12972
12973 static rtx
12974 ix86_internal_arg_pointer (void)
12975 {
12976 return virtual_incoming_args_rtx;
12977 }
12978
12979 struct scratch_reg {
12980 rtx reg;
12981 bool saved;
12982 };
12983
12984 /* Return a short-lived scratch register for use on function entry.
12985 In 32-bit mode, it is valid only after the registers are saved
12986 in the prologue. This register must be released by means of
12987 release_scratch_register_on_entry once it is dead. */
12988
12989 static void
12990 get_scratch_register_on_entry (struct scratch_reg *sr)
12991 {
12992 int regno;
12993
12994 sr->saved = false;
12995
12996 if (TARGET_64BIT)
12997 {
12998 /* We always use R11 in 64-bit mode. */
12999 regno = R11_REG;
13000 }
13001 else
13002 {
13003 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
13004 bool fastcall_p
13005 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13006 bool thiscall_p
13007 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13008 bool static_chain_p = DECL_STATIC_CHAIN (decl);
13009 int regparm = ix86_function_regparm (fntype, decl);
13010 int drap_regno
13011 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
13012
13013 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
13014 for the static chain register. */
13015 if ((regparm < 1 || (fastcall_p && !static_chain_p))
13016 && drap_regno != AX_REG)
13017 regno = AX_REG;
13018 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
13019 for the static chain register. */
13020 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
13021 regno = AX_REG;
13022 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
13023 regno = DX_REG;
13024 /* ecx is the static chain register. */
13025 else if (regparm < 3 && !fastcall_p && !thiscall_p
13026 && !static_chain_p
13027 && drap_regno != CX_REG)
13028 regno = CX_REG;
13029 else if (ix86_save_reg (BX_REG, true))
13030 regno = BX_REG;
13031 /* esi is the static chain register. */
13032 else if (!(regparm == 3 && static_chain_p)
13033 && ix86_save_reg (SI_REG, true))
13034 regno = SI_REG;
13035 else if (ix86_save_reg (DI_REG, true))
13036 regno = DI_REG;
13037 else
13038 {
13039 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
13040 sr->saved = true;
13041 }
13042 }
13043
13044 sr->reg = gen_rtx_REG (Pmode, regno);
13045 if (sr->saved)
13046 {
13047 rtx_insn *insn = emit_insn (gen_push (sr->reg));
13048 RTX_FRAME_RELATED_P (insn) = 1;
13049 }
13050 }
13051
13052 /* Release a scratch register obtained from the preceding function. */
13053
13054 static void
13055 release_scratch_register_on_entry (struct scratch_reg *sr)
13056 {
13057 if (sr->saved)
13058 {
13059 struct machine_function *m = cfun->machine;
13060 rtx x, insn = emit_insn (gen_pop (sr->reg));
13061
13062 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
13063 RTX_FRAME_RELATED_P (insn) = 1;
13064 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
13065 x = gen_rtx_SET (stack_pointer_rtx, x);
13066 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
13067 m->fs.sp_offset -= UNITS_PER_WORD;
13068 }
13069 }
13070
13071 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
13072
13073 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
13074
13075 static void
13076 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
13077 {
13078 /* We skip the probe for the first interval + a small dope of 4 words and
13079 probe that many bytes past the specified size to maintain a protection
13080 area at the botton of the stack. */
13081 const int dope = 4 * UNITS_PER_WORD;
13082 rtx size_rtx = GEN_INT (size), last;
13083
13084 /* See if we have a constant small number of probes to generate. If so,
13085 that's the easy case. The run-time loop is made up of 9 insns in the
13086 generic case while the compile-time loop is made up of 3+2*(n-1) insns
13087 for n # of intervals. */
13088 if (size <= 4 * PROBE_INTERVAL)
13089 {
13090 HOST_WIDE_INT i, adjust;
13091 bool first_probe = true;
13092
13093 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
13094 values of N from 1 until it exceeds SIZE. If only one probe is
13095 needed, this will not generate any code. Then adjust and probe
13096 to PROBE_INTERVAL + SIZE. */
13097 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
13098 {
13099 if (first_probe)
13100 {
13101 adjust = 2 * PROBE_INTERVAL + dope;
13102 first_probe = false;
13103 }
13104 else
13105 adjust = PROBE_INTERVAL;
13106
13107 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13108 plus_constant (Pmode, stack_pointer_rtx,
13109 -adjust)));
13110 emit_stack_probe (stack_pointer_rtx);
13111 }
13112
13113 if (first_probe)
13114 adjust = size + PROBE_INTERVAL + dope;
13115 else
13116 adjust = size + PROBE_INTERVAL - i;
13117
13118 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13119 plus_constant (Pmode, stack_pointer_rtx,
13120 -adjust)));
13121 emit_stack_probe (stack_pointer_rtx);
13122
13123 /* Adjust back to account for the additional first interval. */
13124 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13125 plus_constant (Pmode, stack_pointer_rtx,
13126 PROBE_INTERVAL + dope)));
13127 }
13128
13129 /* Otherwise, do the same as above, but in a loop. Note that we must be
13130 extra careful with variables wrapping around because we might be at
13131 the very top (or the very bottom) of the address space and we have
13132 to be able to handle this case properly; in particular, we use an
13133 equality test for the loop condition. */
13134 else
13135 {
13136 HOST_WIDE_INT rounded_size;
13137 struct scratch_reg sr;
13138
13139 get_scratch_register_on_entry (&sr);
13140
13141
13142 /* Step 1: round SIZE to the previous multiple of the interval. */
13143
13144 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
13145
13146
13147 /* Step 2: compute initial and final value of the loop counter. */
13148
13149 /* SP = SP_0 + PROBE_INTERVAL. */
13150 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13151 plus_constant (Pmode, stack_pointer_rtx,
13152 - (PROBE_INTERVAL + dope))));
13153
13154 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
13155 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
13156 emit_insn (gen_rtx_SET (sr.reg,
13157 plus_constant (Pmode, stack_pointer_rtx,
13158 -rounded_size)));
13159 else
13160 {
13161 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
13162 emit_insn (gen_rtx_SET (sr.reg,
13163 gen_rtx_PLUS (Pmode, sr.reg,
13164 stack_pointer_rtx)));
13165 }
13166
13167
13168 /* Step 3: the loop
13169
13170 do
13171 {
13172 SP = SP + PROBE_INTERVAL
13173 probe at SP
13174 }
13175 while (SP != LAST_ADDR)
13176
13177 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
13178 values of N from 1 until it is equal to ROUNDED_SIZE. */
13179
13180 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
13181
13182
13183 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
13184 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
13185
13186 if (size != rounded_size)
13187 {
13188 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13189 plus_constant (Pmode, stack_pointer_rtx,
13190 rounded_size - size)));
13191 emit_stack_probe (stack_pointer_rtx);
13192 }
13193
13194 /* Adjust back to account for the additional first interval. */
13195 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13196 plus_constant (Pmode, stack_pointer_rtx,
13197 PROBE_INTERVAL + dope)));
13198
13199 release_scratch_register_on_entry (&sr);
13200 }
13201
13202 /* Even if the stack pointer isn't the CFA register, we need to correctly
13203 describe the adjustments made to it, in particular differentiate the
13204 frame-related ones from the frame-unrelated ones. */
13205 if (size > 0)
13206 {
13207 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
13208 XVECEXP (expr, 0, 0)
13209 = gen_rtx_SET (stack_pointer_rtx,
13210 plus_constant (Pmode, stack_pointer_rtx, -size));
13211 XVECEXP (expr, 0, 1)
13212 = gen_rtx_SET (stack_pointer_rtx,
13213 plus_constant (Pmode, stack_pointer_rtx,
13214 PROBE_INTERVAL + dope + size));
13215 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
13216 RTX_FRAME_RELATED_P (last) = 1;
13217
13218 cfun->machine->fs.sp_offset += size;
13219 }
13220
13221 /* Make sure nothing is scheduled before we are done. */
13222 emit_insn (gen_blockage ());
13223 }
13224
13225 /* Adjust the stack pointer up to REG while probing it. */
13226
13227 const char *
13228 output_adjust_stack_and_probe (rtx reg)
13229 {
13230 static int labelno = 0;
13231 char loop_lab[32];
13232 rtx xops[2];
13233
13234 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13235
13236 /* Loop. */
13237 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13238
13239 /* SP = SP + PROBE_INTERVAL. */
13240 xops[0] = stack_pointer_rtx;
13241 xops[1] = GEN_INT (PROBE_INTERVAL);
13242 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13243
13244 /* Probe at SP. */
13245 xops[1] = const0_rtx;
13246 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
13247
13248 /* Test if SP == LAST_ADDR. */
13249 xops[0] = stack_pointer_rtx;
13250 xops[1] = reg;
13251 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13252
13253 /* Branch. */
13254 fputs ("\tjne\t", asm_out_file);
13255 assemble_name_raw (asm_out_file, loop_lab);
13256 fputc ('\n', asm_out_file);
13257
13258 return "";
13259 }
13260
13261 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
13262 inclusive. These are offsets from the current stack pointer. */
13263
13264 static void
13265 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
13266 {
13267 /* See if we have a constant small number of probes to generate. If so,
13268 that's the easy case. The run-time loop is made up of 6 insns in the
13269 generic case while the compile-time loop is made up of n insns for n #
13270 of intervals. */
13271 if (size <= 6 * PROBE_INTERVAL)
13272 {
13273 HOST_WIDE_INT i;
13274
13275 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
13276 it exceeds SIZE. If only one probe is needed, this will not
13277 generate any code. Then probe at FIRST + SIZE. */
13278 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
13279 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
13280 -(first + i)));
13281
13282 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
13283 -(first + size)));
13284 }
13285
13286 /* Otherwise, do the same as above, but in a loop. Note that we must be
13287 extra careful with variables wrapping around because we might be at
13288 the very top (or the very bottom) of the address space and we have
13289 to be able to handle this case properly; in particular, we use an
13290 equality test for the loop condition. */
13291 else
13292 {
13293 HOST_WIDE_INT rounded_size, last;
13294 struct scratch_reg sr;
13295
13296 get_scratch_register_on_entry (&sr);
13297
13298
13299 /* Step 1: round SIZE to the previous multiple of the interval. */
13300
13301 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
13302
13303
13304 /* Step 2: compute initial and final value of the loop counter. */
13305
13306 /* TEST_OFFSET = FIRST. */
13307 emit_move_insn (sr.reg, GEN_INT (-first));
13308
13309 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
13310 last = first + rounded_size;
13311
13312
13313 /* Step 3: the loop
13314
13315 do
13316 {
13317 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
13318 probe at TEST_ADDR
13319 }
13320 while (TEST_ADDR != LAST_ADDR)
13321
13322 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
13323 until it is equal to ROUNDED_SIZE. */
13324
13325 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
13326
13327
13328 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
13329 that SIZE is equal to ROUNDED_SIZE. */
13330
13331 if (size != rounded_size)
13332 emit_stack_probe (plus_constant (Pmode,
13333 gen_rtx_PLUS (Pmode,
13334 stack_pointer_rtx,
13335 sr.reg),
13336 rounded_size - size));
13337
13338 release_scratch_register_on_entry (&sr);
13339 }
13340
13341 /* Make sure nothing is scheduled before we are done. */
13342 emit_insn (gen_blockage ());
13343 }
13344
13345 /* Probe a range of stack addresses from REG to END, inclusive. These are
13346 offsets from the current stack pointer. */
13347
13348 const char *
13349 output_probe_stack_range (rtx reg, rtx end)
13350 {
13351 static int labelno = 0;
13352 char loop_lab[32];
13353 rtx xops[3];
13354
13355 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13356
13357 /* Loop. */
13358 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13359
13360 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
13361 xops[0] = reg;
13362 xops[1] = GEN_INT (PROBE_INTERVAL);
13363 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13364
13365 /* Probe at TEST_ADDR. */
13366 xops[0] = stack_pointer_rtx;
13367 xops[1] = reg;
13368 xops[2] = const0_rtx;
13369 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
13370
13371 /* Test if TEST_ADDR == LAST_ADDR. */
13372 xops[0] = reg;
13373 xops[1] = end;
13374 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13375
13376 /* Branch. */
13377 fputs ("\tjne\t", asm_out_file);
13378 assemble_name_raw (asm_out_file, loop_lab);
13379 fputc ('\n', asm_out_file);
13380
13381 return "";
13382 }
13383
13384 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
13385 to be generated in correct form. */
13386 static void
13387 ix86_finalize_stack_realign_flags (void)
13388 {
13389 /* Check if stack realign is really needed after reload, and
13390 stores result in cfun */
13391 unsigned int incoming_stack_boundary
13392 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
13393 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
13394 unsigned int stack_realign
13395 = (incoming_stack_boundary
13396 < (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
13397 ? crtl->max_used_stack_slot_alignment
13398 : crtl->stack_alignment_needed));
13399
13400 if (crtl->stack_realign_finalized)
13401 {
13402 /* After stack_realign_needed is finalized, we can't no longer
13403 change it. */
13404 gcc_assert (crtl->stack_realign_needed == stack_realign);
13405 return;
13406 }
13407
13408 /* If the only reason for frame_pointer_needed is that we conservatively
13409 assumed stack realignment might be needed, but in the end nothing that
13410 needed the stack alignment had been spilled, clear frame_pointer_needed
13411 and say we don't need stack realignment. */
13412 if (stack_realign
13413 && frame_pointer_needed
13414 && crtl->is_leaf
13415 && flag_omit_frame_pointer
13416 && crtl->sp_is_unchanging
13417 && !ix86_current_function_calls_tls_descriptor
13418 && !crtl->accesses_prior_frames
13419 && !cfun->calls_alloca
13420 && !crtl->calls_eh_return
13421 /* See ira_setup_eliminable_regset for the rationale. */
13422 && !(STACK_CHECK_MOVING_SP
13423 && flag_stack_check
13424 && flag_exceptions
13425 && cfun->can_throw_non_call_exceptions)
13426 && !ix86_frame_pointer_required ()
13427 && get_frame_size () == 0
13428 && ix86_nsaved_sseregs () == 0
13429 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
13430 {
13431 HARD_REG_SET set_up_by_prologue, prologue_used;
13432 basic_block bb;
13433
13434 CLEAR_HARD_REG_SET (prologue_used);
13435 CLEAR_HARD_REG_SET (set_up_by_prologue);
13436 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
13437 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
13438 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
13439 HARD_FRAME_POINTER_REGNUM);
13440 FOR_EACH_BB_FN (bb, cfun)
13441 {
13442 rtx_insn *insn;
13443 FOR_BB_INSNS (bb, insn)
13444 if (NONDEBUG_INSN_P (insn)
13445 && requires_stack_frame_p (insn, prologue_used,
13446 set_up_by_prologue))
13447 {
13448 crtl->stack_realign_needed = stack_realign;
13449 crtl->stack_realign_finalized = true;
13450 return;
13451 }
13452 }
13453
13454 /* If drap has been set, but it actually isn't live at the start
13455 of the function, there is no reason to set it up. */
13456 if (crtl->drap_reg)
13457 {
13458 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13459 if (! REGNO_REG_SET_P (DF_LR_IN (bb), REGNO (crtl->drap_reg)))
13460 {
13461 crtl->drap_reg = NULL_RTX;
13462 crtl->need_drap = false;
13463 }
13464 }
13465 else
13466 cfun->machine->no_drap_save_restore = true;
13467
13468 frame_pointer_needed = false;
13469 stack_realign = false;
13470 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
13471 crtl->stack_alignment_needed = incoming_stack_boundary;
13472 crtl->stack_alignment_estimated = incoming_stack_boundary;
13473 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
13474 crtl->preferred_stack_boundary = incoming_stack_boundary;
13475 df_finish_pass (true);
13476 df_scan_alloc (NULL);
13477 df_scan_blocks ();
13478 df_compute_regs_ever_live (true);
13479 df_analyze ();
13480 }
13481
13482 crtl->stack_realign_needed = stack_realign;
13483 crtl->stack_realign_finalized = true;
13484 }
13485
13486 /* Delete SET_GOT right after entry block if it is allocated to reg. */
13487
13488 static void
13489 ix86_elim_entry_set_got (rtx reg)
13490 {
13491 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13492 rtx_insn *c_insn = BB_HEAD (bb);
13493 if (!NONDEBUG_INSN_P (c_insn))
13494 c_insn = next_nonnote_nondebug_insn (c_insn);
13495 if (c_insn && NONJUMP_INSN_P (c_insn))
13496 {
13497 rtx pat = PATTERN (c_insn);
13498 if (GET_CODE (pat) == PARALLEL)
13499 {
13500 rtx vec = XVECEXP (pat, 0, 0);
13501 if (GET_CODE (vec) == SET
13502 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
13503 && REGNO (XEXP (vec, 0)) == REGNO (reg))
13504 delete_insn (c_insn);
13505 }
13506 }
13507 }
13508
13509 /* Expand the prologue into a bunch of separate insns. */
13510
13511 void
13512 ix86_expand_prologue (void)
13513 {
13514 struct machine_function *m = cfun->machine;
13515 rtx insn, t;
13516 struct ix86_frame frame;
13517 HOST_WIDE_INT allocate;
13518 bool int_registers_saved;
13519 bool sse_registers_saved;
13520 rtx static_chain = NULL_RTX;
13521
13522 ix86_finalize_stack_realign_flags ();
13523
13524 /* DRAP should not coexist with stack_realign_fp */
13525 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
13526
13527 memset (&m->fs, 0, sizeof (m->fs));
13528
13529 /* Initialize CFA state for before the prologue. */
13530 m->fs.cfa_reg = stack_pointer_rtx;
13531 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
13532
13533 /* Track SP offset to the CFA. We continue tracking this after we've
13534 swapped the CFA register away from SP. In the case of re-alignment
13535 this is fudged; we're interested to offsets within the local frame. */
13536 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13537 m->fs.sp_valid = true;
13538
13539 ix86_compute_frame_layout (&frame);
13540
13541 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
13542 {
13543 /* We should have already generated an error for any use of
13544 ms_hook on a nested function. */
13545 gcc_checking_assert (!ix86_static_chain_on_stack);
13546
13547 /* Check if profiling is active and we shall use profiling before
13548 prologue variant. If so sorry. */
13549 if (crtl->profile && flag_fentry != 0)
13550 sorry ("ms_hook_prologue attribute isn%'t compatible "
13551 "with -mfentry for 32-bit");
13552
13553 /* In ix86_asm_output_function_label we emitted:
13554 8b ff movl.s %edi,%edi
13555 55 push %ebp
13556 8b ec movl.s %esp,%ebp
13557
13558 This matches the hookable function prologue in Win32 API
13559 functions in Microsoft Windows XP Service Pack 2 and newer.
13560 Wine uses this to enable Windows apps to hook the Win32 API
13561 functions provided by Wine.
13562
13563 What that means is that we've already set up the frame pointer. */
13564
13565 if (frame_pointer_needed
13566 && !(crtl->drap_reg && crtl->stack_realign_needed))
13567 {
13568 rtx push, mov;
13569
13570 /* We've decided to use the frame pointer already set up.
13571 Describe this to the unwinder by pretending that both
13572 push and mov insns happen right here.
13573
13574 Putting the unwind info here at the end of the ms_hook
13575 is done so that we can make absolutely certain we get
13576 the required byte sequence at the start of the function,
13577 rather than relying on an assembler that can produce
13578 the exact encoding required.
13579
13580 However it does mean (in the unpatched case) that we have
13581 a 1 insn window where the asynchronous unwind info is
13582 incorrect. However, if we placed the unwind info at
13583 its correct location we would have incorrect unwind info
13584 in the patched case. Which is probably all moot since
13585 I don't expect Wine generates dwarf2 unwind info for the
13586 system libraries that use this feature. */
13587
13588 insn = emit_insn (gen_blockage ());
13589
13590 push = gen_push (hard_frame_pointer_rtx);
13591 mov = gen_rtx_SET (hard_frame_pointer_rtx,
13592 stack_pointer_rtx);
13593 RTX_FRAME_RELATED_P (push) = 1;
13594 RTX_FRAME_RELATED_P (mov) = 1;
13595
13596 RTX_FRAME_RELATED_P (insn) = 1;
13597 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13598 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
13599
13600 /* Note that gen_push incremented m->fs.cfa_offset, even
13601 though we didn't emit the push insn here. */
13602 m->fs.cfa_reg = hard_frame_pointer_rtx;
13603 m->fs.fp_offset = m->fs.cfa_offset;
13604 m->fs.fp_valid = true;
13605 }
13606 else
13607 {
13608 /* The frame pointer is not needed so pop %ebp again.
13609 This leaves us with a pristine state. */
13610 emit_insn (gen_pop (hard_frame_pointer_rtx));
13611 }
13612 }
13613
13614 /* The first insn of a function that accepts its static chain on the
13615 stack is to push the register that would be filled in by a direct
13616 call. This insn will be skipped by the trampoline. */
13617 else if (ix86_static_chain_on_stack)
13618 {
13619 static_chain = ix86_static_chain (cfun->decl, false);
13620 insn = emit_insn (gen_push (static_chain));
13621 emit_insn (gen_blockage ());
13622
13623 /* We don't want to interpret this push insn as a register save,
13624 only as a stack adjustment. The real copy of the register as
13625 a save will be done later, if needed. */
13626 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
13627 t = gen_rtx_SET (stack_pointer_rtx, t);
13628 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
13629 RTX_FRAME_RELATED_P (insn) = 1;
13630 }
13631
13632 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
13633 of DRAP is needed and stack realignment is really needed after reload */
13634 if (stack_realign_drap)
13635 {
13636 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13637
13638 /* Can't use DRAP in interrupt function. */
13639 if (cfun->machine->func_type != TYPE_NORMAL)
13640 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
13641 "in interrupt service routine. This may be worked "
13642 "around by avoiding functions with aggregate return.");
13643
13644 /* Only need to push parameter pointer reg if it is caller saved. */
13645 if (!call_used_regs[REGNO (crtl->drap_reg)])
13646 {
13647 /* Push arg pointer reg */
13648 insn = emit_insn (gen_push (crtl->drap_reg));
13649 RTX_FRAME_RELATED_P (insn) = 1;
13650 }
13651
13652 /* Grab the argument pointer. */
13653 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
13654 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13655 RTX_FRAME_RELATED_P (insn) = 1;
13656 m->fs.cfa_reg = crtl->drap_reg;
13657 m->fs.cfa_offset = 0;
13658
13659 /* Align the stack. */
13660 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13661 stack_pointer_rtx,
13662 GEN_INT (-align_bytes)));
13663 RTX_FRAME_RELATED_P (insn) = 1;
13664
13665 /* Replicate the return address on the stack so that return
13666 address can be reached via (argp - 1) slot. This is needed
13667 to implement macro RETURN_ADDR_RTX and intrinsic function
13668 expand_builtin_return_addr etc. */
13669 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
13670 t = gen_frame_mem (word_mode, t);
13671 insn = emit_insn (gen_push (t));
13672 RTX_FRAME_RELATED_P (insn) = 1;
13673
13674 /* For the purposes of frame and register save area addressing,
13675 we've started over with a new frame. */
13676 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13677 m->fs.realigned = true;
13678
13679 if (static_chain)
13680 {
13681 /* Replicate static chain on the stack so that static chain
13682 can be reached via (argp - 2) slot. This is needed for
13683 nested function with stack realignment. */
13684 insn = emit_insn (gen_push (static_chain));
13685 RTX_FRAME_RELATED_P (insn) = 1;
13686 }
13687 }
13688
13689 int_registers_saved = (frame.nregs == 0);
13690 sse_registers_saved = (frame.nsseregs == 0);
13691
13692 if (frame_pointer_needed && !m->fs.fp_valid)
13693 {
13694 /* Note: AT&T enter does NOT have reversed args. Enter is probably
13695 slower on all targets. Also sdb doesn't like it. */
13696 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
13697 RTX_FRAME_RELATED_P (insn) = 1;
13698
13699 /* Push registers now, before setting the frame pointer
13700 on SEH target. */
13701 if (!int_registers_saved
13702 && TARGET_SEH
13703 && !frame.save_regs_using_mov)
13704 {
13705 ix86_emit_save_regs ();
13706 int_registers_saved = true;
13707 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13708 }
13709
13710 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
13711 {
13712 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
13713 RTX_FRAME_RELATED_P (insn) = 1;
13714
13715 if (m->fs.cfa_reg == stack_pointer_rtx)
13716 m->fs.cfa_reg = hard_frame_pointer_rtx;
13717 m->fs.fp_offset = m->fs.sp_offset;
13718 m->fs.fp_valid = true;
13719 }
13720 }
13721
13722 if (!int_registers_saved)
13723 {
13724 /* If saving registers via PUSH, do so now. */
13725 if (!frame.save_regs_using_mov)
13726 {
13727 ix86_emit_save_regs ();
13728 int_registers_saved = true;
13729 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13730 }
13731
13732 /* When using red zone we may start register saving before allocating
13733 the stack frame saving one cycle of the prologue. However, avoid
13734 doing this if we have to probe the stack; at least on x86_64 the
13735 stack probe can turn into a call that clobbers a red zone location. */
13736 else if (ix86_using_red_zone ()
13737 && (! TARGET_STACK_PROBE
13738 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
13739 {
13740 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13741 int_registers_saved = true;
13742 }
13743 }
13744
13745 if (stack_realign_fp)
13746 {
13747 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13748 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
13749
13750 /* The computation of the size of the re-aligned stack frame means
13751 that we must allocate the size of the register save area before
13752 performing the actual alignment. Otherwise we cannot guarantee
13753 that there's enough storage above the realignment point. */
13754 if (m->fs.sp_offset != frame.sse_reg_save_offset)
13755 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13756 GEN_INT (m->fs.sp_offset
13757 - frame.sse_reg_save_offset),
13758 -1, false);
13759
13760 /* Align the stack. */
13761 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13762 stack_pointer_rtx,
13763 GEN_INT (-align_bytes)));
13764
13765 /* For the purposes of register save area addressing, the stack
13766 pointer is no longer valid. As for the value of sp_offset,
13767 see ix86_compute_frame_layout, which we need to match in order
13768 to pass verification of stack_pointer_offset at the end. */
13769 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
13770 m->fs.sp_valid = false;
13771 }
13772
13773 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
13774
13775 if (flag_stack_usage_info)
13776 {
13777 /* We start to count from ARG_POINTER. */
13778 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
13779
13780 /* If it was realigned, take into account the fake frame. */
13781 if (stack_realign_drap)
13782 {
13783 if (ix86_static_chain_on_stack)
13784 stack_size += UNITS_PER_WORD;
13785
13786 if (!call_used_regs[REGNO (crtl->drap_reg)])
13787 stack_size += UNITS_PER_WORD;
13788
13789 /* This over-estimates by 1 minimal-stack-alignment-unit but
13790 mitigates that by counting in the new return address slot. */
13791 current_function_dynamic_stack_size
13792 += crtl->stack_alignment_needed / BITS_PER_UNIT;
13793 }
13794
13795 current_function_static_stack_size = stack_size;
13796 }
13797
13798 /* On SEH target with very large frame size, allocate an area to save
13799 SSE registers (as the very large allocation won't be described). */
13800 if (TARGET_SEH
13801 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
13802 && !sse_registers_saved)
13803 {
13804 HOST_WIDE_INT sse_size =
13805 frame.sse_reg_save_offset - frame.reg_save_offset;
13806
13807 gcc_assert (int_registers_saved);
13808
13809 /* No need to do stack checking as the area will be immediately
13810 written. */
13811 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13812 GEN_INT (-sse_size), -1,
13813 m->fs.cfa_reg == stack_pointer_rtx);
13814 allocate -= sse_size;
13815 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13816 sse_registers_saved = true;
13817 }
13818
13819 /* The stack has already been decremented by the instruction calling us
13820 so probe if the size is non-negative to preserve the protection area. */
13821 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
13822 {
13823 /* We expect the registers to be saved when probes are used. */
13824 gcc_assert (int_registers_saved);
13825
13826 if (STACK_CHECK_MOVING_SP)
13827 {
13828 if (!(crtl->is_leaf && !cfun->calls_alloca
13829 && allocate <= PROBE_INTERVAL))
13830 {
13831 ix86_adjust_stack_and_probe (allocate);
13832 allocate = 0;
13833 }
13834 }
13835 else
13836 {
13837 HOST_WIDE_INT size = allocate;
13838
13839 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
13840 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
13841
13842 if (TARGET_STACK_PROBE)
13843 {
13844 if (crtl->is_leaf && !cfun->calls_alloca)
13845 {
13846 if (size > PROBE_INTERVAL)
13847 ix86_emit_probe_stack_range (0, size);
13848 }
13849 else
13850 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
13851 }
13852 else
13853 {
13854 if (crtl->is_leaf && !cfun->calls_alloca)
13855 {
13856 if (size > PROBE_INTERVAL && size > STACK_CHECK_PROTECT)
13857 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT,
13858 size - STACK_CHECK_PROTECT);
13859 }
13860 else
13861 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
13862 }
13863 }
13864 }
13865
13866 if (allocate == 0)
13867 ;
13868 else if (!ix86_target_stack_probe ()
13869 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
13870 {
13871 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13872 GEN_INT (-allocate), -1,
13873 m->fs.cfa_reg == stack_pointer_rtx);
13874 }
13875 else
13876 {
13877 rtx eax = gen_rtx_REG (Pmode, AX_REG);
13878 rtx r10 = NULL;
13879 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
13880 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
13881 bool eax_live = ix86_eax_live_at_start_p ();
13882 bool r10_live = false;
13883
13884 if (TARGET_64BIT)
13885 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
13886
13887 if (eax_live)
13888 {
13889 insn = emit_insn (gen_push (eax));
13890 allocate -= UNITS_PER_WORD;
13891 /* Note that SEH directives need to continue tracking the stack
13892 pointer even after the frame pointer has been set up. */
13893 if (sp_is_cfa_reg || TARGET_SEH)
13894 {
13895 if (sp_is_cfa_reg)
13896 m->fs.cfa_offset += UNITS_PER_WORD;
13897 RTX_FRAME_RELATED_P (insn) = 1;
13898 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13899 gen_rtx_SET (stack_pointer_rtx,
13900 plus_constant (Pmode, stack_pointer_rtx,
13901 -UNITS_PER_WORD)));
13902 }
13903 }
13904
13905 if (r10_live)
13906 {
13907 r10 = gen_rtx_REG (Pmode, R10_REG);
13908 insn = emit_insn (gen_push (r10));
13909 allocate -= UNITS_PER_WORD;
13910 if (sp_is_cfa_reg || TARGET_SEH)
13911 {
13912 if (sp_is_cfa_reg)
13913 m->fs.cfa_offset += UNITS_PER_WORD;
13914 RTX_FRAME_RELATED_P (insn) = 1;
13915 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13916 gen_rtx_SET (stack_pointer_rtx,
13917 plus_constant (Pmode, stack_pointer_rtx,
13918 -UNITS_PER_WORD)));
13919 }
13920 }
13921
13922 emit_move_insn (eax, GEN_INT (allocate));
13923 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
13924
13925 /* Use the fact that AX still contains ALLOCATE. */
13926 adjust_stack_insn = (Pmode == DImode
13927 ? gen_pro_epilogue_adjust_stack_di_sub
13928 : gen_pro_epilogue_adjust_stack_si_sub);
13929
13930 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
13931 stack_pointer_rtx, eax));
13932
13933 if (sp_is_cfa_reg || TARGET_SEH)
13934 {
13935 if (sp_is_cfa_reg)
13936 m->fs.cfa_offset += allocate;
13937 RTX_FRAME_RELATED_P (insn) = 1;
13938 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13939 gen_rtx_SET (stack_pointer_rtx,
13940 plus_constant (Pmode, stack_pointer_rtx,
13941 -allocate)));
13942 }
13943 m->fs.sp_offset += allocate;
13944
13945 /* Use stack_pointer_rtx for relative addressing so that code
13946 works for realigned stack, too. */
13947 if (r10_live && eax_live)
13948 {
13949 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13950 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
13951 gen_frame_mem (word_mode, t));
13952 t = plus_constant (Pmode, t, UNITS_PER_WORD);
13953 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
13954 gen_frame_mem (word_mode, t));
13955 }
13956 else if (eax_live || r10_live)
13957 {
13958 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13959 emit_move_insn (gen_rtx_REG (word_mode,
13960 (eax_live ? AX_REG : R10_REG)),
13961 gen_frame_mem (word_mode, t));
13962 }
13963 }
13964 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
13965
13966 /* If we havn't already set up the frame pointer, do so now. */
13967 if (frame_pointer_needed && !m->fs.fp_valid)
13968 {
13969 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
13970 GEN_INT (frame.stack_pointer_offset
13971 - frame.hard_frame_pointer_offset));
13972 insn = emit_insn (insn);
13973 RTX_FRAME_RELATED_P (insn) = 1;
13974 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
13975
13976 if (m->fs.cfa_reg == stack_pointer_rtx)
13977 m->fs.cfa_reg = hard_frame_pointer_rtx;
13978 m->fs.fp_offset = frame.hard_frame_pointer_offset;
13979 m->fs.fp_valid = true;
13980 }
13981
13982 if (!int_registers_saved)
13983 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13984 if (!sse_registers_saved)
13985 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13986
13987 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
13988 in PROLOGUE. */
13989 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
13990 {
13991 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
13992 insn = emit_insn (gen_set_got (pic));
13993 RTX_FRAME_RELATED_P (insn) = 1;
13994 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
13995 emit_insn (gen_prologue_use (pic));
13996 /* Deleting already emmitted SET_GOT if exist and allocated to
13997 REAL_PIC_OFFSET_TABLE_REGNUM. */
13998 ix86_elim_entry_set_got (pic);
13999 }
14000
14001 if (crtl->drap_reg && !crtl->stack_realign_needed)
14002 {
14003 /* vDRAP is setup but after reload it turns out stack realign
14004 isn't necessary, here we will emit prologue to setup DRAP
14005 without stack realign adjustment */
14006 t = choose_baseaddr (0);
14007 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
14008 }
14009
14010 /* Prevent instructions from being scheduled into register save push
14011 sequence when access to the redzone area is done through frame pointer.
14012 The offset between the frame pointer and the stack pointer is calculated
14013 relative to the value of the stack pointer at the end of the function
14014 prologue, and moving instructions that access redzone area via frame
14015 pointer inside push sequence violates this assumption. */
14016 if (frame_pointer_needed && frame.red_zone_size)
14017 emit_insn (gen_memory_blockage ());
14018
14019 /* SEH requires that the prologue end within 256 bytes of the start of
14020 the function. Prevent instruction schedules that would extend that.
14021 Further, prevent alloca modifications to the stack pointer from being
14022 combined with prologue modifications. */
14023 if (TARGET_SEH)
14024 emit_insn (gen_prologue_use (stack_pointer_rtx));
14025 }
14026
14027 /* Emit code to restore REG using a POP insn. */
14028
14029 static void
14030 ix86_emit_restore_reg_using_pop (rtx reg)
14031 {
14032 struct machine_function *m = cfun->machine;
14033 rtx_insn *insn = emit_insn (gen_pop (reg));
14034
14035 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
14036 m->fs.sp_offset -= UNITS_PER_WORD;
14037
14038 if (m->fs.cfa_reg == crtl->drap_reg
14039 && REGNO (reg) == REGNO (crtl->drap_reg))
14040 {
14041 /* Previously we'd represented the CFA as an expression
14042 like *(%ebp - 8). We've just popped that value from
14043 the stack, which means we need to reset the CFA to
14044 the drap register. This will remain until we restore
14045 the stack pointer. */
14046 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14047 RTX_FRAME_RELATED_P (insn) = 1;
14048
14049 /* This means that the DRAP register is valid for addressing too. */
14050 m->fs.drap_valid = true;
14051 return;
14052 }
14053
14054 if (m->fs.cfa_reg == stack_pointer_rtx)
14055 {
14056 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14057 x = gen_rtx_SET (stack_pointer_rtx, x);
14058 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14059 RTX_FRAME_RELATED_P (insn) = 1;
14060
14061 m->fs.cfa_offset -= UNITS_PER_WORD;
14062 }
14063
14064 /* When the frame pointer is the CFA, and we pop it, we are
14065 swapping back to the stack pointer as the CFA. This happens
14066 for stack frames that don't allocate other data, so we assume
14067 the stack pointer is now pointing at the return address, i.e.
14068 the function entry state, which makes the offset be 1 word. */
14069 if (reg == hard_frame_pointer_rtx)
14070 {
14071 m->fs.fp_valid = false;
14072 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14073 {
14074 m->fs.cfa_reg = stack_pointer_rtx;
14075 m->fs.cfa_offset -= UNITS_PER_WORD;
14076
14077 add_reg_note (insn, REG_CFA_DEF_CFA,
14078 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14079 GEN_INT (m->fs.cfa_offset)));
14080 RTX_FRAME_RELATED_P (insn) = 1;
14081 }
14082 }
14083 }
14084
14085 /* Emit code to restore saved registers using POP insns. */
14086
14087 static void
14088 ix86_emit_restore_regs_using_pop (void)
14089 {
14090 unsigned int regno;
14091
14092 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14093 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false))
14094 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
14095 }
14096
14097 /* Emit code and notes for the LEAVE instruction. */
14098
14099 static void
14100 ix86_emit_leave (void)
14101 {
14102 struct machine_function *m = cfun->machine;
14103 rtx_insn *insn = emit_insn (ix86_gen_leave ());
14104
14105 ix86_add_queued_cfa_restore_notes (insn);
14106
14107 gcc_assert (m->fs.fp_valid);
14108 m->fs.sp_valid = true;
14109 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
14110 m->fs.fp_valid = false;
14111
14112 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14113 {
14114 m->fs.cfa_reg = stack_pointer_rtx;
14115 m->fs.cfa_offset = m->fs.sp_offset;
14116
14117 add_reg_note (insn, REG_CFA_DEF_CFA,
14118 plus_constant (Pmode, stack_pointer_rtx,
14119 m->fs.sp_offset));
14120 RTX_FRAME_RELATED_P (insn) = 1;
14121 }
14122 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
14123 m->fs.fp_offset);
14124 }
14125
14126 /* Emit code to restore saved registers using MOV insns.
14127 First register is restored from CFA - CFA_OFFSET. */
14128 static void
14129 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
14130 bool maybe_eh_return)
14131 {
14132 struct machine_function *m = cfun->machine;
14133 unsigned int regno;
14134
14135 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14136 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
14137 {
14138 rtx reg = gen_rtx_REG (word_mode, regno);
14139 rtx mem;
14140 rtx_insn *insn;
14141
14142 mem = choose_baseaddr (cfa_offset);
14143 mem = gen_frame_mem (word_mode, mem);
14144 insn = emit_move_insn (reg, mem);
14145
14146 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
14147 {
14148 /* Previously we'd represented the CFA as an expression
14149 like *(%ebp - 8). We've just popped that value from
14150 the stack, which means we need to reset the CFA to
14151 the drap register. This will remain until we restore
14152 the stack pointer. */
14153 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14154 RTX_FRAME_RELATED_P (insn) = 1;
14155
14156 /* This means that the DRAP register is valid for addressing. */
14157 m->fs.drap_valid = true;
14158 }
14159 else
14160 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14161
14162 cfa_offset -= UNITS_PER_WORD;
14163 }
14164 }
14165
14166 /* Emit code to restore saved registers using MOV insns.
14167 First register is restored from CFA - CFA_OFFSET. */
14168 static void
14169 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
14170 bool maybe_eh_return)
14171 {
14172 unsigned int regno;
14173
14174 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14175 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
14176 {
14177 rtx reg = gen_rtx_REG (V4SFmode, regno);
14178 rtx mem;
14179 unsigned int align;
14180
14181 mem = choose_baseaddr (cfa_offset);
14182 mem = gen_rtx_MEM (V4SFmode, mem);
14183
14184 /* The location is aligned up to INCOMING_STACK_BOUNDARY. */
14185 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), INCOMING_STACK_BOUNDARY);
14186 set_mem_align (mem, align);
14187 emit_insn (gen_rtx_SET (reg, mem));
14188
14189 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14190
14191 cfa_offset -= GET_MODE_SIZE (V4SFmode);
14192 }
14193 }
14194
14195 /* Restore function stack, frame, and registers. */
14196
14197 void
14198 ix86_expand_epilogue (int style)
14199 {
14200 struct machine_function *m = cfun->machine;
14201 struct machine_frame_state frame_state_save = m->fs;
14202 struct ix86_frame frame;
14203 bool restore_regs_via_mov;
14204 bool using_drap;
14205
14206 ix86_finalize_stack_realign_flags ();
14207 ix86_compute_frame_layout (&frame);
14208
14209 m->fs.sp_valid = (!frame_pointer_needed
14210 || (crtl->sp_is_unchanging
14211 && !stack_realign_fp));
14212 gcc_assert (!m->fs.sp_valid
14213 || m->fs.sp_offset == frame.stack_pointer_offset);
14214
14215 /* The FP must be valid if the frame pointer is present. */
14216 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
14217 gcc_assert (!m->fs.fp_valid
14218 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
14219
14220 /* We must have *some* valid pointer to the stack frame. */
14221 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
14222
14223 /* The DRAP is never valid at this point. */
14224 gcc_assert (!m->fs.drap_valid);
14225
14226 /* See the comment about red zone and frame
14227 pointer usage in ix86_expand_prologue. */
14228 if (frame_pointer_needed && frame.red_zone_size)
14229 emit_insn (gen_memory_blockage ());
14230
14231 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
14232 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
14233
14234 /* Determine the CFA offset of the end of the red-zone. */
14235 m->fs.red_zone_offset = 0;
14236 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
14237 {
14238 /* The red-zone begins below the return address. */
14239 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
14240
14241 /* When the register save area is in the aligned portion of
14242 the stack, determine the maximum runtime displacement that
14243 matches up with the aligned frame. */
14244 if (stack_realign_drap)
14245 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
14246 + UNITS_PER_WORD);
14247 }
14248
14249 /* Special care must be taken for the normal return case of a function
14250 using eh_return: the eax and edx registers are marked as saved, but
14251 not restored along this path. Adjust the save location to match. */
14252 if (crtl->calls_eh_return && style != 2)
14253 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
14254
14255 /* EH_RETURN requires the use of moves to function properly. */
14256 if (crtl->calls_eh_return)
14257 restore_regs_via_mov = true;
14258 /* SEH requires the use of pops to identify the epilogue. */
14259 else if (TARGET_SEH)
14260 restore_regs_via_mov = false;
14261 /* If we're only restoring one register and sp is not valid then
14262 using a move instruction to restore the register since it's
14263 less work than reloading sp and popping the register. */
14264 else if (!m->fs.sp_valid && frame.nregs <= 1)
14265 restore_regs_via_mov = true;
14266 else if (TARGET_EPILOGUE_USING_MOVE
14267 && cfun->machine->use_fast_prologue_epilogue
14268 && (frame.nregs > 1
14269 || m->fs.sp_offset != frame.reg_save_offset))
14270 restore_regs_via_mov = true;
14271 else if (frame_pointer_needed
14272 && !frame.nregs
14273 && m->fs.sp_offset != frame.reg_save_offset)
14274 restore_regs_via_mov = true;
14275 else if (frame_pointer_needed
14276 && TARGET_USE_LEAVE
14277 && cfun->machine->use_fast_prologue_epilogue
14278 && frame.nregs == 1)
14279 restore_regs_via_mov = true;
14280 else
14281 restore_regs_via_mov = false;
14282
14283 if (restore_regs_via_mov || frame.nsseregs)
14284 {
14285 /* Ensure that the entire register save area is addressable via
14286 the stack pointer, if we will restore via sp. */
14287 if (TARGET_64BIT
14288 && m->fs.sp_offset > 0x7fffffff
14289 && !(m->fs.fp_valid || m->fs.drap_valid)
14290 && (frame.nsseregs + frame.nregs) != 0)
14291 {
14292 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14293 GEN_INT (m->fs.sp_offset
14294 - frame.sse_reg_save_offset),
14295 style,
14296 m->fs.cfa_reg == stack_pointer_rtx);
14297 }
14298 }
14299
14300 /* If there are any SSE registers to restore, then we have to do it
14301 via moves, since there's obviously no pop for SSE regs. */
14302 if (frame.nsseregs)
14303 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
14304 style == 2);
14305
14306 if (restore_regs_via_mov)
14307 {
14308 rtx t;
14309
14310 if (frame.nregs)
14311 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
14312
14313 /* eh_return epilogues need %ecx added to the stack pointer. */
14314 if (style == 2)
14315 {
14316 rtx sa = EH_RETURN_STACKADJ_RTX;
14317 rtx_insn *insn;
14318
14319 /* %ecx can't be used for both DRAP register and eh_return. */
14320 if (crtl->drap_reg)
14321 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
14322
14323 /* regparm nested functions don't work with eh_return. */
14324 gcc_assert (!ix86_static_chain_on_stack);
14325
14326 if (frame_pointer_needed)
14327 {
14328 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
14329 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
14330 emit_insn (gen_rtx_SET (sa, t));
14331
14332 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
14333 insn = emit_move_insn (hard_frame_pointer_rtx, t);
14334
14335 /* Note that we use SA as a temporary CFA, as the return
14336 address is at the proper place relative to it. We
14337 pretend this happens at the FP restore insn because
14338 prior to this insn the FP would be stored at the wrong
14339 offset relative to SA, and after this insn we have no
14340 other reasonable register to use for the CFA. We don't
14341 bother resetting the CFA to the SP for the duration of
14342 the return insn. */
14343 add_reg_note (insn, REG_CFA_DEF_CFA,
14344 plus_constant (Pmode, sa, UNITS_PER_WORD));
14345 ix86_add_queued_cfa_restore_notes (insn);
14346 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
14347 RTX_FRAME_RELATED_P (insn) = 1;
14348
14349 m->fs.cfa_reg = sa;
14350 m->fs.cfa_offset = UNITS_PER_WORD;
14351 m->fs.fp_valid = false;
14352
14353 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
14354 const0_rtx, style, false);
14355 }
14356 else
14357 {
14358 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
14359 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
14360 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
14361 ix86_add_queued_cfa_restore_notes (insn);
14362
14363 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
14364 if (m->fs.cfa_offset != UNITS_PER_WORD)
14365 {
14366 m->fs.cfa_offset = UNITS_PER_WORD;
14367 add_reg_note (insn, REG_CFA_DEF_CFA,
14368 plus_constant (Pmode, stack_pointer_rtx,
14369 UNITS_PER_WORD));
14370 RTX_FRAME_RELATED_P (insn) = 1;
14371 }
14372 }
14373 m->fs.sp_offset = UNITS_PER_WORD;
14374 m->fs.sp_valid = true;
14375 }
14376 }
14377 else
14378 {
14379 /* SEH requires that the function end with (1) a stack adjustment
14380 if necessary, (2) a sequence of pops, and (3) a return or
14381 jump instruction. Prevent insns from the function body from
14382 being scheduled into this sequence. */
14383 if (TARGET_SEH)
14384 {
14385 /* Prevent a catch region from being adjacent to the standard
14386 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
14387 several other flags that would be interesting to test are
14388 not yet set up. */
14389 if (flag_non_call_exceptions)
14390 emit_insn (gen_nops (const1_rtx));
14391 else
14392 emit_insn (gen_blockage ());
14393 }
14394
14395 /* First step is to deallocate the stack frame so that we can
14396 pop the registers. Also do it on SEH target for very large
14397 frame as the emitted instructions aren't allowed by the ABI in
14398 epilogues. */
14399 if (!m->fs.sp_valid
14400 || (TARGET_SEH
14401 && (m->fs.sp_offset - frame.reg_save_offset
14402 >= SEH_MAX_FRAME_SIZE)))
14403 {
14404 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
14405 GEN_INT (m->fs.fp_offset
14406 - frame.reg_save_offset),
14407 style, false);
14408 }
14409 else if (m->fs.sp_offset != frame.reg_save_offset)
14410 {
14411 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14412 GEN_INT (m->fs.sp_offset
14413 - frame.reg_save_offset),
14414 style,
14415 m->fs.cfa_reg == stack_pointer_rtx);
14416 }
14417
14418 ix86_emit_restore_regs_using_pop ();
14419 }
14420
14421 /* If we used a stack pointer and haven't already got rid of it,
14422 then do so now. */
14423 if (m->fs.fp_valid)
14424 {
14425 /* If the stack pointer is valid and pointing at the frame
14426 pointer store address, then we only need a pop. */
14427 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
14428 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14429 /* Leave results in shorter dependency chains on CPUs that are
14430 able to grok it fast. */
14431 else if (TARGET_USE_LEAVE
14432 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
14433 || !cfun->machine->use_fast_prologue_epilogue)
14434 ix86_emit_leave ();
14435 else
14436 {
14437 pro_epilogue_adjust_stack (stack_pointer_rtx,
14438 hard_frame_pointer_rtx,
14439 const0_rtx, style, !using_drap);
14440 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14441 }
14442 }
14443
14444 if (using_drap)
14445 {
14446 int param_ptr_offset = UNITS_PER_WORD;
14447 rtx_insn *insn;
14448
14449 gcc_assert (stack_realign_drap);
14450
14451 if (ix86_static_chain_on_stack)
14452 param_ptr_offset += UNITS_PER_WORD;
14453 if (!call_used_regs[REGNO (crtl->drap_reg)])
14454 param_ptr_offset += UNITS_PER_WORD;
14455
14456 insn = emit_insn (gen_rtx_SET
14457 (stack_pointer_rtx,
14458 gen_rtx_PLUS (Pmode,
14459 crtl->drap_reg,
14460 GEN_INT (-param_ptr_offset))));
14461 m->fs.cfa_reg = stack_pointer_rtx;
14462 m->fs.cfa_offset = param_ptr_offset;
14463 m->fs.sp_offset = param_ptr_offset;
14464 m->fs.realigned = false;
14465
14466 add_reg_note (insn, REG_CFA_DEF_CFA,
14467 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14468 GEN_INT (param_ptr_offset)));
14469 RTX_FRAME_RELATED_P (insn) = 1;
14470
14471 if (!call_used_regs[REGNO (crtl->drap_reg)])
14472 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
14473 }
14474
14475 /* At this point the stack pointer must be valid, and we must have
14476 restored all of the registers. We may not have deallocated the
14477 entire stack frame. We've delayed this until now because it may
14478 be possible to merge the local stack deallocation with the
14479 deallocation forced by ix86_static_chain_on_stack. */
14480 gcc_assert (m->fs.sp_valid);
14481 gcc_assert (!m->fs.fp_valid);
14482 gcc_assert (!m->fs.realigned);
14483 if (m->fs.sp_offset != UNITS_PER_WORD)
14484 {
14485 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14486 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
14487 style, true);
14488 }
14489 else
14490 ix86_add_queued_cfa_restore_notes (get_last_insn ());
14491
14492 /* Sibcall epilogues don't want a return instruction. */
14493 if (style == 0)
14494 {
14495 m->fs = frame_state_save;
14496 return;
14497 }
14498
14499 if (cfun->machine->func_type != TYPE_NORMAL)
14500 {
14501 /* Return with the "IRET" instruction from interrupt handler.
14502 Pop the 'ERROR_CODE' off the stack before the 'IRET'
14503 instruction in exception handler. */
14504 if (cfun->machine->func_type == TYPE_EXCEPTION)
14505 {
14506 rtx r = plus_constant (Pmode, stack_pointer_rtx,
14507 UNITS_PER_WORD);
14508 emit_insn (gen_rtx_SET (stack_pointer_rtx, r));
14509 }
14510 emit_jump_insn (gen_interrupt_return ());
14511 }
14512 else if (crtl->args.pops_args && crtl->args.size)
14513 {
14514 rtx popc = GEN_INT (crtl->args.pops_args);
14515
14516 /* i386 can only pop 64K bytes. If asked to pop more, pop return
14517 address, do explicit add, and jump indirectly to the caller. */
14518
14519 if (crtl->args.pops_args >= 65536)
14520 {
14521 rtx ecx = gen_rtx_REG (SImode, CX_REG);
14522 rtx_insn *insn;
14523
14524 /* There is no "pascal" calling convention in any 64bit ABI. */
14525 gcc_assert (!TARGET_64BIT);
14526
14527 insn = emit_insn (gen_pop (ecx));
14528 m->fs.cfa_offset -= UNITS_PER_WORD;
14529 m->fs.sp_offset -= UNITS_PER_WORD;
14530
14531 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14532 x = gen_rtx_SET (stack_pointer_rtx, x);
14533 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14534 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14535 RTX_FRAME_RELATED_P (insn) = 1;
14536
14537 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14538 popc, -1, true);
14539 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14540 }
14541 else
14542 emit_jump_insn (gen_simple_return_pop_internal (popc));
14543 }
14544 else
14545 emit_jump_insn (gen_simple_return_internal ());
14546
14547 /* Restore the state back to the state from the prologue,
14548 so that it's correct for the next epilogue. */
14549 m->fs = frame_state_save;
14550 }
14551
14552 /* Reset from the function's potential modifications. */
14553
14554 static void
14555 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED, HOST_WIDE_INT)
14556 {
14557 if (pic_offset_table_rtx
14558 && !ix86_use_pseudo_pic_reg ())
14559 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
14560 #if TARGET_MACHO
14561 /* Mach-O doesn't support labels at the end of objects, so if
14562 it looks like we might want one, insert a NOP. */
14563 {
14564 rtx_insn *insn = get_last_insn ();
14565 rtx_insn *deleted_debug_label = NULL;
14566 while (insn
14567 && NOTE_P (insn)
14568 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
14569 {
14570 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
14571 notes only, instead set their CODE_LABEL_NUMBER to -1,
14572 otherwise there would be code generation differences
14573 in between -g and -g0. */
14574 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
14575 deleted_debug_label = insn;
14576 insn = PREV_INSN (insn);
14577 }
14578 if (insn
14579 && (LABEL_P (insn)
14580 || (NOTE_P (insn)
14581 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
14582 fputs ("\tnop\n", file);
14583 else if (deleted_debug_label)
14584 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
14585 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
14586 CODE_LABEL_NUMBER (insn) = -1;
14587 }
14588 #endif
14589
14590 }
14591
14592 /* Return a scratch register to use in the split stack prologue. The
14593 split stack prologue is used for -fsplit-stack. It is the first
14594 instructions in the function, even before the regular prologue.
14595 The scratch register can be any caller-saved register which is not
14596 used for parameters or for the static chain. */
14597
14598 static unsigned int
14599 split_stack_prologue_scratch_regno (void)
14600 {
14601 if (TARGET_64BIT)
14602 return R11_REG;
14603 else
14604 {
14605 bool is_fastcall, is_thiscall;
14606 int regparm;
14607
14608 is_fastcall = (lookup_attribute ("fastcall",
14609 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14610 != NULL);
14611 is_thiscall = (lookup_attribute ("thiscall",
14612 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14613 != NULL);
14614 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
14615
14616 if (is_fastcall)
14617 {
14618 if (DECL_STATIC_CHAIN (cfun->decl))
14619 {
14620 sorry ("-fsplit-stack does not support fastcall with "
14621 "nested function");
14622 return INVALID_REGNUM;
14623 }
14624 return AX_REG;
14625 }
14626 else if (is_thiscall)
14627 {
14628 if (!DECL_STATIC_CHAIN (cfun->decl))
14629 return DX_REG;
14630 return AX_REG;
14631 }
14632 else if (regparm < 3)
14633 {
14634 if (!DECL_STATIC_CHAIN (cfun->decl))
14635 return CX_REG;
14636 else
14637 {
14638 if (regparm >= 2)
14639 {
14640 sorry ("-fsplit-stack does not support 2 register "
14641 "parameters for a nested function");
14642 return INVALID_REGNUM;
14643 }
14644 return DX_REG;
14645 }
14646 }
14647 else
14648 {
14649 /* FIXME: We could make this work by pushing a register
14650 around the addition and comparison. */
14651 sorry ("-fsplit-stack does not support 3 register parameters");
14652 return INVALID_REGNUM;
14653 }
14654 }
14655 }
14656
14657 /* A SYMBOL_REF for the function which allocates new stackspace for
14658 -fsplit-stack. */
14659
14660 static GTY(()) rtx split_stack_fn;
14661
14662 /* A SYMBOL_REF for the more stack function when using the large
14663 model. */
14664
14665 static GTY(()) rtx split_stack_fn_large;
14666
14667 /* Handle -fsplit-stack. These are the first instructions in the
14668 function, even before the regular prologue. */
14669
14670 void
14671 ix86_expand_split_stack_prologue (void)
14672 {
14673 struct ix86_frame frame;
14674 HOST_WIDE_INT allocate;
14675 unsigned HOST_WIDE_INT args_size;
14676 rtx_code_label *label;
14677 rtx limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
14678 rtx scratch_reg = NULL_RTX;
14679 rtx_code_label *varargs_label = NULL;
14680 rtx fn;
14681
14682 gcc_assert (flag_split_stack && reload_completed);
14683
14684 ix86_finalize_stack_realign_flags ();
14685 ix86_compute_frame_layout (&frame);
14686 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
14687
14688 /* This is the label we will branch to if we have enough stack
14689 space. We expect the basic block reordering pass to reverse this
14690 branch if optimizing, so that we branch in the unlikely case. */
14691 label = gen_label_rtx ();
14692
14693 /* We need to compare the stack pointer minus the frame size with
14694 the stack boundary in the TCB. The stack boundary always gives
14695 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
14696 can compare directly. Otherwise we need to do an addition. */
14697
14698 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
14699 UNSPEC_STACK_CHECK);
14700 limit = gen_rtx_CONST (Pmode, limit);
14701 limit = gen_rtx_MEM (Pmode, limit);
14702 if (allocate < SPLIT_STACK_AVAILABLE)
14703 current = stack_pointer_rtx;
14704 else
14705 {
14706 unsigned int scratch_regno;
14707 rtx offset;
14708
14709 /* We need a scratch register to hold the stack pointer minus
14710 the required frame size. Since this is the very start of the
14711 function, the scratch register can be any caller-saved
14712 register which is not used for parameters. */
14713 offset = GEN_INT (- allocate);
14714 scratch_regno = split_stack_prologue_scratch_regno ();
14715 if (scratch_regno == INVALID_REGNUM)
14716 return;
14717 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
14718 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
14719 {
14720 /* We don't use ix86_gen_add3 in this case because it will
14721 want to split to lea, but when not optimizing the insn
14722 will not be split after this point. */
14723 emit_insn (gen_rtx_SET (scratch_reg,
14724 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14725 offset)));
14726 }
14727 else
14728 {
14729 emit_move_insn (scratch_reg, offset);
14730 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
14731 stack_pointer_rtx));
14732 }
14733 current = scratch_reg;
14734 }
14735
14736 ix86_expand_branch (GEU, current, limit, label);
14737 jump_insn = get_last_insn ();
14738 JUMP_LABEL (jump_insn) = label;
14739
14740 /* Mark the jump as very likely to be taken. */
14741 add_int_reg_note (jump_insn, REG_BR_PROB,
14742 REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100);
14743
14744 if (split_stack_fn == NULL_RTX)
14745 {
14746 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
14747 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
14748 }
14749 fn = split_stack_fn;
14750
14751 /* Get more stack space. We pass in the desired stack space and the
14752 size of the arguments to copy to the new stack. In 32-bit mode
14753 we push the parameters; __morestack will return on a new stack
14754 anyhow. In 64-bit mode we pass the parameters in r10 and
14755 r11. */
14756 allocate_rtx = GEN_INT (allocate);
14757 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
14758 call_fusage = NULL_RTX;
14759 if (TARGET_64BIT)
14760 {
14761 rtx reg10, reg11;
14762
14763 reg10 = gen_rtx_REG (Pmode, R10_REG);
14764 reg11 = gen_rtx_REG (Pmode, R11_REG);
14765
14766 /* If this function uses a static chain, it will be in %r10.
14767 Preserve it across the call to __morestack. */
14768 if (DECL_STATIC_CHAIN (cfun->decl))
14769 {
14770 rtx rax;
14771
14772 rax = gen_rtx_REG (word_mode, AX_REG);
14773 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
14774 use_reg (&call_fusage, rax);
14775 }
14776
14777 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
14778 && !TARGET_PECOFF)
14779 {
14780 HOST_WIDE_INT argval;
14781
14782 gcc_assert (Pmode == DImode);
14783 /* When using the large model we need to load the address
14784 into a register, and we've run out of registers. So we
14785 switch to a different calling convention, and we call a
14786 different function: __morestack_large. We pass the
14787 argument size in the upper 32 bits of r10 and pass the
14788 frame size in the lower 32 bits. */
14789 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
14790 gcc_assert ((args_size & 0xffffffff) == args_size);
14791
14792 if (split_stack_fn_large == NULL_RTX)
14793 {
14794 split_stack_fn_large =
14795 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
14796 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
14797 }
14798 if (ix86_cmodel == CM_LARGE_PIC)
14799 {
14800 rtx_code_label *label;
14801 rtx x;
14802
14803 label = gen_label_rtx ();
14804 emit_label (label);
14805 LABEL_PRESERVE_P (label) = 1;
14806 emit_insn (gen_set_rip_rex64 (reg10, label));
14807 emit_insn (gen_set_got_offset_rex64 (reg11, label));
14808 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
14809 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
14810 UNSPEC_GOT);
14811 x = gen_rtx_CONST (Pmode, x);
14812 emit_move_insn (reg11, x);
14813 x = gen_rtx_PLUS (Pmode, reg10, reg11);
14814 x = gen_const_mem (Pmode, x);
14815 emit_move_insn (reg11, x);
14816 }
14817 else
14818 emit_move_insn (reg11, split_stack_fn_large);
14819
14820 fn = reg11;
14821
14822 argval = ((args_size << 16) << 16) + allocate;
14823 emit_move_insn (reg10, GEN_INT (argval));
14824 }
14825 else
14826 {
14827 emit_move_insn (reg10, allocate_rtx);
14828 emit_move_insn (reg11, GEN_INT (args_size));
14829 use_reg (&call_fusage, reg11);
14830 }
14831
14832 use_reg (&call_fusage, reg10);
14833 }
14834 else
14835 {
14836 emit_insn (gen_push (GEN_INT (args_size)));
14837 emit_insn (gen_push (allocate_rtx));
14838 }
14839 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
14840 GEN_INT (UNITS_PER_WORD), constm1_rtx,
14841 NULL_RTX, false);
14842 add_function_usage_to (call_insn, call_fusage);
14843
14844 /* In order to make call/return prediction work right, we now need
14845 to execute a return instruction. See
14846 libgcc/config/i386/morestack.S for the details on how this works.
14847
14848 For flow purposes gcc must not see this as a return
14849 instruction--we need control flow to continue at the subsequent
14850 label. Therefore, we use an unspec. */
14851 gcc_assert (crtl->args.pops_args < 65536);
14852 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
14853
14854 /* If we are in 64-bit mode and this function uses a static chain,
14855 we saved %r10 in %rax before calling _morestack. */
14856 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
14857 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
14858 gen_rtx_REG (word_mode, AX_REG));
14859
14860 /* If this function calls va_start, we need to store a pointer to
14861 the arguments on the old stack, because they may not have been
14862 all copied to the new stack. At this point the old stack can be
14863 found at the frame pointer value used by __morestack, because
14864 __morestack has set that up before calling back to us. Here we
14865 store that pointer in a scratch register, and in
14866 ix86_expand_prologue we store the scratch register in a stack
14867 slot. */
14868 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14869 {
14870 unsigned int scratch_regno;
14871 rtx frame_reg;
14872 int words;
14873
14874 scratch_regno = split_stack_prologue_scratch_regno ();
14875 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
14876 frame_reg = gen_rtx_REG (Pmode, BP_REG);
14877
14878 /* 64-bit:
14879 fp -> old fp value
14880 return address within this function
14881 return address of caller of this function
14882 stack arguments
14883 So we add three words to get to the stack arguments.
14884
14885 32-bit:
14886 fp -> old fp value
14887 return address within this function
14888 first argument to __morestack
14889 second argument to __morestack
14890 return address of caller of this function
14891 stack arguments
14892 So we add five words to get to the stack arguments.
14893 */
14894 words = TARGET_64BIT ? 3 : 5;
14895 emit_insn (gen_rtx_SET (scratch_reg,
14896 gen_rtx_PLUS (Pmode, frame_reg,
14897 GEN_INT (words * UNITS_PER_WORD))));
14898
14899 varargs_label = gen_label_rtx ();
14900 emit_jump_insn (gen_jump (varargs_label));
14901 JUMP_LABEL (get_last_insn ()) = varargs_label;
14902
14903 emit_barrier ();
14904 }
14905
14906 emit_label (label);
14907 LABEL_NUSES (label) = 1;
14908
14909 /* If this function calls va_start, we now have to set the scratch
14910 register for the case where we do not call __morestack. In this
14911 case we need to set it based on the stack pointer. */
14912 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14913 {
14914 emit_insn (gen_rtx_SET (scratch_reg,
14915 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14916 GEN_INT (UNITS_PER_WORD))));
14917
14918 emit_label (varargs_label);
14919 LABEL_NUSES (varargs_label) = 1;
14920 }
14921 }
14922
14923 /* We may have to tell the dataflow pass that the split stack prologue
14924 is initializing a scratch register. */
14925
14926 static void
14927 ix86_live_on_entry (bitmap regs)
14928 {
14929 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14930 {
14931 gcc_assert (flag_split_stack);
14932 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
14933 }
14934 }
14935 \f
14936 /* Extract the parts of an RTL expression that is a valid memory address
14937 for an instruction. Return 0 if the structure of the address is
14938 grossly off. Return -1 if the address contains ASHIFT, so it is not
14939 strictly valid, but still used for computing length of lea instruction. */
14940
14941 int
14942 ix86_decompose_address (rtx addr, struct ix86_address *out)
14943 {
14944 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
14945 rtx base_reg, index_reg;
14946 HOST_WIDE_INT scale = 1;
14947 rtx scale_rtx = NULL_RTX;
14948 rtx tmp;
14949 int retval = 1;
14950 addr_space_t seg = ADDR_SPACE_GENERIC;
14951
14952 /* Allow zero-extended SImode addresses,
14953 they will be emitted with addr32 prefix. */
14954 if (TARGET_64BIT && GET_MODE (addr) == DImode)
14955 {
14956 if (GET_CODE (addr) == ZERO_EXTEND
14957 && GET_MODE (XEXP (addr, 0)) == SImode)
14958 {
14959 addr = XEXP (addr, 0);
14960 if (CONST_INT_P (addr))
14961 return 0;
14962 }
14963 else if (GET_CODE (addr) == AND
14964 && const_32bit_mask (XEXP (addr, 1), DImode))
14965 {
14966 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
14967 if (addr == NULL_RTX)
14968 return 0;
14969
14970 if (CONST_INT_P (addr))
14971 return 0;
14972 }
14973 }
14974
14975 /* Allow SImode subregs of DImode addresses,
14976 they will be emitted with addr32 prefix. */
14977 if (TARGET_64BIT && GET_MODE (addr) == SImode)
14978 {
14979 if (SUBREG_P (addr)
14980 && GET_MODE (SUBREG_REG (addr)) == DImode)
14981 {
14982 addr = SUBREG_REG (addr);
14983 if (CONST_INT_P (addr))
14984 return 0;
14985 }
14986 }
14987
14988 if (REG_P (addr))
14989 base = addr;
14990 else if (SUBREG_P (addr))
14991 {
14992 if (REG_P (SUBREG_REG (addr)))
14993 base = addr;
14994 else
14995 return 0;
14996 }
14997 else if (GET_CODE (addr) == PLUS)
14998 {
14999 rtx addends[4], op;
15000 int n = 0, i;
15001
15002 op = addr;
15003 do
15004 {
15005 if (n >= 4)
15006 return 0;
15007 addends[n++] = XEXP (op, 1);
15008 op = XEXP (op, 0);
15009 }
15010 while (GET_CODE (op) == PLUS);
15011 if (n >= 4)
15012 return 0;
15013 addends[n] = op;
15014
15015 for (i = n; i >= 0; --i)
15016 {
15017 op = addends[i];
15018 switch (GET_CODE (op))
15019 {
15020 case MULT:
15021 if (index)
15022 return 0;
15023 index = XEXP (op, 0);
15024 scale_rtx = XEXP (op, 1);
15025 break;
15026
15027 case ASHIFT:
15028 if (index)
15029 return 0;
15030 index = XEXP (op, 0);
15031 tmp = XEXP (op, 1);
15032 if (!CONST_INT_P (tmp))
15033 return 0;
15034 scale = INTVAL (tmp);
15035 if ((unsigned HOST_WIDE_INT) scale > 3)
15036 return 0;
15037 scale = 1 << scale;
15038 break;
15039
15040 case ZERO_EXTEND:
15041 op = XEXP (op, 0);
15042 if (GET_CODE (op) != UNSPEC)
15043 return 0;
15044 /* FALLTHRU */
15045
15046 case UNSPEC:
15047 if (XINT (op, 1) == UNSPEC_TP
15048 && TARGET_TLS_DIRECT_SEG_REFS
15049 && seg == ADDR_SPACE_GENERIC)
15050 seg = DEFAULT_TLS_SEG_REG;
15051 else
15052 return 0;
15053 break;
15054
15055 case SUBREG:
15056 if (!REG_P (SUBREG_REG (op)))
15057 return 0;
15058 /* FALLTHRU */
15059
15060 case REG:
15061 if (!base)
15062 base = op;
15063 else if (!index)
15064 index = op;
15065 else
15066 return 0;
15067 break;
15068
15069 case CONST:
15070 case CONST_INT:
15071 case SYMBOL_REF:
15072 case LABEL_REF:
15073 if (disp)
15074 return 0;
15075 disp = op;
15076 break;
15077
15078 default:
15079 return 0;
15080 }
15081 }
15082 }
15083 else if (GET_CODE (addr) == MULT)
15084 {
15085 index = XEXP (addr, 0); /* index*scale */
15086 scale_rtx = XEXP (addr, 1);
15087 }
15088 else if (GET_CODE (addr) == ASHIFT)
15089 {
15090 /* We're called for lea too, which implements ashift on occasion. */
15091 index = XEXP (addr, 0);
15092 tmp = XEXP (addr, 1);
15093 if (!CONST_INT_P (tmp))
15094 return 0;
15095 scale = INTVAL (tmp);
15096 if ((unsigned HOST_WIDE_INT) scale > 3)
15097 return 0;
15098 scale = 1 << scale;
15099 retval = -1;
15100 }
15101 else
15102 disp = addr; /* displacement */
15103
15104 if (index)
15105 {
15106 if (REG_P (index))
15107 ;
15108 else if (SUBREG_P (index)
15109 && REG_P (SUBREG_REG (index)))
15110 ;
15111 else
15112 return 0;
15113 }
15114
15115 /* Extract the integral value of scale. */
15116 if (scale_rtx)
15117 {
15118 if (!CONST_INT_P (scale_rtx))
15119 return 0;
15120 scale = INTVAL (scale_rtx);
15121 }
15122
15123 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
15124 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
15125
15126 /* Avoid useless 0 displacement. */
15127 if (disp == const0_rtx && (base || index))
15128 disp = NULL_RTX;
15129
15130 /* Allow arg pointer and stack pointer as index if there is not scaling. */
15131 if (base_reg && index_reg && scale == 1
15132 && (index_reg == arg_pointer_rtx
15133 || index_reg == frame_pointer_rtx
15134 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
15135 {
15136 std::swap (base, index);
15137 std::swap (base_reg, index_reg);
15138 }
15139
15140 /* Special case: %ebp cannot be encoded as a base without a displacement.
15141 Similarly %r13. */
15142 if (!disp
15143 && base_reg
15144 && (base_reg == hard_frame_pointer_rtx
15145 || base_reg == frame_pointer_rtx
15146 || base_reg == arg_pointer_rtx
15147 || (REG_P (base_reg)
15148 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
15149 || REGNO (base_reg) == R13_REG))))
15150 disp = const0_rtx;
15151
15152 /* Special case: on K6, [%esi] makes the instruction vector decoded.
15153 Avoid this by transforming to [%esi+0].
15154 Reload calls address legitimization without cfun defined, so we need
15155 to test cfun for being non-NULL. */
15156 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
15157 && base_reg && !index_reg && !disp
15158 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
15159 disp = const0_rtx;
15160
15161 /* Special case: encode reg+reg instead of reg*2. */
15162 if (!base && index && scale == 2)
15163 base = index, base_reg = index_reg, scale = 1;
15164
15165 /* Special case: scaling cannot be encoded without base or displacement. */
15166 if (!base && !disp && index && scale != 1)
15167 disp = const0_rtx;
15168
15169 out->base = base;
15170 out->index = index;
15171 out->disp = disp;
15172 out->scale = scale;
15173 out->seg = seg;
15174
15175 return retval;
15176 }
15177 \f
15178 /* Return cost of the memory address x.
15179 For i386, it is better to use a complex address than let gcc copy
15180 the address into a reg and make a new pseudo. But not if the address
15181 requires to two regs - that would mean more pseudos with longer
15182 lifetimes. */
15183 static int
15184 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
15185 {
15186 struct ix86_address parts;
15187 int cost = 1;
15188 int ok = ix86_decompose_address (x, &parts);
15189
15190 gcc_assert (ok);
15191
15192 if (parts.base && SUBREG_P (parts.base))
15193 parts.base = SUBREG_REG (parts.base);
15194 if (parts.index && SUBREG_P (parts.index))
15195 parts.index = SUBREG_REG (parts.index);
15196
15197 /* Attempt to minimize number of registers in the address by increasing
15198 address cost for each used register. We don't increase address cost
15199 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
15200 is not invariant itself it most likely means that base or index is not
15201 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
15202 which is not profitable for x86. */
15203 if (parts.base
15204 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
15205 && (current_pass->type == GIMPLE_PASS
15206 || !pic_offset_table_rtx
15207 || !REG_P (parts.base)
15208 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
15209 cost++;
15210
15211 if (parts.index
15212 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
15213 && (current_pass->type == GIMPLE_PASS
15214 || !pic_offset_table_rtx
15215 || !REG_P (parts.index)
15216 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
15217 cost++;
15218
15219 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
15220 since it's predecode logic can't detect the length of instructions
15221 and it degenerates to vector decoded. Increase cost of such
15222 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
15223 to split such addresses or even refuse such addresses at all.
15224
15225 Following addressing modes are affected:
15226 [base+scale*index]
15227 [scale*index+disp]
15228 [base+index]
15229
15230 The first and last case may be avoidable by explicitly coding the zero in
15231 memory address, but I don't have AMD-K6 machine handy to check this
15232 theory. */
15233
15234 if (TARGET_K6
15235 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
15236 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
15237 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
15238 cost += 10;
15239
15240 return cost;
15241 }
15242 \f
15243 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
15244 this is used for to form addresses to local data when -fPIC is in
15245 use. */
15246
15247 static bool
15248 darwin_local_data_pic (rtx disp)
15249 {
15250 return (GET_CODE (disp) == UNSPEC
15251 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
15252 }
15253
15254 /* True if operand X should be loaded from GOT. */
15255
15256 bool
15257 ix86_force_load_from_GOT_p (rtx x)
15258 {
15259 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
15260 && !TARGET_PECOFF && !TARGET_MACHO
15261 && !flag_plt && !flag_pic
15262 && ix86_cmodel != CM_LARGE
15263 && GET_CODE (x) == SYMBOL_REF
15264 && SYMBOL_REF_FUNCTION_P (x)
15265 && !SYMBOL_REF_LOCAL_P (x));
15266 }
15267
15268 /* Determine if a given RTX is a valid constant. We already know this
15269 satisfies CONSTANT_P. */
15270
15271 static bool
15272 ix86_legitimate_constant_p (machine_mode mode, rtx x)
15273 {
15274 /* Pointer bounds constants are not valid. */
15275 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
15276 return false;
15277
15278 switch (GET_CODE (x))
15279 {
15280 case CONST:
15281 x = XEXP (x, 0);
15282
15283 if (GET_CODE (x) == PLUS)
15284 {
15285 if (!CONST_INT_P (XEXP (x, 1)))
15286 return false;
15287 x = XEXP (x, 0);
15288 }
15289
15290 if (TARGET_MACHO && darwin_local_data_pic (x))
15291 return true;
15292
15293 /* Only some unspecs are valid as "constants". */
15294 if (GET_CODE (x) == UNSPEC)
15295 switch (XINT (x, 1))
15296 {
15297 case UNSPEC_GOT:
15298 case UNSPEC_GOTOFF:
15299 case UNSPEC_PLTOFF:
15300 return TARGET_64BIT;
15301 case UNSPEC_TPOFF:
15302 case UNSPEC_NTPOFF:
15303 x = XVECEXP (x, 0, 0);
15304 return (GET_CODE (x) == SYMBOL_REF
15305 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15306 case UNSPEC_DTPOFF:
15307 x = XVECEXP (x, 0, 0);
15308 return (GET_CODE (x) == SYMBOL_REF
15309 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
15310 default:
15311 return false;
15312 }
15313
15314 /* We must have drilled down to a symbol. */
15315 if (GET_CODE (x) == LABEL_REF)
15316 return true;
15317 if (GET_CODE (x) != SYMBOL_REF)
15318 return false;
15319 /* FALLTHRU */
15320
15321 case SYMBOL_REF:
15322 /* TLS symbols are never valid. */
15323 if (SYMBOL_REF_TLS_MODEL (x))
15324 return false;
15325
15326 /* DLLIMPORT symbols are never valid. */
15327 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15328 && SYMBOL_REF_DLLIMPORT_P (x))
15329 return false;
15330
15331 #if TARGET_MACHO
15332 /* mdynamic-no-pic */
15333 if (MACHO_DYNAMIC_NO_PIC_P)
15334 return machopic_symbol_defined_p (x);
15335 #endif
15336
15337 /* External function address should be loaded
15338 via the GOT slot to avoid PLT. */
15339 if (ix86_force_load_from_GOT_p (x))
15340 return false;
15341
15342 break;
15343
15344 CASE_CONST_SCALAR_INT:
15345 switch (mode)
15346 {
15347 case TImode:
15348 if (TARGET_64BIT)
15349 return true;
15350 /* FALLTHRU */
15351 case OImode:
15352 case XImode:
15353 if (!standard_sse_constant_p (x, mode))
15354 return false;
15355 default:
15356 break;
15357 }
15358 break;
15359
15360 case CONST_VECTOR:
15361 if (!standard_sse_constant_p (x, mode))
15362 return false;
15363
15364 default:
15365 break;
15366 }
15367
15368 /* Otherwise we handle everything else in the move patterns. */
15369 return true;
15370 }
15371
15372 /* Determine if it's legal to put X into the constant pool. This
15373 is not possible for the address of thread-local symbols, which
15374 is checked above. */
15375
15376 static bool
15377 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
15378 {
15379 /* We can put any immediate constant in memory. */
15380 switch (GET_CODE (x))
15381 {
15382 CASE_CONST_ANY:
15383 return false;
15384
15385 default:
15386 break;
15387 }
15388
15389 return !ix86_legitimate_constant_p (mode, x);
15390 }
15391
15392 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
15393 otherwise zero. */
15394
15395 static bool
15396 is_imported_p (rtx x)
15397 {
15398 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
15399 || GET_CODE (x) != SYMBOL_REF)
15400 return false;
15401
15402 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
15403 }
15404
15405
15406 /* Nonzero if the constant value X is a legitimate general operand
15407 when generating PIC code. It is given that flag_pic is on and
15408 that X satisfies CONSTANT_P. */
15409
15410 bool
15411 legitimate_pic_operand_p (rtx x)
15412 {
15413 rtx inner;
15414
15415 switch (GET_CODE (x))
15416 {
15417 case CONST:
15418 inner = XEXP (x, 0);
15419 if (GET_CODE (inner) == PLUS
15420 && CONST_INT_P (XEXP (inner, 1)))
15421 inner = XEXP (inner, 0);
15422
15423 /* Only some unspecs are valid as "constants". */
15424 if (GET_CODE (inner) == UNSPEC)
15425 switch (XINT (inner, 1))
15426 {
15427 case UNSPEC_GOT:
15428 case UNSPEC_GOTOFF:
15429 case UNSPEC_PLTOFF:
15430 return TARGET_64BIT;
15431 case UNSPEC_TPOFF:
15432 x = XVECEXP (inner, 0, 0);
15433 return (GET_CODE (x) == SYMBOL_REF
15434 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15435 case UNSPEC_MACHOPIC_OFFSET:
15436 return legitimate_pic_address_disp_p (x);
15437 default:
15438 return false;
15439 }
15440 /* FALLTHRU */
15441
15442 case SYMBOL_REF:
15443 case LABEL_REF:
15444 return legitimate_pic_address_disp_p (x);
15445
15446 default:
15447 return true;
15448 }
15449 }
15450
15451 /* Determine if a given CONST RTX is a valid memory displacement
15452 in PIC mode. */
15453
15454 bool
15455 legitimate_pic_address_disp_p (rtx disp)
15456 {
15457 bool saw_plus;
15458
15459 /* In 64bit mode we can allow direct addresses of symbols and labels
15460 when they are not dynamic symbols. */
15461 if (TARGET_64BIT)
15462 {
15463 rtx op0 = disp, op1;
15464
15465 switch (GET_CODE (disp))
15466 {
15467 case LABEL_REF:
15468 return true;
15469
15470 case CONST:
15471 if (GET_CODE (XEXP (disp, 0)) != PLUS)
15472 break;
15473 op0 = XEXP (XEXP (disp, 0), 0);
15474 op1 = XEXP (XEXP (disp, 0), 1);
15475 if (!CONST_INT_P (op1)
15476 || INTVAL (op1) >= 16*1024*1024
15477 || INTVAL (op1) < -16*1024*1024)
15478 break;
15479 if (GET_CODE (op0) == LABEL_REF)
15480 return true;
15481 if (GET_CODE (op0) == CONST
15482 && GET_CODE (XEXP (op0, 0)) == UNSPEC
15483 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
15484 return true;
15485 if (GET_CODE (op0) == UNSPEC
15486 && XINT (op0, 1) == UNSPEC_PCREL)
15487 return true;
15488 if (GET_CODE (op0) != SYMBOL_REF)
15489 break;
15490 /* FALLTHRU */
15491
15492 case SYMBOL_REF:
15493 /* TLS references should always be enclosed in UNSPEC.
15494 The dllimported symbol needs always to be resolved. */
15495 if (SYMBOL_REF_TLS_MODEL (op0)
15496 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
15497 return false;
15498
15499 if (TARGET_PECOFF)
15500 {
15501 if (is_imported_p (op0))
15502 return true;
15503
15504 if (SYMBOL_REF_FAR_ADDR_P (op0)
15505 || !SYMBOL_REF_LOCAL_P (op0))
15506 break;
15507
15508 /* Function-symbols need to be resolved only for
15509 large-model.
15510 For the small-model we don't need to resolve anything
15511 here. */
15512 if ((ix86_cmodel != CM_LARGE_PIC
15513 && SYMBOL_REF_FUNCTION_P (op0))
15514 || ix86_cmodel == CM_SMALL_PIC)
15515 return true;
15516 /* Non-external symbols don't need to be resolved for
15517 large, and medium-model. */
15518 if ((ix86_cmodel == CM_LARGE_PIC
15519 || ix86_cmodel == CM_MEDIUM_PIC)
15520 && !SYMBOL_REF_EXTERNAL_P (op0))
15521 return true;
15522 }
15523 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
15524 && (SYMBOL_REF_LOCAL_P (op0)
15525 || (HAVE_LD_PIE_COPYRELOC
15526 && flag_pie
15527 && !SYMBOL_REF_WEAK (op0)
15528 && !SYMBOL_REF_FUNCTION_P (op0)))
15529 && ix86_cmodel != CM_LARGE_PIC)
15530 return true;
15531 break;
15532
15533 default:
15534 break;
15535 }
15536 }
15537 if (GET_CODE (disp) != CONST)
15538 return false;
15539 disp = XEXP (disp, 0);
15540
15541 if (TARGET_64BIT)
15542 {
15543 /* We are unsafe to allow PLUS expressions. This limit allowed distance
15544 of GOT tables. We should not need these anyway. */
15545 if (GET_CODE (disp) != UNSPEC
15546 || (XINT (disp, 1) != UNSPEC_GOTPCREL
15547 && XINT (disp, 1) != UNSPEC_GOTOFF
15548 && XINT (disp, 1) != UNSPEC_PCREL
15549 && XINT (disp, 1) != UNSPEC_PLTOFF))
15550 return false;
15551
15552 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
15553 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
15554 return false;
15555 return true;
15556 }
15557
15558 saw_plus = false;
15559 if (GET_CODE (disp) == PLUS)
15560 {
15561 if (!CONST_INT_P (XEXP (disp, 1)))
15562 return false;
15563 disp = XEXP (disp, 0);
15564 saw_plus = true;
15565 }
15566
15567 if (TARGET_MACHO && darwin_local_data_pic (disp))
15568 return true;
15569
15570 if (GET_CODE (disp) != UNSPEC)
15571 return false;
15572
15573 switch (XINT (disp, 1))
15574 {
15575 case UNSPEC_GOT:
15576 if (saw_plus)
15577 return false;
15578 /* We need to check for both symbols and labels because VxWorks loads
15579 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
15580 details. */
15581 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15582 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
15583 case UNSPEC_GOTOFF:
15584 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
15585 While ABI specify also 32bit relocation but we don't produce it in
15586 small PIC model at all. */
15587 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15588 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
15589 && !TARGET_64BIT)
15590 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
15591 return false;
15592 case UNSPEC_GOTTPOFF:
15593 case UNSPEC_GOTNTPOFF:
15594 case UNSPEC_INDNTPOFF:
15595 if (saw_plus)
15596 return false;
15597 disp = XVECEXP (disp, 0, 0);
15598 return (GET_CODE (disp) == SYMBOL_REF
15599 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
15600 case UNSPEC_NTPOFF:
15601 disp = XVECEXP (disp, 0, 0);
15602 return (GET_CODE (disp) == SYMBOL_REF
15603 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
15604 case UNSPEC_DTPOFF:
15605 disp = XVECEXP (disp, 0, 0);
15606 return (GET_CODE (disp) == SYMBOL_REF
15607 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
15608 }
15609
15610 return false;
15611 }
15612
15613 /* Determine if op is suitable RTX for an address register.
15614 Return naked register if a register or a register subreg is
15615 found, otherwise return NULL_RTX. */
15616
15617 static rtx
15618 ix86_validate_address_register (rtx op)
15619 {
15620 machine_mode mode = GET_MODE (op);
15621
15622 /* Only SImode or DImode registers can form the address. */
15623 if (mode != SImode && mode != DImode)
15624 return NULL_RTX;
15625
15626 if (REG_P (op))
15627 return op;
15628 else if (SUBREG_P (op))
15629 {
15630 rtx reg = SUBREG_REG (op);
15631
15632 if (!REG_P (reg))
15633 return NULL_RTX;
15634
15635 mode = GET_MODE (reg);
15636
15637 /* Don't allow SUBREGs that span more than a word. It can
15638 lead to spill failures when the register is one word out
15639 of a two word structure. */
15640 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
15641 return NULL_RTX;
15642
15643 /* Allow only SUBREGs of non-eliminable hard registers. */
15644 if (register_no_elim_operand (reg, mode))
15645 return reg;
15646 }
15647
15648 /* Op is not a register. */
15649 return NULL_RTX;
15650 }
15651
15652 /* Recognizes RTL expressions that are valid memory addresses for an
15653 instruction. The MODE argument is the machine mode for the MEM
15654 expression that wants to use this address.
15655
15656 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
15657 convert common non-canonical forms to canonical form so that they will
15658 be recognized. */
15659
15660 static bool
15661 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
15662 {
15663 struct ix86_address parts;
15664 rtx base, index, disp;
15665 HOST_WIDE_INT scale;
15666 addr_space_t seg;
15667
15668 if (ix86_decompose_address (addr, &parts) <= 0)
15669 /* Decomposition failed. */
15670 return false;
15671
15672 base = parts.base;
15673 index = parts.index;
15674 disp = parts.disp;
15675 scale = parts.scale;
15676 seg = parts.seg;
15677
15678 /* Validate base register. */
15679 if (base)
15680 {
15681 rtx reg = ix86_validate_address_register (base);
15682
15683 if (reg == NULL_RTX)
15684 return false;
15685
15686 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
15687 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
15688 /* Base is not valid. */
15689 return false;
15690 }
15691
15692 /* Validate index register. */
15693 if (index)
15694 {
15695 rtx reg = ix86_validate_address_register (index);
15696
15697 if (reg == NULL_RTX)
15698 return false;
15699
15700 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
15701 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
15702 /* Index is not valid. */
15703 return false;
15704 }
15705
15706 /* Index and base should have the same mode. */
15707 if (base && index
15708 && GET_MODE (base) != GET_MODE (index))
15709 return false;
15710
15711 /* Address override works only on the (%reg) part of %fs:(%reg). */
15712 if (seg != ADDR_SPACE_GENERIC
15713 && ((base && GET_MODE (base) != word_mode)
15714 || (index && GET_MODE (index) != word_mode)))
15715 return false;
15716
15717 /* Validate scale factor. */
15718 if (scale != 1)
15719 {
15720 if (!index)
15721 /* Scale without index. */
15722 return false;
15723
15724 if (scale != 2 && scale != 4 && scale != 8)
15725 /* Scale is not a valid multiplier. */
15726 return false;
15727 }
15728
15729 /* Validate displacement. */
15730 if (disp)
15731 {
15732 if (GET_CODE (disp) == CONST
15733 && GET_CODE (XEXP (disp, 0)) == UNSPEC
15734 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
15735 switch (XINT (XEXP (disp, 0), 1))
15736 {
15737 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
15738 when used. While ABI specify also 32bit relocations, we
15739 don't produce them at all and use IP relative instead.
15740 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
15741 should be loaded via GOT. */
15742 case UNSPEC_GOT:
15743 if (!TARGET_64BIT
15744 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15745 goto is_legitimate_pic;
15746 /* FALLTHRU */
15747 case UNSPEC_GOTOFF:
15748 gcc_assert (flag_pic);
15749 if (!TARGET_64BIT)
15750 goto is_legitimate_pic;
15751
15752 /* 64bit address unspec. */
15753 return false;
15754
15755 case UNSPEC_GOTPCREL:
15756 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15757 goto is_legitimate_pic;
15758 /* FALLTHRU */
15759 case UNSPEC_PCREL:
15760 gcc_assert (flag_pic);
15761 goto is_legitimate_pic;
15762
15763 case UNSPEC_GOTTPOFF:
15764 case UNSPEC_GOTNTPOFF:
15765 case UNSPEC_INDNTPOFF:
15766 case UNSPEC_NTPOFF:
15767 case UNSPEC_DTPOFF:
15768 break;
15769
15770 case UNSPEC_STACK_CHECK:
15771 gcc_assert (flag_split_stack);
15772 break;
15773
15774 default:
15775 /* Invalid address unspec. */
15776 return false;
15777 }
15778
15779 else if (SYMBOLIC_CONST (disp)
15780 && (flag_pic
15781 || (TARGET_MACHO
15782 #if TARGET_MACHO
15783 && MACHOPIC_INDIRECT
15784 && !machopic_operand_p (disp)
15785 #endif
15786 )))
15787 {
15788
15789 is_legitimate_pic:
15790 if (TARGET_64BIT && (index || base))
15791 {
15792 /* foo@dtpoff(%rX) is ok. */
15793 if (GET_CODE (disp) != CONST
15794 || GET_CODE (XEXP (disp, 0)) != PLUS
15795 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
15796 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
15797 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
15798 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
15799 /* Non-constant pic memory reference. */
15800 return false;
15801 }
15802 else if ((!TARGET_MACHO || flag_pic)
15803 && ! legitimate_pic_address_disp_p (disp))
15804 /* Displacement is an invalid pic construct. */
15805 return false;
15806 #if TARGET_MACHO
15807 else if (MACHO_DYNAMIC_NO_PIC_P
15808 && !ix86_legitimate_constant_p (Pmode, disp))
15809 /* displacment must be referenced via non_lazy_pointer */
15810 return false;
15811 #endif
15812
15813 /* This code used to verify that a symbolic pic displacement
15814 includes the pic_offset_table_rtx register.
15815
15816 While this is good idea, unfortunately these constructs may
15817 be created by "adds using lea" optimization for incorrect
15818 code like:
15819
15820 int a;
15821 int foo(int i)
15822 {
15823 return *(&a+i);
15824 }
15825
15826 This code is nonsensical, but results in addressing
15827 GOT table with pic_offset_table_rtx base. We can't
15828 just refuse it easily, since it gets matched by
15829 "addsi3" pattern, that later gets split to lea in the
15830 case output register differs from input. While this
15831 can be handled by separate addsi pattern for this case
15832 that never results in lea, this seems to be easier and
15833 correct fix for crash to disable this test. */
15834 }
15835 else if (GET_CODE (disp) != LABEL_REF
15836 && !CONST_INT_P (disp)
15837 && (GET_CODE (disp) != CONST
15838 || !ix86_legitimate_constant_p (Pmode, disp))
15839 && (GET_CODE (disp) != SYMBOL_REF
15840 || !ix86_legitimate_constant_p (Pmode, disp)))
15841 /* Displacement is not constant. */
15842 return false;
15843 else if (TARGET_64BIT
15844 && !x86_64_immediate_operand (disp, VOIDmode))
15845 /* Displacement is out of range. */
15846 return false;
15847 /* In x32 mode, constant addresses are sign extended to 64bit, so
15848 we have to prevent addresses from 0x80000000 to 0xffffffff. */
15849 else if (TARGET_X32 && !(index || base)
15850 && CONST_INT_P (disp)
15851 && val_signbit_known_set_p (SImode, INTVAL (disp)))
15852 return false;
15853 }
15854
15855 /* Everything looks valid. */
15856 return true;
15857 }
15858
15859 /* Determine if a given RTX is a valid constant address. */
15860
15861 bool
15862 constant_address_p (rtx x)
15863 {
15864 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
15865 }
15866 \f
15867 /* Return a unique alias set for the GOT. */
15868
15869 static alias_set_type
15870 ix86_GOT_alias_set (void)
15871 {
15872 static alias_set_type set = -1;
15873 if (set == -1)
15874 set = new_alias_set ();
15875 return set;
15876 }
15877
15878 /* Return a legitimate reference for ORIG (an address) using the
15879 register REG. If REG is 0, a new pseudo is generated.
15880
15881 There are two types of references that must be handled:
15882
15883 1. Global data references must load the address from the GOT, via
15884 the PIC reg. An insn is emitted to do this load, and the reg is
15885 returned.
15886
15887 2. Static data references, constant pool addresses, and code labels
15888 compute the address as an offset from the GOT, whose base is in
15889 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
15890 differentiate them from global data objects. The returned
15891 address is the PIC reg + an unspec constant.
15892
15893 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
15894 reg also appears in the address. */
15895
15896 static rtx
15897 legitimize_pic_address (rtx orig, rtx reg)
15898 {
15899 rtx addr = orig;
15900 rtx new_rtx = orig;
15901
15902 #if TARGET_MACHO
15903 if (TARGET_MACHO && !TARGET_64BIT)
15904 {
15905 if (reg == 0)
15906 reg = gen_reg_rtx (Pmode);
15907 /* Use the generic Mach-O PIC machinery. */
15908 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
15909 }
15910 #endif
15911
15912 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
15913 {
15914 rtx tmp = legitimize_pe_coff_symbol (addr, true);
15915 if (tmp)
15916 return tmp;
15917 }
15918
15919 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
15920 new_rtx = addr;
15921 else if ((!TARGET_64BIT
15922 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
15923 && !TARGET_PECOFF
15924 && gotoff_operand (addr, Pmode))
15925 {
15926 /* This symbol may be referenced via a displacement
15927 from the PIC base address (@GOTOFF). */
15928 if (GET_CODE (addr) == CONST)
15929 addr = XEXP (addr, 0);
15930
15931 if (GET_CODE (addr) == PLUS)
15932 {
15933 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
15934 UNSPEC_GOTOFF);
15935 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
15936 }
15937 else
15938 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
15939
15940 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15941
15942 if (TARGET_64BIT)
15943 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
15944
15945 if (reg != 0)
15946 {
15947 gcc_assert (REG_P (reg));
15948 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
15949 new_rtx, reg, 1, OPTAB_DIRECT);
15950 }
15951 else
15952 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
15953 }
15954 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
15955 /* We can't use @GOTOFF for text labels
15956 on VxWorks, see gotoff_operand. */
15957 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
15958 {
15959 rtx tmp = legitimize_pe_coff_symbol (addr, true);
15960 if (tmp)
15961 return tmp;
15962
15963 /* For x64 PE-COFF there is no GOT table,
15964 so we use address directly. */
15965 if (TARGET_64BIT && TARGET_PECOFF)
15966 {
15967 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
15968 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15969 }
15970 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
15971 {
15972 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
15973 UNSPEC_GOTPCREL);
15974 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15975 new_rtx = gen_const_mem (Pmode, new_rtx);
15976 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
15977 }
15978 else
15979 {
15980 /* This symbol must be referenced via a load
15981 from the Global Offset Table (@GOT). */
15982 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
15983 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15984 if (TARGET_64BIT)
15985 new_rtx = force_reg (Pmode, new_rtx);
15986 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
15987 new_rtx = gen_const_mem (Pmode, new_rtx);
15988 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
15989 }
15990
15991 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
15992 }
15993 else
15994 {
15995 if (CONST_INT_P (addr)
15996 && !x86_64_immediate_operand (addr, VOIDmode))
15997 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
15998 else if (GET_CODE (addr) == CONST)
15999 {
16000 addr = XEXP (addr, 0);
16001
16002 /* We must match stuff we generate before. Assume the only
16003 unspecs that can get here are ours. Not that we could do
16004 anything with them anyway.... */
16005 if (GET_CODE (addr) == UNSPEC
16006 || (GET_CODE (addr) == PLUS
16007 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
16008 return orig;
16009 gcc_assert (GET_CODE (addr) == PLUS);
16010 }
16011
16012 if (GET_CODE (addr) == PLUS)
16013 {
16014 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
16015
16016 /* Check first to see if this is a constant
16017 offset from a @GOTOFF symbol reference. */
16018 if (!TARGET_PECOFF
16019 && gotoff_operand (op0, Pmode)
16020 && CONST_INT_P (op1))
16021 {
16022 if (!TARGET_64BIT)
16023 {
16024 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
16025 UNSPEC_GOTOFF);
16026 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
16027 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16028
16029 if (reg != 0)
16030 {
16031 gcc_assert (REG_P (reg));
16032 new_rtx = expand_simple_binop (Pmode, PLUS,
16033 pic_offset_table_rtx,
16034 new_rtx, reg, 1,
16035 OPTAB_DIRECT);
16036 }
16037 else
16038 new_rtx
16039 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16040 }
16041 else
16042 {
16043 if (INTVAL (op1) < -16*1024*1024
16044 || INTVAL (op1) >= 16*1024*1024)
16045 {
16046 if (!x86_64_immediate_operand (op1, Pmode))
16047 op1 = force_reg (Pmode, op1);
16048
16049 new_rtx
16050 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
16051 }
16052 }
16053 }
16054 else
16055 {
16056 rtx base = legitimize_pic_address (op0, reg);
16057 machine_mode mode = GET_MODE (base);
16058 new_rtx
16059 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
16060
16061 if (CONST_INT_P (new_rtx))
16062 {
16063 if (INTVAL (new_rtx) < -16*1024*1024
16064 || INTVAL (new_rtx) >= 16*1024*1024)
16065 {
16066 if (!x86_64_immediate_operand (new_rtx, mode))
16067 new_rtx = force_reg (mode, new_rtx);
16068
16069 new_rtx
16070 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
16071 }
16072 else
16073 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
16074 }
16075 else
16076 {
16077 /* For %rip addressing, we have to use
16078 just disp32, not base nor index. */
16079 if (TARGET_64BIT
16080 && (GET_CODE (base) == SYMBOL_REF
16081 || GET_CODE (base) == LABEL_REF))
16082 base = force_reg (mode, base);
16083 if (GET_CODE (new_rtx) == PLUS
16084 && CONSTANT_P (XEXP (new_rtx, 1)))
16085 {
16086 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
16087 new_rtx = XEXP (new_rtx, 1);
16088 }
16089 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
16090 }
16091 }
16092 }
16093 }
16094 return new_rtx;
16095 }
16096 \f
16097 /* Load the thread pointer. If TO_REG is true, force it into a register. */
16098
16099 static rtx
16100 get_thread_pointer (machine_mode tp_mode, bool to_reg)
16101 {
16102 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
16103
16104 if (GET_MODE (tp) != tp_mode)
16105 {
16106 gcc_assert (GET_MODE (tp) == SImode);
16107 gcc_assert (tp_mode == DImode);
16108
16109 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
16110 }
16111
16112 if (to_reg)
16113 tp = copy_to_mode_reg (tp_mode, tp);
16114
16115 return tp;
16116 }
16117
16118 /* Construct the SYMBOL_REF for the tls_get_addr function. */
16119
16120 static GTY(()) rtx ix86_tls_symbol;
16121
16122 static rtx
16123 ix86_tls_get_addr (void)
16124 {
16125 if (!ix86_tls_symbol)
16126 {
16127 const char *sym
16128 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
16129 ? "___tls_get_addr" : "__tls_get_addr");
16130
16131 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
16132 }
16133
16134 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
16135 {
16136 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
16137 UNSPEC_PLTOFF);
16138 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
16139 gen_rtx_CONST (Pmode, unspec));
16140 }
16141
16142 return ix86_tls_symbol;
16143 }
16144
16145 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
16146
16147 static GTY(()) rtx ix86_tls_module_base_symbol;
16148
16149 rtx
16150 ix86_tls_module_base (void)
16151 {
16152 if (!ix86_tls_module_base_symbol)
16153 {
16154 ix86_tls_module_base_symbol
16155 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
16156
16157 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
16158 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
16159 }
16160
16161 return ix86_tls_module_base_symbol;
16162 }
16163
16164 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
16165 false if we expect this to be used for a memory address and true if
16166 we expect to load the address into a register. */
16167
16168 static rtx
16169 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
16170 {
16171 rtx dest, base, off;
16172 rtx pic = NULL_RTX, tp = NULL_RTX;
16173 machine_mode tp_mode = Pmode;
16174 int type;
16175
16176 /* Fall back to global dynamic model if tool chain cannot support local
16177 dynamic. */
16178 if (TARGET_SUN_TLS && !TARGET_64BIT
16179 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
16180 && model == TLS_MODEL_LOCAL_DYNAMIC)
16181 model = TLS_MODEL_GLOBAL_DYNAMIC;
16182
16183 switch (model)
16184 {
16185 case TLS_MODEL_GLOBAL_DYNAMIC:
16186 dest = gen_reg_rtx (Pmode);
16187
16188 if (!TARGET_64BIT)
16189 {
16190 if (flag_pic && !TARGET_PECOFF)
16191 pic = pic_offset_table_rtx;
16192 else
16193 {
16194 pic = gen_reg_rtx (Pmode);
16195 emit_insn (gen_set_got (pic));
16196 }
16197 }
16198
16199 if (TARGET_GNU2_TLS)
16200 {
16201 if (TARGET_64BIT)
16202 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
16203 else
16204 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
16205
16206 tp = get_thread_pointer (Pmode, true);
16207 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
16208
16209 if (GET_MODE (x) != Pmode)
16210 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16211
16212 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16213 }
16214 else
16215 {
16216 rtx caddr = ix86_tls_get_addr ();
16217
16218 if (TARGET_64BIT)
16219 {
16220 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16221 rtx_insn *insns;
16222
16223 start_sequence ();
16224 emit_call_insn
16225 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
16226 insns = get_insns ();
16227 end_sequence ();
16228
16229 if (GET_MODE (x) != Pmode)
16230 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16231
16232 RTL_CONST_CALL_P (insns) = 1;
16233 emit_libcall_block (insns, dest, rax, x);
16234 }
16235 else
16236 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
16237 }
16238 break;
16239
16240 case TLS_MODEL_LOCAL_DYNAMIC:
16241 base = gen_reg_rtx (Pmode);
16242
16243 if (!TARGET_64BIT)
16244 {
16245 if (flag_pic)
16246 pic = pic_offset_table_rtx;
16247 else
16248 {
16249 pic = gen_reg_rtx (Pmode);
16250 emit_insn (gen_set_got (pic));
16251 }
16252 }
16253
16254 if (TARGET_GNU2_TLS)
16255 {
16256 rtx tmp = ix86_tls_module_base ();
16257
16258 if (TARGET_64BIT)
16259 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
16260 else
16261 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
16262
16263 tp = get_thread_pointer (Pmode, true);
16264 set_unique_reg_note (get_last_insn (), REG_EQUAL,
16265 gen_rtx_MINUS (Pmode, tmp, tp));
16266 }
16267 else
16268 {
16269 rtx caddr = ix86_tls_get_addr ();
16270
16271 if (TARGET_64BIT)
16272 {
16273 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16274 rtx_insn *insns;
16275 rtx eqv;
16276
16277 start_sequence ();
16278 emit_call_insn
16279 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
16280 insns = get_insns ();
16281 end_sequence ();
16282
16283 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
16284 share the LD_BASE result with other LD model accesses. */
16285 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
16286 UNSPEC_TLS_LD_BASE);
16287
16288 RTL_CONST_CALL_P (insns) = 1;
16289 emit_libcall_block (insns, base, rax, eqv);
16290 }
16291 else
16292 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
16293 }
16294
16295 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
16296 off = gen_rtx_CONST (Pmode, off);
16297
16298 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
16299
16300 if (TARGET_GNU2_TLS)
16301 {
16302 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
16303
16304 if (GET_MODE (x) != Pmode)
16305 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16306
16307 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16308 }
16309 break;
16310
16311 case TLS_MODEL_INITIAL_EXEC:
16312 if (TARGET_64BIT)
16313 {
16314 if (TARGET_SUN_TLS && !TARGET_X32)
16315 {
16316 /* The Sun linker took the AMD64 TLS spec literally
16317 and can only handle %rax as destination of the
16318 initial executable code sequence. */
16319
16320 dest = gen_reg_rtx (DImode);
16321 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
16322 return dest;
16323 }
16324
16325 /* Generate DImode references to avoid %fs:(%reg32)
16326 problems and linker IE->LE relaxation bug. */
16327 tp_mode = DImode;
16328 pic = NULL;
16329 type = UNSPEC_GOTNTPOFF;
16330 }
16331 else if (flag_pic)
16332 {
16333 pic = pic_offset_table_rtx;
16334 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
16335 }
16336 else if (!TARGET_ANY_GNU_TLS)
16337 {
16338 pic = gen_reg_rtx (Pmode);
16339 emit_insn (gen_set_got (pic));
16340 type = UNSPEC_GOTTPOFF;
16341 }
16342 else
16343 {
16344 pic = NULL;
16345 type = UNSPEC_INDNTPOFF;
16346 }
16347
16348 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
16349 off = gen_rtx_CONST (tp_mode, off);
16350 if (pic)
16351 off = gen_rtx_PLUS (tp_mode, pic, off);
16352 off = gen_const_mem (tp_mode, off);
16353 set_mem_alias_set (off, ix86_GOT_alias_set ());
16354
16355 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16356 {
16357 base = get_thread_pointer (tp_mode,
16358 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16359 off = force_reg (tp_mode, off);
16360 return gen_rtx_PLUS (tp_mode, base, off);
16361 }
16362 else
16363 {
16364 base = get_thread_pointer (Pmode, true);
16365 dest = gen_reg_rtx (Pmode);
16366 emit_insn (ix86_gen_sub3 (dest, base, off));
16367 }
16368 break;
16369
16370 case TLS_MODEL_LOCAL_EXEC:
16371 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
16372 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16373 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
16374 off = gen_rtx_CONST (Pmode, off);
16375
16376 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16377 {
16378 base = get_thread_pointer (Pmode,
16379 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16380 return gen_rtx_PLUS (Pmode, base, off);
16381 }
16382 else
16383 {
16384 base = get_thread_pointer (Pmode, true);
16385 dest = gen_reg_rtx (Pmode);
16386 emit_insn (ix86_gen_sub3 (dest, base, off));
16387 }
16388 break;
16389
16390 default:
16391 gcc_unreachable ();
16392 }
16393
16394 return dest;
16395 }
16396
16397 /* Create or return the unique __imp_DECL dllimport symbol corresponding
16398 to symbol DECL if BEIMPORT is true. Otherwise create or return the
16399 unique refptr-DECL symbol corresponding to symbol DECL. */
16400
16401 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
16402 {
16403 static inline hashval_t hash (tree_map *m) { return m->hash; }
16404 static inline bool
16405 equal (tree_map *a, tree_map *b)
16406 {
16407 return a->base.from == b->base.from;
16408 }
16409
16410 static int
16411 keep_cache_entry (tree_map *&m)
16412 {
16413 return ggc_marked_p (m->base.from);
16414 }
16415 };
16416
16417 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
16418
16419 static tree
16420 get_dllimport_decl (tree decl, bool beimport)
16421 {
16422 struct tree_map *h, in;
16423 const char *name;
16424 const char *prefix;
16425 size_t namelen, prefixlen;
16426 char *imp_name;
16427 tree to;
16428 rtx rtl;
16429
16430 if (!dllimport_map)
16431 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
16432
16433 in.hash = htab_hash_pointer (decl);
16434 in.base.from = decl;
16435 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
16436 h = *loc;
16437 if (h)
16438 return h->to;
16439
16440 *loc = h = ggc_alloc<tree_map> ();
16441 h->hash = in.hash;
16442 h->base.from = decl;
16443 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
16444 VAR_DECL, NULL, ptr_type_node);
16445 DECL_ARTIFICIAL (to) = 1;
16446 DECL_IGNORED_P (to) = 1;
16447 DECL_EXTERNAL (to) = 1;
16448 TREE_READONLY (to) = 1;
16449
16450 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
16451 name = targetm.strip_name_encoding (name);
16452 if (beimport)
16453 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
16454 ? "*__imp_" : "*__imp__";
16455 else
16456 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
16457 namelen = strlen (name);
16458 prefixlen = strlen (prefix);
16459 imp_name = (char *) alloca (namelen + prefixlen + 1);
16460 memcpy (imp_name, prefix, prefixlen);
16461 memcpy (imp_name + prefixlen, name, namelen + 1);
16462
16463 name = ggc_alloc_string (imp_name, namelen + prefixlen);
16464 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
16465 SET_SYMBOL_REF_DECL (rtl, to);
16466 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
16467 if (!beimport)
16468 {
16469 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
16470 #ifdef SUB_TARGET_RECORD_STUB
16471 SUB_TARGET_RECORD_STUB (name);
16472 #endif
16473 }
16474
16475 rtl = gen_const_mem (Pmode, rtl);
16476 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
16477
16478 SET_DECL_RTL (to, rtl);
16479 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
16480
16481 return to;
16482 }
16483
16484 /* Expand SYMBOL into its corresponding far-addresse symbol.
16485 WANT_REG is true if we require the result be a register. */
16486
16487 static rtx
16488 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
16489 {
16490 tree imp_decl;
16491 rtx x;
16492
16493 gcc_assert (SYMBOL_REF_DECL (symbol));
16494 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
16495
16496 x = DECL_RTL (imp_decl);
16497 if (want_reg)
16498 x = force_reg (Pmode, x);
16499 return x;
16500 }
16501
16502 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
16503 true if we require the result be a register. */
16504
16505 static rtx
16506 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
16507 {
16508 tree imp_decl;
16509 rtx x;
16510
16511 gcc_assert (SYMBOL_REF_DECL (symbol));
16512 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
16513
16514 x = DECL_RTL (imp_decl);
16515 if (want_reg)
16516 x = force_reg (Pmode, x);
16517 return x;
16518 }
16519
16520 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
16521 is true if we require the result be a register. */
16522
16523 static rtx
16524 legitimize_pe_coff_symbol (rtx addr, bool inreg)
16525 {
16526 if (!TARGET_PECOFF)
16527 return NULL_RTX;
16528
16529 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16530 {
16531 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
16532 return legitimize_dllimport_symbol (addr, inreg);
16533 if (GET_CODE (addr) == CONST
16534 && GET_CODE (XEXP (addr, 0)) == PLUS
16535 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16536 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
16537 {
16538 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
16539 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16540 }
16541 }
16542
16543 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
16544 return NULL_RTX;
16545 if (GET_CODE (addr) == SYMBOL_REF
16546 && !is_imported_p (addr)
16547 && SYMBOL_REF_EXTERNAL_P (addr)
16548 && SYMBOL_REF_DECL (addr))
16549 return legitimize_pe_coff_extern_decl (addr, inreg);
16550
16551 if (GET_CODE (addr) == CONST
16552 && GET_CODE (XEXP (addr, 0)) == PLUS
16553 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16554 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
16555 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
16556 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
16557 {
16558 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
16559 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16560 }
16561 return NULL_RTX;
16562 }
16563
16564 /* Try machine-dependent ways of modifying an illegitimate address
16565 to be legitimate. If we find one, return the new, valid address.
16566 This macro is used in only one place: `memory_address' in explow.c.
16567
16568 OLDX is the address as it was before break_out_memory_refs was called.
16569 In some cases it is useful to look at this to decide what needs to be done.
16570
16571 It is always safe for this macro to do nothing. It exists to recognize
16572 opportunities to optimize the output.
16573
16574 For the 80386, we handle X+REG by loading X into a register R and
16575 using R+REG. R will go in a general reg and indexing will be used.
16576 However, if REG is a broken-out memory address or multiplication,
16577 nothing needs to be done because REG can certainly go in a general reg.
16578
16579 When -fpic is used, special handling is needed for symbolic references.
16580 See comments by legitimize_pic_address in i386.c for details. */
16581
16582 static rtx
16583 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
16584 {
16585 bool changed = false;
16586 unsigned log;
16587
16588 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
16589 if (log)
16590 return legitimize_tls_address (x, (enum tls_model) log, false);
16591 if (GET_CODE (x) == CONST
16592 && GET_CODE (XEXP (x, 0)) == PLUS
16593 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
16594 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
16595 {
16596 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
16597 (enum tls_model) log, false);
16598 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
16599 }
16600
16601 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16602 {
16603 rtx tmp = legitimize_pe_coff_symbol (x, true);
16604 if (tmp)
16605 return tmp;
16606 }
16607
16608 if (flag_pic && SYMBOLIC_CONST (x))
16609 return legitimize_pic_address (x, 0);
16610
16611 #if TARGET_MACHO
16612 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
16613 return machopic_indirect_data_reference (x, 0);
16614 #endif
16615
16616 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
16617 if (GET_CODE (x) == ASHIFT
16618 && CONST_INT_P (XEXP (x, 1))
16619 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
16620 {
16621 changed = true;
16622 log = INTVAL (XEXP (x, 1));
16623 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
16624 GEN_INT (1 << log));
16625 }
16626
16627 if (GET_CODE (x) == PLUS)
16628 {
16629 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
16630
16631 if (GET_CODE (XEXP (x, 0)) == ASHIFT
16632 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
16633 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
16634 {
16635 changed = true;
16636 log = INTVAL (XEXP (XEXP (x, 0), 1));
16637 XEXP (x, 0) = gen_rtx_MULT (Pmode,
16638 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
16639 GEN_INT (1 << log));
16640 }
16641
16642 if (GET_CODE (XEXP (x, 1)) == ASHIFT
16643 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
16644 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
16645 {
16646 changed = true;
16647 log = INTVAL (XEXP (XEXP (x, 1), 1));
16648 XEXP (x, 1) = gen_rtx_MULT (Pmode,
16649 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
16650 GEN_INT (1 << log));
16651 }
16652
16653 /* Put multiply first if it isn't already. */
16654 if (GET_CODE (XEXP (x, 1)) == MULT)
16655 {
16656 std::swap (XEXP (x, 0), XEXP (x, 1));
16657 changed = true;
16658 }
16659
16660 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
16661 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
16662 created by virtual register instantiation, register elimination, and
16663 similar optimizations. */
16664 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
16665 {
16666 changed = true;
16667 x = gen_rtx_PLUS (Pmode,
16668 gen_rtx_PLUS (Pmode, XEXP (x, 0),
16669 XEXP (XEXP (x, 1), 0)),
16670 XEXP (XEXP (x, 1), 1));
16671 }
16672
16673 /* Canonicalize
16674 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
16675 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
16676 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
16677 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
16678 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
16679 && CONSTANT_P (XEXP (x, 1)))
16680 {
16681 rtx constant;
16682 rtx other = NULL_RTX;
16683
16684 if (CONST_INT_P (XEXP (x, 1)))
16685 {
16686 constant = XEXP (x, 1);
16687 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
16688 }
16689 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
16690 {
16691 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
16692 other = XEXP (x, 1);
16693 }
16694 else
16695 constant = 0;
16696
16697 if (constant)
16698 {
16699 changed = true;
16700 x = gen_rtx_PLUS (Pmode,
16701 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
16702 XEXP (XEXP (XEXP (x, 0), 1), 0)),
16703 plus_constant (Pmode, other,
16704 INTVAL (constant)));
16705 }
16706 }
16707
16708 if (changed && ix86_legitimate_address_p (mode, x, false))
16709 return x;
16710
16711 if (GET_CODE (XEXP (x, 0)) == MULT)
16712 {
16713 changed = true;
16714 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
16715 }
16716
16717 if (GET_CODE (XEXP (x, 1)) == MULT)
16718 {
16719 changed = true;
16720 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
16721 }
16722
16723 if (changed
16724 && REG_P (XEXP (x, 1))
16725 && REG_P (XEXP (x, 0)))
16726 return x;
16727
16728 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
16729 {
16730 changed = true;
16731 x = legitimize_pic_address (x, 0);
16732 }
16733
16734 if (changed && ix86_legitimate_address_p (mode, x, false))
16735 return x;
16736
16737 if (REG_P (XEXP (x, 0)))
16738 {
16739 rtx temp = gen_reg_rtx (Pmode);
16740 rtx val = force_operand (XEXP (x, 1), temp);
16741 if (val != temp)
16742 {
16743 val = convert_to_mode (Pmode, val, 1);
16744 emit_move_insn (temp, val);
16745 }
16746
16747 XEXP (x, 1) = temp;
16748 return x;
16749 }
16750
16751 else if (REG_P (XEXP (x, 1)))
16752 {
16753 rtx temp = gen_reg_rtx (Pmode);
16754 rtx val = force_operand (XEXP (x, 0), temp);
16755 if (val != temp)
16756 {
16757 val = convert_to_mode (Pmode, val, 1);
16758 emit_move_insn (temp, val);
16759 }
16760
16761 XEXP (x, 0) = temp;
16762 return x;
16763 }
16764 }
16765
16766 return x;
16767 }
16768 \f
16769 /* Print an integer constant expression in assembler syntax. Addition
16770 and subtraction are the only arithmetic that may appear in these
16771 expressions. FILE is the stdio stream to write to, X is the rtx, and
16772 CODE is the operand print code from the output string. */
16773
16774 static void
16775 output_pic_addr_const (FILE *file, rtx x, int code)
16776 {
16777 char buf[256];
16778
16779 switch (GET_CODE (x))
16780 {
16781 case PC:
16782 gcc_assert (flag_pic);
16783 putc ('.', file);
16784 break;
16785
16786 case SYMBOL_REF:
16787 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
16788 output_addr_const (file, x);
16789 else
16790 {
16791 const char *name = XSTR (x, 0);
16792
16793 /* Mark the decl as referenced so that cgraph will
16794 output the function. */
16795 if (SYMBOL_REF_DECL (x))
16796 mark_decl_referenced (SYMBOL_REF_DECL (x));
16797
16798 #if TARGET_MACHO
16799 if (MACHOPIC_INDIRECT
16800 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
16801 name = machopic_indirection_name (x, /*stub_p=*/true);
16802 #endif
16803 assemble_name (file, name);
16804 }
16805 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
16806 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
16807 fputs ("@PLT", file);
16808 break;
16809
16810 case LABEL_REF:
16811 x = XEXP (x, 0);
16812 /* FALLTHRU */
16813 case CODE_LABEL:
16814 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
16815 assemble_name (asm_out_file, buf);
16816 break;
16817
16818 case CONST_INT:
16819 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
16820 break;
16821
16822 case CONST:
16823 /* This used to output parentheses around the expression,
16824 but that does not work on the 386 (either ATT or BSD assembler). */
16825 output_pic_addr_const (file, XEXP (x, 0), code);
16826 break;
16827
16828 case CONST_DOUBLE:
16829 /* We can't handle floating point constants;
16830 TARGET_PRINT_OPERAND must handle them. */
16831 output_operand_lossage ("floating constant misused");
16832 break;
16833
16834 case PLUS:
16835 /* Some assemblers need integer constants to appear first. */
16836 if (CONST_INT_P (XEXP (x, 0)))
16837 {
16838 output_pic_addr_const (file, XEXP (x, 0), code);
16839 putc ('+', file);
16840 output_pic_addr_const (file, XEXP (x, 1), code);
16841 }
16842 else
16843 {
16844 gcc_assert (CONST_INT_P (XEXP (x, 1)));
16845 output_pic_addr_const (file, XEXP (x, 1), code);
16846 putc ('+', file);
16847 output_pic_addr_const (file, XEXP (x, 0), code);
16848 }
16849 break;
16850
16851 case MINUS:
16852 if (!TARGET_MACHO)
16853 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
16854 output_pic_addr_const (file, XEXP (x, 0), code);
16855 putc ('-', file);
16856 output_pic_addr_const (file, XEXP (x, 1), code);
16857 if (!TARGET_MACHO)
16858 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
16859 break;
16860
16861 case UNSPEC:
16862 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
16863 {
16864 bool f = i386_asm_output_addr_const_extra (file, x);
16865 gcc_assert (f);
16866 break;
16867 }
16868
16869 gcc_assert (XVECLEN (x, 0) == 1);
16870 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
16871 switch (XINT (x, 1))
16872 {
16873 case UNSPEC_GOT:
16874 fputs ("@GOT", file);
16875 break;
16876 case UNSPEC_GOTOFF:
16877 fputs ("@GOTOFF", file);
16878 break;
16879 case UNSPEC_PLTOFF:
16880 fputs ("@PLTOFF", file);
16881 break;
16882 case UNSPEC_PCREL:
16883 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16884 "(%rip)" : "[rip]", file);
16885 break;
16886 case UNSPEC_GOTPCREL:
16887 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16888 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
16889 break;
16890 case UNSPEC_GOTTPOFF:
16891 /* FIXME: This might be @TPOFF in Sun ld too. */
16892 fputs ("@gottpoff", file);
16893 break;
16894 case UNSPEC_TPOFF:
16895 fputs ("@tpoff", file);
16896 break;
16897 case UNSPEC_NTPOFF:
16898 if (TARGET_64BIT)
16899 fputs ("@tpoff", file);
16900 else
16901 fputs ("@ntpoff", file);
16902 break;
16903 case UNSPEC_DTPOFF:
16904 fputs ("@dtpoff", file);
16905 break;
16906 case UNSPEC_GOTNTPOFF:
16907 if (TARGET_64BIT)
16908 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16909 "@gottpoff(%rip)": "@gottpoff[rip]", file);
16910 else
16911 fputs ("@gotntpoff", file);
16912 break;
16913 case UNSPEC_INDNTPOFF:
16914 fputs ("@indntpoff", file);
16915 break;
16916 #if TARGET_MACHO
16917 case UNSPEC_MACHOPIC_OFFSET:
16918 putc ('-', file);
16919 machopic_output_function_base_name (file);
16920 break;
16921 #endif
16922 default:
16923 output_operand_lossage ("invalid UNSPEC as operand");
16924 break;
16925 }
16926 break;
16927
16928 default:
16929 output_operand_lossage ("invalid expression as operand");
16930 }
16931 }
16932
16933 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
16934 We need to emit DTP-relative relocations. */
16935
16936 static void ATTRIBUTE_UNUSED
16937 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
16938 {
16939 fputs (ASM_LONG, file);
16940 output_addr_const (file, x);
16941 fputs ("@dtpoff", file);
16942 switch (size)
16943 {
16944 case 4:
16945 break;
16946 case 8:
16947 fputs (", 0", file);
16948 break;
16949 default:
16950 gcc_unreachable ();
16951 }
16952 }
16953
16954 /* Return true if X is a representation of the PIC register. This copes
16955 with calls from ix86_find_base_term, where the register might have
16956 been replaced by a cselib value. */
16957
16958 static bool
16959 ix86_pic_register_p (rtx x)
16960 {
16961 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
16962 return (pic_offset_table_rtx
16963 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
16964 else if (!REG_P (x))
16965 return false;
16966 else if (pic_offset_table_rtx)
16967 {
16968 if (REGNO (x) == REGNO (pic_offset_table_rtx))
16969 return true;
16970 if (HARD_REGISTER_P (x)
16971 && !HARD_REGISTER_P (pic_offset_table_rtx)
16972 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
16973 return true;
16974 return false;
16975 }
16976 else
16977 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
16978 }
16979
16980 /* Helper function for ix86_delegitimize_address.
16981 Attempt to delegitimize TLS local-exec accesses. */
16982
16983 static rtx
16984 ix86_delegitimize_tls_address (rtx orig_x)
16985 {
16986 rtx x = orig_x, unspec;
16987 struct ix86_address addr;
16988
16989 if (!TARGET_TLS_DIRECT_SEG_REFS)
16990 return orig_x;
16991 if (MEM_P (x))
16992 x = XEXP (x, 0);
16993 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
16994 return orig_x;
16995 if (ix86_decompose_address (x, &addr) == 0
16996 || addr.seg != DEFAULT_TLS_SEG_REG
16997 || addr.disp == NULL_RTX
16998 || GET_CODE (addr.disp) != CONST)
16999 return orig_x;
17000 unspec = XEXP (addr.disp, 0);
17001 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
17002 unspec = XEXP (unspec, 0);
17003 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
17004 return orig_x;
17005 x = XVECEXP (unspec, 0, 0);
17006 gcc_assert (GET_CODE (x) == SYMBOL_REF);
17007 if (unspec != XEXP (addr.disp, 0))
17008 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
17009 if (addr.index)
17010 {
17011 rtx idx = addr.index;
17012 if (addr.scale != 1)
17013 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
17014 x = gen_rtx_PLUS (Pmode, idx, x);
17015 }
17016 if (addr.base)
17017 x = gen_rtx_PLUS (Pmode, addr.base, x);
17018 if (MEM_P (orig_x))
17019 x = replace_equiv_address_nv (orig_x, x);
17020 return x;
17021 }
17022
17023 /* In the name of slightly smaller debug output, and to cater to
17024 general assembler lossage, recognize PIC+GOTOFF and turn it back
17025 into a direct symbol reference.
17026
17027 On Darwin, this is necessary to avoid a crash, because Darwin
17028 has a different PIC label for each routine but the DWARF debugging
17029 information is not associated with any particular routine, so it's
17030 necessary to remove references to the PIC label from RTL stored by
17031 the DWARF output code. */
17032
17033 static rtx
17034 ix86_delegitimize_address (rtx x)
17035 {
17036 rtx orig_x = delegitimize_mem_from_attrs (x);
17037 /* addend is NULL or some rtx if x is something+GOTOFF where
17038 something doesn't include the PIC register. */
17039 rtx addend = NULL_RTX;
17040 /* reg_addend is NULL or a multiple of some register. */
17041 rtx reg_addend = NULL_RTX;
17042 /* const_addend is NULL or a const_int. */
17043 rtx const_addend = NULL_RTX;
17044 /* This is the result, or NULL. */
17045 rtx result = NULL_RTX;
17046
17047 x = orig_x;
17048
17049 if (MEM_P (x))
17050 x = XEXP (x, 0);
17051
17052 if (TARGET_64BIT)
17053 {
17054 if (GET_CODE (x) == CONST
17055 && GET_CODE (XEXP (x, 0)) == PLUS
17056 && GET_MODE (XEXP (x, 0)) == Pmode
17057 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
17058 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
17059 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
17060 {
17061 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
17062 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
17063 if (MEM_P (orig_x))
17064 x = replace_equiv_address_nv (orig_x, x);
17065 return x;
17066 }
17067
17068 if (GET_CODE (x) == CONST
17069 && GET_CODE (XEXP (x, 0)) == UNSPEC
17070 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
17071 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
17072 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
17073 {
17074 x = XVECEXP (XEXP (x, 0), 0, 0);
17075 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
17076 {
17077 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
17078 if (x == NULL_RTX)
17079 return orig_x;
17080 }
17081 return x;
17082 }
17083
17084 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
17085 return ix86_delegitimize_tls_address (orig_x);
17086
17087 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
17088 and -mcmodel=medium -fpic. */
17089 }
17090
17091 if (GET_CODE (x) != PLUS
17092 || GET_CODE (XEXP (x, 1)) != CONST)
17093 return ix86_delegitimize_tls_address (orig_x);
17094
17095 if (ix86_pic_register_p (XEXP (x, 0)))
17096 /* %ebx + GOT/GOTOFF */
17097 ;
17098 else if (GET_CODE (XEXP (x, 0)) == PLUS)
17099 {
17100 /* %ebx + %reg * scale + GOT/GOTOFF */
17101 reg_addend = XEXP (x, 0);
17102 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
17103 reg_addend = XEXP (reg_addend, 1);
17104 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
17105 reg_addend = XEXP (reg_addend, 0);
17106 else
17107 {
17108 reg_addend = NULL_RTX;
17109 addend = XEXP (x, 0);
17110 }
17111 }
17112 else
17113 addend = XEXP (x, 0);
17114
17115 x = XEXP (XEXP (x, 1), 0);
17116 if (GET_CODE (x) == PLUS
17117 && CONST_INT_P (XEXP (x, 1)))
17118 {
17119 const_addend = XEXP (x, 1);
17120 x = XEXP (x, 0);
17121 }
17122
17123 if (GET_CODE (x) == UNSPEC
17124 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
17125 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
17126 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
17127 && !MEM_P (orig_x) && !addend)))
17128 result = XVECEXP (x, 0, 0);
17129
17130 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
17131 && !MEM_P (orig_x))
17132 result = XVECEXP (x, 0, 0);
17133
17134 if (! result)
17135 return ix86_delegitimize_tls_address (orig_x);
17136
17137 if (const_addend)
17138 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
17139 if (reg_addend)
17140 result = gen_rtx_PLUS (Pmode, reg_addend, result);
17141 if (addend)
17142 {
17143 /* If the rest of original X doesn't involve the PIC register, add
17144 addend and subtract pic_offset_table_rtx. This can happen e.g.
17145 for code like:
17146 leal (%ebx, %ecx, 4), %ecx
17147 ...
17148 movl foo@GOTOFF(%ecx), %edx
17149 in which case we return (%ecx - %ebx) + foo
17150 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
17151 and reload has completed. */
17152 if (pic_offset_table_rtx
17153 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
17154 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
17155 pic_offset_table_rtx),
17156 result);
17157 else if (pic_offset_table_rtx && !TARGET_MACHO && !TARGET_VXWORKS_RTP)
17158 {
17159 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
17160 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
17161 result = gen_rtx_PLUS (Pmode, tmp, result);
17162 }
17163 else
17164 return orig_x;
17165 }
17166 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
17167 {
17168 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
17169 if (result == NULL_RTX)
17170 return orig_x;
17171 }
17172 return result;
17173 }
17174
17175 /* If X is a machine specific address (i.e. a symbol or label being
17176 referenced as a displacement from the GOT implemented using an
17177 UNSPEC), then return the base term. Otherwise return X. */
17178
17179 rtx
17180 ix86_find_base_term (rtx x)
17181 {
17182 rtx term;
17183
17184 if (TARGET_64BIT)
17185 {
17186 if (GET_CODE (x) != CONST)
17187 return x;
17188 term = XEXP (x, 0);
17189 if (GET_CODE (term) == PLUS
17190 && CONST_INT_P (XEXP (term, 1)))
17191 term = XEXP (term, 0);
17192 if (GET_CODE (term) != UNSPEC
17193 || (XINT (term, 1) != UNSPEC_GOTPCREL
17194 && XINT (term, 1) != UNSPEC_PCREL))
17195 return x;
17196
17197 return XVECEXP (term, 0, 0);
17198 }
17199
17200 return ix86_delegitimize_address (x);
17201 }
17202 \f
17203 static void
17204 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
17205 bool fp, FILE *file)
17206 {
17207 const char *suffix;
17208
17209 if (mode == CCFPmode || mode == CCFPUmode)
17210 {
17211 code = ix86_fp_compare_code_to_integer (code);
17212 mode = CCmode;
17213 }
17214 if (reverse)
17215 code = reverse_condition (code);
17216
17217 switch (code)
17218 {
17219 case EQ:
17220 switch (mode)
17221 {
17222 case CCAmode:
17223 suffix = "a";
17224 break;
17225 case CCCmode:
17226 suffix = "c";
17227 break;
17228 case CCOmode:
17229 suffix = "o";
17230 break;
17231 case CCPmode:
17232 suffix = "p";
17233 break;
17234 case CCSmode:
17235 suffix = "s";
17236 break;
17237 default:
17238 suffix = "e";
17239 break;
17240 }
17241 break;
17242 case NE:
17243 switch (mode)
17244 {
17245 case CCAmode:
17246 suffix = "na";
17247 break;
17248 case CCCmode:
17249 suffix = "nc";
17250 break;
17251 case CCOmode:
17252 suffix = "no";
17253 break;
17254 case CCPmode:
17255 suffix = "np";
17256 break;
17257 case CCSmode:
17258 suffix = "ns";
17259 break;
17260 default:
17261 suffix = "ne";
17262 break;
17263 }
17264 break;
17265 case GT:
17266 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
17267 suffix = "g";
17268 break;
17269 case GTU:
17270 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
17271 Those same assemblers have the same but opposite lossage on cmov. */
17272 if (mode == CCmode)
17273 suffix = fp ? "nbe" : "a";
17274 else
17275 gcc_unreachable ();
17276 break;
17277 case LT:
17278 switch (mode)
17279 {
17280 case CCNOmode:
17281 case CCGOCmode:
17282 suffix = "s";
17283 break;
17284
17285 case CCmode:
17286 case CCGCmode:
17287 suffix = "l";
17288 break;
17289
17290 default:
17291 gcc_unreachable ();
17292 }
17293 break;
17294 case LTU:
17295 if (mode == CCmode)
17296 suffix = "b";
17297 else if (mode == CCCmode)
17298 suffix = fp ? "b" : "c";
17299 else
17300 gcc_unreachable ();
17301 break;
17302 case GE:
17303 switch (mode)
17304 {
17305 case CCNOmode:
17306 case CCGOCmode:
17307 suffix = "ns";
17308 break;
17309
17310 case CCmode:
17311 case CCGCmode:
17312 suffix = "ge";
17313 break;
17314
17315 default:
17316 gcc_unreachable ();
17317 }
17318 break;
17319 case GEU:
17320 if (mode == CCmode)
17321 suffix = "nb";
17322 else if (mode == CCCmode)
17323 suffix = fp ? "nb" : "nc";
17324 else
17325 gcc_unreachable ();
17326 break;
17327 case LE:
17328 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
17329 suffix = "le";
17330 break;
17331 case LEU:
17332 if (mode == CCmode)
17333 suffix = "be";
17334 else
17335 gcc_unreachable ();
17336 break;
17337 case UNORDERED:
17338 suffix = fp ? "u" : "p";
17339 break;
17340 case ORDERED:
17341 suffix = fp ? "nu" : "np";
17342 break;
17343 default:
17344 gcc_unreachable ();
17345 }
17346 fputs (suffix, file);
17347 }
17348
17349 /* Print the name of register X to FILE based on its machine mode and number.
17350 If CODE is 'w', pretend the mode is HImode.
17351 If CODE is 'b', pretend the mode is QImode.
17352 If CODE is 'k', pretend the mode is SImode.
17353 If CODE is 'q', pretend the mode is DImode.
17354 If CODE is 'x', pretend the mode is V4SFmode.
17355 If CODE is 't', pretend the mode is V8SFmode.
17356 If CODE is 'g', pretend the mode is V16SFmode.
17357 If CODE is 'h', pretend the reg is the 'high' byte register.
17358 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
17359 If CODE is 'd', duplicate the operand for AVX instruction.
17360 */
17361
17362 void
17363 print_reg (rtx x, int code, FILE *file)
17364 {
17365 const char *reg;
17366 int msize;
17367 unsigned int regno;
17368 bool duplicated;
17369
17370 if (ASSEMBLER_DIALECT == ASM_ATT)
17371 putc ('%', file);
17372
17373 if (x == pc_rtx)
17374 {
17375 gcc_assert (TARGET_64BIT);
17376 fputs ("rip", file);
17377 return;
17378 }
17379
17380 if (code == 'y' && STACK_TOP_P (x))
17381 {
17382 fputs ("st(0)", file);
17383 return;
17384 }
17385
17386 if (code == 'w')
17387 msize = 2;
17388 else if (code == 'b')
17389 msize = 1;
17390 else if (code == 'k')
17391 msize = 4;
17392 else if (code == 'q')
17393 msize = 8;
17394 else if (code == 'h')
17395 msize = 0;
17396 else if (code == 'x')
17397 msize = 16;
17398 else if (code == 't')
17399 msize = 32;
17400 else if (code == 'g')
17401 msize = 64;
17402 else
17403 msize = GET_MODE_SIZE (GET_MODE (x));
17404
17405 regno = true_regnum (x);
17406
17407 gcc_assert (regno != ARG_POINTER_REGNUM
17408 && regno != FRAME_POINTER_REGNUM
17409 && regno != FPSR_REG
17410 && regno != FPCR_REG);
17411
17412 if (regno == FLAGS_REG)
17413 {
17414 output_operand_lossage ("invalid use of asm flag output");
17415 return;
17416 }
17417
17418 duplicated = code == 'd' && TARGET_AVX;
17419
17420 switch (msize)
17421 {
17422 case 8:
17423 case 4:
17424 if (LEGACY_INT_REGNO_P (regno))
17425 putc (msize == 8 && TARGET_64BIT ? 'r' : 'e', file);
17426 /* FALLTHRU */
17427 case 16:
17428 case 12:
17429 case 2:
17430 normal:
17431 reg = hi_reg_name[regno];
17432 break;
17433 case 1:
17434 if (regno >= ARRAY_SIZE (qi_reg_name))
17435 goto normal;
17436 reg = qi_reg_name[regno];
17437 break;
17438 case 0:
17439 if (regno >= ARRAY_SIZE (qi_high_reg_name))
17440 goto normal;
17441 reg = qi_high_reg_name[regno];
17442 break;
17443 case 32:
17444 case 64:
17445 if (SSE_REGNO_P (regno))
17446 {
17447 gcc_assert (!duplicated);
17448 putc (msize == 32 ? 'y' : 'z', file);
17449 reg = hi_reg_name[regno] + 1;
17450 break;
17451 }
17452 goto normal;
17453 default:
17454 gcc_unreachable ();
17455 }
17456
17457 fputs (reg, file);
17458
17459 /* Irritatingly, AMD extended registers use
17460 different naming convention: "r%d[bwd]" */
17461 if (REX_INT_REGNO_P (regno))
17462 {
17463 gcc_assert (TARGET_64BIT);
17464 switch (msize)
17465 {
17466 case 0:
17467 error ("extended registers have no high halves");
17468 break;
17469 case 1:
17470 putc ('b', file);
17471 break;
17472 case 2:
17473 putc ('w', file);
17474 break;
17475 case 4:
17476 putc ('d', file);
17477 break;
17478 case 8:
17479 /* no suffix */
17480 break;
17481 default:
17482 error ("unsupported operand size for extended register");
17483 break;
17484 }
17485 return;
17486 }
17487
17488 if (duplicated)
17489 {
17490 if (ASSEMBLER_DIALECT == ASM_ATT)
17491 fprintf (file, ", %%%s", reg);
17492 else
17493 fprintf (file, ", %s", reg);
17494 }
17495 }
17496
17497 /* Meaning of CODE:
17498 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
17499 C -- print opcode suffix for set/cmov insn.
17500 c -- like C, but print reversed condition
17501 F,f -- likewise, but for floating-point.
17502 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
17503 otherwise nothing
17504 R -- print embeded rounding and sae.
17505 r -- print only sae.
17506 z -- print the opcode suffix for the size of the current operand.
17507 Z -- likewise, with special suffixes for x87 instructions.
17508 * -- print a star (in certain assembler syntax)
17509 A -- print an absolute memory reference.
17510 E -- print address with DImode register names if TARGET_64BIT.
17511 w -- print the operand as if it's a "word" (HImode) even if it isn't.
17512 s -- print a shift double count, followed by the assemblers argument
17513 delimiter.
17514 b -- print the QImode name of the register for the indicated operand.
17515 %b0 would print %al if operands[0] is reg 0.
17516 w -- likewise, print the HImode name of the register.
17517 k -- likewise, print the SImode name of the register.
17518 q -- likewise, print the DImode name of the register.
17519 x -- likewise, print the V4SFmode name of the register.
17520 t -- likewise, print the V8SFmode name of the register.
17521 g -- likewise, print the V16SFmode name of the register.
17522 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
17523 y -- print "st(0)" instead of "st" as a register.
17524 d -- print duplicated register operand for AVX instruction.
17525 D -- print condition for SSE cmp instruction.
17526 P -- if PIC, print an @PLT suffix.
17527 p -- print raw symbol name.
17528 X -- don't print any sort of PIC '@' suffix for a symbol.
17529 & -- print some in-use local-dynamic symbol name.
17530 H -- print a memory address offset by 8; used for sse high-parts
17531 Y -- print condition for XOP pcom* instruction.
17532 + -- print a branch hint as 'cs' or 'ds' prefix
17533 ; -- print a semicolon (after prefixes due to bug in older gas).
17534 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
17535 @ -- print a segment register of thread base pointer load
17536 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
17537 ! -- print MPX prefix for jxx/call/ret instructions if required.
17538 */
17539
17540 void
17541 ix86_print_operand (FILE *file, rtx x, int code)
17542 {
17543 if (code)
17544 {
17545 switch (code)
17546 {
17547 case 'A':
17548 switch (ASSEMBLER_DIALECT)
17549 {
17550 case ASM_ATT:
17551 putc ('*', file);
17552 break;
17553
17554 case ASM_INTEL:
17555 /* Intel syntax. For absolute addresses, registers should not
17556 be surrounded by braces. */
17557 if (!REG_P (x))
17558 {
17559 putc ('[', file);
17560 ix86_print_operand (file, x, 0);
17561 putc (']', file);
17562 return;
17563 }
17564 break;
17565
17566 default:
17567 gcc_unreachable ();
17568 }
17569
17570 ix86_print_operand (file, x, 0);
17571 return;
17572
17573 case 'E':
17574 /* Wrap address in an UNSPEC to declare special handling. */
17575 if (TARGET_64BIT)
17576 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
17577
17578 output_address (VOIDmode, x);
17579 return;
17580
17581 case 'L':
17582 if (ASSEMBLER_DIALECT == ASM_ATT)
17583 putc ('l', file);
17584 return;
17585
17586 case 'W':
17587 if (ASSEMBLER_DIALECT == ASM_ATT)
17588 putc ('w', file);
17589 return;
17590
17591 case 'B':
17592 if (ASSEMBLER_DIALECT == ASM_ATT)
17593 putc ('b', file);
17594 return;
17595
17596 case 'Q':
17597 if (ASSEMBLER_DIALECT == ASM_ATT)
17598 putc ('l', file);
17599 return;
17600
17601 case 'S':
17602 if (ASSEMBLER_DIALECT == ASM_ATT)
17603 putc ('s', file);
17604 return;
17605
17606 case 'T':
17607 if (ASSEMBLER_DIALECT == ASM_ATT)
17608 putc ('t', file);
17609 return;
17610
17611 case 'O':
17612 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
17613 if (ASSEMBLER_DIALECT != ASM_ATT)
17614 return;
17615
17616 switch (GET_MODE_SIZE (GET_MODE (x)))
17617 {
17618 case 2:
17619 putc ('w', file);
17620 break;
17621
17622 case 4:
17623 putc ('l', file);
17624 break;
17625
17626 case 8:
17627 putc ('q', file);
17628 break;
17629
17630 default:
17631 output_operand_lossage
17632 ("invalid operand size for operand code 'O'");
17633 return;
17634 }
17635
17636 putc ('.', file);
17637 #endif
17638 return;
17639
17640 case 'z':
17641 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17642 {
17643 /* Opcodes don't get size suffixes if using Intel opcodes. */
17644 if (ASSEMBLER_DIALECT == ASM_INTEL)
17645 return;
17646
17647 switch (GET_MODE_SIZE (GET_MODE (x)))
17648 {
17649 case 1:
17650 putc ('b', file);
17651 return;
17652
17653 case 2:
17654 putc ('w', file);
17655 return;
17656
17657 case 4:
17658 putc ('l', file);
17659 return;
17660
17661 case 8:
17662 putc ('q', file);
17663 return;
17664
17665 default:
17666 output_operand_lossage
17667 ("invalid operand size for operand code 'z'");
17668 return;
17669 }
17670 }
17671
17672 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17673 warning
17674 (0, "non-integer operand used with operand code 'z'");
17675 /* FALLTHRU */
17676
17677 case 'Z':
17678 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
17679 if (ASSEMBLER_DIALECT == ASM_INTEL)
17680 return;
17681
17682 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17683 {
17684 switch (GET_MODE_SIZE (GET_MODE (x)))
17685 {
17686 case 2:
17687 #ifdef HAVE_AS_IX86_FILDS
17688 putc ('s', file);
17689 #endif
17690 return;
17691
17692 case 4:
17693 putc ('l', file);
17694 return;
17695
17696 case 8:
17697 #ifdef HAVE_AS_IX86_FILDQ
17698 putc ('q', file);
17699 #else
17700 fputs ("ll", file);
17701 #endif
17702 return;
17703
17704 default:
17705 break;
17706 }
17707 }
17708 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17709 {
17710 /* 387 opcodes don't get size suffixes
17711 if the operands are registers. */
17712 if (STACK_REG_P (x))
17713 return;
17714
17715 switch (GET_MODE_SIZE (GET_MODE (x)))
17716 {
17717 case 4:
17718 putc ('s', file);
17719 return;
17720
17721 case 8:
17722 putc ('l', file);
17723 return;
17724
17725 case 12:
17726 case 16:
17727 putc ('t', file);
17728 return;
17729
17730 default:
17731 break;
17732 }
17733 }
17734 else
17735 {
17736 output_operand_lossage
17737 ("invalid operand type used with operand code 'Z'");
17738 return;
17739 }
17740
17741 output_operand_lossage
17742 ("invalid operand size for operand code 'Z'");
17743 return;
17744
17745 case 'd':
17746 case 'b':
17747 case 'w':
17748 case 'k':
17749 case 'q':
17750 case 'h':
17751 case 't':
17752 case 'g':
17753 case 'y':
17754 case 'x':
17755 case 'X':
17756 case 'P':
17757 case 'p':
17758 break;
17759
17760 case 's':
17761 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
17762 {
17763 ix86_print_operand (file, x, 0);
17764 fputs (", ", file);
17765 }
17766 return;
17767
17768 case 'Y':
17769 switch (GET_CODE (x))
17770 {
17771 case NE:
17772 fputs ("neq", file);
17773 break;
17774 case EQ:
17775 fputs ("eq", file);
17776 break;
17777 case GE:
17778 case GEU:
17779 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
17780 break;
17781 case GT:
17782 case GTU:
17783 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
17784 break;
17785 case LE:
17786 case LEU:
17787 fputs ("le", file);
17788 break;
17789 case LT:
17790 case LTU:
17791 fputs ("lt", file);
17792 break;
17793 case UNORDERED:
17794 fputs ("unord", file);
17795 break;
17796 case ORDERED:
17797 fputs ("ord", file);
17798 break;
17799 case UNEQ:
17800 fputs ("ueq", file);
17801 break;
17802 case UNGE:
17803 fputs ("nlt", file);
17804 break;
17805 case UNGT:
17806 fputs ("nle", file);
17807 break;
17808 case UNLE:
17809 fputs ("ule", file);
17810 break;
17811 case UNLT:
17812 fputs ("ult", file);
17813 break;
17814 case LTGT:
17815 fputs ("une", file);
17816 break;
17817 default:
17818 output_operand_lossage ("operand is not a condition code, "
17819 "invalid operand code 'Y'");
17820 return;
17821 }
17822 return;
17823
17824 case 'D':
17825 /* Little bit of braindamage here. The SSE compare instructions
17826 does use completely different names for the comparisons that the
17827 fp conditional moves. */
17828 switch (GET_CODE (x))
17829 {
17830 case UNEQ:
17831 if (TARGET_AVX)
17832 {
17833 fputs ("eq_us", file);
17834 break;
17835 }
17836 /* FALLTHRU */
17837 case EQ:
17838 fputs ("eq", file);
17839 break;
17840 case UNLT:
17841 if (TARGET_AVX)
17842 {
17843 fputs ("nge", file);
17844 break;
17845 }
17846 /* FALLTHRU */
17847 case LT:
17848 fputs ("lt", file);
17849 break;
17850 case UNLE:
17851 if (TARGET_AVX)
17852 {
17853 fputs ("ngt", file);
17854 break;
17855 }
17856 /* FALLTHRU */
17857 case LE:
17858 fputs ("le", file);
17859 break;
17860 case UNORDERED:
17861 fputs ("unord", file);
17862 break;
17863 case LTGT:
17864 if (TARGET_AVX)
17865 {
17866 fputs ("neq_oq", file);
17867 break;
17868 }
17869 /* FALLTHRU */
17870 case NE:
17871 fputs ("neq", file);
17872 break;
17873 case GE:
17874 if (TARGET_AVX)
17875 {
17876 fputs ("ge", file);
17877 break;
17878 }
17879 /* FALLTHRU */
17880 case UNGE:
17881 fputs ("nlt", file);
17882 break;
17883 case GT:
17884 if (TARGET_AVX)
17885 {
17886 fputs ("gt", file);
17887 break;
17888 }
17889 /* FALLTHRU */
17890 case UNGT:
17891 fputs ("nle", file);
17892 break;
17893 case ORDERED:
17894 fputs ("ord", file);
17895 break;
17896 default:
17897 output_operand_lossage ("operand is not a condition code, "
17898 "invalid operand code 'D'");
17899 return;
17900 }
17901 return;
17902
17903 case 'F':
17904 case 'f':
17905 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
17906 if (ASSEMBLER_DIALECT == ASM_ATT)
17907 putc ('.', file);
17908 gcc_fallthrough ();
17909 #endif
17910
17911 case 'C':
17912 case 'c':
17913 if (!COMPARISON_P (x))
17914 {
17915 output_operand_lossage ("operand is not a condition code, "
17916 "invalid operand code '%c'", code);
17917 return;
17918 }
17919 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
17920 code == 'c' || code == 'f',
17921 code == 'F' || code == 'f',
17922 file);
17923 return;
17924
17925 case 'H':
17926 if (!offsettable_memref_p (x))
17927 {
17928 output_operand_lossage ("operand is not an offsettable memory "
17929 "reference, invalid operand code 'H'");
17930 return;
17931 }
17932 /* It doesn't actually matter what mode we use here, as we're
17933 only going to use this for printing. */
17934 x = adjust_address_nv (x, DImode, 8);
17935 /* Output 'qword ptr' for intel assembler dialect. */
17936 if (ASSEMBLER_DIALECT == ASM_INTEL)
17937 code = 'q';
17938 break;
17939
17940 case 'K':
17941 gcc_assert (CONST_INT_P (x));
17942
17943 if (INTVAL (x) & IX86_HLE_ACQUIRE)
17944 #ifdef HAVE_AS_IX86_HLE
17945 fputs ("xacquire ", file);
17946 #else
17947 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
17948 #endif
17949 else if (INTVAL (x) & IX86_HLE_RELEASE)
17950 #ifdef HAVE_AS_IX86_HLE
17951 fputs ("xrelease ", file);
17952 #else
17953 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
17954 #endif
17955 /* We do not want to print value of the operand. */
17956 return;
17957
17958 case 'N':
17959 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
17960 fputs ("{z}", file);
17961 return;
17962
17963 case 'r':
17964 gcc_assert (CONST_INT_P (x));
17965 gcc_assert (INTVAL (x) == ROUND_SAE);
17966
17967 if (ASSEMBLER_DIALECT == ASM_INTEL)
17968 fputs (", ", file);
17969
17970 fputs ("{sae}", file);
17971
17972 if (ASSEMBLER_DIALECT == ASM_ATT)
17973 fputs (", ", file);
17974
17975 return;
17976
17977 case 'R':
17978 gcc_assert (CONST_INT_P (x));
17979
17980 if (ASSEMBLER_DIALECT == ASM_INTEL)
17981 fputs (", ", file);
17982
17983 switch (INTVAL (x))
17984 {
17985 case ROUND_NEAREST_INT | ROUND_SAE:
17986 fputs ("{rn-sae}", file);
17987 break;
17988 case ROUND_NEG_INF | ROUND_SAE:
17989 fputs ("{rd-sae}", file);
17990 break;
17991 case ROUND_POS_INF | ROUND_SAE:
17992 fputs ("{ru-sae}", file);
17993 break;
17994 case ROUND_ZERO | ROUND_SAE:
17995 fputs ("{rz-sae}", file);
17996 break;
17997 default:
17998 gcc_unreachable ();
17999 }
18000
18001 if (ASSEMBLER_DIALECT == ASM_ATT)
18002 fputs (", ", file);
18003
18004 return;
18005
18006 case '*':
18007 if (ASSEMBLER_DIALECT == ASM_ATT)
18008 putc ('*', file);
18009 return;
18010
18011 case '&':
18012 {
18013 const char *name = get_some_local_dynamic_name ();
18014 if (name == NULL)
18015 output_operand_lossage ("'%%&' used without any "
18016 "local dynamic TLS references");
18017 else
18018 assemble_name (file, name);
18019 return;
18020 }
18021
18022 case '+':
18023 {
18024 rtx x;
18025
18026 if (!optimize
18027 || optimize_function_for_size_p (cfun)
18028 || !TARGET_BRANCH_PREDICTION_HINTS)
18029 return;
18030
18031 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
18032 if (x)
18033 {
18034 int pred_val = XINT (x, 0);
18035
18036 if (pred_val < REG_BR_PROB_BASE * 45 / 100
18037 || pred_val > REG_BR_PROB_BASE * 55 / 100)
18038 {
18039 bool taken = pred_val > REG_BR_PROB_BASE / 2;
18040 bool cputaken
18041 = final_forward_branch_p (current_output_insn) == 0;
18042
18043 /* Emit hints only in the case default branch prediction
18044 heuristics would fail. */
18045 if (taken != cputaken)
18046 {
18047 /* We use 3e (DS) prefix for taken branches and
18048 2e (CS) prefix for not taken branches. */
18049 if (taken)
18050 fputs ("ds ; ", file);
18051 else
18052 fputs ("cs ; ", file);
18053 }
18054 }
18055 }
18056 return;
18057 }
18058
18059 case ';':
18060 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
18061 putc (';', file);
18062 #endif
18063 return;
18064
18065 case '@':
18066 if (ASSEMBLER_DIALECT == ASM_ATT)
18067 putc ('%', file);
18068
18069 /* The kernel uses a different segment register for performance
18070 reasons; a system call would not have to trash the userspace
18071 segment register, which would be expensive. */
18072 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
18073 fputs ("fs", file);
18074 else
18075 fputs ("gs", file);
18076 return;
18077
18078 case '~':
18079 putc (TARGET_AVX2 ? 'i' : 'f', file);
18080 return;
18081
18082 case '^':
18083 if (TARGET_64BIT && Pmode != word_mode)
18084 fputs ("addr32 ", file);
18085 return;
18086
18087 case '!':
18088 if (ix86_bnd_prefixed_insn_p (current_output_insn))
18089 fputs ("bnd ", file);
18090 return;
18091
18092 default:
18093 output_operand_lossage ("invalid operand code '%c'", code);
18094 }
18095 }
18096
18097 if (REG_P (x))
18098 print_reg (x, code, file);
18099
18100 else if (MEM_P (x))
18101 {
18102 rtx addr = XEXP (x, 0);
18103
18104 /* No `byte ptr' prefix for call instructions ... */
18105 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
18106 {
18107 machine_mode mode = GET_MODE (x);
18108 const char *size;
18109
18110 /* Check for explicit size override codes. */
18111 if (code == 'b')
18112 size = "BYTE";
18113 else if (code == 'w')
18114 size = "WORD";
18115 else if (code == 'k')
18116 size = "DWORD";
18117 else if (code == 'q')
18118 size = "QWORD";
18119 else if (code == 'x')
18120 size = "XMMWORD";
18121 else if (code == 't')
18122 size = "YMMWORD";
18123 else if (code == 'g')
18124 size = "ZMMWORD";
18125 else if (mode == BLKmode)
18126 /* ... or BLKmode operands, when not overridden. */
18127 size = NULL;
18128 else
18129 switch (GET_MODE_SIZE (mode))
18130 {
18131 case 1: size = "BYTE"; break;
18132 case 2: size = "WORD"; break;
18133 case 4: size = "DWORD"; break;
18134 case 8: size = "QWORD"; break;
18135 case 12: size = "TBYTE"; break;
18136 case 16:
18137 if (mode == XFmode)
18138 size = "TBYTE";
18139 else
18140 size = "XMMWORD";
18141 break;
18142 case 32: size = "YMMWORD"; break;
18143 case 64: size = "ZMMWORD"; break;
18144 default:
18145 gcc_unreachable ();
18146 }
18147 if (size)
18148 {
18149 fputs (size, file);
18150 fputs (" PTR ", file);
18151 }
18152 }
18153
18154 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
18155 output_operand_lossage ("invalid constraints for operand");
18156 else
18157 ix86_print_operand_address_as
18158 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
18159 }
18160
18161 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
18162 {
18163 long l;
18164
18165 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18166
18167 if (ASSEMBLER_DIALECT == ASM_ATT)
18168 putc ('$', file);
18169 /* Sign extend 32bit SFmode immediate to 8 bytes. */
18170 if (code == 'q')
18171 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
18172 (unsigned long long) (int) l);
18173 else
18174 fprintf (file, "0x%08x", (unsigned int) l);
18175 }
18176
18177 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
18178 {
18179 long l[2];
18180
18181 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18182
18183 if (ASSEMBLER_DIALECT == ASM_ATT)
18184 putc ('$', file);
18185 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
18186 }
18187
18188 /* These float cases don't actually occur as immediate operands. */
18189 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
18190 {
18191 char dstr[30];
18192
18193 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
18194 fputs (dstr, file);
18195 }
18196
18197 else
18198 {
18199 /* We have patterns that allow zero sets of memory, for instance.
18200 In 64-bit mode, we should probably support all 8-byte vectors,
18201 since we can in fact encode that into an immediate. */
18202 if (GET_CODE (x) == CONST_VECTOR)
18203 {
18204 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
18205 x = const0_rtx;
18206 }
18207
18208 if (code != 'P' && code != 'p')
18209 {
18210 if (CONST_INT_P (x))
18211 {
18212 if (ASSEMBLER_DIALECT == ASM_ATT)
18213 putc ('$', file);
18214 }
18215 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
18216 || GET_CODE (x) == LABEL_REF)
18217 {
18218 if (ASSEMBLER_DIALECT == ASM_ATT)
18219 putc ('$', file);
18220 else
18221 fputs ("OFFSET FLAT:", file);
18222 }
18223 }
18224 if (CONST_INT_P (x))
18225 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
18226 else if (flag_pic || MACHOPIC_INDIRECT)
18227 output_pic_addr_const (file, x, code);
18228 else
18229 output_addr_const (file, x);
18230 }
18231 }
18232
18233 static bool
18234 ix86_print_operand_punct_valid_p (unsigned char code)
18235 {
18236 return (code == '@' || code == '*' || code == '+' || code == '&'
18237 || code == ';' || code == '~' || code == '^' || code == '!');
18238 }
18239 \f
18240 /* Print a memory operand whose address is ADDR. */
18241
18242 static void
18243 ix86_print_operand_address_as (FILE *file, rtx addr,
18244 addr_space_t as, bool no_rip)
18245 {
18246 struct ix86_address parts;
18247 rtx base, index, disp;
18248 int scale;
18249 int ok;
18250 bool vsib = false;
18251 int code = 0;
18252
18253 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
18254 {
18255 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18256 gcc_assert (parts.index == NULL_RTX);
18257 parts.index = XVECEXP (addr, 0, 1);
18258 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
18259 addr = XVECEXP (addr, 0, 0);
18260 vsib = true;
18261 }
18262 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
18263 {
18264 gcc_assert (TARGET_64BIT);
18265 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18266 code = 'q';
18267 }
18268 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
18269 {
18270 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
18271 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
18272 if (parts.base != NULL_RTX)
18273 {
18274 parts.index = parts.base;
18275 parts.scale = 1;
18276 }
18277 parts.base = XVECEXP (addr, 0, 0);
18278 addr = XVECEXP (addr, 0, 0);
18279 }
18280 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
18281 {
18282 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18283 gcc_assert (parts.index == NULL_RTX);
18284 parts.index = XVECEXP (addr, 0, 1);
18285 addr = XVECEXP (addr, 0, 0);
18286 }
18287 else
18288 ok = ix86_decompose_address (addr, &parts);
18289
18290 gcc_assert (ok);
18291
18292 base = parts.base;
18293 index = parts.index;
18294 disp = parts.disp;
18295 scale = parts.scale;
18296
18297 if (ADDR_SPACE_GENERIC_P (as))
18298 as = parts.seg;
18299 else
18300 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
18301
18302 if (!ADDR_SPACE_GENERIC_P (as))
18303 {
18304 const char *string;
18305
18306 if (as == ADDR_SPACE_SEG_FS)
18307 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
18308 else if (as == ADDR_SPACE_SEG_GS)
18309 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
18310 else
18311 gcc_unreachable ();
18312 fputs (string, file);
18313 }
18314
18315 /* Use one byte shorter RIP relative addressing for 64bit mode. */
18316 if (TARGET_64BIT && !base && !index && !no_rip)
18317 {
18318 rtx symbol = disp;
18319
18320 if (GET_CODE (disp) == CONST
18321 && GET_CODE (XEXP (disp, 0)) == PLUS
18322 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18323 symbol = XEXP (XEXP (disp, 0), 0);
18324
18325 if (GET_CODE (symbol) == LABEL_REF
18326 || (GET_CODE (symbol) == SYMBOL_REF
18327 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
18328 base = pc_rtx;
18329 }
18330
18331 if (!base && !index)
18332 {
18333 /* Displacement only requires special attention. */
18334 if (CONST_INT_P (disp))
18335 {
18336 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == ADDR_SPACE_GENERIC)
18337 fputs ("ds:", file);
18338 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
18339 }
18340 /* Load the external function address via the GOT slot to avoid PLT. */
18341 else if (GET_CODE (disp) == CONST
18342 && GET_CODE (XEXP (disp, 0)) == UNSPEC
18343 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
18344 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
18345 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
18346 output_pic_addr_const (file, disp, 0);
18347 else if (flag_pic)
18348 output_pic_addr_const (file, disp, 0);
18349 else
18350 output_addr_const (file, disp);
18351 }
18352 else
18353 {
18354 /* Print SImode register names to force addr32 prefix. */
18355 if (SImode_address_operand (addr, VOIDmode))
18356 {
18357 if (flag_checking)
18358 {
18359 gcc_assert (TARGET_64BIT);
18360 switch (GET_CODE (addr))
18361 {
18362 case SUBREG:
18363 gcc_assert (GET_MODE (addr) == SImode);
18364 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
18365 break;
18366 case ZERO_EXTEND:
18367 case AND:
18368 gcc_assert (GET_MODE (addr) == DImode);
18369 break;
18370 default:
18371 gcc_unreachable ();
18372 }
18373 }
18374 gcc_assert (!code);
18375 code = 'k';
18376 }
18377 else if (code == 0
18378 && TARGET_X32
18379 && disp
18380 && CONST_INT_P (disp)
18381 && INTVAL (disp) < -16*1024*1024)
18382 {
18383 /* X32 runs in 64-bit mode, where displacement, DISP, in
18384 address DISP(%r64), is encoded as 32-bit immediate sign-
18385 extended from 32-bit to 64-bit. For -0x40000300(%r64),
18386 address is %r64 + 0xffffffffbffffd00. When %r64 <
18387 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
18388 which is invalid for x32. The correct address is %r64
18389 - 0x40000300 == 0xf7ffdd64. To properly encode
18390 -0x40000300(%r64) for x32, we zero-extend negative
18391 displacement by forcing addr32 prefix which truncates
18392 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
18393 zero-extend all negative displacements, including -1(%rsp).
18394 However, for small negative displacements, sign-extension
18395 won't cause overflow. We only zero-extend negative
18396 displacements if they < -16*1024*1024, which is also used
18397 to check legitimate address displacements for PIC. */
18398 code = 'k';
18399 }
18400
18401 if (ASSEMBLER_DIALECT == ASM_ATT)
18402 {
18403 if (disp)
18404 {
18405 if (flag_pic)
18406 output_pic_addr_const (file, disp, 0);
18407 else if (GET_CODE (disp) == LABEL_REF)
18408 output_asm_label (disp);
18409 else
18410 output_addr_const (file, disp);
18411 }
18412
18413 putc ('(', file);
18414 if (base)
18415 print_reg (base, code, file);
18416 if (index)
18417 {
18418 putc (',', file);
18419 print_reg (index, vsib ? 0 : code, file);
18420 if (scale != 1 || vsib)
18421 fprintf (file, ",%d", scale);
18422 }
18423 putc (')', file);
18424 }
18425 else
18426 {
18427 rtx offset = NULL_RTX;
18428
18429 if (disp)
18430 {
18431 /* Pull out the offset of a symbol; print any symbol itself. */
18432 if (GET_CODE (disp) == CONST
18433 && GET_CODE (XEXP (disp, 0)) == PLUS
18434 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18435 {
18436 offset = XEXP (XEXP (disp, 0), 1);
18437 disp = gen_rtx_CONST (VOIDmode,
18438 XEXP (XEXP (disp, 0), 0));
18439 }
18440
18441 if (flag_pic)
18442 output_pic_addr_const (file, disp, 0);
18443 else if (GET_CODE (disp) == LABEL_REF)
18444 output_asm_label (disp);
18445 else if (CONST_INT_P (disp))
18446 offset = disp;
18447 else
18448 output_addr_const (file, disp);
18449 }
18450
18451 putc ('[', file);
18452 if (base)
18453 {
18454 print_reg (base, code, file);
18455 if (offset)
18456 {
18457 if (INTVAL (offset) >= 0)
18458 putc ('+', file);
18459 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18460 }
18461 }
18462 else if (offset)
18463 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18464 else
18465 putc ('0', file);
18466
18467 if (index)
18468 {
18469 putc ('+', file);
18470 print_reg (index, vsib ? 0 : code, file);
18471 if (scale != 1 || vsib)
18472 fprintf (file, "*%d", scale);
18473 }
18474 putc (']', file);
18475 }
18476 }
18477 }
18478
18479 static void
18480 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
18481 {
18482 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
18483 }
18484
18485 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
18486
18487 static bool
18488 i386_asm_output_addr_const_extra (FILE *file, rtx x)
18489 {
18490 rtx op;
18491
18492 if (GET_CODE (x) != UNSPEC)
18493 return false;
18494
18495 op = XVECEXP (x, 0, 0);
18496 switch (XINT (x, 1))
18497 {
18498 case UNSPEC_GOTTPOFF:
18499 output_addr_const (file, op);
18500 /* FIXME: This might be @TPOFF in Sun ld. */
18501 fputs ("@gottpoff", file);
18502 break;
18503 case UNSPEC_TPOFF:
18504 output_addr_const (file, op);
18505 fputs ("@tpoff", file);
18506 break;
18507 case UNSPEC_NTPOFF:
18508 output_addr_const (file, op);
18509 if (TARGET_64BIT)
18510 fputs ("@tpoff", file);
18511 else
18512 fputs ("@ntpoff", file);
18513 break;
18514 case UNSPEC_DTPOFF:
18515 output_addr_const (file, op);
18516 fputs ("@dtpoff", file);
18517 break;
18518 case UNSPEC_GOTNTPOFF:
18519 output_addr_const (file, op);
18520 if (TARGET_64BIT)
18521 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18522 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
18523 else
18524 fputs ("@gotntpoff", file);
18525 break;
18526 case UNSPEC_INDNTPOFF:
18527 output_addr_const (file, op);
18528 fputs ("@indntpoff", file);
18529 break;
18530 #if TARGET_MACHO
18531 case UNSPEC_MACHOPIC_OFFSET:
18532 output_addr_const (file, op);
18533 putc ('-', file);
18534 machopic_output_function_base_name (file);
18535 break;
18536 #endif
18537
18538 case UNSPEC_STACK_CHECK:
18539 {
18540 int offset;
18541
18542 gcc_assert (flag_split_stack);
18543
18544 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
18545 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
18546 #else
18547 gcc_unreachable ();
18548 #endif
18549
18550 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
18551 }
18552 break;
18553
18554 default:
18555 return false;
18556 }
18557
18558 return true;
18559 }
18560 \f
18561 /* Split one or more double-mode RTL references into pairs of half-mode
18562 references. The RTL can be REG, offsettable MEM, integer constant, or
18563 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
18564 split and "num" is its length. lo_half and hi_half are output arrays
18565 that parallel "operands". */
18566
18567 void
18568 split_double_mode (machine_mode mode, rtx operands[],
18569 int num, rtx lo_half[], rtx hi_half[])
18570 {
18571 machine_mode half_mode;
18572 unsigned int byte;
18573
18574 switch (mode)
18575 {
18576 case TImode:
18577 half_mode = DImode;
18578 break;
18579 case DImode:
18580 half_mode = SImode;
18581 break;
18582 default:
18583 gcc_unreachable ();
18584 }
18585
18586 byte = GET_MODE_SIZE (half_mode);
18587
18588 while (num--)
18589 {
18590 rtx op = operands[num];
18591
18592 /* simplify_subreg refuse to split volatile memory addresses,
18593 but we still have to handle it. */
18594 if (MEM_P (op))
18595 {
18596 lo_half[num] = adjust_address (op, half_mode, 0);
18597 hi_half[num] = adjust_address (op, half_mode, byte);
18598 }
18599 else
18600 {
18601 lo_half[num] = simplify_gen_subreg (half_mode, op,
18602 GET_MODE (op) == VOIDmode
18603 ? mode : GET_MODE (op), 0);
18604 hi_half[num] = simplify_gen_subreg (half_mode, op,
18605 GET_MODE (op) == VOIDmode
18606 ? mode : GET_MODE (op), byte);
18607 }
18608 }
18609 }
18610 \f
18611 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
18612 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
18613 is the expression of the binary operation. The output may either be
18614 emitted here, or returned to the caller, like all output_* functions.
18615
18616 There is no guarantee that the operands are the same mode, as they
18617 might be within FLOAT or FLOAT_EXTEND expressions. */
18618
18619 #ifndef SYSV386_COMPAT
18620 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
18621 wants to fix the assemblers because that causes incompatibility
18622 with gcc. No-one wants to fix gcc because that causes
18623 incompatibility with assemblers... You can use the option of
18624 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
18625 #define SYSV386_COMPAT 1
18626 #endif
18627
18628 const char *
18629 output_387_binary_op (rtx insn, rtx *operands)
18630 {
18631 static char buf[40];
18632 const char *p;
18633 const char *ssep;
18634 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
18635
18636 /* Even if we do not want to check the inputs, this documents input
18637 constraints. Which helps in understanding the following code. */
18638 if (flag_checking)
18639 {
18640 if (STACK_REG_P (operands[0])
18641 && ((REG_P (operands[1])
18642 && REGNO (operands[0]) == REGNO (operands[1])
18643 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
18644 || (REG_P (operands[2])
18645 && REGNO (operands[0]) == REGNO (operands[2])
18646 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
18647 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
18648 ; /* ok */
18649 else
18650 gcc_assert (is_sse);
18651 }
18652
18653 switch (GET_CODE (operands[3]))
18654 {
18655 case PLUS:
18656 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18657 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18658 p = "fiadd";
18659 else
18660 p = "fadd";
18661 ssep = "vadd";
18662 break;
18663
18664 case MINUS:
18665 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18666 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18667 p = "fisub";
18668 else
18669 p = "fsub";
18670 ssep = "vsub";
18671 break;
18672
18673 case MULT:
18674 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18675 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18676 p = "fimul";
18677 else
18678 p = "fmul";
18679 ssep = "vmul";
18680 break;
18681
18682 case DIV:
18683 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18684 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18685 p = "fidiv";
18686 else
18687 p = "fdiv";
18688 ssep = "vdiv";
18689 break;
18690
18691 default:
18692 gcc_unreachable ();
18693 }
18694
18695 if (is_sse)
18696 {
18697 if (TARGET_AVX)
18698 {
18699 strcpy (buf, ssep);
18700 if (GET_MODE (operands[0]) == SFmode)
18701 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
18702 else
18703 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
18704 }
18705 else
18706 {
18707 strcpy (buf, ssep + 1);
18708 if (GET_MODE (operands[0]) == SFmode)
18709 strcat (buf, "ss\t{%2, %0|%0, %2}");
18710 else
18711 strcat (buf, "sd\t{%2, %0|%0, %2}");
18712 }
18713 return buf;
18714 }
18715 strcpy (buf, p);
18716
18717 switch (GET_CODE (operands[3]))
18718 {
18719 case MULT:
18720 case PLUS:
18721 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
18722 std::swap (operands[1], operands[2]);
18723
18724 /* know operands[0] == operands[1]. */
18725
18726 if (MEM_P (operands[2]))
18727 {
18728 p = "%Z2\t%2";
18729 break;
18730 }
18731
18732 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
18733 {
18734 if (STACK_TOP_P (operands[0]))
18735 /* How is it that we are storing to a dead operand[2]?
18736 Well, presumably operands[1] is dead too. We can't
18737 store the result to st(0) as st(0) gets popped on this
18738 instruction. Instead store to operands[2] (which I
18739 think has to be st(1)). st(1) will be popped later.
18740 gcc <= 2.8.1 didn't have this check and generated
18741 assembly code that the Unixware assembler rejected. */
18742 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
18743 else
18744 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
18745 break;
18746 }
18747
18748 if (STACK_TOP_P (operands[0]))
18749 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
18750 else
18751 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
18752 break;
18753
18754 case MINUS:
18755 case DIV:
18756 if (MEM_P (operands[1]))
18757 {
18758 p = "r%Z1\t%1";
18759 break;
18760 }
18761
18762 if (MEM_P (operands[2]))
18763 {
18764 p = "%Z2\t%2";
18765 break;
18766 }
18767
18768 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
18769 {
18770 #if SYSV386_COMPAT
18771 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
18772 derived assemblers, confusingly reverse the direction of
18773 the operation for fsub{r} and fdiv{r} when the
18774 destination register is not st(0). The Intel assembler
18775 doesn't have this brain damage. Read !SYSV386_COMPAT to
18776 figure out what the hardware really does. */
18777 if (STACK_TOP_P (operands[0]))
18778 p = "{p\t%0, %2|rp\t%2, %0}";
18779 else
18780 p = "{rp\t%2, %0|p\t%0, %2}";
18781 #else
18782 if (STACK_TOP_P (operands[0]))
18783 /* As above for fmul/fadd, we can't store to st(0). */
18784 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
18785 else
18786 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
18787 #endif
18788 break;
18789 }
18790
18791 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
18792 {
18793 #if SYSV386_COMPAT
18794 if (STACK_TOP_P (operands[0]))
18795 p = "{rp\t%0, %1|p\t%1, %0}";
18796 else
18797 p = "{p\t%1, %0|rp\t%0, %1}";
18798 #else
18799 if (STACK_TOP_P (operands[0]))
18800 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
18801 else
18802 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
18803 #endif
18804 break;
18805 }
18806
18807 if (STACK_TOP_P (operands[0]))
18808 {
18809 if (STACK_TOP_P (operands[1]))
18810 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
18811 else
18812 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
18813 break;
18814 }
18815 else if (STACK_TOP_P (operands[1]))
18816 {
18817 #if SYSV386_COMPAT
18818 p = "{\t%1, %0|r\t%0, %1}";
18819 #else
18820 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
18821 #endif
18822 }
18823 else
18824 {
18825 #if SYSV386_COMPAT
18826 p = "{r\t%2, %0|\t%0, %2}";
18827 #else
18828 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
18829 #endif
18830 }
18831 break;
18832
18833 default:
18834 gcc_unreachable ();
18835 }
18836
18837 strcat (buf, p);
18838 return buf;
18839 }
18840
18841 /* Return needed mode for entity in optimize_mode_switching pass. */
18842
18843 static int
18844 ix86_dirflag_mode_needed (rtx_insn *insn)
18845 {
18846 if (CALL_P (insn))
18847 {
18848 if (cfun->machine->func_type == TYPE_NORMAL)
18849 return X86_DIRFLAG_ANY;
18850 else
18851 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
18852 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
18853 }
18854
18855 if (recog_memoized (insn) < 0)
18856 return X86_DIRFLAG_ANY;
18857
18858 if (get_attr_type (insn) == TYPE_STR)
18859 {
18860 /* Emit cld instruction if stringops are used in the function. */
18861 if (cfun->machine->func_type == TYPE_NORMAL)
18862 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
18863 else
18864 return X86_DIRFLAG_RESET;
18865 }
18866
18867 return X86_DIRFLAG_ANY;
18868 }
18869
18870 /* Check if a 256bit AVX register is referenced inside of EXP. */
18871
18872 static bool
18873 ix86_check_avx256_register (const_rtx exp)
18874 {
18875 if (SUBREG_P (exp))
18876 exp = SUBREG_REG (exp);
18877
18878 return (REG_P (exp)
18879 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)));
18880 }
18881
18882 /* Return needed mode for entity in optimize_mode_switching pass. */
18883
18884 static int
18885 ix86_avx_u128_mode_needed (rtx_insn *insn)
18886 {
18887 if (CALL_P (insn))
18888 {
18889 rtx link;
18890
18891 /* Needed mode is set to AVX_U128_CLEAN if there are
18892 no 256bit modes used in function arguments. */
18893 for (link = CALL_INSN_FUNCTION_USAGE (insn);
18894 link;
18895 link = XEXP (link, 1))
18896 {
18897 if (GET_CODE (XEXP (link, 0)) == USE)
18898 {
18899 rtx arg = XEXP (XEXP (link, 0), 0);
18900
18901 if (ix86_check_avx256_register (arg))
18902 return AVX_U128_DIRTY;
18903 }
18904 }
18905
18906 return AVX_U128_CLEAN;
18907 }
18908
18909 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
18910 changes state only when a 256bit register is written to, but we need
18911 to prevent the compiler from moving optimal insertion point above
18912 eventual read from 256bit register. */
18913 subrtx_iterator::array_type array;
18914 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
18915 if (ix86_check_avx256_register (*iter))
18916 return AVX_U128_DIRTY;
18917
18918 return AVX_U128_ANY;
18919 }
18920
18921 /* Return mode that i387 must be switched into
18922 prior to the execution of insn. */
18923
18924 static int
18925 ix86_i387_mode_needed (int entity, rtx_insn *insn)
18926 {
18927 enum attr_i387_cw mode;
18928
18929 /* The mode UNINITIALIZED is used to store control word after a
18930 function call or ASM pattern. The mode ANY specify that function
18931 has no requirements on the control word and make no changes in the
18932 bits we are interested in. */
18933
18934 if (CALL_P (insn)
18935 || (NONJUMP_INSN_P (insn)
18936 && (asm_noperands (PATTERN (insn)) >= 0
18937 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
18938 return I387_CW_UNINITIALIZED;
18939
18940 if (recog_memoized (insn) < 0)
18941 return I387_CW_ANY;
18942
18943 mode = get_attr_i387_cw (insn);
18944
18945 switch (entity)
18946 {
18947 case I387_TRUNC:
18948 if (mode == I387_CW_TRUNC)
18949 return mode;
18950 break;
18951
18952 case I387_FLOOR:
18953 if (mode == I387_CW_FLOOR)
18954 return mode;
18955 break;
18956
18957 case I387_CEIL:
18958 if (mode == I387_CW_CEIL)
18959 return mode;
18960 break;
18961
18962 case I387_MASK_PM:
18963 if (mode == I387_CW_MASK_PM)
18964 return mode;
18965 break;
18966
18967 default:
18968 gcc_unreachable ();
18969 }
18970
18971 return I387_CW_ANY;
18972 }
18973
18974 /* Return mode that entity must be switched into
18975 prior to the execution of insn. */
18976
18977 static int
18978 ix86_mode_needed (int entity, rtx_insn *insn)
18979 {
18980 switch (entity)
18981 {
18982 case X86_DIRFLAG:
18983 return ix86_dirflag_mode_needed (insn);
18984 case AVX_U128:
18985 return ix86_avx_u128_mode_needed (insn);
18986 case I387_TRUNC:
18987 case I387_FLOOR:
18988 case I387_CEIL:
18989 case I387_MASK_PM:
18990 return ix86_i387_mode_needed (entity, insn);
18991 default:
18992 gcc_unreachable ();
18993 }
18994 return 0;
18995 }
18996
18997 /* Check if a 256bit AVX register is referenced in stores. */
18998
18999 static void
19000 ix86_check_avx256_stores (rtx dest, const_rtx, void *data)
19001 {
19002 if (ix86_check_avx256_register (dest))
19003 {
19004 bool *used = (bool *) data;
19005 *used = true;
19006 }
19007 }
19008
19009 /* Calculate mode of upper 128bit AVX registers after the insn. */
19010
19011 static int
19012 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
19013 {
19014 rtx pat = PATTERN (insn);
19015
19016 if (vzeroupper_operation (pat, VOIDmode)
19017 || vzeroall_operation (pat, VOIDmode))
19018 return AVX_U128_CLEAN;
19019
19020 /* We know that state is clean after CALL insn if there are no
19021 256bit registers used in the function return register. */
19022 if (CALL_P (insn))
19023 {
19024 bool avx_reg256_found = false;
19025 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
19026
19027 return avx_reg256_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
19028 }
19029
19030 /* Otherwise, return current mode. Remember that if insn
19031 references AVX 256bit registers, the mode was already changed
19032 to DIRTY from MODE_NEEDED. */
19033 return mode;
19034 }
19035
19036 /* Return the mode that an insn results in. */
19037
19038 static int
19039 ix86_mode_after (int entity, int mode, rtx_insn *insn)
19040 {
19041 switch (entity)
19042 {
19043 case X86_DIRFLAG:
19044 return mode;
19045 case AVX_U128:
19046 return ix86_avx_u128_mode_after (mode, insn);
19047 case I387_TRUNC:
19048 case I387_FLOOR:
19049 case I387_CEIL:
19050 case I387_MASK_PM:
19051 return mode;
19052 default:
19053 gcc_unreachable ();
19054 }
19055 }
19056
19057 static int
19058 ix86_dirflag_mode_entry (void)
19059 {
19060 /* For TARGET_CLD or in the interrupt handler we can't assume
19061 direction flag state at function entry. */
19062 if (TARGET_CLD
19063 || cfun->machine->func_type != TYPE_NORMAL)
19064 return X86_DIRFLAG_ANY;
19065
19066 return X86_DIRFLAG_RESET;
19067 }
19068
19069 static int
19070 ix86_avx_u128_mode_entry (void)
19071 {
19072 tree arg;
19073
19074 /* Entry mode is set to AVX_U128_DIRTY if there are
19075 256bit modes used in function arguments. */
19076 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
19077 arg = TREE_CHAIN (arg))
19078 {
19079 rtx incoming = DECL_INCOMING_RTL (arg);
19080
19081 if (incoming && ix86_check_avx256_register (incoming))
19082 return AVX_U128_DIRTY;
19083 }
19084
19085 return AVX_U128_CLEAN;
19086 }
19087
19088 /* Return a mode that ENTITY is assumed to be
19089 switched to at function entry. */
19090
19091 static int
19092 ix86_mode_entry (int entity)
19093 {
19094 switch (entity)
19095 {
19096 case X86_DIRFLAG:
19097 return ix86_dirflag_mode_entry ();
19098 case AVX_U128:
19099 return ix86_avx_u128_mode_entry ();
19100 case I387_TRUNC:
19101 case I387_FLOOR:
19102 case I387_CEIL:
19103 case I387_MASK_PM:
19104 return I387_CW_ANY;
19105 default:
19106 gcc_unreachable ();
19107 }
19108 }
19109
19110 static int
19111 ix86_avx_u128_mode_exit (void)
19112 {
19113 rtx reg = crtl->return_rtx;
19114
19115 /* Exit mode is set to AVX_U128_DIRTY if there are
19116 256bit modes used in the function return register. */
19117 if (reg && ix86_check_avx256_register (reg))
19118 return AVX_U128_DIRTY;
19119
19120 return AVX_U128_CLEAN;
19121 }
19122
19123 /* Return a mode that ENTITY is assumed to be
19124 switched to at function exit. */
19125
19126 static int
19127 ix86_mode_exit (int entity)
19128 {
19129 switch (entity)
19130 {
19131 case X86_DIRFLAG:
19132 return X86_DIRFLAG_ANY;
19133 case AVX_U128:
19134 return ix86_avx_u128_mode_exit ();
19135 case I387_TRUNC:
19136 case I387_FLOOR:
19137 case I387_CEIL:
19138 case I387_MASK_PM:
19139 return I387_CW_ANY;
19140 default:
19141 gcc_unreachable ();
19142 }
19143 }
19144
19145 static int
19146 ix86_mode_priority (int, int n)
19147 {
19148 return n;
19149 }
19150
19151 /* Output code to initialize control word copies used by trunc?f?i and
19152 rounding patterns. CURRENT_MODE is set to current control word,
19153 while NEW_MODE is set to new control word. */
19154
19155 static void
19156 emit_i387_cw_initialization (int mode)
19157 {
19158 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
19159 rtx new_mode;
19160
19161 enum ix86_stack_slot slot;
19162
19163 rtx reg = gen_reg_rtx (HImode);
19164
19165 emit_insn (gen_x86_fnstcw_1 (stored_mode));
19166 emit_move_insn (reg, copy_rtx (stored_mode));
19167
19168 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
19169 || optimize_insn_for_size_p ())
19170 {
19171 switch (mode)
19172 {
19173 case I387_CW_TRUNC:
19174 /* round toward zero (truncate) */
19175 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
19176 slot = SLOT_CW_TRUNC;
19177 break;
19178
19179 case I387_CW_FLOOR:
19180 /* round down toward -oo */
19181 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19182 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
19183 slot = SLOT_CW_FLOOR;
19184 break;
19185
19186 case I387_CW_CEIL:
19187 /* round up toward +oo */
19188 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19189 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
19190 slot = SLOT_CW_CEIL;
19191 break;
19192
19193 case I387_CW_MASK_PM:
19194 /* mask precision exception for nearbyint() */
19195 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19196 slot = SLOT_CW_MASK_PM;
19197 break;
19198
19199 default:
19200 gcc_unreachable ();
19201 }
19202 }
19203 else
19204 {
19205 switch (mode)
19206 {
19207 case I387_CW_TRUNC:
19208 /* round toward zero (truncate) */
19209 emit_insn (gen_insvsi_1 (reg, GEN_INT (0xc)));
19210 slot = SLOT_CW_TRUNC;
19211 break;
19212
19213 case I387_CW_FLOOR:
19214 /* round down toward -oo */
19215 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x4)));
19216 slot = SLOT_CW_FLOOR;
19217 break;
19218
19219 case I387_CW_CEIL:
19220 /* round up toward +oo */
19221 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x8)));
19222 slot = SLOT_CW_CEIL;
19223 break;
19224
19225 case I387_CW_MASK_PM:
19226 /* mask precision exception for nearbyint() */
19227 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19228 slot = SLOT_CW_MASK_PM;
19229 break;
19230
19231 default:
19232 gcc_unreachable ();
19233 }
19234 }
19235
19236 gcc_assert (slot < MAX_386_STACK_LOCALS);
19237
19238 new_mode = assign_386_stack_local (HImode, slot);
19239 emit_move_insn (new_mode, reg);
19240 }
19241
19242 /* Emit vzeroupper. */
19243
19244 void
19245 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
19246 {
19247 int i;
19248
19249 /* Cancel automatic vzeroupper insertion if there are
19250 live call-saved SSE registers at the insertion point. */
19251
19252 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
19253 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19254 return;
19255
19256 if (TARGET_64BIT)
19257 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
19258 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19259 return;
19260
19261 emit_insn (gen_avx_vzeroupper ());
19262 }
19263
19264 /* Generate one or more insns to set ENTITY to MODE. */
19265
19266 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
19267 is the set of hard registers live at the point where the insn(s)
19268 are to be inserted. */
19269
19270 static void
19271 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
19272 HARD_REG_SET regs_live)
19273 {
19274 switch (entity)
19275 {
19276 case X86_DIRFLAG:
19277 if (mode == X86_DIRFLAG_RESET)
19278 emit_insn (gen_cld ());
19279 break;
19280 case AVX_U128:
19281 if (mode == AVX_U128_CLEAN)
19282 ix86_avx_emit_vzeroupper (regs_live);
19283 break;
19284 case I387_TRUNC:
19285 case I387_FLOOR:
19286 case I387_CEIL:
19287 case I387_MASK_PM:
19288 if (mode != I387_CW_ANY
19289 && mode != I387_CW_UNINITIALIZED)
19290 emit_i387_cw_initialization (mode);
19291 break;
19292 default:
19293 gcc_unreachable ();
19294 }
19295 }
19296
19297 /* Output code for INSN to convert a float to a signed int. OPERANDS
19298 are the insn operands. The output may be [HSD]Imode and the input
19299 operand may be [SDX]Fmode. */
19300
19301 const char *
19302 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
19303 {
19304 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
19305 int dimode_p = GET_MODE (operands[0]) == DImode;
19306 int round_mode = get_attr_i387_cw (insn);
19307
19308 /* Jump through a hoop or two for DImode, since the hardware has no
19309 non-popping instruction. We used to do this a different way, but
19310 that was somewhat fragile and broke with post-reload splitters. */
19311 if ((dimode_p || fisttp) && !stack_top_dies)
19312 output_asm_insn ("fld\t%y1", operands);
19313
19314 gcc_assert (STACK_TOP_P (operands[1]));
19315 gcc_assert (MEM_P (operands[0]));
19316 gcc_assert (GET_MODE (operands[1]) != TFmode);
19317
19318 if (fisttp)
19319 output_asm_insn ("fisttp%Z0\t%0", operands);
19320 else
19321 {
19322 if (round_mode != I387_CW_ANY)
19323 output_asm_insn ("fldcw\t%3", operands);
19324 if (stack_top_dies || dimode_p)
19325 output_asm_insn ("fistp%Z0\t%0", operands);
19326 else
19327 output_asm_insn ("fist%Z0\t%0", operands);
19328 if (round_mode != I387_CW_ANY)
19329 output_asm_insn ("fldcw\t%2", operands);
19330 }
19331
19332 return "";
19333 }
19334
19335 /* Output code for x87 ffreep insn. The OPNO argument, which may only
19336 have the values zero or one, indicates the ffreep insn's operand
19337 from the OPERANDS array. */
19338
19339 static const char *
19340 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
19341 {
19342 if (TARGET_USE_FFREEP)
19343 #ifdef HAVE_AS_IX86_FFREEP
19344 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
19345 #else
19346 {
19347 static char retval[32];
19348 int regno = REGNO (operands[opno]);
19349
19350 gcc_assert (STACK_REGNO_P (regno));
19351
19352 regno -= FIRST_STACK_REG;
19353
19354 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
19355 return retval;
19356 }
19357 #endif
19358
19359 return opno ? "fstp\t%y1" : "fstp\t%y0";
19360 }
19361
19362
19363 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
19364 should be used. UNORDERED_P is true when fucom should be used. */
19365
19366 const char *
19367 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
19368 {
19369 int stack_top_dies;
19370 rtx cmp_op0, cmp_op1;
19371 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
19372
19373 if (eflags_p)
19374 {
19375 cmp_op0 = operands[0];
19376 cmp_op1 = operands[1];
19377 }
19378 else
19379 {
19380 cmp_op0 = operands[1];
19381 cmp_op1 = operands[2];
19382 }
19383
19384 if (is_sse)
19385 {
19386 if (GET_MODE (operands[0]) == SFmode)
19387 if (unordered_p)
19388 return "%vucomiss\t{%1, %0|%0, %1}";
19389 else
19390 return "%vcomiss\t{%1, %0|%0, %1}";
19391 else
19392 if (unordered_p)
19393 return "%vucomisd\t{%1, %0|%0, %1}";
19394 else
19395 return "%vcomisd\t{%1, %0|%0, %1}";
19396 }
19397
19398 gcc_assert (STACK_TOP_P (cmp_op0));
19399
19400 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
19401
19402 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
19403 {
19404 if (stack_top_dies)
19405 {
19406 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
19407 return output_387_ffreep (operands, 1);
19408 }
19409 else
19410 return "ftst\n\tfnstsw\t%0";
19411 }
19412
19413 if (STACK_REG_P (cmp_op1)
19414 && stack_top_dies
19415 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
19416 && REGNO (cmp_op1) != FIRST_STACK_REG)
19417 {
19418 /* If both the top of the 387 stack dies, and the other operand
19419 is also a stack register that dies, then this must be a
19420 `fcompp' float compare */
19421
19422 if (eflags_p)
19423 {
19424 /* There is no double popping fcomi variant. Fortunately,
19425 eflags is immune from the fstp's cc clobbering. */
19426 if (unordered_p)
19427 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
19428 else
19429 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
19430 return output_387_ffreep (operands, 0);
19431 }
19432 else
19433 {
19434 if (unordered_p)
19435 return "fucompp\n\tfnstsw\t%0";
19436 else
19437 return "fcompp\n\tfnstsw\t%0";
19438 }
19439 }
19440 else
19441 {
19442 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
19443
19444 static const char * const alt[16] =
19445 {
19446 "fcom%Z2\t%y2\n\tfnstsw\t%0",
19447 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
19448 "fucom%Z2\t%y2\n\tfnstsw\t%0",
19449 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
19450
19451 "ficom%Z2\t%y2\n\tfnstsw\t%0",
19452 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
19453 NULL,
19454 NULL,
19455
19456 "fcomi\t{%y1, %0|%0, %y1}",
19457 "fcomip\t{%y1, %0|%0, %y1}",
19458 "fucomi\t{%y1, %0|%0, %y1}",
19459 "fucomip\t{%y1, %0|%0, %y1}",
19460
19461 NULL,
19462 NULL,
19463 NULL,
19464 NULL
19465 };
19466
19467 int mask;
19468 const char *ret;
19469
19470 mask = eflags_p << 3;
19471 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
19472 mask |= unordered_p << 1;
19473 mask |= stack_top_dies;
19474
19475 gcc_assert (mask < 16);
19476 ret = alt[mask];
19477 gcc_assert (ret);
19478
19479 return ret;
19480 }
19481 }
19482
19483 void
19484 ix86_output_addr_vec_elt (FILE *file, int value)
19485 {
19486 const char *directive = ASM_LONG;
19487
19488 #ifdef ASM_QUAD
19489 if (TARGET_LP64)
19490 directive = ASM_QUAD;
19491 #else
19492 gcc_assert (!TARGET_64BIT);
19493 #endif
19494
19495 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
19496 }
19497
19498 void
19499 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
19500 {
19501 const char *directive = ASM_LONG;
19502
19503 #ifdef ASM_QUAD
19504 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
19505 directive = ASM_QUAD;
19506 #else
19507 gcc_assert (!TARGET_64BIT);
19508 #endif
19509 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
19510 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
19511 fprintf (file, "%s%s%d-%s%d\n",
19512 directive, LPREFIX, value, LPREFIX, rel);
19513 else if (HAVE_AS_GOTOFF_IN_DATA)
19514 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
19515 #if TARGET_MACHO
19516 else if (TARGET_MACHO)
19517 {
19518 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
19519 machopic_output_function_base_name (file);
19520 putc ('\n', file);
19521 }
19522 #endif
19523 else
19524 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
19525 GOT_SYMBOL_NAME, LPREFIX, value);
19526 }
19527 \f
19528 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
19529 for the target. */
19530
19531 void
19532 ix86_expand_clear (rtx dest)
19533 {
19534 rtx tmp;
19535
19536 /* We play register width games, which are only valid after reload. */
19537 gcc_assert (reload_completed);
19538
19539 /* Avoid HImode and its attendant prefix byte. */
19540 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
19541 dest = gen_rtx_REG (SImode, REGNO (dest));
19542 tmp = gen_rtx_SET (dest, const0_rtx);
19543
19544 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
19545 {
19546 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
19547 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
19548 }
19549
19550 emit_insn (tmp);
19551 }
19552
19553 /* X is an unchanging MEM. If it is a constant pool reference, return
19554 the constant pool rtx, else NULL. */
19555
19556 rtx
19557 maybe_get_pool_constant (rtx x)
19558 {
19559 x = ix86_delegitimize_address (XEXP (x, 0));
19560
19561 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
19562 return get_pool_constant (x);
19563
19564 return NULL_RTX;
19565 }
19566
19567 void
19568 ix86_expand_move (machine_mode mode, rtx operands[])
19569 {
19570 rtx op0, op1;
19571 rtx tmp, addend = NULL_RTX;
19572 enum tls_model model;
19573
19574 op0 = operands[0];
19575 op1 = operands[1];
19576
19577 switch (GET_CODE (op1))
19578 {
19579 case CONST:
19580 tmp = XEXP (op1, 0);
19581
19582 if (GET_CODE (tmp) != PLUS
19583 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
19584 break;
19585
19586 op1 = XEXP (tmp, 0);
19587 addend = XEXP (tmp, 1);
19588 /* FALLTHRU */
19589
19590 case SYMBOL_REF:
19591 model = SYMBOL_REF_TLS_MODEL (op1);
19592
19593 if (model)
19594 op1 = legitimize_tls_address (op1, model, true);
19595 else if (ix86_force_load_from_GOT_p (op1))
19596 {
19597 /* Load the external function address via GOT slot to avoid PLT. */
19598 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
19599 (TARGET_64BIT
19600 ? UNSPEC_GOTPCREL
19601 : UNSPEC_GOT));
19602 op1 = gen_rtx_CONST (Pmode, op1);
19603 op1 = gen_const_mem (Pmode, op1);
19604 set_mem_alias_set (op1, ix86_GOT_alias_set ());
19605 }
19606 else
19607 {
19608 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
19609 if (tmp)
19610 {
19611 op1 = tmp;
19612 if (!addend)
19613 break;
19614 }
19615 else
19616 {
19617 op1 = operands[1];
19618 break;
19619 }
19620 }
19621
19622 if (addend)
19623 {
19624 op1 = force_operand (op1, NULL_RTX);
19625 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
19626 op0, 1, OPTAB_DIRECT);
19627 }
19628 else
19629 op1 = force_operand (op1, op0);
19630
19631 if (op1 == op0)
19632 return;
19633
19634 op1 = convert_to_mode (mode, op1, 1);
19635
19636 default:
19637 break;
19638 }
19639
19640 if ((flag_pic || MACHOPIC_INDIRECT)
19641 && symbolic_operand (op1, mode))
19642 {
19643 if (TARGET_MACHO && !TARGET_64BIT)
19644 {
19645 #if TARGET_MACHO
19646 /* dynamic-no-pic */
19647 if (MACHOPIC_INDIRECT)
19648 {
19649 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
19650 ? op0 : gen_reg_rtx (Pmode);
19651 op1 = machopic_indirect_data_reference (op1, temp);
19652 if (MACHOPIC_PURE)
19653 op1 = machopic_legitimize_pic_address (op1, mode,
19654 temp == op1 ? 0 : temp);
19655 }
19656 if (op0 != op1 && GET_CODE (op0) != MEM)
19657 {
19658 rtx insn = gen_rtx_SET (op0, op1);
19659 emit_insn (insn);
19660 return;
19661 }
19662 if (GET_CODE (op0) == MEM)
19663 op1 = force_reg (Pmode, op1);
19664 else
19665 {
19666 rtx temp = op0;
19667 if (GET_CODE (temp) != REG)
19668 temp = gen_reg_rtx (Pmode);
19669 temp = legitimize_pic_address (op1, temp);
19670 if (temp == op0)
19671 return;
19672 op1 = temp;
19673 }
19674 /* dynamic-no-pic */
19675 #endif
19676 }
19677 else
19678 {
19679 if (MEM_P (op0))
19680 op1 = force_reg (mode, op1);
19681 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
19682 {
19683 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
19684 op1 = legitimize_pic_address (op1, reg);
19685 if (op0 == op1)
19686 return;
19687 op1 = convert_to_mode (mode, op1, 1);
19688 }
19689 }
19690 }
19691 else
19692 {
19693 if (MEM_P (op0)
19694 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
19695 || !push_operand (op0, mode))
19696 && MEM_P (op1))
19697 op1 = force_reg (mode, op1);
19698
19699 if (push_operand (op0, mode)
19700 && ! general_no_elim_operand (op1, mode))
19701 op1 = copy_to_mode_reg (mode, op1);
19702
19703 /* Force large constants in 64bit compilation into register
19704 to get them CSEed. */
19705 if (can_create_pseudo_p ()
19706 && (mode == DImode) && TARGET_64BIT
19707 && immediate_operand (op1, mode)
19708 && !x86_64_zext_immediate_operand (op1, VOIDmode)
19709 && !register_operand (op0, mode)
19710 && optimize)
19711 op1 = copy_to_mode_reg (mode, op1);
19712
19713 if (can_create_pseudo_p ()
19714 && CONST_DOUBLE_P (op1))
19715 {
19716 /* If we are loading a floating point constant to a register,
19717 force the value to memory now, since we'll get better code
19718 out the back end. */
19719
19720 op1 = validize_mem (force_const_mem (mode, op1));
19721 if (!register_operand (op0, mode))
19722 {
19723 rtx temp = gen_reg_rtx (mode);
19724 emit_insn (gen_rtx_SET (temp, op1));
19725 emit_move_insn (op0, temp);
19726 return;
19727 }
19728 }
19729 }
19730
19731 emit_insn (gen_rtx_SET (op0, op1));
19732 }
19733
19734 void
19735 ix86_expand_vector_move (machine_mode mode, rtx operands[])
19736 {
19737 rtx op0 = operands[0], op1 = operands[1];
19738 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
19739 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
19740 unsigned int align = (TARGET_IAMCU
19741 ? GET_MODE_BITSIZE (mode)
19742 : GET_MODE_ALIGNMENT (mode));
19743
19744 if (push_operand (op0, VOIDmode))
19745 op0 = emit_move_resolve_push (mode, op0);
19746
19747 /* Force constants other than zero into memory. We do not know how
19748 the instructions used to build constants modify the upper 64 bits
19749 of the register, once we have that information we may be able
19750 to handle some of them more efficiently. */
19751 if (can_create_pseudo_p ()
19752 && (CONSTANT_P (op1)
19753 || (SUBREG_P (op1)
19754 && CONSTANT_P (SUBREG_REG (op1))))
19755 && ((register_operand (op0, mode)
19756 && !standard_sse_constant_p (op1, mode))
19757 /* ix86_expand_vector_move_misalign() does not like constants. */
19758 || (SSE_REG_MODE_P (mode)
19759 && MEM_P (op0)
19760 && MEM_ALIGN (op0) < align)))
19761 {
19762 if (SUBREG_P (op1))
19763 {
19764 machine_mode imode = GET_MODE (SUBREG_REG (op1));
19765 rtx r = force_const_mem (imode, SUBREG_REG (op1));
19766 if (r)
19767 r = validize_mem (r);
19768 else
19769 r = force_reg (imode, SUBREG_REG (op1));
19770 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
19771 }
19772 else
19773 op1 = validize_mem (force_const_mem (mode, op1));
19774 }
19775
19776 /* We need to check memory alignment for SSE mode since attribute
19777 can make operands unaligned. */
19778 if (can_create_pseudo_p ()
19779 && SSE_REG_MODE_P (mode)
19780 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
19781 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
19782 {
19783 rtx tmp[2];
19784
19785 /* ix86_expand_vector_move_misalign() does not like both
19786 arguments in memory. */
19787 if (!register_operand (op0, mode)
19788 && !register_operand (op1, mode))
19789 op1 = force_reg (mode, op1);
19790
19791 tmp[0] = op0; tmp[1] = op1;
19792 ix86_expand_vector_move_misalign (mode, tmp);
19793 return;
19794 }
19795
19796 /* Make operand1 a register if it isn't already. */
19797 if (can_create_pseudo_p ()
19798 && !register_operand (op0, mode)
19799 && !register_operand (op1, mode))
19800 {
19801 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
19802 return;
19803 }
19804
19805 emit_insn (gen_rtx_SET (op0, op1));
19806 }
19807
19808 /* Split 32-byte AVX unaligned load and store if needed. */
19809
19810 static void
19811 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
19812 {
19813 rtx m;
19814 rtx (*extract) (rtx, rtx, rtx);
19815 machine_mode mode;
19816
19817 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
19818 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
19819 {
19820 emit_insn (gen_rtx_SET (op0, op1));
19821 return;
19822 }
19823
19824 rtx orig_op0 = NULL_RTX;
19825 mode = GET_MODE (op0);
19826 switch (GET_MODE_CLASS (mode))
19827 {
19828 case MODE_VECTOR_INT:
19829 case MODE_INT:
19830 if (mode != V32QImode)
19831 {
19832 if (!MEM_P (op0))
19833 {
19834 orig_op0 = op0;
19835 op0 = gen_reg_rtx (V32QImode);
19836 }
19837 else
19838 op0 = gen_lowpart (V32QImode, op0);
19839 op1 = gen_lowpart (V32QImode, op1);
19840 mode = V32QImode;
19841 }
19842 break;
19843 case MODE_VECTOR_FLOAT:
19844 break;
19845 default:
19846 gcc_unreachable ();
19847 }
19848
19849 switch (mode)
19850 {
19851 default:
19852 gcc_unreachable ();
19853 case V32QImode:
19854 extract = gen_avx_vextractf128v32qi;
19855 mode = V16QImode;
19856 break;
19857 case V8SFmode:
19858 extract = gen_avx_vextractf128v8sf;
19859 mode = V4SFmode;
19860 break;
19861 case V4DFmode:
19862 extract = gen_avx_vextractf128v4df;
19863 mode = V2DFmode;
19864 break;
19865 }
19866
19867 if (MEM_P (op1))
19868 {
19869 rtx r = gen_reg_rtx (mode);
19870 m = adjust_address (op1, mode, 0);
19871 emit_move_insn (r, m);
19872 m = adjust_address (op1, mode, 16);
19873 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
19874 emit_move_insn (op0, r);
19875 }
19876 else if (MEM_P (op0))
19877 {
19878 m = adjust_address (op0, mode, 0);
19879 emit_insn (extract (m, op1, const0_rtx));
19880 m = adjust_address (op0, mode, 16);
19881 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
19882 }
19883 else
19884 gcc_unreachable ();
19885
19886 if (orig_op0)
19887 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
19888 }
19889
19890 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
19891 straight to ix86_expand_vector_move. */
19892 /* Code generation for scalar reg-reg moves of single and double precision data:
19893 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
19894 movaps reg, reg
19895 else
19896 movss reg, reg
19897 if (x86_sse_partial_reg_dependency == true)
19898 movapd reg, reg
19899 else
19900 movsd reg, reg
19901
19902 Code generation for scalar loads of double precision data:
19903 if (x86_sse_split_regs == true)
19904 movlpd mem, reg (gas syntax)
19905 else
19906 movsd mem, reg
19907
19908 Code generation for unaligned packed loads of single precision data
19909 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
19910 if (x86_sse_unaligned_move_optimal)
19911 movups mem, reg
19912
19913 if (x86_sse_partial_reg_dependency == true)
19914 {
19915 xorps reg, reg
19916 movlps mem, reg
19917 movhps mem+8, reg
19918 }
19919 else
19920 {
19921 movlps mem, reg
19922 movhps mem+8, reg
19923 }
19924
19925 Code generation for unaligned packed loads of double precision data
19926 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
19927 if (x86_sse_unaligned_move_optimal)
19928 movupd mem, reg
19929
19930 if (x86_sse_split_regs == true)
19931 {
19932 movlpd mem, reg
19933 movhpd mem+8, reg
19934 }
19935 else
19936 {
19937 movsd mem, reg
19938 movhpd mem+8, reg
19939 }
19940 */
19941
19942 void
19943 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
19944 {
19945 rtx op0, op1, m;
19946
19947 op0 = operands[0];
19948 op1 = operands[1];
19949
19950 /* Use unaligned load/store for AVX512 or when optimizing for size. */
19951 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
19952 {
19953 emit_insn (gen_rtx_SET (op0, op1));
19954 return;
19955 }
19956
19957 if (TARGET_AVX)
19958 {
19959 if (GET_MODE_SIZE (mode) == 32)
19960 ix86_avx256_split_vector_move_misalign (op0, op1);
19961 else
19962 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
19963 emit_insn (gen_rtx_SET (op0, op1));
19964 return;
19965 }
19966
19967 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
19968 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
19969 {
19970 emit_insn (gen_rtx_SET (op0, op1));
19971 return;
19972 }
19973
19974 /* ??? If we have typed data, then it would appear that using
19975 movdqu is the only way to get unaligned data loaded with
19976 integer type. */
19977 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
19978 {
19979 emit_insn (gen_rtx_SET (op0, op1));
19980 return;
19981 }
19982
19983 if (MEM_P (op1))
19984 {
19985 if (TARGET_SSE2 && mode == V2DFmode)
19986 {
19987 rtx zero;
19988
19989 /* When SSE registers are split into halves, we can avoid
19990 writing to the top half twice. */
19991 if (TARGET_SSE_SPLIT_REGS)
19992 {
19993 emit_clobber (op0);
19994 zero = op0;
19995 }
19996 else
19997 {
19998 /* ??? Not sure about the best option for the Intel chips.
19999 The following would seem to satisfy; the register is
20000 entirely cleared, breaking the dependency chain. We
20001 then store to the upper half, with a dependency depth
20002 of one. A rumor has it that Intel recommends two movsd
20003 followed by an unpacklpd, but this is unconfirmed. And
20004 given that the dependency depth of the unpacklpd would
20005 still be one, I'm not sure why this would be better. */
20006 zero = CONST0_RTX (V2DFmode);
20007 }
20008
20009 m = adjust_address (op1, DFmode, 0);
20010 emit_insn (gen_sse2_loadlpd (op0, zero, m));
20011 m = adjust_address (op1, DFmode, 8);
20012 emit_insn (gen_sse2_loadhpd (op0, op0, m));
20013 }
20014 else
20015 {
20016 rtx t;
20017
20018 if (mode != V4SFmode)
20019 t = gen_reg_rtx (V4SFmode);
20020 else
20021 t = op0;
20022
20023 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
20024 emit_move_insn (t, CONST0_RTX (V4SFmode));
20025 else
20026 emit_clobber (t);
20027
20028 m = adjust_address (op1, V2SFmode, 0);
20029 emit_insn (gen_sse_loadlps (t, t, m));
20030 m = adjust_address (op1, V2SFmode, 8);
20031 emit_insn (gen_sse_loadhps (t, t, m));
20032 if (mode != V4SFmode)
20033 emit_move_insn (op0, gen_lowpart (mode, t));
20034 }
20035 }
20036 else if (MEM_P (op0))
20037 {
20038 if (TARGET_SSE2 && mode == V2DFmode)
20039 {
20040 m = adjust_address (op0, DFmode, 0);
20041 emit_insn (gen_sse2_storelpd (m, op1));
20042 m = adjust_address (op0, DFmode, 8);
20043 emit_insn (gen_sse2_storehpd (m, op1));
20044 }
20045 else
20046 {
20047 if (mode != V4SFmode)
20048 op1 = gen_lowpart (V4SFmode, op1);
20049
20050 m = adjust_address (op0, V2SFmode, 0);
20051 emit_insn (gen_sse_storelps (m, op1));
20052 m = adjust_address (op0, V2SFmode, 8);
20053 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
20054 }
20055 }
20056 else
20057 gcc_unreachable ();
20058 }
20059
20060 /* Helper function of ix86_fixup_binary_operands to canonicalize
20061 operand order. Returns true if the operands should be swapped. */
20062
20063 static bool
20064 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
20065 rtx operands[])
20066 {
20067 rtx dst = operands[0];
20068 rtx src1 = operands[1];
20069 rtx src2 = operands[2];
20070
20071 /* If the operation is not commutative, we can't do anything. */
20072 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
20073 return false;
20074
20075 /* Highest priority is that src1 should match dst. */
20076 if (rtx_equal_p (dst, src1))
20077 return false;
20078 if (rtx_equal_p (dst, src2))
20079 return true;
20080
20081 /* Next highest priority is that immediate constants come second. */
20082 if (immediate_operand (src2, mode))
20083 return false;
20084 if (immediate_operand (src1, mode))
20085 return true;
20086
20087 /* Lowest priority is that memory references should come second. */
20088 if (MEM_P (src2))
20089 return false;
20090 if (MEM_P (src1))
20091 return true;
20092
20093 return false;
20094 }
20095
20096
20097 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
20098 destination to use for the operation. If different from the true
20099 destination in operands[0], a copy operation will be required. */
20100
20101 rtx
20102 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
20103 rtx operands[])
20104 {
20105 rtx dst = operands[0];
20106 rtx src1 = operands[1];
20107 rtx src2 = operands[2];
20108
20109 /* Canonicalize operand order. */
20110 if (ix86_swap_binary_operands_p (code, mode, operands))
20111 {
20112 /* It is invalid to swap operands of different modes. */
20113 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
20114
20115 std::swap (src1, src2);
20116 }
20117
20118 /* Both source operands cannot be in memory. */
20119 if (MEM_P (src1) && MEM_P (src2))
20120 {
20121 /* Optimization: Only read from memory once. */
20122 if (rtx_equal_p (src1, src2))
20123 {
20124 src2 = force_reg (mode, src2);
20125 src1 = src2;
20126 }
20127 else if (rtx_equal_p (dst, src1))
20128 src2 = force_reg (mode, src2);
20129 else
20130 src1 = force_reg (mode, src1);
20131 }
20132
20133 /* If the destination is memory, and we do not have matching source
20134 operands, do things in registers. */
20135 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20136 dst = gen_reg_rtx (mode);
20137
20138 /* Source 1 cannot be a constant. */
20139 if (CONSTANT_P (src1))
20140 src1 = force_reg (mode, src1);
20141
20142 /* Source 1 cannot be a non-matching memory. */
20143 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20144 src1 = force_reg (mode, src1);
20145
20146 /* Improve address combine. */
20147 if (code == PLUS
20148 && GET_MODE_CLASS (mode) == MODE_INT
20149 && MEM_P (src2))
20150 src2 = force_reg (mode, src2);
20151
20152 operands[1] = src1;
20153 operands[2] = src2;
20154 return dst;
20155 }
20156
20157 /* Similarly, but assume that the destination has already been
20158 set up properly. */
20159
20160 void
20161 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
20162 machine_mode mode, rtx operands[])
20163 {
20164 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
20165 gcc_assert (dst == operands[0]);
20166 }
20167
20168 /* Attempt to expand a binary operator. Make the expansion closer to the
20169 actual machine, then just general_operand, which will allow 3 separate
20170 memory references (one output, two input) in a single insn. */
20171
20172 void
20173 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
20174 rtx operands[])
20175 {
20176 rtx src1, src2, dst, op, clob;
20177
20178 dst = ix86_fixup_binary_operands (code, mode, operands);
20179 src1 = operands[1];
20180 src2 = operands[2];
20181
20182 /* Emit the instruction. */
20183
20184 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
20185
20186 if (reload_completed
20187 && code == PLUS
20188 && !rtx_equal_p (dst, src1))
20189 {
20190 /* This is going to be an LEA; avoid splitting it later. */
20191 emit_insn (op);
20192 }
20193 else
20194 {
20195 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20196 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20197 }
20198
20199 /* Fix up the destination if needed. */
20200 if (dst != operands[0])
20201 emit_move_insn (operands[0], dst);
20202 }
20203
20204 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
20205 the given OPERANDS. */
20206
20207 void
20208 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
20209 rtx operands[])
20210 {
20211 rtx op1 = NULL_RTX, op2 = NULL_RTX;
20212 if (SUBREG_P (operands[1]))
20213 {
20214 op1 = operands[1];
20215 op2 = operands[2];
20216 }
20217 else if (SUBREG_P (operands[2]))
20218 {
20219 op1 = operands[2];
20220 op2 = operands[1];
20221 }
20222 /* Optimize (__m128i) d | (__m128i) e and similar code
20223 when d and e are float vectors into float vector logical
20224 insn. In C/C++ without using intrinsics there is no other way
20225 to express vector logical operation on float vectors than
20226 to cast them temporarily to integer vectors. */
20227 if (op1
20228 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
20229 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
20230 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
20231 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
20232 && SUBREG_BYTE (op1) == 0
20233 && (GET_CODE (op2) == CONST_VECTOR
20234 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
20235 && SUBREG_BYTE (op2) == 0))
20236 && can_create_pseudo_p ())
20237 {
20238 rtx dst;
20239 switch (GET_MODE (SUBREG_REG (op1)))
20240 {
20241 case V4SFmode:
20242 case V8SFmode:
20243 case V16SFmode:
20244 case V2DFmode:
20245 case V4DFmode:
20246 case V8DFmode:
20247 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
20248 if (GET_CODE (op2) == CONST_VECTOR)
20249 {
20250 op2 = gen_lowpart (GET_MODE (dst), op2);
20251 op2 = force_reg (GET_MODE (dst), op2);
20252 }
20253 else
20254 {
20255 op1 = operands[1];
20256 op2 = SUBREG_REG (operands[2]);
20257 if (!vector_operand (op2, GET_MODE (dst)))
20258 op2 = force_reg (GET_MODE (dst), op2);
20259 }
20260 op1 = SUBREG_REG (op1);
20261 if (!vector_operand (op1, GET_MODE (dst)))
20262 op1 = force_reg (GET_MODE (dst), op1);
20263 emit_insn (gen_rtx_SET (dst,
20264 gen_rtx_fmt_ee (code, GET_MODE (dst),
20265 op1, op2)));
20266 emit_move_insn (operands[0], gen_lowpart (mode, dst));
20267 return;
20268 default:
20269 break;
20270 }
20271 }
20272 if (!vector_operand (operands[1], mode))
20273 operands[1] = force_reg (mode, operands[1]);
20274 if (!vector_operand (operands[2], mode))
20275 operands[2] = force_reg (mode, operands[2]);
20276 ix86_fixup_binary_operands_no_copy (code, mode, operands);
20277 emit_insn (gen_rtx_SET (operands[0],
20278 gen_rtx_fmt_ee (code, mode, operands[1],
20279 operands[2])));
20280 }
20281
20282 /* Return TRUE or FALSE depending on whether the binary operator meets the
20283 appropriate constraints. */
20284
20285 bool
20286 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
20287 rtx operands[3])
20288 {
20289 rtx dst = operands[0];
20290 rtx src1 = operands[1];
20291 rtx src2 = operands[2];
20292
20293 /* Both source operands cannot be in memory. */
20294 if (MEM_P (src1) && MEM_P (src2))
20295 return false;
20296
20297 /* Canonicalize operand order for commutative operators. */
20298 if (ix86_swap_binary_operands_p (code, mode, operands))
20299 std::swap (src1, src2);
20300
20301 /* If the destination is memory, we must have a matching source operand. */
20302 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20303 return false;
20304
20305 /* Source 1 cannot be a constant. */
20306 if (CONSTANT_P (src1))
20307 return false;
20308
20309 /* Source 1 cannot be a non-matching memory. */
20310 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20311 /* Support "andhi/andsi/anddi" as a zero-extending move. */
20312 return (code == AND
20313 && (mode == HImode
20314 || mode == SImode
20315 || (TARGET_64BIT && mode == DImode))
20316 && satisfies_constraint_L (src2));
20317
20318 return true;
20319 }
20320
20321 /* Attempt to expand a unary operator. Make the expansion closer to the
20322 actual machine, then just general_operand, which will allow 2 separate
20323 memory references (one output, one input) in a single insn. */
20324
20325 void
20326 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
20327 rtx operands[])
20328 {
20329 bool matching_memory = false;
20330 rtx src, dst, op, clob;
20331
20332 dst = operands[0];
20333 src = operands[1];
20334
20335 /* If the destination is memory, and we do not have matching source
20336 operands, do things in registers. */
20337 if (MEM_P (dst))
20338 {
20339 if (rtx_equal_p (dst, src))
20340 matching_memory = true;
20341 else
20342 dst = gen_reg_rtx (mode);
20343 }
20344
20345 /* When source operand is memory, destination must match. */
20346 if (MEM_P (src) && !matching_memory)
20347 src = force_reg (mode, src);
20348
20349 /* Emit the instruction. */
20350
20351 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
20352
20353 if (code == NOT)
20354 emit_insn (op);
20355 else
20356 {
20357 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20358 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20359 }
20360
20361 /* Fix up the destination if needed. */
20362 if (dst != operands[0])
20363 emit_move_insn (operands[0], dst);
20364 }
20365
20366 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
20367 divisor are within the range [0-255]. */
20368
20369 void
20370 ix86_split_idivmod (machine_mode mode, rtx operands[],
20371 bool signed_p)
20372 {
20373 rtx_code_label *end_label, *qimode_label;
20374 rtx insn, div, mod;
20375 rtx scratch, tmp0, tmp1, tmp2;
20376 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
20377 rtx (*gen_zero_extend) (rtx, rtx);
20378 rtx (*gen_test_ccno_1) (rtx, rtx);
20379
20380 switch (mode)
20381 {
20382 case SImode:
20383 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
20384 gen_test_ccno_1 = gen_testsi_ccno_1;
20385 gen_zero_extend = gen_zero_extendqisi2;
20386 break;
20387 case DImode:
20388 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
20389 gen_test_ccno_1 = gen_testdi_ccno_1;
20390 gen_zero_extend = gen_zero_extendqidi2;
20391 break;
20392 default:
20393 gcc_unreachable ();
20394 }
20395
20396 end_label = gen_label_rtx ();
20397 qimode_label = gen_label_rtx ();
20398
20399 scratch = gen_reg_rtx (mode);
20400
20401 /* Use 8bit unsigned divimod if dividend and divisor are within
20402 the range [0-255]. */
20403 emit_move_insn (scratch, operands[2]);
20404 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
20405 scratch, 1, OPTAB_DIRECT);
20406 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
20407 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
20408 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
20409 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
20410 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
20411 pc_rtx);
20412 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
20413 predict_jump (REG_BR_PROB_BASE * 50 / 100);
20414 JUMP_LABEL (insn) = qimode_label;
20415
20416 /* Generate original signed/unsigned divimod. */
20417 div = gen_divmod4_1 (operands[0], operands[1],
20418 operands[2], operands[3]);
20419 emit_insn (div);
20420
20421 /* Branch to the end. */
20422 emit_jump_insn (gen_jump (end_label));
20423 emit_barrier ();
20424
20425 /* Generate 8bit unsigned divide. */
20426 emit_label (qimode_label);
20427 /* Don't use operands[0] for result of 8bit divide since not all
20428 registers support QImode ZERO_EXTRACT. */
20429 tmp0 = lowpart_subreg (HImode, scratch, mode);
20430 tmp1 = lowpart_subreg (HImode, operands[2], mode);
20431 tmp2 = lowpart_subreg (QImode, operands[3], mode);
20432 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
20433
20434 if (signed_p)
20435 {
20436 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
20437 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
20438 }
20439 else
20440 {
20441 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
20442 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
20443 }
20444
20445 /* Extract remainder from AH. */
20446 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
20447 if (REG_P (operands[1]))
20448 insn = emit_move_insn (operands[1], tmp1);
20449 else
20450 {
20451 /* Need a new scratch register since the old one has result
20452 of 8bit divide. */
20453 scratch = gen_reg_rtx (mode);
20454 emit_move_insn (scratch, tmp1);
20455 insn = emit_move_insn (operands[1], scratch);
20456 }
20457 set_unique_reg_note (insn, REG_EQUAL, mod);
20458
20459 /* Zero extend quotient from AL. */
20460 tmp1 = gen_lowpart (QImode, tmp0);
20461 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
20462 set_unique_reg_note (insn, REG_EQUAL, div);
20463
20464 emit_label (end_label);
20465 }
20466
20467 #define LEA_MAX_STALL (3)
20468 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
20469
20470 /* Increase given DISTANCE in half-cycles according to
20471 dependencies between PREV and NEXT instructions.
20472 Add 1 half-cycle if there is no dependency and
20473 go to next cycle if there is some dependecy. */
20474
20475 static unsigned int
20476 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
20477 {
20478 df_ref def, use;
20479
20480 if (!prev || !next)
20481 return distance + (distance & 1) + 2;
20482
20483 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
20484 return distance + 1;
20485
20486 FOR_EACH_INSN_USE (use, next)
20487 FOR_EACH_INSN_DEF (def, prev)
20488 if (!DF_REF_IS_ARTIFICIAL (def)
20489 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
20490 return distance + (distance & 1) + 2;
20491
20492 return distance + 1;
20493 }
20494
20495 /* Function checks if instruction INSN defines register number
20496 REGNO1 or REGNO2. */
20497
20498 static bool
20499 insn_defines_reg (unsigned int regno1, unsigned int regno2,
20500 rtx_insn *insn)
20501 {
20502 df_ref def;
20503
20504 FOR_EACH_INSN_DEF (def, insn)
20505 if (DF_REF_REG_DEF_P (def)
20506 && !DF_REF_IS_ARTIFICIAL (def)
20507 && (regno1 == DF_REF_REGNO (def)
20508 || regno2 == DF_REF_REGNO (def)))
20509 return true;
20510
20511 return false;
20512 }
20513
20514 /* Function checks if instruction INSN uses register number
20515 REGNO as a part of address expression. */
20516
20517 static bool
20518 insn_uses_reg_mem (unsigned int regno, rtx insn)
20519 {
20520 df_ref use;
20521
20522 FOR_EACH_INSN_USE (use, insn)
20523 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
20524 return true;
20525
20526 return false;
20527 }
20528
20529 /* Search backward for non-agu definition of register number REGNO1
20530 or register number REGNO2 in basic block starting from instruction
20531 START up to head of basic block or instruction INSN.
20532
20533 Function puts true value into *FOUND var if definition was found
20534 and false otherwise.
20535
20536 Distance in half-cycles between START and found instruction or head
20537 of BB is added to DISTANCE and returned. */
20538
20539 static int
20540 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
20541 rtx_insn *insn, int distance,
20542 rtx_insn *start, bool *found)
20543 {
20544 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
20545 rtx_insn *prev = start;
20546 rtx_insn *next = NULL;
20547
20548 *found = false;
20549
20550 while (prev
20551 && prev != insn
20552 && distance < LEA_SEARCH_THRESHOLD)
20553 {
20554 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
20555 {
20556 distance = increase_distance (prev, next, distance);
20557 if (insn_defines_reg (regno1, regno2, prev))
20558 {
20559 if (recog_memoized (prev) < 0
20560 || get_attr_type (prev) != TYPE_LEA)
20561 {
20562 *found = true;
20563 return distance;
20564 }
20565 }
20566
20567 next = prev;
20568 }
20569 if (prev == BB_HEAD (bb))
20570 break;
20571
20572 prev = PREV_INSN (prev);
20573 }
20574
20575 return distance;
20576 }
20577
20578 /* Search backward for non-agu definition of register number REGNO1
20579 or register number REGNO2 in INSN's basic block until
20580 1. Pass LEA_SEARCH_THRESHOLD instructions, or
20581 2. Reach neighbor BBs boundary, or
20582 3. Reach agu definition.
20583 Returns the distance between the non-agu definition point and INSN.
20584 If no definition point, returns -1. */
20585
20586 static int
20587 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
20588 rtx_insn *insn)
20589 {
20590 basic_block bb = BLOCK_FOR_INSN (insn);
20591 int distance = 0;
20592 bool found = false;
20593
20594 if (insn != BB_HEAD (bb))
20595 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
20596 distance, PREV_INSN (insn),
20597 &found);
20598
20599 if (!found && distance < LEA_SEARCH_THRESHOLD)
20600 {
20601 edge e;
20602 edge_iterator ei;
20603 bool simple_loop = false;
20604
20605 FOR_EACH_EDGE (e, ei, bb->preds)
20606 if (e->src == bb)
20607 {
20608 simple_loop = true;
20609 break;
20610 }
20611
20612 if (simple_loop)
20613 distance = distance_non_agu_define_in_bb (regno1, regno2,
20614 insn, distance,
20615 BB_END (bb), &found);
20616 else
20617 {
20618 int shortest_dist = -1;
20619 bool found_in_bb = false;
20620
20621 FOR_EACH_EDGE (e, ei, bb->preds)
20622 {
20623 int bb_dist
20624 = distance_non_agu_define_in_bb (regno1, regno2,
20625 insn, distance,
20626 BB_END (e->src),
20627 &found_in_bb);
20628 if (found_in_bb)
20629 {
20630 if (shortest_dist < 0)
20631 shortest_dist = bb_dist;
20632 else if (bb_dist > 0)
20633 shortest_dist = MIN (bb_dist, shortest_dist);
20634
20635 found = true;
20636 }
20637 }
20638
20639 distance = shortest_dist;
20640 }
20641 }
20642
20643 /* get_attr_type may modify recog data. We want to make sure
20644 that recog data is valid for instruction INSN, on which
20645 distance_non_agu_define is called. INSN is unchanged here. */
20646 extract_insn_cached (insn);
20647
20648 if (!found)
20649 return -1;
20650
20651 return distance >> 1;
20652 }
20653
20654 /* Return the distance in half-cycles between INSN and the next
20655 insn that uses register number REGNO in memory address added
20656 to DISTANCE. Return -1 if REGNO0 is set.
20657
20658 Put true value into *FOUND if register usage was found and
20659 false otherwise.
20660 Put true value into *REDEFINED if register redefinition was
20661 found and false otherwise. */
20662
20663 static int
20664 distance_agu_use_in_bb (unsigned int regno,
20665 rtx_insn *insn, int distance, rtx_insn *start,
20666 bool *found, bool *redefined)
20667 {
20668 basic_block bb = NULL;
20669 rtx_insn *next = start;
20670 rtx_insn *prev = NULL;
20671
20672 *found = false;
20673 *redefined = false;
20674
20675 if (start != NULL_RTX)
20676 {
20677 bb = BLOCK_FOR_INSN (start);
20678 if (start != BB_HEAD (bb))
20679 /* If insn and start belong to the same bb, set prev to insn,
20680 so the call to increase_distance will increase the distance
20681 between insns by 1. */
20682 prev = insn;
20683 }
20684
20685 while (next
20686 && next != insn
20687 && distance < LEA_SEARCH_THRESHOLD)
20688 {
20689 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
20690 {
20691 distance = increase_distance(prev, next, distance);
20692 if (insn_uses_reg_mem (regno, next))
20693 {
20694 /* Return DISTANCE if OP0 is used in memory
20695 address in NEXT. */
20696 *found = true;
20697 return distance;
20698 }
20699
20700 if (insn_defines_reg (regno, INVALID_REGNUM, next))
20701 {
20702 /* Return -1 if OP0 is set in NEXT. */
20703 *redefined = true;
20704 return -1;
20705 }
20706
20707 prev = next;
20708 }
20709
20710 if (next == BB_END (bb))
20711 break;
20712
20713 next = NEXT_INSN (next);
20714 }
20715
20716 return distance;
20717 }
20718
20719 /* Return the distance between INSN and the next insn that uses
20720 register number REGNO0 in memory address. Return -1 if no such
20721 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
20722
20723 static int
20724 distance_agu_use (unsigned int regno0, rtx_insn *insn)
20725 {
20726 basic_block bb = BLOCK_FOR_INSN (insn);
20727 int distance = 0;
20728 bool found = false;
20729 bool redefined = false;
20730
20731 if (insn != BB_END (bb))
20732 distance = distance_agu_use_in_bb (regno0, insn, distance,
20733 NEXT_INSN (insn),
20734 &found, &redefined);
20735
20736 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
20737 {
20738 edge e;
20739 edge_iterator ei;
20740 bool simple_loop = false;
20741
20742 FOR_EACH_EDGE (e, ei, bb->succs)
20743 if (e->dest == bb)
20744 {
20745 simple_loop = true;
20746 break;
20747 }
20748
20749 if (simple_loop)
20750 distance = distance_agu_use_in_bb (regno0, insn,
20751 distance, BB_HEAD (bb),
20752 &found, &redefined);
20753 else
20754 {
20755 int shortest_dist = -1;
20756 bool found_in_bb = false;
20757 bool redefined_in_bb = false;
20758
20759 FOR_EACH_EDGE (e, ei, bb->succs)
20760 {
20761 int bb_dist
20762 = distance_agu_use_in_bb (regno0, insn,
20763 distance, BB_HEAD (e->dest),
20764 &found_in_bb, &redefined_in_bb);
20765 if (found_in_bb)
20766 {
20767 if (shortest_dist < 0)
20768 shortest_dist = bb_dist;
20769 else if (bb_dist > 0)
20770 shortest_dist = MIN (bb_dist, shortest_dist);
20771
20772 found = true;
20773 }
20774 }
20775
20776 distance = shortest_dist;
20777 }
20778 }
20779
20780 if (!found || redefined)
20781 return -1;
20782
20783 return distance >> 1;
20784 }
20785
20786 /* Define this macro to tune LEA priority vs ADD, it take effect when
20787 there is a dilemma of choicing LEA or ADD
20788 Negative value: ADD is more preferred than LEA
20789 Zero: Netrual
20790 Positive value: LEA is more preferred than ADD*/
20791 #define IX86_LEA_PRIORITY 0
20792
20793 /* Return true if usage of lea INSN has performance advantage
20794 over a sequence of instructions. Instructions sequence has
20795 SPLIT_COST cycles higher latency than lea latency. */
20796
20797 static bool
20798 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
20799 unsigned int regno2, int split_cost, bool has_scale)
20800 {
20801 int dist_define, dist_use;
20802
20803 /* For Silvermont if using a 2-source or 3-source LEA for
20804 non-destructive destination purposes, or due to wanting
20805 ability to use SCALE, the use of LEA is justified. */
20806 if (TARGET_SILVERMONT || TARGET_INTEL)
20807 {
20808 if (has_scale)
20809 return true;
20810 if (split_cost < 1)
20811 return false;
20812 if (regno0 == regno1 || regno0 == regno2)
20813 return false;
20814 return true;
20815 }
20816
20817 dist_define = distance_non_agu_define (regno1, regno2, insn);
20818 dist_use = distance_agu_use (regno0, insn);
20819
20820 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
20821 {
20822 /* If there is no non AGU operand definition, no AGU
20823 operand usage and split cost is 0 then both lea
20824 and non lea variants have same priority. Currently
20825 we prefer lea for 64 bit code and non lea on 32 bit
20826 code. */
20827 if (dist_use < 0 && split_cost == 0)
20828 return TARGET_64BIT || IX86_LEA_PRIORITY;
20829 else
20830 return true;
20831 }
20832
20833 /* With longer definitions distance lea is more preferable.
20834 Here we change it to take into account splitting cost and
20835 lea priority. */
20836 dist_define += split_cost + IX86_LEA_PRIORITY;
20837
20838 /* If there is no use in memory addess then we just check
20839 that split cost exceeds AGU stall. */
20840 if (dist_use < 0)
20841 return dist_define > LEA_MAX_STALL;
20842
20843 /* If this insn has both backward non-agu dependence and forward
20844 agu dependence, the one with short distance takes effect. */
20845 return dist_define >= dist_use;
20846 }
20847
20848 /* Return true if it is legal to clobber flags by INSN and
20849 false otherwise. */
20850
20851 static bool
20852 ix86_ok_to_clobber_flags (rtx_insn *insn)
20853 {
20854 basic_block bb = BLOCK_FOR_INSN (insn);
20855 df_ref use;
20856 bitmap live;
20857
20858 while (insn)
20859 {
20860 if (NONDEBUG_INSN_P (insn))
20861 {
20862 FOR_EACH_INSN_USE (use, insn)
20863 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
20864 return false;
20865
20866 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
20867 return true;
20868 }
20869
20870 if (insn == BB_END (bb))
20871 break;
20872
20873 insn = NEXT_INSN (insn);
20874 }
20875
20876 live = df_get_live_out(bb);
20877 return !REGNO_REG_SET_P (live, FLAGS_REG);
20878 }
20879
20880 /* Return true if we need to split op0 = op1 + op2 into a sequence of
20881 move and add to avoid AGU stalls. */
20882
20883 bool
20884 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
20885 {
20886 unsigned int regno0, regno1, regno2;
20887
20888 /* Check if we need to optimize. */
20889 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
20890 return false;
20891
20892 /* Check it is correct to split here. */
20893 if (!ix86_ok_to_clobber_flags(insn))
20894 return false;
20895
20896 regno0 = true_regnum (operands[0]);
20897 regno1 = true_regnum (operands[1]);
20898 regno2 = true_regnum (operands[2]);
20899
20900 /* We need to split only adds with non destructive
20901 destination operand. */
20902 if (regno0 == regno1 || regno0 == regno2)
20903 return false;
20904 else
20905 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
20906 }
20907
20908 /* Return true if we should emit lea instruction instead of mov
20909 instruction. */
20910
20911 bool
20912 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
20913 {
20914 unsigned int regno0, regno1;
20915
20916 /* Check if we need to optimize. */
20917 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
20918 return false;
20919
20920 /* Use lea for reg to reg moves only. */
20921 if (!REG_P (operands[0]) || !REG_P (operands[1]))
20922 return false;
20923
20924 regno0 = true_regnum (operands[0]);
20925 regno1 = true_regnum (operands[1]);
20926
20927 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
20928 }
20929
20930 /* Return true if we need to split lea into a sequence of
20931 instructions to avoid AGU stalls. */
20932
20933 bool
20934 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
20935 {
20936 unsigned int regno0, regno1, regno2;
20937 int split_cost;
20938 struct ix86_address parts;
20939 int ok;
20940
20941 /* Check we need to optimize. */
20942 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
20943 return false;
20944
20945 /* The "at least two components" test below might not catch simple
20946 move or zero extension insns if parts.base is non-NULL and parts.disp
20947 is const0_rtx as the only components in the address, e.g. if the
20948 register is %rbp or %r13. As this test is much cheaper and moves or
20949 zero extensions are the common case, do this check first. */
20950 if (REG_P (operands[1])
20951 || (SImode_address_operand (operands[1], VOIDmode)
20952 && REG_P (XEXP (operands[1], 0))))
20953 return false;
20954
20955 /* Check if it is OK to split here. */
20956 if (!ix86_ok_to_clobber_flags (insn))
20957 return false;
20958
20959 ok = ix86_decompose_address (operands[1], &parts);
20960 gcc_assert (ok);
20961
20962 /* There should be at least two components in the address. */
20963 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
20964 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
20965 return false;
20966
20967 /* We should not split into add if non legitimate pic
20968 operand is used as displacement. */
20969 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
20970 return false;
20971
20972 regno0 = true_regnum (operands[0]) ;
20973 regno1 = INVALID_REGNUM;
20974 regno2 = INVALID_REGNUM;
20975
20976 if (parts.base)
20977 regno1 = true_regnum (parts.base);
20978 if (parts.index)
20979 regno2 = true_regnum (parts.index);
20980
20981 split_cost = 0;
20982
20983 /* Compute how many cycles we will add to execution time
20984 if split lea into a sequence of instructions. */
20985 if (parts.base || parts.index)
20986 {
20987 /* Have to use mov instruction if non desctructive
20988 destination form is used. */
20989 if (regno1 != regno0 && regno2 != regno0)
20990 split_cost += 1;
20991
20992 /* Have to add index to base if both exist. */
20993 if (parts.base && parts.index)
20994 split_cost += 1;
20995
20996 /* Have to use shift and adds if scale is 2 or greater. */
20997 if (parts.scale > 1)
20998 {
20999 if (regno0 != regno1)
21000 split_cost += 1;
21001 else if (regno2 == regno0)
21002 split_cost += 4;
21003 else
21004 split_cost += parts.scale;
21005 }
21006
21007 /* Have to use add instruction with immediate if
21008 disp is non zero. */
21009 if (parts.disp && parts.disp != const0_rtx)
21010 split_cost += 1;
21011
21012 /* Subtract the price of lea. */
21013 split_cost -= 1;
21014 }
21015
21016 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
21017 parts.scale > 1);
21018 }
21019
21020 /* Emit x86 binary operand CODE in mode MODE, where the first operand
21021 matches destination. RTX includes clobber of FLAGS_REG. */
21022
21023 static void
21024 ix86_emit_binop (enum rtx_code code, machine_mode mode,
21025 rtx dst, rtx src)
21026 {
21027 rtx op, clob;
21028
21029 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
21030 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21031
21032 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21033 }
21034
21035 /* Return true if regno1 def is nearest to the insn. */
21036
21037 static bool
21038 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
21039 {
21040 rtx_insn *prev = insn;
21041 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
21042
21043 if (insn == start)
21044 return false;
21045 while (prev && prev != start)
21046 {
21047 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
21048 {
21049 prev = PREV_INSN (prev);
21050 continue;
21051 }
21052 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
21053 return true;
21054 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
21055 return false;
21056 prev = PREV_INSN (prev);
21057 }
21058
21059 /* None of the regs is defined in the bb. */
21060 return false;
21061 }
21062
21063 /* Split lea instructions into a sequence of instructions
21064 which are executed on ALU to avoid AGU stalls.
21065 It is assumed that it is allowed to clobber flags register
21066 at lea position. */
21067
21068 void
21069 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
21070 {
21071 unsigned int regno0, regno1, regno2;
21072 struct ix86_address parts;
21073 rtx target, tmp;
21074 int ok, adds;
21075
21076 ok = ix86_decompose_address (operands[1], &parts);
21077 gcc_assert (ok);
21078
21079 target = gen_lowpart (mode, operands[0]);
21080
21081 regno0 = true_regnum (target);
21082 regno1 = INVALID_REGNUM;
21083 regno2 = INVALID_REGNUM;
21084
21085 if (parts.base)
21086 {
21087 parts.base = gen_lowpart (mode, parts.base);
21088 regno1 = true_regnum (parts.base);
21089 }
21090
21091 if (parts.index)
21092 {
21093 parts.index = gen_lowpart (mode, parts.index);
21094 regno2 = true_regnum (parts.index);
21095 }
21096
21097 if (parts.disp)
21098 parts.disp = gen_lowpart (mode, parts.disp);
21099
21100 if (parts.scale > 1)
21101 {
21102 /* Case r1 = r1 + ... */
21103 if (regno1 == regno0)
21104 {
21105 /* If we have a case r1 = r1 + C * r2 then we
21106 should use multiplication which is very
21107 expensive. Assume cost model is wrong if we
21108 have such case here. */
21109 gcc_assert (regno2 != regno0);
21110
21111 for (adds = parts.scale; adds > 0; adds--)
21112 ix86_emit_binop (PLUS, mode, target, parts.index);
21113 }
21114 else
21115 {
21116 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
21117 if (regno0 != regno2)
21118 emit_insn (gen_rtx_SET (target, parts.index));
21119
21120 /* Use shift for scaling. */
21121 ix86_emit_binop (ASHIFT, mode, target,
21122 GEN_INT (exact_log2 (parts.scale)));
21123
21124 if (parts.base)
21125 ix86_emit_binop (PLUS, mode, target, parts.base);
21126
21127 if (parts.disp && parts.disp != const0_rtx)
21128 ix86_emit_binop (PLUS, mode, target, parts.disp);
21129 }
21130 }
21131 else if (!parts.base && !parts.index)
21132 {
21133 gcc_assert(parts.disp);
21134 emit_insn (gen_rtx_SET (target, parts.disp));
21135 }
21136 else
21137 {
21138 if (!parts.base)
21139 {
21140 if (regno0 != regno2)
21141 emit_insn (gen_rtx_SET (target, parts.index));
21142 }
21143 else if (!parts.index)
21144 {
21145 if (regno0 != regno1)
21146 emit_insn (gen_rtx_SET (target, parts.base));
21147 }
21148 else
21149 {
21150 if (regno0 == regno1)
21151 tmp = parts.index;
21152 else if (regno0 == regno2)
21153 tmp = parts.base;
21154 else
21155 {
21156 rtx tmp1;
21157
21158 /* Find better operand for SET instruction, depending
21159 on which definition is farther from the insn. */
21160 if (find_nearest_reg_def (insn, regno1, regno2))
21161 tmp = parts.index, tmp1 = parts.base;
21162 else
21163 tmp = parts.base, tmp1 = parts.index;
21164
21165 emit_insn (gen_rtx_SET (target, tmp));
21166
21167 if (parts.disp && parts.disp != const0_rtx)
21168 ix86_emit_binop (PLUS, mode, target, parts.disp);
21169
21170 ix86_emit_binop (PLUS, mode, target, tmp1);
21171 return;
21172 }
21173
21174 ix86_emit_binop (PLUS, mode, target, tmp);
21175 }
21176
21177 if (parts.disp && parts.disp != const0_rtx)
21178 ix86_emit_binop (PLUS, mode, target, parts.disp);
21179 }
21180 }
21181
21182 /* Return true if it is ok to optimize an ADD operation to LEA
21183 operation to avoid flag register consumation. For most processors,
21184 ADD is faster than LEA. For the processors like BONNELL, if the
21185 destination register of LEA holds an actual address which will be
21186 used soon, LEA is better and otherwise ADD is better. */
21187
21188 bool
21189 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
21190 {
21191 unsigned int regno0 = true_regnum (operands[0]);
21192 unsigned int regno1 = true_regnum (operands[1]);
21193 unsigned int regno2 = true_regnum (operands[2]);
21194
21195 /* If a = b + c, (a!=b && a!=c), must use lea form. */
21196 if (regno0 != regno1 && regno0 != regno2)
21197 return true;
21198
21199 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21200 return false;
21201
21202 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
21203 }
21204
21205 /* Return true if destination reg of SET_BODY is shift count of
21206 USE_BODY. */
21207
21208 static bool
21209 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
21210 {
21211 rtx set_dest;
21212 rtx shift_rtx;
21213 int i;
21214
21215 /* Retrieve destination of SET_BODY. */
21216 switch (GET_CODE (set_body))
21217 {
21218 case SET:
21219 set_dest = SET_DEST (set_body);
21220 if (!set_dest || !REG_P (set_dest))
21221 return false;
21222 break;
21223 case PARALLEL:
21224 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
21225 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
21226 use_body))
21227 return true;
21228 /* FALLTHROUGH */
21229 default:
21230 return false;
21231 }
21232
21233 /* Retrieve shift count of USE_BODY. */
21234 switch (GET_CODE (use_body))
21235 {
21236 case SET:
21237 shift_rtx = XEXP (use_body, 1);
21238 break;
21239 case PARALLEL:
21240 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
21241 if (ix86_dep_by_shift_count_body (set_body,
21242 XVECEXP (use_body, 0, i)))
21243 return true;
21244 /* FALLTHROUGH */
21245 default:
21246 return false;
21247 }
21248
21249 if (shift_rtx
21250 && (GET_CODE (shift_rtx) == ASHIFT
21251 || GET_CODE (shift_rtx) == LSHIFTRT
21252 || GET_CODE (shift_rtx) == ASHIFTRT
21253 || GET_CODE (shift_rtx) == ROTATE
21254 || GET_CODE (shift_rtx) == ROTATERT))
21255 {
21256 rtx shift_count = XEXP (shift_rtx, 1);
21257
21258 /* Return true if shift count is dest of SET_BODY. */
21259 if (REG_P (shift_count))
21260 {
21261 /* Add check since it can be invoked before register
21262 allocation in pre-reload schedule. */
21263 if (reload_completed
21264 && true_regnum (set_dest) == true_regnum (shift_count))
21265 return true;
21266 else if (REGNO(set_dest) == REGNO(shift_count))
21267 return true;
21268 }
21269 }
21270
21271 return false;
21272 }
21273
21274 /* Return true if destination reg of SET_INSN is shift count of
21275 USE_INSN. */
21276
21277 bool
21278 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
21279 {
21280 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
21281 PATTERN (use_insn));
21282 }
21283
21284 /* Return TRUE or FALSE depending on whether the unary operator meets the
21285 appropriate constraints. */
21286
21287 bool
21288 ix86_unary_operator_ok (enum rtx_code,
21289 machine_mode,
21290 rtx operands[2])
21291 {
21292 /* If one of operands is memory, source and destination must match. */
21293 if ((MEM_P (operands[0])
21294 || MEM_P (operands[1]))
21295 && ! rtx_equal_p (operands[0], operands[1]))
21296 return false;
21297 return true;
21298 }
21299
21300 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
21301 are ok, keeping in mind the possible movddup alternative. */
21302
21303 bool
21304 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
21305 {
21306 if (MEM_P (operands[0]))
21307 return rtx_equal_p (operands[0], operands[1 + high]);
21308 if (MEM_P (operands[1]) && MEM_P (operands[2]))
21309 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
21310 return true;
21311 }
21312
21313 /* Post-reload splitter for converting an SF or DFmode value in an
21314 SSE register into an unsigned SImode. */
21315
21316 void
21317 ix86_split_convert_uns_si_sse (rtx operands[])
21318 {
21319 machine_mode vecmode;
21320 rtx value, large, zero_or_two31, input, two31, x;
21321
21322 large = operands[1];
21323 zero_or_two31 = operands[2];
21324 input = operands[3];
21325 two31 = operands[4];
21326 vecmode = GET_MODE (large);
21327 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
21328
21329 /* Load up the value into the low element. We must ensure that the other
21330 elements are valid floats -- zero is the easiest such value. */
21331 if (MEM_P (input))
21332 {
21333 if (vecmode == V4SFmode)
21334 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
21335 else
21336 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
21337 }
21338 else
21339 {
21340 input = gen_rtx_REG (vecmode, REGNO (input));
21341 emit_move_insn (value, CONST0_RTX (vecmode));
21342 if (vecmode == V4SFmode)
21343 emit_insn (gen_sse_movss (value, value, input));
21344 else
21345 emit_insn (gen_sse2_movsd (value, value, input));
21346 }
21347
21348 emit_move_insn (large, two31);
21349 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
21350
21351 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
21352 emit_insn (gen_rtx_SET (large, x));
21353
21354 x = gen_rtx_AND (vecmode, zero_or_two31, large);
21355 emit_insn (gen_rtx_SET (zero_or_two31, x));
21356
21357 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
21358 emit_insn (gen_rtx_SET (value, x));
21359
21360 large = gen_rtx_REG (V4SImode, REGNO (large));
21361 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
21362
21363 x = gen_rtx_REG (V4SImode, REGNO (value));
21364 if (vecmode == V4SFmode)
21365 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
21366 else
21367 emit_insn (gen_sse2_cvttpd2dq (x, value));
21368 value = x;
21369
21370 emit_insn (gen_xorv4si3 (value, value, large));
21371 }
21372
21373 /* Convert an unsigned DImode value into a DFmode, using only SSE.
21374 Expects the 64-bit DImode to be supplied in a pair of integral
21375 registers. Requires SSE2; will use SSE3 if available. For x86_32,
21376 -mfpmath=sse, !optimize_size only. */
21377
21378 void
21379 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
21380 {
21381 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
21382 rtx int_xmm, fp_xmm;
21383 rtx biases, exponents;
21384 rtx x;
21385
21386 int_xmm = gen_reg_rtx (V4SImode);
21387 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
21388 emit_insn (gen_movdi_to_sse (int_xmm, input));
21389 else if (TARGET_SSE_SPLIT_REGS)
21390 {
21391 emit_clobber (int_xmm);
21392 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
21393 }
21394 else
21395 {
21396 x = gen_reg_rtx (V2DImode);
21397 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
21398 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
21399 }
21400
21401 x = gen_rtx_CONST_VECTOR (V4SImode,
21402 gen_rtvec (4, GEN_INT (0x43300000UL),
21403 GEN_INT (0x45300000UL),
21404 const0_rtx, const0_rtx));
21405 exponents = validize_mem (force_const_mem (V4SImode, x));
21406
21407 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
21408 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
21409
21410 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
21411 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
21412 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
21413 (0x1.0p84 + double(fp_value_hi_xmm)).
21414 Note these exponents differ by 32. */
21415
21416 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
21417
21418 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
21419 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
21420 real_ldexp (&bias_lo_rvt, &dconst1, 52);
21421 real_ldexp (&bias_hi_rvt, &dconst1, 84);
21422 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
21423 x = const_double_from_real_value (bias_hi_rvt, DFmode);
21424 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
21425 biases = validize_mem (force_const_mem (V2DFmode, biases));
21426 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
21427
21428 /* Add the upper and lower DFmode values together. */
21429 if (TARGET_SSE3)
21430 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
21431 else
21432 {
21433 x = copy_to_mode_reg (V2DFmode, fp_xmm);
21434 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
21435 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
21436 }
21437
21438 ix86_expand_vector_extract (false, target, fp_xmm, 0);
21439 }
21440
21441 /* Not used, but eases macroization of patterns. */
21442 void
21443 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
21444 {
21445 gcc_unreachable ();
21446 }
21447
21448 /* Convert an unsigned SImode value into a DFmode. Only currently used
21449 for SSE, but applicable anywhere. */
21450
21451 void
21452 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
21453 {
21454 REAL_VALUE_TYPE TWO31r;
21455 rtx x, fp;
21456
21457 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
21458 NULL, 1, OPTAB_DIRECT);
21459
21460 fp = gen_reg_rtx (DFmode);
21461 emit_insn (gen_floatsidf2 (fp, x));
21462
21463 real_ldexp (&TWO31r, &dconst1, 31);
21464 x = const_double_from_real_value (TWO31r, DFmode);
21465
21466 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
21467 if (x != target)
21468 emit_move_insn (target, x);
21469 }
21470
21471 /* Convert a signed DImode value into a DFmode. Only used for SSE in
21472 32-bit mode; otherwise we have a direct convert instruction. */
21473
21474 void
21475 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
21476 {
21477 REAL_VALUE_TYPE TWO32r;
21478 rtx fp_lo, fp_hi, x;
21479
21480 fp_lo = gen_reg_rtx (DFmode);
21481 fp_hi = gen_reg_rtx (DFmode);
21482
21483 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
21484
21485 real_ldexp (&TWO32r, &dconst1, 32);
21486 x = const_double_from_real_value (TWO32r, DFmode);
21487 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
21488
21489 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
21490
21491 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
21492 0, OPTAB_DIRECT);
21493 if (x != target)
21494 emit_move_insn (target, x);
21495 }
21496
21497 /* Convert an unsigned SImode value into a SFmode, using only SSE.
21498 For x86_32, -mfpmath=sse, !optimize_size only. */
21499 void
21500 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
21501 {
21502 REAL_VALUE_TYPE ONE16r;
21503 rtx fp_hi, fp_lo, int_hi, int_lo, x;
21504
21505 real_ldexp (&ONE16r, &dconst1, 16);
21506 x = const_double_from_real_value (ONE16r, SFmode);
21507 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
21508 NULL, 0, OPTAB_DIRECT);
21509 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
21510 NULL, 0, OPTAB_DIRECT);
21511 fp_hi = gen_reg_rtx (SFmode);
21512 fp_lo = gen_reg_rtx (SFmode);
21513 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
21514 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
21515 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
21516 0, OPTAB_DIRECT);
21517 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
21518 0, OPTAB_DIRECT);
21519 if (!rtx_equal_p (target, fp_hi))
21520 emit_move_insn (target, fp_hi);
21521 }
21522
21523 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
21524 a vector of unsigned ints VAL to vector of floats TARGET. */
21525
21526 void
21527 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
21528 {
21529 rtx tmp[8];
21530 REAL_VALUE_TYPE TWO16r;
21531 machine_mode intmode = GET_MODE (val);
21532 machine_mode fltmode = GET_MODE (target);
21533 rtx (*cvt) (rtx, rtx);
21534
21535 if (intmode == V4SImode)
21536 cvt = gen_floatv4siv4sf2;
21537 else
21538 cvt = gen_floatv8siv8sf2;
21539 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
21540 tmp[0] = force_reg (intmode, tmp[0]);
21541 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
21542 OPTAB_DIRECT);
21543 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
21544 NULL_RTX, 1, OPTAB_DIRECT);
21545 tmp[3] = gen_reg_rtx (fltmode);
21546 emit_insn (cvt (tmp[3], tmp[1]));
21547 tmp[4] = gen_reg_rtx (fltmode);
21548 emit_insn (cvt (tmp[4], tmp[2]));
21549 real_ldexp (&TWO16r, &dconst1, 16);
21550 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
21551 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
21552 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
21553 OPTAB_DIRECT);
21554 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
21555 OPTAB_DIRECT);
21556 if (tmp[7] != target)
21557 emit_move_insn (target, tmp[7]);
21558 }
21559
21560 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
21561 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
21562 This is done by doing just signed conversion if < 0x1p31, and otherwise by
21563 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
21564
21565 rtx
21566 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
21567 {
21568 REAL_VALUE_TYPE TWO31r;
21569 rtx two31r, tmp[4];
21570 machine_mode mode = GET_MODE (val);
21571 machine_mode scalarmode = GET_MODE_INNER (mode);
21572 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
21573 rtx (*cmp) (rtx, rtx, rtx, rtx);
21574 int i;
21575
21576 for (i = 0; i < 3; i++)
21577 tmp[i] = gen_reg_rtx (mode);
21578 real_ldexp (&TWO31r, &dconst1, 31);
21579 two31r = const_double_from_real_value (TWO31r, scalarmode);
21580 two31r = ix86_build_const_vector (mode, 1, two31r);
21581 two31r = force_reg (mode, two31r);
21582 switch (mode)
21583 {
21584 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
21585 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
21586 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
21587 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
21588 default: gcc_unreachable ();
21589 }
21590 tmp[3] = gen_rtx_LE (mode, two31r, val);
21591 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
21592 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
21593 0, OPTAB_DIRECT);
21594 if (intmode == V4SImode || TARGET_AVX2)
21595 *xorp = expand_simple_binop (intmode, ASHIFT,
21596 gen_lowpart (intmode, tmp[0]),
21597 GEN_INT (31), NULL_RTX, 0,
21598 OPTAB_DIRECT);
21599 else
21600 {
21601 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
21602 two31 = ix86_build_const_vector (intmode, 1, two31);
21603 *xorp = expand_simple_binop (intmode, AND,
21604 gen_lowpart (intmode, tmp[0]),
21605 two31, NULL_RTX, 0,
21606 OPTAB_DIRECT);
21607 }
21608 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
21609 0, OPTAB_DIRECT);
21610 }
21611
21612 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
21613 then replicate the value for all elements of the vector
21614 register. */
21615
21616 rtx
21617 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
21618 {
21619 int i, n_elt;
21620 rtvec v;
21621 machine_mode scalar_mode;
21622
21623 switch (mode)
21624 {
21625 case V64QImode:
21626 case V32QImode:
21627 case V16QImode:
21628 case V32HImode:
21629 case V16HImode:
21630 case V8HImode:
21631 case V16SImode:
21632 case V8SImode:
21633 case V4SImode:
21634 case V8DImode:
21635 case V4DImode:
21636 case V2DImode:
21637 gcc_assert (vect);
21638 /* FALLTHRU */
21639 case V16SFmode:
21640 case V8SFmode:
21641 case V4SFmode:
21642 case V8DFmode:
21643 case V4DFmode:
21644 case V2DFmode:
21645 n_elt = GET_MODE_NUNITS (mode);
21646 v = rtvec_alloc (n_elt);
21647 scalar_mode = GET_MODE_INNER (mode);
21648
21649 RTVEC_ELT (v, 0) = value;
21650
21651 for (i = 1; i < n_elt; ++i)
21652 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
21653
21654 return gen_rtx_CONST_VECTOR (mode, v);
21655
21656 default:
21657 gcc_unreachable ();
21658 }
21659 }
21660
21661 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
21662 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
21663 for an SSE register. If VECT is true, then replicate the mask for
21664 all elements of the vector register. If INVERT is true, then create
21665 a mask excluding the sign bit. */
21666
21667 rtx
21668 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
21669 {
21670 machine_mode vec_mode, imode;
21671 wide_int w;
21672 rtx mask, v;
21673
21674 switch (mode)
21675 {
21676 case V16SImode:
21677 case V16SFmode:
21678 case V8SImode:
21679 case V4SImode:
21680 case V8SFmode:
21681 case V4SFmode:
21682 vec_mode = mode;
21683 imode = SImode;
21684 break;
21685
21686 case V8DImode:
21687 case V4DImode:
21688 case V2DImode:
21689 case V8DFmode:
21690 case V4DFmode:
21691 case V2DFmode:
21692 vec_mode = mode;
21693 imode = DImode;
21694 break;
21695
21696 case TImode:
21697 case TFmode:
21698 vec_mode = VOIDmode;
21699 imode = TImode;
21700 break;
21701
21702 default:
21703 gcc_unreachable ();
21704 }
21705
21706 machine_mode inner_mode = GET_MODE_INNER (mode);
21707 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
21708 GET_MODE_BITSIZE (inner_mode));
21709 if (invert)
21710 w = wi::bit_not (w);
21711
21712 /* Force this value into the low part of a fp vector constant. */
21713 mask = immed_wide_int_const (w, imode);
21714 mask = gen_lowpart (inner_mode, mask);
21715
21716 if (vec_mode == VOIDmode)
21717 return force_reg (inner_mode, mask);
21718
21719 v = ix86_build_const_vector (vec_mode, vect, mask);
21720 return force_reg (vec_mode, v);
21721 }
21722
21723 /* Generate code for floating point ABS or NEG. */
21724
21725 void
21726 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
21727 rtx operands[])
21728 {
21729 rtx mask, set, dst, src;
21730 bool use_sse = false;
21731 bool vector_mode = VECTOR_MODE_P (mode);
21732 machine_mode vmode = mode;
21733
21734 if (vector_mode)
21735 use_sse = true;
21736 else if (mode == TFmode)
21737 use_sse = true;
21738 else if (TARGET_SSE_MATH)
21739 {
21740 use_sse = SSE_FLOAT_MODE_P (mode);
21741 if (mode == SFmode)
21742 vmode = V4SFmode;
21743 else if (mode == DFmode)
21744 vmode = V2DFmode;
21745 }
21746
21747 /* NEG and ABS performed with SSE use bitwise mask operations.
21748 Create the appropriate mask now. */
21749 if (use_sse)
21750 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
21751 else
21752 mask = NULL_RTX;
21753
21754 dst = operands[0];
21755 src = operands[1];
21756
21757 set = gen_rtx_fmt_e (code, mode, src);
21758 set = gen_rtx_SET (dst, set);
21759
21760 if (mask)
21761 {
21762 rtx use, clob;
21763 rtvec par;
21764
21765 use = gen_rtx_USE (VOIDmode, mask);
21766 if (vector_mode)
21767 par = gen_rtvec (2, set, use);
21768 else
21769 {
21770 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21771 par = gen_rtvec (3, set, use, clob);
21772 }
21773 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
21774 }
21775 else
21776 emit_insn (set);
21777 }
21778
21779 /* Expand a copysign operation. Special case operand 0 being a constant. */
21780
21781 void
21782 ix86_expand_copysign (rtx operands[])
21783 {
21784 machine_mode mode, vmode;
21785 rtx dest, op0, op1, mask, nmask;
21786
21787 dest = operands[0];
21788 op0 = operands[1];
21789 op1 = operands[2];
21790
21791 mode = GET_MODE (dest);
21792
21793 if (mode == SFmode)
21794 vmode = V4SFmode;
21795 else if (mode == DFmode)
21796 vmode = V2DFmode;
21797 else
21798 vmode = mode;
21799
21800 if (CONST_DOUBLE_P (op0))
21801 {
21802 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
21803
21804 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
21805 op0 = simplify_unary_operation (ABS, mode, op0, mode);
21806
21807 if (mode == SFmode || mode == DFmode)
21808 {
21809 if (op0 == CONST0_RTX (mode))
21810 op0 = CONST0_RTX (vmode);
21811 else
21812 {
21813 rtx v = ix86_build_const_vector (vmode, false, op0);
21814
21815 op0 = force_reg (vmode, v);
21816 }
21817 }
21818 else if (op0 != CONST0_RTX (mode))
21819 op0 = force_reg (mode, op0);
21820
21821 mask = ix86_build_signbit_mask (vmode, 0, 0);
21822
21823 if (mode == SFmode)
21824 copysign_insn = gen_copysignsf3_const;
21825 else if (mode == DFmode)
21826 copysign_insn = gen_copysigndf3_const;
21827 else
21828 copysign_insn = gen_copysigntf3_const;
21829
21830 emit_insn (copysign_insn (dest, op0, op1, mask));
21831 }
21832 else
21833 {
21834 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
21835
21836 nmask = ix86_build_signbit_mask (vmode, 0, 1);
21837 mask = ix86_build_signbit_mask (vmode, 0, 0);
21838
21839 if (mode == SFmode)
21840 copysign_insn = gen_copysignsf3_var;
21841 else if (mode == DFmode)
21842 copysign_insn = gen_copysigndf3_var;
21843 else
21844 copysign_insn = gen_copysigntf3_var;
21845
21846 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
21847 }
21848 }
21849
21850 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
21851 be a constant, and so has already been expanded into a vector constant. */
21852
21853 void
21854 ix86_split_copysign_const (rtx operands[])
21855 {
21856 machine_mode mode, vmode;
21857 rtx dest, op0, mask, x;
21858
21859 dest = operands[0];
21860 op0 = operands[1];
21861 mask = operands[3];
21862
21863 mode = GET_MODE (dest);
21864 vmode = GET_MODE (mask);
21865
21866 dest = lowpart_subreg (vmode, dest, mode);
21867 x = gen_rtx_AND (vmode, dest, mask);
21868 emit_insn (gen_rtx_SET (dest, x));
21869
21870 if (op0 != CONST0_RTX (vmode))
21871 {
21872 x = gen_rtx_IOR (vmode, dest, op0);
21873 emit_insn (gen_rtx_SET (dest, x));
21874 }
21875 }
21876
21877 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
21878 so we have to do two masks. */
21879
21880 void
21881 ix86_split_copysign_var (rtx operands[])
21882 {
21883 machine_mode mode, vmode;
21884 rtx dest, scratch, op0, op1, mask, nmask, x;
21885
21886 dest = operands[0];
21887 scratch = operands[1];
21888 op0 = operands[2];
21889 op1 = operands[3];
21890 nmask = operands[4];
21891 mask = operands[5];
21892
21893 mode = GET_MODE (dest);
21894 vmode = GET_MODE (mask);
21895
21896 if (rtx_equal_p (op0, op1))
21897 {
21898 /* Shouldn't happen often (it's useless, obviously), but when it does
21899 we'd generate incorrect code if we continue below. */
21900 emit_move_insn (dest, op0);
21901 return;
21902 }
21903
21904 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
21905 {
21906 gcc_assert (REGNO (op1) == REGNO (scratch));
21907
21908 x = gen_rtx_AND (vmode, scratch, mask);
21909 emit_insn (gen_rtx_SET (scratch, x));
21910
21911 dest = mask;
21912 op0 = lowpart_subreg (vmode, op0, mode);
21913 x = gen_rtx_NOT (vmode, dest);
21914 x = gen_rtx_AND (vmode, x, op0);
21915 emit_insn (gen_rtx_SET (dest, x));
21916 }
21917 else
21918 {
21919 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
21920 {
21921 x = gen_rtx_AND (vmode, scratch, mask);
21922 }
21923 else /* alternative 2,4 */
21924 {
21925 gcc_assert (REGNO (mask) == REGNO (scratch));
21926 op1 = lowpart_subreg (vmode, op1, mode);
21927 x = gen_rtx_AND (vmode, scratch, op1);
21928 }
21929 emit_insn (gen_rtx_SET (scratch, x));
21930
21931 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
21932 {
21933 dest = lowpart_subreg (vmode, op0, mode);
21934 x = gen_rtx_AND (vmode, dest, nmask);
21935 }
21936 else /* alternative 3,4 */
21937 {
21938 gcc_assert (REGNO (nmask) == REGNO (dest));
21939 dest = nmask;
21940 op0 = lowpart_subreg (vmode, op0, mode);
21941 x = gen_rtx_AND (vmode, dest, op0);
21942 }
21943 emit_insn (gen_rtx_SET (dest, x));
21944 }
21945
21946 x = gen_rtx_IOR (vmode, dest, scratch);
21947 emit_insn (gen_rtx_SET (dest, x));
21948 }
21949
21950 /* Return TRUE or FALSE depending on whether the first SET in INSN
21951 has source and destination with matching CC modes, and that the
21952 CC mode is at least as constrained as REQ_MODE. */
21953
21954 bool
21955 ix86_match_ccmode (rtx insn, machine_mode req_mode)
21956 {
21957 rtx set;
21958 machine_mode set_mode;
21959
21960 set = PATTERN (insn);
21961 if (GET_CODE (set) == PARALLEL)
21962 set = XVECEXP (set, 0, 0);
21963 gcc_assert (GET_CODE (set) == SET);
21964 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
21965
21966 set_mode = GET_MODE (SET_DEST (set));
21967 switch (set_mode)
21968 {
21969 case CCNOmode:
21970 if (req_mode != CCNOmode
21971 && (req_mode != CCmode
21972 || XEXP (SET_SRC (set), 1) != const0_rtx))
21973 return false;
21974 break;
21975 case CCmode:
21976 if (req_mode == CCGCmode)
21977 return false;
21978 /* FALLTHRU */
21979 case CCGCmode:
21980 if (req_mode == CCGOCmode || req_mode == CCNOmode)
21981 return false;
21982 /* FALLTHRU */
21983 case CCGOCmode:
21984 if (req_mode == CCZmode)
21985 return false;
21986 /* FALLTHRU */
21987 case CCZmode:
21988 break;
21989
21990 case CCAmode:
21991 case CCCmode:
21992 case CCOmode:
21993 case CCPmode:
21994 case CCSmode:
21995 if (set_mode != req_mode)
21996 return false;
21997 break;
21998
21999 default:
22000 gcc_unreachable ();
22001 }
22002
22003 return GET_MODE (SET_SRC (set)) == set_mode;
22004 }
22005
22006 /* Generate insn patterns to do an integer compare of OPERANDS. */
22007
22008 static rtx
22009 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
22010 {
22011 machine_mode cmpmode;
22012 rtx tmp, flags;
22013
22014 cmpmode = SELECT_CC_MODE (code, op0, op1);
22015 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
22016
22017 /* This is very simple, but making the interface the same as in the
22018 FP case makes the rest of the code easier. */
22019 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
22020 emit_insn (gen_rtx_SET (flags, tmp));
22021
22022 /* Return the test that should be put into the flags user, i.e.
22023 the bcc, scc, or cmov instruction. */
22024 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
22025 }
22026
22027 /* Figure out whether to use ordered or unordered fp comparisons.
22028 Return the appropriate mode to use. */
22029
22030 machine_mode
22031 ix86_fp_compare_mode (enum rtx_code)
22032 {
22033 /* ??? In order to make all comparisons reversible, we do all comparisons
22034 non-trapping when compiling for IEEE. Once gcc is able to distinguish
22035 all forms trapping and nontrapping comparisons, we can make inequality
22036 comparisons trapping again, since it results in better code when using
22037 FCOM based compares. */
22038 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
22039 }
22040
22041 machine_mode
22042 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
22043 {
22044 machine_mode mode = GET_MODE (op0);
22045
22046 if (SCALAR_FLOAT_MODE_P (mode))
22047 {
22048 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
22049 return ix86_fp_compare_mode (code);
22050 }
22051
22052 switch (code)
22053 {
22054 /* Only zero flag is needed. */
22055 case EQ: /* ZF=0 */
22056 case NE: /* ZF!=0 */
22057 return CCZmode;
22058 /* Codes needing carry flag. */
22059 case GEU: /* CF=0 */
22060 case LTU: /* CF=1 */
22061 /* Detect overflow checks. They need just the carry flag. */
22062 if (GET_CODE (op0) == PLUS
22063 && (rtx_equal_p (op1, XEXP (op0, 0))
22064 || rtx_equal_p (op1, XEXP (op0, 1))))
22065 return CCCmode;
22066 else
22067 return CCmode;
22068 case GTU: /* CF=0 & ZF=0 */
22069 case LEU: /* CF=1 | ZF=1 */
22070 return CCmode;
22071 /* Codes possibly doable only with sign flag when
22072 comparing against zero. */
22073 case GE: /* SF=OF or SF=0 */
22074 case LT: /* SF<>OF or SF=1 */
22075 if (op1 == const0_rtx)
22076 return CCGOCmode;
22077 else
22078 /* For other cases Carry flag is not required. */
22079 return CCGCmode;
22080 /* Codes doable only with sign flag when comparing
22081 against zero, but we miss jump instruction for it
22082 so we need to use relational tests against overflow
22083 that thus needs to be zero. */
22084 case GT: /* ZF=0 & SF=OF */
22085 case LE: /* ZF=1 | SF<>OF */
22086 if (op1 == const0_rtx)
22087 return CCNOmode;
22088 else
22089 return CCGCmode;
22090 /* strcmp pattern do (use flags) and combine may ask us for proper
22091 mode. */
22092 case USE:
22093 return CCmode;
22094 default:
22095 gcc_unreachable ();
22096 }
22097 }
22098
22099 /* Return the fixed registers used for condition codes. */
22100
22101 static bool
22102 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
22103 {
22104 *p1 = FLAGS_REG;
22105 *p2 = FPSR_REG;
22106 return true;
22107 }
22108
22109 /* If two condition code modes are compatible, return a condition code
22110 mode which is compatible with both. Otherwise, return
22111 VOIDmode. */
22112
22113 static machine_mode
22114 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
22115 {
22116 if (m1 == m2)
22117 return m1;
22118
22119 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
22120 return VOIDmode;
22121
22122 if ((m1 == CCGCmode && m2 == CCGOCmode)
22123 || (m1 == CCGOCmode && m2 == CCGCmode))
22124 return CCGCmode;
22125
22126 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
22127 return m2;
22128 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
22129 return m1;
22130
22131 switch (m1)
22132 {
22133 default:
22134 gcc_unreachable ();
22135
22136 case CCmode:
22137 case CCGCmode:
22138 case CCGOCmode:
22139 case CCNOmode:
22140 case CCAmode:
22141 case CCCmode:
22142 case CCOmode:
22143 case CCPmode:
22144 case CCSmode:
22145 case CCZmode:
22146 switch (m2)
22147 {
22148 default:
22149 return VOIDmode;
22150
22151 case CCmode:
22152 case CCGCmode:
22153 case CCGOCmode:
22154 case CCNOmode:
22155 case CCAmode:
22156 case CCCmode:
22157 case CCOmode:
22158 case CCPmode:
22159 case CCSmode:
22160 case CCZmode:
22161 return CCmode;
22162 }
22163
22164 case CCFPmode:
22165 case CCFPUmode:
22166 /* These are only compatible with themselves, which we already
22167 checked above. */
22168 return VOIDmode;
22169 }
22170 }
22171
22172
22173 /* Return a comparison we can do and that it is equivalent to
22174 swap_condition (code) apart possibly from orderedness.
22175 But, never change orderedness if TARGET_IEEE_FP, returning
22176 UNKNOWN in that case if necessary. */
22177
22178 static enum rtx_code
22179 ix86_fp_swap_condition (enum rtx_code code)
22180 {
22181 switch (code)
22182 {
22183 case GT: /* GTU - CF=0 & ZF=0 */
22184 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
22185 case GE: /* GEU - CF=0 */
22186 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
22187 case UNLT: /* LTU - CF=1 */
22188 return TARGET_IEEE_FP ? UNKNOWN : GT;
22189 case UNLE: /* LEU - CF=1 | ZF=1 */
22190 return TARGET_IEEE_FP ? UNKNOWN : GE;
22191 default:
22192 return swap_condition (code);
22193 }
22194 }
22195
22196 /* Return cost of comparison CODE using the best strategy for performance.
22197 All following functions do use number of instructions as a cost metrics.
22198 In future this should be tweaked to compute bytes for optimize_size and
22199 take into account performance of various instructions on various CPUs. */
22200
22201 static int
22202 ix86_fp_comparison_cost (enum rtx_code code)
22203 {
22204 int arith_cost;
22205
22206 /* The cost of code using bit-twiddling on %ah. */
22207 switch (code)
22208 {
22209 case UNLE:
22210 case UNLT:
22211 case LTGT:
22212 case GT:
22213 case GE:
22214 case UNORDERED:
22215 case ORDERED:
22216 case UNEQ:
22217 arith_cost = 4;
22218 break;
22219 case LT:
22220 case NE:
22221 case EQ:
22222 case UNGE:
22223 arith_cost = TARGET_IEEE_FP ? 5 : 4;
22224 break;
22225 case LE:
22226 case UNGT:
22227 arith_cost = TARGET_IEEE_FP ? 6 : 4;
22228 break;
22229 default:
22230 gcc_unreachable ();
22231 }
22232
22233 switch (ix86_fp_comparison_strategy (code))
22234 {
22235 case IX86_FPCMP_COMI:
22236 return arith_cost > 4 ? 3 : 2;
22237 case IX86_FPCMP_SAHF:
22238 return arith_cost > 4 ? 4 : 3;
22239 default:
22240 return arith_cost;
22241 }
22242 }
22243
22244 /* Return strategy to use for floating-point. We assume that fcomi is always
22245 preferrable where available, since that is also true when looking at size
22246 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
22247
22248 enum ix86_fpcmp_strategy
22249 ix86_fp_comparison_strategy (enum rtx_code)
22250 {
22251 /* Do fcomi/sahf based test when profitable. */
22252
22253 if (TARGET_CMOVE)
22254 return IX86_FPCMP_COMI;
22255
22256 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
22257 return IX86_FPCMP_SAHF;
22258
22259 return IX86_FPCMP_ARITH;
22260 }
22261
22262 /* Swap, force into registers, or otherwise massage the two operands
22263 to a fp comparison. The operands are updated in place; the new
22264 comparison code is returned. */
22265
22266 static enum rtx_code
22267 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
22268 {
22269 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
22270 rtx op0 = *pop0, op1 = *pop1;
22271 machine_mode op_mode = GET_MODE (op0);
22272 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
22273
22274 /* All of the unordered compare instructions only work on registers.
22275 The same is true of the fcomi compare instructions. The XFmode
22276 compare instructions require registers except when comparing
22277 against zero or when converting operand 1 from fixed point to
22278 floating point. */
22279
22280 if (!is_sse
22281 && (fpcmp_mode == CCFPUmode
22282 || (op_mode == XFmode
22283 && ! (standard_80387_constant_p (op0) == 1
22284 || standard_80387_constant_p (op1) == 1)
22285 && GET_CODE (op1) != FLOAT)
22286 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
22287 {
22288 op0 = force_reg (op_mode, op0);
22289 op1 = force_reg (op_mode, op1);
22290 }
22291 else
22292 {
22293 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
22294 things around if they appear profitable, otherwise force op0
22295 into a register. */
22296
22297 if (standard_80387_constant_p (op0) == 0
22298 || (MEM_P (op0)
22299 && ! (standard_80387_constant_p (op1) == 0
22300 || MEM_P (op1))))
22301 {
22302 enum rtx_code new_code = ix86_fp_swap_condition (code);
22303 if (new_code != UNKNOWN)
22304 {
22305 std::swap (op0, op1);
22306 code = new_code;
22307 }
22308 }
22309
22310 if (!REG_P (op0))
22311 op0 = force_reg (op_mode, op0);
22312
22313 if (CONSTANT_P (op1))
22314 {
22315 int tmp = standard_80387_constant_p (op1);
22316 if (tmp == 0)
22317 op1 = validize_mem (force_const_mem (op_mode, op1));
22318 else if (tmp == 1)
22319 {
22320 if (TARGET_CMOVE)
22321 op1 = force_reg (op_mode, op1);
22322 }
22323 else
22324 op1 = force_reg (op_mode, op1);
22325 }
22326 }
22327
22328 /* Try to rearrange the comparison to make it cheaper. */
22329 if (ix86_fp_comparison_cost (code)
22330 > ix86_fp_comparison_cost (swap_condition (code))
22331 && (REG_P (op1) || can_create_pseudo_p ()))
22332 {
22333 std::swap (op0, op1);
22334 code = swap_condition (code);
22335 if (!REG_P (op0))
22336 op0 = force_reg (op_mode, op0);
22337 }
22338
22339 *pop0 = op0;
22340 *pop1 = op1;
22341 return code;
22342 }
22343
22344 /* Convert comparison codes we use to represent FP comparison to integer
22345 code that will result in proper branch. Return UNKNOWN if no such code
22346 is available. */
22347
22348 enum rtx_code
22349 ix86_fp_compare_code_to_integer (enum rtx_code code)
22350 {
22351 switch (code)
22352 {
22353 case GT:
22354 return GTU;
22355 case GE:
22356 return GEU;
22357 case ORDERED:
22358 case UNORDERED:
22359 return code;
22360 case UNEQ:
22361 return EQ;
22362 case UNLT:
22363 return LTU;
22364 case UNLE:
22365 return LEU;
22366 case LTGT:
22367 return NE;
22368 default:
22369 return UNKNOWN;
22370 }
22371 }
22372
22373 /* Generate insn patterns to do a floating point compare of OPERANDS. */
22374
22375 static rtx
22376 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
22377 {
22378 machine_mode fpcmp_mode, intcmp_mode;
22379 rtx tmp, tmp2;
22380
22381 fpcmp_mode = ix86_fp_compare_mode (code);
22382 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
22383
22384 /* Do fcomi/sahf based test when profitable. */
22385 switch (ix86_fp_comparison_strategy (code))
22386 {
22387 case IX86_FPCMP_COMI:
22388 intcmp_mode = fpcmp_mode;
22389 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
22390 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
22391 emit_insn (tmp);
22392 break;
22393
22394 case IX86_FPCMP_SAHF:
22395 intcmp_mode = fpcmp_mode;
22396 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
22397 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
22398
22399 if (!scratch)
22400 scratch = gen_reg_rtx (HImode);
22401 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
22402 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
22403 break;
22404
22405 case IX86_FPCMP_ARITH:
22406 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
22407 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
22408 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
22409 if (!scratch)
22410 scratch = gen_reg_rtx (HImode);
22411 emit_insn (gen_rtx_SET (scratch, tmp2));
22412
22413 /* In the unordered case, we have to check C2 for NaN's, which
22414 doesn't happen to work out to anything nice combination-wise.
22415 So do some bit twiddling on the value we've got in AH to come
22416 up with an appropriate set of condition codes. */
22417
22418 intcmp_mode = CCNOmode;
22419 switch (code)
22420 {
22421 case GT:
22422 case UNGT:
22423 if (code == GT || !TARGET_IEEE_FP)
22424 {
22425 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
22426 code = EQ;
22427 }
22428 else
22429 {
22430 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
22431 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22432 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
22433 intcmp_mode = CCmode;
22434 code = GEU;
22435 }
22436 break;
22437 case LT:
22438 case UNLT:
22439 if (code == LT && TARGET_IEEE_FP)
22440 {
22441 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
22442 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
22443 intcmp_mode = CCmode;
22444 code = EQ;
22445 }
22446 else
22447 {
22448 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
22449 code = NE;
22450 }
22451 break;
22452 case GE:
22453 case UNGE:
22454 if (code == GE || !TARGET_IEEE_FP)
22455 {
22456 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
22457 code = EQ;
22458 }
22459 else
22460 {
22461 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
22462 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
22463 code = NE;
22464 }
22465 break;
22466 case LE:
22467 case UNLE:
22468 if (code == LE && TARGET_IEEE_FP)
22469 {
22470 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
22471 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22472 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22473 intcmp_mode = CCmode;
22474 code = LTU;
22475 }
22476 else
22477 {
22478 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
22479 code = NE;
22480 }
22481 break;
22482 case EQ:
22483 case UNEQ:
22484 if (code == EQ && TARGET_IEEE_FP)
22485 {
22486 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
22487 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22488 intcmp_mode = CCmode;
22489 code = EQ;
22490 }
22491 else
22492 {
22493 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
22494 code = NE;
22495 }
22496 break;
22497 case NE:
22498 case LTGT:
22499 if (code == NE && TARGET_IEEE_FP)
22500 {
22501 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
22502 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
22503 GEN_INT (0x40)));
22504 code = NE;
22505 }
22506 else
22507 {
22508 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
22509 code = EQ;
22510 }
22511 break;
22512
22513 case UNORDERED:
22514 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
22515 code = NE;
22516 break;
22517 case ORDERED:
22518 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
22519 code = EQ;
22520 break;
22521
22522 default:
22523 gcc_unreachable ();
22524 }
22525 break;
22526
22527 default:
22528 gcc_unreachable();
22529 }
22530
22531 /* Return the test that should be put into the flags user, i.e.
22532 the bcc, scc, or cmov instruction. */
22533 return gen_rtx_fmt_ee (code, VOIDmode,
22534 gen_rtx_REG (intcmp_mode, FLAGS_REG),
22535 const0_rtx);
22536 }
22537
22538 static rtx
22539 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
22540 {
22541 rtx ret;
22542
22543 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
22544 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
22545
22546 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
22547 {
22548 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
22549 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22550 }
22551 else
22552 ret = ix86_expand_int_compare (code, op0, op1);
22553
22554 return ret;
22555 }
22556
22557 void
22558 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
22559 {
22560 machine_mode mode = GET_MODE (op0);
22561 rtx tmp;
22562
22563 /* Handle special case - vector comparsion with boolean result, transform
22564 it using ptest instruction. */
22565 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
22566 {
22567 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
22568 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
22569
22570 gcc_assert (code == EQ || code == NE);
22571 /* Generate XOR since we can't check that one operand is zero vector. */
22572 tmp = gen_reg_rtx (mode);
22573 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
22574 tmp = gen_lowpart (p_mode, tmp);
22575 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
22576 gen_rtx_UNSPEC (CCmode,
22577 gen_rtvec (2, tmp, tmp),
22578 UNSPEC_PTEST)));
22579 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
22580 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22581 gen_rtx_LABEL_REF (VOIDmode, label),
22582 pc_rtx);
22583 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22584 return;
22585 }
22586
22587 switch (mode)
22588 {
22589 case SFmode:
22590 case DFmode:
22591 case XFmode:
22592 case QImode:
22593 case HImode:
22594 case SImode:
22595 simple:
22596 tmp = ix86_expand_compare (code, op0, op1);
22597 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22598 gen_rtx_LABEL_REF (VOIDmode, label),
22599 pc_rtx);
22600 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22601 return;
22602
22603 case DImode:
22604 if (TARGET_64BIT)
22605 goto simple;
22606 /* For 32-bit target DI comparison may be performed on
22607 SSE registers. To allow this we should avoid split
22608 to SI mode which is achieved by doing xor in DI mode
22609 and then comparing with zero (which is recognized by
22610 STV pass). We don't compare using xor when optimizing
22611 for size. */
22612 if (!optimize_insn_for_size_p ()
22613 && TARGET_STV
22614 && (code == EQ || code == NE))
22615 {
22616 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
22617 op1 = const0_rtx;
22618 }
22619 /* FALLTHRU */
22620 case TImode:
22621 /* Expand DImode branch into multiple compare+branch. */
22622 {
22623 rtx lo[2], hi[2];
22624 rtx_code_label *label2;
22625 enum rtx_code code1, code2, code3;
22626 machine_mode submode;
22627
22628 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
22629 {
22630 std::swap (op0, op1);
22631 code = swap_condition (code);
22632 }
22633
22634 split_double_mode (mode, &op0, 1, lo+0, hi+0);
22635 split_double_mode (mode, &op1, 1, lo+1, hi+1);
22636
22637 submode = mode == DImode ? SImode : DImode;
22638
22639 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
22640 avoid two branches. This costs one extra insn, so disable when
22641 optimizing for size. */
22642
22643 if ((code == EQ || code == NE)
22644 && (!optimize_insn_for_size_p ()
22645 || hi[1] == const0_rtx || lo[1] == const0_rtx))
22646 {
22647 rtx xor0, xor1;
22648
22649 xor1 = hi[0];
22650 if (hi[1] != const0_rtx)
22651 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
22652 NULL_RTX, 0, OPTAB_WIDEN);
22653
22654 xor0 = lo[0];
22655 if (lo[1] != const0_rtx)
22656 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
22657 NULL_RTX, 0, OPTAB_WIDEN);
22658
22659 tmp = expand_binop (submode, ior_optab, xor1, xor0,
22660 NULL_RTX, 0, OPTAB_WIDEN);
22661
22662 ix86_expand_branch (code, tmp, const0_rtx, label);
22663 return;
22664 }
22665
22666 /* Otherwise, if we are doing less-than or greater-or-equal-than,
22667 op1 is a constant and the low word is zero, then we can just
22668 examine the high word. Similarly for low word -1 and
22669 less-or-equal-than or greater-than. */
22670
22671 if (CONST_INT_P (hi[1]))
22672 switch (code)
22673 {
22674 case LT: case LTU: case GE: case GEU:
22675 if (lo[1] == const0_rtx)
22676 {
22677 ix86_expand_branch (code, hi[0], hi[1], label);
22678 return;
22679 }
22680 break;
22681 case LE: case LEU: case GT: case GTU:
22682 if (lo[1] == constm1_rtx)
22683 {
22684 ix86_expand_branch (code, hi[0], hi[1], label);
22685 return;
22686 }
22687 break;
22688 default:
22689 break;
22690 }
22691
22692 /* Otherwise, we need two or three jumps. */
22693
22694 label2 = gen_label_rtx ();
22695
22696 code1 = code;
22697 code2 = swap_condition (code);
22698 code3 = unsigned_condition (code);
22699
22700 switch (code)
22701 {
22702 case LT: case GT: case LTU: case GTU:
22703 break;
22704
22705 case LE: code1 = LT; code2 = GT; break;
22706 case GE: code1 = GT; code2 = LT; break;
22707 case LEU: code1 = LTU; code2 = GTU; break;
22708 case GEU: code1 = GTU; code2 = LTU; break;
22709
22710 case EQ: code1 = UNKNOWN; code2 = NE; break;
22711 case NE: code2 = UNKNOWN; break;
22712
22713 default:
22714 gcc_unreachable ();
22715 }
22716
22717 /*
22718 * a < b =>
22719 * if (hi(a) < hi(b)) goto true;
22720 * if (hi(a) > hi(b)) goto false;
22721 * if (lo(a) < lo(b)) goto true;
22722 * false:
22723 */
22724
22725 if (code1 != UNKNOWN)
22726 ix86_expand_branch (code1, hi[0], hi[1], label);
22727 if (code2 != UNKNOWN)
22728 ix86_expand_branch (code2, hi[0], hi[1], label2);
22729
22730 ix86_expand_branch (code3, lo[0], lo[1], label);
22731
22732 if (code2 != UNKNOWN)
22733 emit_label (label2);
22734 return;
22735 }
22736
22737 default:
22738 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
22739 goto simple;
22740 }
22741 }
22742
22743 /* Split branch based on floating point condition. */
22744 void
22745 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
22746 rtx target1, rtx target2, rtx tmp)
22747 {
22748 rtx condition;
22749 rtx i;
22750
22751 if (target2 != pc_rtx)
22752 {
22753 std::swap (target1, target2);
22754 code = reverse_condition_maybe_unordered (code);
22755 }
22756
22757 condition = ix86_expand_fp_compare (code, op1, op2,
22758 tmp);
22759
22760 i = emit_jump_insn (gen_rtx_SET
22761 (pc_rtx,
22762 gen_rtx_IF_THEN_ELSE (VOIDmode,
22763 condition, target1, target2)));
22764 if (split_branch_probability >= 0)
22765 add_int_reg_note (i, REG_BR_PROB, split_branch_probability);
22766 }
22767
22768 void
22769 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
22770 {
22771 rtx ret;
22772
22773 gcc_assert (GET_MODE (dest) == QImode);
22774
22775 ret = ix86_expand_compare (code, op0, op1);
22776 PUT_MODE (ret, QImode);
22777 emit_insn (gen_rtx_SET (dest, ret));
22778 }
22779
22780 /* Expand comparison setting or clearing carry flag. Return true when
22781 successful and set pop for the operation. */
22782 static bool
22783 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
22784 {
22785 machine_mode mode =
22786 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
22787
22788 /* Do not handle double-mode compares that go through special path. */
22789 if (mode == (TARGET_64BIT ? TImode : DImode))
22790 return false;
22791
22792 if (SCALAR_FLOAT_MODE_P (mode))
22793 {
22794 rtx compare_op;
22795 rtx_insn *compare_seq;
22796
22797 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
22798
22799 /* Shortcut: following common codes never translate
22800 into carry flag compares. */
22801 if (code == EQ || code == NE || code == UNEQ || code == LTGT
22802 || code == ORDERED || code == UNORDERED)
22803 return false;
22804
22805 /* These comparisons require zero flag; swap operands so they won't. */
22806 if ((code == GT || code == UNLE || code == LE || code == UNGT)
22807 && !TARGET_IEEE_FP)
22808 {
22809 std::swap (op0, op1);
22810 code = swap_condition (code);
22811 }
22812
22813 /* Try to expand the comparison and verify that we end up with
22814 carry flag based comparison. This fails to be true only when
22815 we decide to expand comparison using arithmetic that is not
22816 too common scenario. */
22817 start_sequence ();
22818 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22819 compare_seq = get_insns ();
22820 end_sequence ();
22821
22822 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
22823 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
22824 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
22825 else
22826 code = GET_CODE (compare_op);
22827
22828 if (code != LTU && code != GEU)
22829 return false;
22830
22831 emit_insn (compare_seq);
22832 *pop = compare_op;
22833 return true;
22834 }
22835
22836 if (!INTEGRAL_MODE_P (mode))
22837 return false;
22838
22839 switch (code)
22840 {
22841 case LTU:
22842 case GEU:
22843 break;
22844
22845 /* Convert a==0 into (unsigned)a<1. */
22846 case EQ:
22847 case NE:
22848 if (op1 != const0_rtx)
22849 return false;
22850 op1 = const1_rtx;
22851 code = (code == EQ ? LTU : GEU);
22852 break;
22853
22854 /* Convert a>b into b<a or a>=b-1. */
22855 case GTU:
22856 case LEU:
22857 if (CONST_INT_P (op1))
22858 {
22859 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
22860 /* Bail out on overflow. We still can swap operands but that
22861 would force loading of the constant into register. */
22862 if (op1 == const0_rtx
22863 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
22864 return false;
22865 code = (code == GTU ? GEU : LTU);
22866 }
22867 else
22868 {
22869 std::swap (op0, op1);
22870 code = (code == GTU ? LTU : GEU);
22871 }
22872 break;
22873
22874 /* Convert a>=0 into (unsigned)a<0x80000000. */
22875 case LT:
22876 case GE:
22877 if (mode == DImode || op1 != const0_rtx)
22878 return false;
22879 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
22880 code = (code == LT ? GEU : LTU);
22881 break;
22882 case LE:
22883 case GT:
22884 if (mode == DImode || op1 != constm1_rtx)
22885 return false;
22886 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
22887 code = (code == LE ? GEU : LTU);
22888 break;
22889
22890 default:
22891 return false;
22892 }
22893 /* Swapping operands may cause constant to appear as first operand. */
22894 if (!nonimmediate_operand (op0, VOIDmode))
22895 {
22896 if (!can_create_pseudo_p ())
22897 return false;
22898 op0 = force_reg (mode, op0);
22899 }
22900 *pop = ix86_expand_compare (code, op0, op1);
22901 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
22902 return true;
22903 }
22904
22905 bool
22906 ix86_expand_int_movcc (rtx operands[])
22907 {
22908 enum rtx_code code = GET_CODE (operands[1]), compare_code;
22909 rtx_insn *compare_seq;
22910 rtx compare_op;
22911 machine_mode mode = GET_MODE (operands[0]);
22912 bool sign_bit_compare_p = false;
22913 rtx op0 = XEXP (operands[1], 0);
22914 rtx op1 = XEXP (operands[1], 1);
22915
22916 if (GET_MODE (op0) == TImode
22917 || (GET_MODE (op0) == DImode
22918 && !TARGET_64BIT))
22919 return false;
22920
22921 start_sequence ();
22922 compare_op = ix86_expand_compare (code, op0, op1);
22923 compare_seq = get_insns ();
22924 end_sequence ();
22925
22926 compare_code = GET_CODE (compare_op);
22927
22928 if ((op1 == const0_rtx && (code == GE || code == LT))
22929 || (op1 == constm1_rtx && (code == GT || code == LE)))
22930 sign_bit_compare_p = true;
22931
22932 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
22933 HImode insns, we'd be swallowed in word prefix ops. */
22934
22935 if ((mode != HImode || TARGET_FAST_PREFIX)
22936 && (mode != (TARGET_64BIT ? TImode : DImode))
22937 && CONST_INT_P (operands[2])
22938 && CONST_INT_P (operands[3]))
22939 {
22940 rtx out = operands[0];
22941 HOST_WIDE_INT ct = INTVAL (operands[2]);
22942 HOST_WIDE_INT cf = INTVAL (operands[3]);
22943 HOST_WIDE_INT diff;
22944
22945 diff = ct - cf;
22946 /* Sign bit compares are better done using shifts than we do by using
22947 sbb. */
22948 if (sign_bit_compare_p
22949 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
22950 {
22951 /* Detect overlap between destination and compare sources. */
22952 rtx tmp = out;
22953
22954 if (!sign_bit_compare_p)
22955 {
22956 rtx flags;
22957 bool fpcmp = false;
22958
22959 compare_code = GET_CODE (compare_op);
22960
22961 flags = XEXP (compare_op, 0);
22962
22963 if (GET_MODE (flags) == CCFPmode
22964 || GET_MODE (flags) == CCFPUmode)
22965 {
22966 fpcmp = true;
22967 compare_code
22968 = ix86_fp_compare_code_to_integer (compare_code);
22969 }
22970
22971 /* To simplify rest of code, restrict to the GEU case. */
22972 if (compare_code == LTU)
22973 {
22974 std::swap (ct, cf);
22975 compare_code = reverse_condition (compare_code);
22976 code = reverse_condition (code);
22977 }
22978 else
22979 {
22980 if (fpcmp)
22981 PUT_CODE (compare_op,
22982 reverse_condition_maybe_unordered
22983 (GET_CODE (compare_op)));
22984 else
22985 PUT_CODE (compare_op,
22986 reverse_condition (GET_CODE (compare_op)));
22987 }
22988 diff = ct - cf;
22989
22990 if (reg_overlap_mentioned_p (out, op0)
22991 || reg_overlap_mentioned_p (out, op1))
22992 tmp = gen_reg_rtx (mode);
22993
22994 if (mode == DImode)
22995 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
22996 else
22997 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
22998 flags, compare_op));
22999 }
23000 else
23001 {
23002 if (code == GT || code == GE)
23003 code = reverse_condition (code);
23004 else
23005 {
23006 std::swap (ct, cf);
23007 diff = ct - cf;
23008 }
23009 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
23010 }
23011
23012 if (diff == 1)
23013 {
23014 /*
23015 * cmpl op0,op1
23016 * sbbl dest,dest
23017 * [addl dest, ct]
23018 *
23019 * Size 5 - 8.
23020 */
23021 if (ct)
23022 tmp = expand_simple_binop (mode, PLUS,
23023 tmp, GEN_INT (ct),
23024 copy_rtx (tmp), 1, OPTAB_DIRECT);
23025 }
23026 else if (cf == -1)
23027 {
23028 /*
23029 * cmpl op0,op1
23030 * sbbl dest,dest
23031 * orl $ct, dest
23032 *
23033 * Size 8.
23034 */
23035 tmp = expand_simple_binop (mode, IOR,
23036 tmp, GEN_INT (ct),
23037 copy_rtx (tmp), 1, OPTAB_DIRECT);
23038 }
23039 else if (diff == -1 && ct)
23040 {
23041 /*
23042 * cmpl op0,op1
23043 * sbbl dest,dest
23044 * notl dest
23045 * [addl dest, cf]
23046 *
23047 * Size 8 - 11.
23048 */
23049 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23050 if (cf)
23051 tmp = expand_simple_binop (mode, PLUS,
23052 copy_rtx (tmp), GEN_INT (cf),
23053 copy_rtx (tmp), 1, OPTAB_DIRECT);
23054 }
23055 else
23056 {
23057 /*
23058 * cmpl op0,op1
23059 * sbbl dest,dest
23060 * [notl dest]
23061 * andl cf - ct, dest
23062 * [addl dest, ct]
23063 *
23064 * Size 8 - 11.
23065 */
23066
23067 if (cf == 0)
23068 {
23069 cf = ct;
23070 ct = 0;
23071 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23072 }
23073
23074 tmp = expand_simple_binop (mode, AND,
23075 copy_rtx (tmp),
23076 gen_int_mode (cf - ct, mode),
23077 copy_rtx (tmp), 1, OPTAB_DIRECT);
23078 if (ct)
23079 tmp = expand_simple_binop (mode, PLUS,
23080 copy_rtx (tmp), GEN_INT (ct),
23081 copy_rtx (tmp), 1, OPTAB_DIRECT);
23082 }
23083
23084 if (!rtx_equal_p (tmp, out))
23085 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
23086
23087 return true;
23088 }
23089
23090 if (diff < 0)
23091 {
23092 machine_mode cmp_mode = GET_MODE (op0);
23093 enum rtx_code new_code;
23094
23095 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23096 {
23097 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23098
23099 /* We may be reversing unordered compare to normal compare, that
23100 is not valid in general (we may convert non-trapping condition
23101 to trapping one), however on i386 we currently emit all
23102 comparisons unordered. */
23103 new_code = reverse_condition_maybe_unordered (code);
23104 }
23105 else
23106 new_code = ix86_reverse_condition (code, cmp_mode);
23107 if (new_code != UNKNOWN)
23108 {
23109 std::swap (ct, cf);
23110 diff = -diff;
23111 code = new_code;
23112 }
23113 }
23114
23115 compare_code = UNKNOWN;
23116 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
23117 && CONST_INT_P (op1))
23118 {
23119 if (op1 == const0_rtx
23120 && (code == LT || code == GE))
23121 compare_code = code;
23122 else if (op1 == constm1_rtx)
23123 {
23124 if (code == LE)
23125 compare_code = LT;
23126 else if (code == GT)
23127 compare_code = GE;
23128 }
23129 }
23130
23131 /* Optimize dest = (op0 < 0) ? -1 : cf. */
23132 if (compare_code != UNKNOWN
23133 && GET_MODE (op0) == GET_MODE (out)
23134 && (cf == -1 || ct == -1))
23135 {
23136 /* If lea code below could be used, only optimize
23137 if it results in a 2 insn sequence. */
23138
23139 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
23140 || diff == 3 || diff == 5 || diff == 9)
23141 || (compare_code == LT && ct == -1)
23142 || (compare_code == GE && cf == -1))
23143 {
23144 /*
23145 * notl op1 (if necessary)
23146 * sarl $31, op1
23147 * orl cf, op1
23148 */
23149 if (ct != -1)
23150 {
23151 cf = ct;
23152 ct = -1;
23153 code = reverse_condition (code);
23154 }
23155
23156 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23157
23158 out = expand_simple_binop (mode, IOR,
23159 out, GEN_INT (cf),
23160 out, 1, OPTAB_DIRECT);
23161 if (out != operands[0])
23162 emit_move_insn (operands[0], out);
23163
23164 return true;
23165 }
23166 }
23167
23168
23169 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
23170 || diff == 3 || diff == 5 || diff == 9)
23171 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
23172 && (mode != DImode
23173 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
23174 {
23175 /*
23176 * xorl dest,dest
23177 * cmpl op1,op2
23178 * setcc dest
23179 * lea cf(dest*(ct-cf)),dest
23180 *
23181 * Size 14.
23182 *
23183 * This also catches the degenerate setcc-only case.
23184 */
23185
23186 rtx tmp;
23187 int nops;
23188
23189 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23190
23191 nops = 0;
23192 /* On x86_64 the lea instruction operates on Pmode, so we need
23193 to get arithmetics done in proper mode to match. */
23194 if (diff == 1)
23195 tmp = copy_rtx (out);
23196 else
23197 {
23198 rtx out1;
23199 out1 = copy_rtx (out);
23200 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
23201 nops++;
23202 if (diff & 1)
23203 {
23204 tmp = gen_rtx_PLUS (mode, tmp, out1);
23205 nops++;
23206 }
23207 }
23208 if (cf != 0)
23209 {
23210 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
23211 nops++;
23212 }
23213 if (!rtx_equal_p (tmp, out))
23214 {
23215 if (nops == 1)
23216 out = force_operand (tmp, copy_rtx (out));
23217 else
23218 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
23219 }
23220 if (!rtx_equal_p (out, operands[0]))
23221 emit_move_insn (operands[0], copy_rtx (out));
23222
23223 return true;
23224 }
23225
23226 /*
23227 * General case: Jumpful:
23228 * xorl dest,dest cmpl op1, op2
23229 * cmpl op1, op2 movl ct, dest
23230 * setcc dest jcc 1f
23231 * decl dest movl cf, dest
23232 * andl (cf-ct),dest 1:
23233 * addl ct,dest
23234 *
23235 * Size 20. Size 14.
23236 *
23237 * This is reasonably steep, but branch mispredict costs are
23238 * high on modern cpus, so consider failing only if optimizing
23239 * for space.
23240 */
23241
23242 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23243 && BRANCH_COST (optimize_insn_for_speed_p (),
23244 false) >= 2)
23245 {
23246 if (cf == 0)
23247 {
23248 machine_mode cmp_mode = GET_MODE (op0);
23249 enum rtx_code new_code;
23250
23251 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23252 {
23253 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23254
23255 /* We may be reversing unordered compare to normal compare,
23256 that is not valid in general (we may convert non-trapping
23257 condition to trapping one), however on i386 we currently
23258 emit all comparisons unordered. */
23259 new_code = reverse_condition_maybe_unordered (code);
23260 }
23261 else
23262 {
23263 new_code = ix86_reverse_condition (code, cmp_mode);
23264 if (compare_code != UNKNOWN && new_code != UNKNOWN)
23265 compare_code = reverse_condition (compare_code);
23266 }
23267
23268 if (new_code != UNKNOWN)
23269 {
23270 cf = ct;
23271 ct = 0;
23272 code = new_code;
23273 }
23274 }
23275
23276 if (compare_code != UNKNOWN)
23277 {
23278 /* notl op1 (if needed)
23279 sarl $31, op1
23280 andl (cf-ct), op1
23281 addl ct, op1
23282
23283 For x < 0 (resp. x <= -1) there will be no notl,
23284 so if possible swap the constants to get rid of the
23285 complement.
23286 True/false will be -1/0 while code below (store flag
23287 followed by decrement) is 0/-1, so the constants need
23288 to be exchanged once more. */
23289
23290 if (compare_code == GE || !cf)
23291 {
23292 code = reverse_condition (code);
23293 compare_code = LT;
23294 }
23295 else
23296 std::swap (ct, cf);
23297
23298 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23299 }
23300 else
23301 {
23302 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23303
23304 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
23305 constm1_rtx,
23306 copy_rtx (out), 1, OPTAB_DIRECT);
23307 }
23308
23309 out = expand_simple_binop (mode, AND, copy_rtx (out),
23310 gen_int_mode (cf - ct, mode),
23311 copy_rtx (out), 1, OPTAB_DIRECT);
23312 if (ct)
23313 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
23314 copy_rtx (out), 1, OPTAB_DIRECT);
23315 if (!rtx_equal_p (out, operands[0]))
23316 emit_move_insn (operands[0], copy_rtx (out));
23317
23318 return true;
23319 }
23320 }
23321
23322 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23323 {
23324 /* Try a few things more with specific constants and a variable. */
23325
23326 optab op;
23327 rtx var, orig_out, out, tmp;
23328
23329 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
23330 return false;
23331
23332 /* If one of the two operands is an interesting constant, load a
23333 constant with the above and mask it in with a logical operation. */
23334
23335 if (CONST_INT_P (operands[2]))
23336 {
23337 var = operands[3];
23338 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
23339 operands[3] = constm1_rtx, op = and_optab;
23340 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
23341 operands[3] = const0_rtx, op = ior_optab;
23342 else
23343 return false;
23344 }
23345 else if (CONST_INT_P (operands[3]))
23346 {
23347 var = operands[2];
23348 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
23349 operands[2] = constm1_rtx, op = and_optab;
23350 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
23351 operands[2] = const0_rtx, op = ior_optab;
23352 else
23353 return false;
23354 }
23355 else
23356 return false;
23357
23358 orig_out = operands[0];
23359 tmp = gen_reg_rtx (mode);
23360 operands[0] = tmp;
23361
23362 /* Recurse to get the constant loaded. */
23363 if (!ix86_expand_int_movcc (operands))
23364 return false;
23365
23366 /* Mask in the interesting variable. */
23367 out = expand_binop (mode, op, var, tmp, orig_out, 0,
23368 OPTAB_WIDEN);
23369 if (!rtx_equal_p (out, orig_out))
23370 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
23371
23372 return true;
23373 }
23374
23375 /*
23376 * For comparison with above,
23377 *
23378 * movl cf,dest
23379 * movl ct,tmp
23380 * cmpl op1,op2
23381 * cmovcc tmp,dest
23382 *
23383 * Size 15.
23384 */
23385
23386 if (! nonimmediate_operand (operands[2], mode))
23387 operands[2] = force_reg (mode, operands[2]);
23388 if (! nonimmediate_operand (operands[3], mode))
23389 operands[3] = force_reg (mode, operands[3]);
23390
23391 if (! register_operand (operands[2], VOIDmode)
23392 && (mode == QImode
23393 || ! register_operand (operands[3], VOIDmode)))
23394 operands[2] = force_reg (mode, operands[2]);
23395
23396 if (mode == QImode
23397 && ! register_operand (operands[3], VOIDmode))
23398 operands[3] = force_reg (mode, operands[3]);
23399
23400 emit_insn (compare_seq);
23401 emit_insn (gen_rtx_SET (operands[0],
23402 gen_rtx_IF_THEN_ELSE (mode,
23403 compare_op, operands[2],
23404 operands[3])));
23405 return true;
23406 }
23407
23408 /* Swap, force into registers, or otherwise massage the two operands
23409 to an sse comparison with a mask result. Thus we differ a bit from
23410 ix86_prepare_fp_compare_args which expects to produce a flags result.
23411
23412 The DEST operand exists to help determine whether to commute commutative
23413 operators. The POP0/POP1 operands are updated in place. The new
23414 comparison code is returned, or UNKNOWN if not implementable. */
23415
23416 static enum rtx_code
23417 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
23418 rtx *pop0, rtx *pop1)
23419 {
23420 switch (code)
23421 {
23422 case LTGT:
23423 case UNEQ:
23424 /* AVX supports all the needed comparisons. */
23425 if (TARGET_AVX)
23426 break;
23427 /* We have no LTGT as an operator. We could implement it with
23428 NE & ORDERED, but this requires an extra temporary. It's
23429 not clear that it's worth it. */
23430 return UNKNOWN;
23431
23432 case LT:
23433 case LE:
23434 case UNGT:
23435 case UNGE:
23436 /* These are supported directly. */
23437 break;
23438
23439 case EQ:
23440 case NE:
23441 case UNORDERED:
23442 case ORDERED:
23443 /* AVX has 3 operand comparisons, no need to swap anything. */
23444 if (TARGET_AVX)
23445 break;
23446 /* For commutative operators, try to canonicalize the destination
23447 operand to be first in the comparison - this helps reload to
23448 avoid extra moves. */
23449 if (!dest || !rtx_equal_p (dest, *pop1))
23450 break;
23451 /* FALLTHRU */
23452
23453 case GE:
23454 case GT:
23455 case UNLE:
23456 case UNLT:
23457 /* These are not supported directly before AVX, and furthermore
23458 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
23459 comparison operands to transform into something that is
23460 supported. */
23461 std::swap (*pop0, *pop1);
23462 code = swap_condition (code);
23463 break;
23464
23465 default:
23466 gcc_unreachable ();
23467 }
23468
23469 return code;
23470 }
23471
23472 /* Detect conditional moves that exactly match min/max operational
23473 semantics. Note that this is IEEE safe, as long as we don't
23474 interchange the operands.
23475
23476 Returns FALSE if this conditional move doesn't match a MIN/MAX,
23477 and TRUE if the operation is successful and instructions are emitted. */
23478
23479 static bool
23480 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
23481 rtx cmp_op1, rtx if_true, rtx if_false)
23482 {
23483 machine_mode mode;
23484 bool is_min;
23485 rtx tmp;
23486
23487 if (code == LT)
23488 ;
23489 else if (code == UNGE)
23490 std::swap (if_true, if_false);
23491 else
23492 return false;
23493
23494 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
23495 is_min = true;
23496 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
23497 is_min = false;
23498 else
23499 return false;
23500
23501 mode = GET_MODE (dest);
23502
23503 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
23504 but MODE may be a vector mode and thus not appropriate. */
23505 if (!flag_finite_math_only || flag_signed_zeros)
23506 {
23507 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
23508 rtvec v;
23509
23510 if_true = force_reg (mode, if_true);
23511 v = gen_rtvec (2, if_true, if_false);
23512 tmp = gen_rtx_UNSPEC (mode, v, u);
23513 }
23514 else
23515 {
23516 code = is_min ? SMIN : SMAX;
23517 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
23518 }
23519
23520 emit_insn (gen_rtx_SET (dest, tmp));
23521 return true;
23522 }
23523
23524 /* Expand an sse vector comparison. Return the register with the result. */
23525
23526 static rtx
23527 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
23528 rtx op_true, rtx op_false)
23529 {
23530 machine_mode mode = GET_MODE (dest);
23531 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
23532
23533 /* In general case result of comparison can differ from operands' type. */
23534 machine_mode cmp_mode;
23535
23536 /* In AVX512F the result of comparison is an integer mask. */
23537 bool maskcmp = false;
23538 rtx x;
23539
23540 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
23541 {
23542 cmp_mode = mode_for_size (GET_MODE_NUNITS (cmp_ops_mode), MODE_INT, 0);
23543 gcc_assert (cmp_mode != BLKmode);
23544
23545 maskcmp = true;
23546 }
23547 else
23548 cmp_mode = cmp_ops_mode;
23549
23550
23551 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
23552 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
23553 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
23554
23555 if (optimize
23556 || (op_true && reg_overlap_mentioned_p (dest, op_true))
23557 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
23558 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
23559
23560 /* Compare patterns for int modes are unspec in AVX512F only. */
23561 if (maskcmp && (code == GT || code == EQ))
23562 {
23563 rtx (*gen)(rtx, rtx, rtx);
23564
23565 switch (cmp_ops_mode)
23566 {
23567 case V64QImode:
23568 gcc_assert (TARGET_AVX512BW);
23569 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
23570 break;
23571 case V32HImode:
23572 gcc_assert (TARGET_AVX512BW);
23573 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
23574 break;
23575 case V16SImode:
23576 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
23577 break;
23578 case V8DImode:
23579 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
23580 break;
23581 default:
23582 gen = NULL;
23583 }
23584
23585 if (gen)
23586 {
23587 emit_insn (gen (dest, cmp_op0, cmp_op1));
23588 return dest;
23589 }
23590 }
23591 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
23592
23593 if (cmp_mode != mode && !maskcmp)
23594 {
23595 x = force_reg (cmp_ops_mode, x);
23596 convert_move (dest, x, false);
23597 }
23598 else
23599 emit_insn (gen_rtx_SET (dest, x));
23600
23601 return dest;
23602 }
23603
23604 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
23605 operations. This is used for both scalar and vector conditional moves. */
23606
23607 void
23608 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
23609 {
23610 machine_mode mode = GET_MODE (dest);
23611 machine_mode cmpmode = GET_MODE (cmp);
23612
23613 /* In AVX512F the result of comparison is an integer mask. */
23614 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
23615
23616 rtx t2, t3, x;
23617
23618 /* If we have an integer mask and FP value then we need
23619 to cast mask to FP mode. */
23620 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
23621 {
23622 cmp = force_reg (cmpmode, cmp);
23623 cmp = gen_rtx_SUBREG (mode, cmp, 0);
23624 }
23625
23626 if (vector_all_ones_operand (op_true, mode)
23627 && rtx_equal_p (op_false, CONST0_RTX (mode))
23628 && !maskcmp)
23629 {
23630 emit_insn (gen_rtx_SET (dest, cmp));
23631 }
23632 else if (op_false == CONST0_RTX (mode)
23633 && !maskcmp)
23634 {
23635 op_true = force_reg (mode, op_true);
23636 x = gen_rtx_AND (mode, cmp, op_true);
23637 emit_insn (gen_rtx_SET (dest, x));
23638 }
23639 else if (op_true == CONST0_RTX (mode)
23640 && !maskcmp)
23641 {
23642 op_false = force_reg (mode, op_false);
23643 x = gen_rtx_NOT (mode, cmp);
23644 x = gen_rtx_AND (mode, x, op_false);
23645 emit_insn (gen_rtx_SET (dest, x));
23646 }
23647 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
23648 && !maskcmp)
23649 {
23650 op_false = force_reg (mode, op_false);
23651 x = gen_rtx_IOR (mode, cmp, op_false);
23652 emit_insn (gen_rtx_SET (dest, x));
23653 }
23654 else if (TARGET_XOP
23655 && !maskcmp)
23656 {
23657 op_true = force_reg (mode, op_true);
23658
23659 if (!nonimmediate_operand (op_false, mode))
23660 op_false = force_reg (mode, op_false);
23661
23662 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
23663 op_true,
23664 op_false)));
23665 }
23666 else
23667 {
23668 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
23669 rtx d = dest;
23670
23671 if (!nonimmediate_operand (op_true, mode))
23672 op_true = force_reg (mode, op_true);
23673
23674 op_false = force_reg (mode, op_false);
23675
23676 switch (mode)
23677 {
23678 case V4SFmode:
23679 if (TARGET_SSE4_1)
23680 gen = gen_sse4_1_blendvps;
23681 break;
23682 case V2DFmode:
23683 if (TARGET_SSE4_1)
23684 gen = gen_sse4_1_blendvpd;
23685 break;
23686 case V16QImode:
23687 case V8HImode:
23688 case V4SImode:
23689 case V2DImode:
23690 if (TARGET_SSE4_1)
23691 {
23692 gen = gen_sse4_1_pblendvb;
23693 if (mode != V16QImode)
23694 d = gen_reg_rtx (V16QImode);
23695 op_false = gen_lowpart (V16QImode, op_false);
23696 op_true = gen_lowpart (V16QImode, op_true);
23697 cmp = gen_lowpart (V16QImode, cmp);
23698 }
23699 break;
23700 case V8SFmode:
23701 if (TARGET_AVX)
23702 gen = gen_avx_blendvps256;
23703 break;
23704 case V4DFmode:
23705 if (TARGET_AVX)
23706 gen = gen_avx_blendvpd256;
23707 break;
23708 case V32QImode:
23709 case V16HImode:
23710 case V8SImode:
23711 case V4DImode:
23712 if (TARGET_AVX2)
23713 {
23714 gen = gen_avx2_pblendvb;
23715 if (mode != V32QImode)
23716 d = gen_reg_rtx (V32QImode);
23717 op_false = gen_lowpart (V32QImode, op_false);
23718 op_true = gen_lowpart (V32QImode, op_true);
23719 cmp = gen_lowpart (V32QImode, cmp);
23720 }
23721 break;
23722
23723 case V64QImode:
23724 gen = gen_avx512bw_blendmv64qi;
23725 break;
23726 case V32HImode:
23727 gen = gen_avx512bw_blendmv32hi;
23728 break;
23729 case V16SImode:
23730 gen = gen_avx512f_blendmv16si;
23731 break;
23732 case V8DImode:
23733 gen = gen_avx512f_blendmv8di;
23734 break;
23735 case V8DFmode:
23736 gen = gen_avx512f_blendmv8df;
23737 break;
23738 case V16SFmode:
23739 gen = gen_avx512f_blendmv16sf;
23740 break;
23741
23742 default:
23743 break;
23744 }
23745
23746 if (gen != NULL)
23747 {
23748 emit_insn (gen (d, op_false, op_true, cmp));
23749 if (d != dest)
23750 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
23751 }
23752 else
23753 {
23754 op_true = force_reg (mode, op_true);
23755
23756 t2 = gen_reg_rtx (mode);
23757 if (optimize)
23758 t3 = gen_reg_rtx (mode);
23759 else
23760 t3 = dest;
23761
23762 x = gen_rtx_AND (mode, op_true, cmp);
23763 emit_insn (gen_rtx_SET (t2, x));
23764
23765 x = gen_rtx_NOT (mode, cmp);
23766 x = gen_rtx_AND (mode, x, op_false);
23767 emit_insn (gen_rtx_SET (t3, x));
23768
23769 x = gen_rtx_IOR (mode, t3, t2);
23770 emit_insn (gen_rtx_SET (dest, x));
23771 }
23772 }
23773 }
23774
23775 /* Expand a floating-point conditional move. Return true if successful. */
23776
23777 bool
23778 ix86_expand_fp_movcc (rtx operands[])
23779 {
23780 machine_mode mode = GET_MODE (operands[0]);
23781 enum rtx_code code = GET_CODE (operands[1]);
23782 rtx tmp, compare_op;
23783 rtx op0 = XEXP (operands[1], 0);
23784 rtx op1 = XEXP (operands[1], 1);
23785
23786 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
23787 {
23788 machine_mode cmode;
23789
23790 /* Since we've no cmove for sse registers, don't force bad register
23791 allocation just to gain access to it. Deny movcc when the
23792 comparison mode doesn't match the move mode. */
23793 cmode = GET_MODE (op0);
23794 if (cmode == VOIDmode)
23795 cmode = GET_MODE (op1);
23796 if (cmode != mode)
23797 return false;
23798
23799 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
23800 if (code == UNKNOWN)
23801 return false;
23802
23803 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
23804 operands[2], operands[3]))
23805 return true;
23806
23807 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
23808 operands[2], operands[3]);
23809 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
23810 return true;
23811 }
23812
23813 if (GET_MODE (op0) == TImode
23814 || (GET_MODE (op0) == DImode
23815 && !TARGET_64BIT))
23816 return false;
23817
23818 /* The floating point conditional move instructions don't directly
23819 support conditions resulting from a signed integer comparison. */
23820
23821 compare_op = ix86_expand_compare (code, op0, op1);
23822 if (!fcmov_comparison_operator (compare_op, VOIDmode))
23823 {
23824 tmp = gen_reg_rtx (QImode);
23825 ix86_expand_setcc (tmp, code, op0, op1);
23826
23827 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
23828 }
23829
23830 emit_insn (gen_rtx_SET (operands[0],
23831 gen_rtx_IF_THEN_ELSE (mode, compare_op,
23832 operands[2], operands[3])));
23833
23834 return true;
23835 }
23836
23837 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
23838
23839 static int
23840 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
23841 {
23842 switch (code)
23843 {
23844 case EQ:
23845 return 0;
23846 case LT:
23847 case LTU:
23848 return 1;
23849 case LE:
23850 case LEU:
23851 return 2;
23852 case NE:
23853 return 4;
23854 case GE:
23855 case GEU:
23856 return 5;
23857 case GT:
23858 case GTU:
23859 return 6;
23860 default:
23861 gcc_unreachable ();
23862 }
23863 }
23864
23865 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
23866
23867 static int
23868 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
23869 {
23870 switch (code)
23871 {
23872 case EQ:
23873 return 0x00;
23874 case NE:
23875 return 0x04;
23876 case GT:
23877 return 0x0e;
23878 case LE:
23879 return 0x02;
23880 case GE:
23881 return 0x0d;
23882 case LT:
23883 return 0x01;
23884 case UNLE:
23885 return 0x0a;
23886 case UNLT:
23887 return 0x09;
23888 case UNGE:
23889 return 0x05;
23890 case UNGT:
23891 return 0x06;
23892 case UNEQ:
23893 return 0x18;
23894 case LTGT:
23895 return 0x0c;
23896 case ORDERED:
23897 return 0x07;
23898 case UNORDERED:
23899 return 0x03;
23900 default:
23901 gcc_unreachable ();
23902 }
23903 }
23904
23905 /* Return immediate value to be used in UNSPEC_PCMP
23906 for comparison CODE in MODE. */
23907
23908 static int
23909 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
23910 {
23911 if (FLOAT_MODE_P (mode))
23912 return ix86_fp_cmp_code_to_pcmp_immediate (code);
23913 return ix86_int_cmp_code_to_pcmp_immediate (code);
23914 }
23915
23916 /* Expand AVX-512 vector comparison. */
23917
23918 bool
23919 ix86_expand_mask_vec_cmp (rtx operands[])
23920 {
23921 machine_mode mask_mode = GET_MODE (operands[0]);
23922 machine_mode cmp_mode = GET_MODE (operands[2]);
23923 enum rtx_code code = GET_CODE (operands[1]);
23924 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
23925 int unspec_code;
23926 rtx unspec;
23927
23928 switch (code)
23929 {
23930 case LEU:
23931 case GTU:
23932 case GEU:
23933 case LTU:
23934 unspec_code = UNSPEC_UNSIGNED_PCMP;
23935 break;
23936
23937 default:
23938 unspec_code = UNSPEC_PCMP;
23939 }
23940
23941 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
23942 operands[3], imm),
23943 unspec_code);
23944 emit_insn (gen_rtx_SET (operands[0], unspec));
23945
23946 return true;
23947 }
23948
23949 /* Expand fp vector comparison. */
23950
23951 bool
23952 ix86_expand_fp_vec_cmp (rtx operands[])
23953 {
23954 enum rtx_code code = GET_CODE (operands[1]);
23955 rtx cmp;
23956
23957 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
23958 &operands[2], &operands[3]);
23959 if (code == UNKNOWN)
23960 {
23961 rtx temp;
23962 switch (GET_CODE (operands[1]))
23963 {
23964 case LTGT:
23965 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
23966 operands[3], NULL, NULL);
23967 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
23968 operands[3], NULL, NULL);
23969 code = AND;
23970 break;
23971 case UNEQ:
23972 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
23973 operands[3], NULL, NULL);
23974 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
23975 operands[3], NULL, NULL);
23976 code = IOR;
23977 break;
23978 default:
23979 gcc_unreachable ();
23980 }
23981 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
23982 OPTAB_DIRECT);
23983 }
23984 else
23985 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
23986 operands[1], operands[2]);
23987
23988 if (operands[0] != cmp)
23989 emit_move_insn (operands[0], cmp);
23990
23991 return true;
23992 }
23993
23994 static rtx
23995 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
23996 rtx op_true, rtx op_false, bool *negate)
23997 {
23998 machine_mode data_mode = GET_MODE (dest);
23999 machine_mode mode = GET_MODE (cop0);
24000 rtx x;
24001
24002 *negate = false;
24003
24004 /* XOP supports all of the comparisons on all 128-bit vector int types. */
24005 if (TARGET_XOP
24006 && (mode == V16QImode || mode == V8HImode
24007 || mode == V4SImode || mode == V2DImode))
24008 ;
24009 else
24010 {
24011 /* Canonicalize the comparison to EQ, GT, GTU. */
24012 switch (code)
24013 {
24014 case EQ:
24015 case GT:
24016 case GTU:
24017 break;
24018
24019 case NE:
24020 case LE:
24021 case LEU:
24022 code = reverse_condition (code);
24023 *negate = true;
24024 break;
24025
24026 case GE:
24027 case GEU:
24028 code = reverse_condition (code);
24029 *negate = true;
24030 /* FALLTHRU */
24031
24032 case LT:
24033 case LTU:
24034 std::swap (cop0, cop1);
24035 code = swap_condition (code);
24036 break;
24037
24038 default:
24039 gcc_unreachable ();
24040 }
24041
24042 /* Only SSE4.1/SSE4.2 supports V2DImode. */
24043 if (mode == V2DImode)
24044 {
24045 switch (code)
24046 {
24047 case EQ:
24048 /* SSE4.1 supports EQ. */
24049 if (!TARGET_SSE4_1)
24050 return NULL;
24051 break;
24052
24053 case GT:
24054 case GTU:
24055 /* SSE4.2 supports GT/GTU. */
24056 if (!TARGET_SSE4_2)
24057 return NULL;
24058 break;
24059
24060 default:
24061 gcc_unreachable ();
24062 }
24063 }
24064
24065 /* Unsigned parallel compare is not supported by the hardware.
24066 Play some tricks to turn this into a signed comparison
24067 against 0. */
24068 if (code == GTU)
24069 {
24070 cop0 = force_reg (mode, cop0);
24071
24072 switch (mode)
24073 {
24074 case V16SImode:
24075 case V8DImode:
24076 case V8SImode:
24077 case V4DImode:
24078 case V4SImode:
24079 case V2DImode:
24080 {
24081 rtx t1, t2, mask;
24082 rtx (*gen_sub3) (rtx, rtx, rtx);
24083
24084 switch (mode)
24085 {
24086 case V16SImode: gen_sub3 = gen_subv16si3; break;
24087 case V8DImode: gen_sub3 = gen_subv8di3; break;
24088 case V8SImode: gen_sub3 = gen_subv8si3; break;
24089 case V4DImode: gen_sub3 = gen_subv4di3; break;
24090 case V4SImode: gen_sub3 = gen_subv4si3; break;
24091 case V2DImode: gen_sub3 = gen_subv2di3; break;
24092 default:
24093 gcc_unreachable ();
24094 }
24095 /* Subtract (-(INT MAX) - 1) from both operands to make
24096 them signed. */
24097 mask = ix86_build_signbit_mask (mode, true, false);
24098 t1 = gen_reg_rtx (mode);
24099 emit_insn (gen_sub3 (t1, cop0, mask));
24100
24101 t2 = gen_reg_rtx (mode);
24102 emit_insn (gen_sub3 (t2, cop1, mask));
24103
24104 cop0 = t1;
24105 cop1 = t2;
24106 code = GT;
24107 }
24108 break;
24109
24110 case V64QImode:
24111 case V32HImode:
24112 case V32QImode:
24113 case V16HImode:
24114 case V16QImode:
24115 case V8HImode:
24116 /* Perform a parallel unsigned saturating subtraction. */
24117 x = gen_reg_rtx (mode);
24118 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
24119 cop1)));
24120
24121 cop0 = x;
24122 cop1 = CONST0_RTX (mode);
24123 code = EQ;
24124 *negate = !*negate;
24125 break;
24126
24127 default:
24128 gcc_unreachable ();
24129 }
24130 }
24131 }
24132
24133 if (*negate)
24134 std::swap (op_true, op_false);
24135
24136 /* Allow the comparison to be done in one mode, but the movcc to
24137 happen in another mode. */
24138 if (data_mode == mode)
24139 {
24140 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
24141 op_true, op_false);
24142 }
24143 else
24144 {
24145 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
24146 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
24147 op_true, op_false);
24148 if (GET_MODE (x) == mode)
24149 x = gen_lowpart (data_mode, x);
24150 }
24151
24152 return x;
24153 }
24154
24155 /* Expand integer vector comparison. */
24156
24157 bool
24158 ix86_expand_int_vec_cmp (rtx operands[])
24159 {
24160 rtx_code code = GET_CODE (operands[1]);
24161 bool negate = false;
24162 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
24163 operands[3], NULL, NULL, &negate);
24164
24165 if (!cmp)
24166 return false;
24167
24168 if (negate)
24169 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
24170 CONST0_RTX (GET_MODE (cmp)),
24171 NULL, NULL, &negate);
24172
24173 gcc_assert (!negate);
24174
24175 if (operands[0] != cmp)
24176 emit_move_insn (operands[0], cmp);
24177
24178 return true;
24179 }
24180
24181 /* Expand a floating-point vector conditional move; a vcond operation
24182 rather than a movcc operation. */
24183
24184 bool
24185 ix86_expand_fp_vcond (rtx operands[])
24186 {
24187 enum rtx_code code = GET_CODE (operands[3]);
24188 rtx cmp;
24189
24190 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24191 &operands[4], &operands[5]);
24192 if (code == UNKNOWN)
24193 {
24194 rtx temp;
24195 switch (GET_CODE (operands[3]))
24196 {
24197 case LTGT:
24198 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
24199 operands[5], operands[0], operands[0]);
24200 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
24201 operands[5], operands[1], operands[2]);
24202 code = AND;
24203 break;
24204 case UNEQ:
24205 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
24206 operands[5], operands[0], operands[0]);
24207 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
24208 operands[5], operands[1], operands[2]);
24209 code = IOR;
24210 break;
24211 default:
24212 gcc_unreachable ();
24213 }
24214 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24215 OPTAB_DIRECT);
24216 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24217 return true;
24218 }
24219
24220 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
24221 operands[5], operands[1], operands[2]))
24222 return true;
24223
24224 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
24225 operands[1], operands[2]);
24226 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24227 return true;
24228 }
24229
24230 /* Expand a signed/unsigned integral vector conditional move. */
24231
24232 bool
24233 ix86_expand_int_vcond (rtx operands[])
24234 {
24235 machine_mode data_mode = GET_MODE (operands[0]);
24236 machine_mode mode = GET_MODE (operands[4]);
24237 enum rtx_code code = GET_CODE (operands[3]);
24238 bool negate = false;
24239 rtx x, cop0, cop1;
24240
24241 cop0 = operands[4];
24242 cop1 = operands[5];
24243
24244 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
24245 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
24246 if ((code == LT || code == GE)
24247 && data_mode == mode
24248 && cop1 == CONST0_RTX (mode)
24249 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
24250 && GET_MODE_UNIT_SIZE (data_mode) > 1
24251 && GET_MODE_UNIT_SIZE (data_mode) <= 8
24252 && (GET_MODE_SIZE (data_mode) == 16
24253 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
24254 {
24255 rtx negop = operands[2 - (code == LT)];
24256 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
24257 if (negop == CONST1_RTX (data_mode))
24258 {
24259 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
24260 operands[0], 1, OPTAB_DIRECT);
24261 if (res != operands[0])
24262 emit_move_insn (operands[0], res);
24263 return true;
24264 }
24265 else if (GET_MODE_INNER (data_mode) != DImode
24266 && vector_all_ones_operand (negop, data_mode))
24267 {
24268 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
24269 operands[0], 0, OPTAB_DIRECT);
24270 if (res != operands[0])
24271 emit_move_insn (operands[0], res);
24272 return true;
24273 }
24274 }
24275
24276 if (!nonimmediate_operand (cop1, mode))
24277 cop1 = force_reg (mode, cop1);
24278 if (!general_operand (operands[1], data_mode))
24279 operands[1] = force_reg (data_mode, operands[1]);
24280 if (!general_operand (operands[2], data_mode))
24281 operands[2] = force_reg (data_mode, operands[2]);
24282
24283 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
24284 operands[1], operands[2], &negate);
24285
24286 if (!x)
24287 return false;
24288
24289 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
24290 operands[2-negate]);
24291 return true;
24292 }
24293
24294 /* AVX512F does support 64-byte integer vector operations,
24295 thus the longest vector we are faced with is V64QImode. */
24296 #define MAX_VECT_LEN 64
24297
24298 struct expand_vec_perm_d
24299 {
24300 rtx target, op0, op1;
24301 unsigned char perm[MAX_VECT_LEN];
24302 machine_mode vmode;
24303 unsigned char nelt;
24304 bool one_operand_p;
24305 bool testing_p;
24306 };
24307
24308 static bool
24309 ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1,
24310 struct expand_vec_perm_d *d)
24311 {
24312 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
24313 expander, so args are either in d, or in op0, op1 etc. */
24314 machine_mode mode = GET_MODE (d ? d->op0 : op0);
24315 machine_mode maskmode = mode;
24316 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24317
24318 switch (mode)
24319 {
24320 case V8HImode:
24321 if (TARGET_AVX512VL && TARGET_AVX512BW)
24322 gen = gen_avx512vl_vpermi2varv8hi3;
24323 break;
24324 case V16HImode:
24325 if (TARGET_AVX512VL && TARGET_AVX512BW)
24326 gen = gen_avx512vl_vpermi2varv16hi3;
24327 break;
24328 case V64QImode:
24329 if (TARGET_AVX512VBMI)
24330 gen = gen_avx512bw_vpermi2varv64qi3;
24331 break;
24332 case V32HImode:
24333 if (TARGET_AVX512BW)
24334 gen = gen_avx512bw_vpermi2varv32hi3;
24335 break;
24336 case V4SImode:
24337 if (TARGET_AVX512VL)
24338 gen = gen_avx512vl_vpermi2varv4si3;
24339 break;
24340 case V8SImode:
24341 if (TARGET_AVX512VL)
24342 gen = gen_avx512vl_vpermi2varv8si3;
24343 break;
24344 case V16SImode:
24345 if (TARGET_AVX512F)
24346 gen = gen_avx512f_vpermi2varv16si3;
24347 break;
24348 case V4SFmode:
24349 if (TARGET_AVX512VL)
24350 {
24351 gen = gen_avx512vl_vpermi2varv4sf3;
24352 maskmode = V4SImode;
24353 }
24354 break;
24355 case V8SFmode:
24356 if (TARGET_AVX512VL)
24357 {
24358 gen = gen_avx512vl_vpermi2varv8sf3;
24359 maskmode = V8SImode;
24360 }
24361 break;
24362 case V16SFmode:
24363 if (TARGET_AVX512F)
24364 {
24365 gen = gen_avx512f_vpermi2varv16sf3;
24366 maskmode = V16SImode;
24367 }
24368 break;
24369 case V2DImode:
24370 if (TARGET_AVX512VL)
24371 gen = gen_avx512vl_vpermi2varv2di3;
24372 break;
24373 case V4DImode:
24374 if (TARGET_AVX512VL)
24375 gen = gen_avx512vl_vpermi2varv4di3;
24376 break;
24377 case V8DImode:
24378 if (TARGET_AVX512F)
24379 gen = gen_avx512f_vpermi2varv8di3;
24380 break;
24381 case V2DFmode:
24382 if (TARGET_AVX512VL)
24383 {
24384 gen = gen_avx512vl_vpermi2varv2df3;
24385 maskmode = V2DImode;
24386 }
24387 break;
24388 case V4DFmode:
24389 if (TARGET_AVX512VL)
24390 {
24391 gen = gen_avx512vl_vpermi2varv4df3;
24392 maskmode = V4DImode;
24393 }
24394 break;
24395 case V8DFmode:
24396 if (TARGET_AVX512F)
24397 {
24398 gen = gen_avx512f_vpermi2varv8df3;
24399 maskmode = V8DImode;
24400 }
24401 break;
24402 default:
24403 break;
24404 }
24405
24406 if (gen == NULL)
24407 return false;
24408
24409 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
24410 expander, so args are either in d, or in op0, op1 etc. */
24411 if (d)
24412 {
24413 rtx vec[64];
24414 target = d->target;
24415 op0 = d->op0;
24416 op1 = d->op1;
24417 for (int i = 0; i < d->nelt; ++i)
24418 vec[i] = GEN_INT (d->perm[i]);
24419 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
24420 }
24421
24422 emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
24423 return true;
24424 }
24425
24426 /* Expand a variable vector permutation. */
24427
24428 void
24429 ix86_expand_vec_perm (rtx operands[])
24430 {
24431 rtx target = operands[0];
24432 rtx op0 = operands[1];
24433 rtx op1 = operands[2];
24434 rtx mask = operands[3];
24435 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
24436 machine_mode mode = GET_MODE (op0);
24437 machine_mode maskmode = GET_MODE (mask);
24438 int w, e, i;
24439 bool one_operand_shuffle = rtx_equal_p (op0, op1);
24440
24441 /* Number of elements in the vector. */
24442 w = GET_MODE_NUNITS (mode);
24443 e = GET_MODE_UNIT_SIZE (mode);
24444 gcc_assert (w <= 64);
24445
24446 if (TARGET_AVX512F && one_operand_shuffle)
24447 {
24448 rtx (*gen) (rtx, rtx, rtx) = NULL;
24449 switch (mode)
24450 {
24451 case V16SImode:
24452 gen =gen_avx512f_permvarv16si;
24453 break;
24454 case V16SFmode:
24455 gen = gen_avx512f_permvarv16sf;
24456 break;
24457 case V8DImode:
24458 gen = gen_avx512f_permvarv8di;
24459 break;
24460 case V8DFmode:
24461 gen = gen_avx512f_permvarv8df;
24462 break;
24463 default:
24464 break;
24465 }
24466 if (gen != NULL)
24467 {
24468 emit_insn (gen (target, op0, mask));
24469 return;
24470 }
24471 }
24472
24473 if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1, NULL))
24474 return;
24475
24476 if (TARGET_AVX2)
24477 {
24478 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
24479 {
24480 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
24481 an constant shuffle operand. With a tiny bit of effort we can
24482 use VPERMD instead. A re-interpretation stall for V4DFmode is
24483 unfortunate but there's no avoiding it.
24484 Similarly for V16HImode we don't have instructions for variable
24485 shuffling, while for V32QImode we can use after preparing suitable
24486 masks vpshufb; vpshufb; vpermq; vpor. */
24487
24488 if (mode == V16HImode)
24489 {
24490 maskmode = mode = V32QImode;
24491 w = 32;
24492 e = 1;
24493 }
24494 else
24495 {
24496 maskmode = mode = V8SImode;
24497 w = 8;
24498 e = 4;
24499 }
24500 t1 = gen_reg_rtx (maskmode);
24501
24502 /* Replicate the low bits of the V4DImode mask into V8SImode:
24503 mask = { A B C D }
24504 t1 = { A A B B C C D D }. */
24505 for (i = 0; i < w / 2; ++i)
24506 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
24507 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24508 vt = force_reg (maskmode, vt);
24509 mask = gen_lowpart (maskmode, mask);
24510 if (maskmode == V8SImode)
24511 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
24512 else
24513 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
24514
24515 /* Multiply the shuffle indicies by two. */
24516 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
24517 OPTAB_DIRECT);
24518
24519 /* Add one to the odd shuffle indicies:
24520 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
24521 for (i = 0; i < w / 2; ++i)
24522 {
24523 vec[i * 2] = const0_rtx;
24524 vec[i * 2 + 1] = const1_rtx;
24525 }
24526 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24527 vt = validize_mem (force_const_mem (maskmode, vt));
24528 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
24529 OPTAB_DIRECT);
24530
24531 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
24532 operands[3] = mask = t1;
24533 target = gen_reg_rtx (mode);
24534 op0 = gen_lowpart (mode, op0);
24535 op1 = gen_lowpart (mode, op1);
24536 }
24537
24538 switch (mode)
24539 {
24540 case V8SImode:
24541 /* The VPERMD and VPERMPS instructions already properly ignore
24542 the high bits of the shuffle elements. No need for us to
24543 perform an AND ourselves. */
24544 if (one_operand_shuffle)
24545 {
24546 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
24547 if (target != operands[0])
24548 emit_move_insn (operands[0],
24549 gen_lowpart (GET_MODE (operands[0]), target));
24550 }
24551 else
24552 {
24553 t1 = gen_reg_rtx (V8SImode);
24554 t2 = gen_reg_rtx (V8SImode);
24555 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
24556 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
24557 goto merge_two;
24558 }
24559 return;
24560
24561 case V8SFmode:
24562 mask = gen_lowpart (V8SImode, mask);
24563 if (one_operand_shuffle)
24564 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
24565 else
24566 {
24567 t1 = gen_reg_rtx (V8SFmode);
24568 t2 = gen_reg_rtx (V8SFmode);
24569 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
24570 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
24571 goto merge_two;
24572 }
24573 return;
24574
24575 case V4SImode:
24576 /* By combining the two 128-bit input vectors into one 256-bit
24577 input vector, we can use VPERMD and VPERMPS for the full
24578 two-operand shuffle. */
24579 t1 = gen_reg_rtx (V8SImode);
24580 t2 = gen_reg_rtx (V8SImode);
24581 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
24582 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24583 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
24584 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
24585 return;
24586
24587 case V4SFmode:
24588 t1 = gen_reg_rtx (V8SFmode);
24589 t2 = gen_reg_rtx (V8SImode);
24590 mask = gen_lowpart (V4SImode, mask);
24591 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
24592 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24593 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
24594 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
24595 return;
24596
24597 case V32QImode:
24598 t1 = gen_reg_rtx (V32QImode);
24599 t2 = gen_reg_rtx (V32QImode);
24600 t3 = gen_reg_rtx (V32QImode);
24601 vt2 = GEN_INT (-128);
24602 for (i = 0; i < 32; i++)
24603 vec[i] = vt2;
24604 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
24605 vt = force_reg (V32QImode, vt);
24606 for (i = 0; i < 32; i++)
24607 vec[i] = i < 16 ? vt2 : const0_rtx;
24608 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
24609 vt2 = force_reg (V32QImode, vt2);
24610 /* From mask create two adjusted masks, which contain the same
24611 bits as mask in the low 7 bits of each vector element.
24612 The first mask will have the most significant bit clear
24613 if it requests element from the same 128-bit lane
24614 and MSB set if it requests element from the other 128-bit lane.
24615 The second mask will have the opposite values of the MSB,
24616 and additionally will have its 128-bit lanes swapped.
24617 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
24618 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
24619 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
24620 stands for other 12 bytes. */
24621 /* The bit whether element is from the same lane or the other
24622 lane is bit 4, so shift it up by 3 to the MSB position. */
24623 t5 = gen_reg_rtx (V4DImode);
24624 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
24625 GEN_INT (3)));
24626 /* Clear MSB bits from the mask just in case it had them set. */
24627 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
24628 /* After this t1 will have MSB set for elements from other lane. */
24629 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
24630 /* Clear bits other than MSB. */
24631 emit_insn (gen_andv32qi3 (t1, t1, vt));
24632 /* Or in the lower bits from mask into t3. */
24633 emit_insn (gen_iorv32qi3 (t3, t1, t2));
24634 /* And invert MSB bits in t1, so MSB is set for elements from the same
24635 lane. */
24636 emit_insn (gen_xorv32qi3 (t1, t1, vt));
24637 /* Swap 128-bit lanes in t3. */
24638 t6 = gen_reg_rtx (V4DImode);
24639 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
24640 const2_rtx, GEN_INT (3),
24641 const0_rtx, const1_rtx));
24642 /* And or in the lower bits from mask into t1. */
24643 emit_insn (gen_iorv32qi3 (t1, t1, t2));
24644 if (one_operand_shuffle)
24645 {
24646 /* Each of these shuffles will put 0s in places where
24647 element from the other 128-bit lane is needed, otherwise
24648 will shuffle in the requested value. */
24649 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
24650 gen_lowpart (V32QImode, t6)));
24651 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
24652 /* For t3 the 128-bit lanes are swapped again. */
24653 t7 = gen_reg_rtx (V4DImode);
24654 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
24655 const2_rtx, GEN_INT (3),
24656 const0_rtx, const1_rtx));
24657 /* And oring both together leads to the result. */
24658 emit_insn (gen_iorv32qi3 (target, t1,
24659 gen_lowpart (V32QImode, t7)));
24660 if (target != operands[0])
24661 emit_move_insn (operands[0],
24662 gen_lowpart (GET_MODE (operands[0]), target));
24663 return;
24664 }
24665
24666 t4 = gen_reg_rtx (V32QImode);
24667 /* Similarly to the above one_operand_shuffle code,
24668 just for repeated twice for each operand. merge_two:
24669 code will merge the two results together. */
24670 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
24671 gen_lowpart (V32QImode, t6)));
24672 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
24673 gen_lowpart (V32QImode, t6)));
24674 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
24675 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
24676 t7 = gen_reg_rtx (V4DImode);
24677 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
24678 const2_rtx, GEN_INT (3),
24679 const0_rtx, const1_rtx));
24680 t8 = gen_reg_rtx (V4DImode);
24681 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
24682 const2_rtx, GEN_INT (3),
24683 const0_rtx, const1_rtx));
24684 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
24685 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
24686 t1 = t4;
24687 t2 = t3;
24688 goto merge_two;
24689
24690 default:
24691 gcc_assert (GET_MODE_SIZE (mode) <= 16);
24692 break;
24693 }
24694 }
24695
24696 if (TARGET_XOP)
24697 {
24698 /* The XOP VPPERM insn supports three inputs. By ignoring the
24699 one_operand_shuffle special case, we avoid creating another
24700 set of constant vectors in memory. */
24701 one_operand_shuffle = false;
24702
24703 /* mask = mask & {2*w-1, ...} */
24704 vt = GEN_INT (2*w - 1);
24705 }
24706 else
24707 {
24708 /* mask = mask & {w-1, ...} */
24709 vt = GEN_INT (w - 1);
24710 }
24711
24712 for (i = 0; i < w; i++)
24713 vec[i] = vt;
24714 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24715 mask = expand_simple_binop (maskmode, AND, mask, vt,
24716 NULL_RTX, 0, OPTAB_DIRECT);
24717
24718 /* For non-QImode operations, convert the word permutation control
24719 into a byte permutation control. */
24720 if (mode != V16QImode)
24721 {
24722 mask = expand_simple_binop (maskmode, ASHIFT, mask,
24723 GEN_INT (exact_log2 (e)),
24724 NULL_RTX, 0, OPTAB_DIRECT);
24725
24726 /* Convert mask to vector of chars. */
24727 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
24728
24729 /* Replicate each of the input bytes into byte positions:
24730 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
24731 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
24732 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
24733 for (i = 0; i < 16; ++i)
24734 vec[i] = GEN_INT (i/e * e);
24735 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
24736 vt = validize_mem (force_const_mem (V16QImode, vt));
24737 if (TARGET_XOP)
24738 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
24739 else
24740 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
24741
24742 /* Convert it into the byte positions by doing
24743 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
24744 for (i = 0; i < 16; ++i)
24745 vec[i] = GEN_INT (i % e);
24746 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
24747 vt = validize_mem (force_const_mem (V16QImode, vt));
24748 emit_insn (gen_addv16qi3 (mask, mask, vt));
24749 }
24750
24751 /* The actual shuffle operations all operate on V16QImode. */
24752 op0 = gen_lowpart (V16QImode, op0);
24753 op1 = gen_lowpart (V16QImode, op1);
24754
24755 if (TARGET_XOP)
24756 {
24757 if (GET_MODE (target) != V16QImode)
24758 target = gen_reg_rtx (V16QImode);
24759 emit_insn (gen_xop_pperm (target, op0, op1, mask));
24760 if (target != operands[0])
24761 emit_move_insn (operands[0],
24762 gen_lowpart (GET_MODE (operands[0]), target));
24763 }
24764 else if (one_operand_shuffle)
24765 {
24766 if (GET_MODE (target) != V16QImode)
24767 target = gen_reg_rtx (V16QImode);
24768 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
24769 if (target != operands[0])
24770 emit_move_insn (operands[0],
24771 gen_lowpart (GET_MODE (operands[0]), target));
24772 }
24773 else
24774 {
24775 rtx xops[6];
24776 bool ok;
24777
24778 /* Shuffle the two input vectors independently. */
24779 t1 = gen_reg_rtx (V16QImode);
24780 t2 = gen_reg_rtx (V16QImode);
24781 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
24782 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
24783
24784 merge_two:
24785 /* Then merge them together. The key is whether any given control
24786 element contained a bit set that indicates the second word. */
24787 mask = operands[3];
24788 vt = GEN_INT (w);
24789 if (maskmode == V2DImode && !TARGET_SSE4_1)
24790 {
24791 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
24792 more shuffle to convert the V2DI input mask into a V4SI
24793 input mask. At which point the masking that expand_int_vcond
24794 will work as desired. */
24795 rtx t3 = gen_reg_rtx (V4SImode);
24796 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
24797 const0_rtx, const0_rtx,
24798 const2_rtx, const2_rtx));
24799 mask = t3;
24800 maskmode = V4SImode;
24801 e = w = 4;
24802 }
24803
24804 for (i = 0; i < w; i++)
24805 vec[i] = vt;
24806 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24807 vt = force_reg (maskmode, vt);
24808 mask = expand_simple_binop (maskmode, AND, mask, vt,
24809 NULL_RTX, 0, OPTAB_DIRECT);
24810
24811 if (GET_MODE (target) != mode)
24812 target = gen_reg_rtx (mode);
24813 xops[0] = target;
24814 xops[1] = gen_lowpart (mode, t2);
24815 xops[2] = gen_lowpart (mode, t1);
24816 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
24817 xops[4] = mask;
24818 xops[5] = vt;
24819 ok = ix86_expand_int_vcond (xops);
24820 gcc_assert (ok);
24821 if (target != operands[0])
24822 emit_move_insn (operands[0],
24823 gen_lowpart (GET_MODE (operands[0]), target));
24824 }
24825 }
24826
24827 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
24828 true if we should do zero extension, else sign extension. HIGH_P is
24829 true if we want the N/2 high elements, else the low elements. */
24830
24831 void
24832 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
24833 {
24834 machine_mode imode = GET_MODE (src);
24835 rtx tmp;
24836
24837 if (TARGET_SSE4_1)
24838 {
24839 rtx (*unpack)(rtx, rtx);
24840 rtx (*extract)(rtx, rtx) = NULL;
24841 machine_mode halfmode = BLKmode;
24842
24843 switch (imode)
24844 {
24845 case V64QImode:
24846 if (unsigned_p)
24847 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
24848 else
24849 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
24850 halfmode = V32QImode;
24851 extract
24852 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
24853 break;
24854 case V32QImode:
24855 if (unsigned_p)
24856 unpack = gen_avx2_zero_extendv16qiv16hi2;
24857 else
24858 unpack = gen_avx2_sign_extendv16qiv16hi2;
24859 halfmode = V16QImode;
24860 extract
24861 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
24862 break;
24863 case V32HImode:
24864 if (unsigned_p)
24865 unpack = gen_avx512f_zero_extendv16hiv16si2;
24866 else
24867 unpack = gen_avx512f_sign_extendv16hiv16si2;
24868 halfmode = V16HImode;
24869 extract
24870 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
24871 break;
24872 case V16HImode:
24873 if (unsigned_p)
24874 unpack = gen_avx2_zero_extendv8hiv8si2;
24875 else
24876 unpack = gen_avx2_sign_extendv8hiv8si2;
24877 halfmode = V8HImode;
24878 extract
24879 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
24880 break;
24881 case V16SImode:
24882 if (unsigned_p)
24883 unpack = gen_avx512f_zero_extendv8siv8di2;
24884 else
24885 unpack = gen_avx512f_sign_extendv8siv8di2;
24886 halfmode = V8SImode;
24887 extract
24888 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
24889 break;
24890 case V8SImode:
24891 if (unsigned_p)
24892 unpack = gen_avx2_zero_extendv4siv4di2;
24893 else
24894 unpack = gen_avx2_sign_extendv4siv4di2;
24895 halfmode = V4SImode;
24896 extract
24897 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
24898 break;
24899 case V16QImode:
24900 if (unsigned_p)
24901 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
24902 else
24903 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
24904 break;
24905 case V8HImode:
24906 if (unsigned_p)
24907 unpack = gen_sse4_1_zero_extendv4hiv4si2;
24908 else
24909 unpack = gen_sse4_1_sign_extendv4hiv4si2;
24910 break;
24911 case V4SImode:
24912 if (unsigned_p)
24913 unpack = gen_sse4_1_zero_extendv2siv2di2;
24914 else
24915 unpack = gen_sse4_1_sign_extendv2siv2di2;
24916 break;
24917 default:
24918 gcc_unreachable ();
24919 }
24920
24921 if (GET_MODE_SIZE (imode) >= 32)
24922 {
24923 tmp = gen_reg_rtx (halfmode);
24924 emit_insn (extract (tmp, src));
24925 }
24926 else if (high_p)
24927 {
24928 /* Shift higher 8 bytes to lower 8 bytes. */
24929 tmp = gen_reg_rtx (V1TImode);
24930 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
24931 GEN_INT (64)));
24932 tmp = gen_lowpart (imode, tmp);
24933 }
24934 else
24935 tmp = src;
24936
24937 emit_insn (unpack (dest, tmp));
24938 }
24939 else
24940 {
24941 rtx (*unpack)(rtx, rtx, rtx);
24942
24943 switch (imode)
24944 {
24945 case V16QImode:
24946 if (high_p)
24947 unpack = gen_vec_interleave_highv16qi;
24948 else
24949 unpack = gen_vec_interleave_lowv16qi;
24950 break;
24951 case V8HImode:
24952 if (high_p)
24953 unpack = gen_vec_interleave_highv8hi;
24954 else
24955 unpack = gen_vec_interleave_lowv8hi;
24956 break;
24957 case V4SImode:
24958 if (high_p)
24959 unpack = gen_vec_interleave_highv4si;
24960 else
24961 unpack = gen_vec_interleave_lowv4si;
24962 break;
24963 default:
24964 gcc_unreachable ();
24965 }
24966
24967 if (unsigned_p)
24968 tmp = force_reg (imode, CONST0_RTX (imode));
24969 else
24970 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
24971 src, pc_rtx, pc_rtx);
24972
24973 rtx tmp2 = gen_reg_rtx (imode);
24974 emit_insn (unpack (tmp2, src, tmp));
24975 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
24976 }
24977 }
24978
24979 /* Expand conditional increment or decrement using adb/sbb instructions.
24980 The default case using setcc followed by the conditional move can be
24981 done by generic code. */
24982 bool
24983 ix86_expand_int_addcc (rtx operands[])
24984 {
24985 enum rtx_code code = GET_CODE (operands[1]);
24986 rtx flags;
24987 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
24988 rtx compare_op;
24989 rtx val = const0_rtx;
24990 bool fpcmp = false;
24991 machine_mode mode;
24992 rtx op0 = XEXP (operands[1], 0);
24993 rtx op1 = XEXP (operands[1], 1);
24994
24995 if (operands[3] != const1_rtx
24996 && operands[3] != constm1_rtx)
24997 return false;
24998 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
24999 return false;
25000 code = GET_CODE (compare_op);
25001
25002 flags = XEXP (compare_op, 0);
25003
25004 if (GET_MODE (flags) == CCFPmode
25005 || GET_MODE (flags) == CCFPUmode)
25006 {
25007 fpcmp = true;
25008 code = ix86_fp_compare_code_to_integer (code);
25009 }
25010
25011 if (code != LTU)
25012 {
25013 val = constm1_rtx;
25014 if (fpcmp)
25015 PUT_CODE (compare_op,
25016 reverse_condition_maybe_unordered
25017 (GET_CODE (compare_op)));
25018 else
25019 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
25020 }
25021
25022 mode = GET_MODE (operands[0]);
25023
25024 /* Construct either adc or sbb insn. */
25025 if ((code == LTU) == (operands[3] == constm1_rtx))
25026 {
25027 switch (mode)
25028 {
25029 case QImode:
25030 insn = gen_subqi3_carry;
25031 break;
25032 case HImode:
25033 insn = gen_subhi3_carry;
25034 break;
25035 case SImode:
25036 insn = gen_subsi3_carry;
25037 break;
25038 case DImode:
25039 insn = gen_subdi3_carry;
25040 break;
25041 default:
25042 gcc_unreachable ();
25043 }
25044 }
25045 else
25046 {
25047 switch (mode)
25048 {
25049 case QImode:
25050 insn = gen_addqi3_carry;
25051 break;
25052 case HImode:
25053 insn = gen_addhi3_carry;
25054 break;
25055 case SImode:
25056 insn = gen_addsi3_carry;
25057 break;
25058 case DImode:
25059 insn = gen_adddi3_carry;
25060 break;
25061 default:
25062 gcc_unreachable ();
25063 }
25064 }
25065 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
25066
25067 return true;
25068 }
25069
25070
25071 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
25072 but works for floating pointer parameters and nonoffsetable memories.
25073 For pushes, it returns just stack offsets; the values will be saved
25074 in the right order. Maximally three parts are generated. */
25075
25076 static int
25077 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
25078 {
25079 int size;
25080
25081 if (!TARGET_64BIT)
25082 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
25083 else
25084 size = (GET_MODE_SIZE (mode) + 4) / 8;
25085
25086 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
25087 gcc_assert (size >= 2 && size <= 4);
25088
25089 /* Optimize constant pool reference to immediates. This is used by fp
25090 moves, that force all constants to memory to allow combining. */
25091 if (MEM_P (operand) && MEM_READONLY_P (operand))
25092 {
25093 rtx tmp = maybe_get_pool_constant (operand);
25094 if (tmp)
25095 operand = tmp;
25096 }
25097
25098 if (MEM_P (operand) && !offsettable_memref_p (operand))
25099 {
25100 /* The only non-offsetable memories we handle are pushes. */
25101 int ok = push_operand (operand, VOIDmode);
25102
25103 gcc_assert (ok);
25104
25105 operand = copy_rtx (operand);
25106 PUT_MODE (operand, word_mode);
25107 parts[0] = parts[1] = parts[2] = parts[3] = operand;
25108 return size;
25109 }
25110
25111 if (GET_CODE (operand) == CONST_VECTOR)
25112 {
25113 machine_mode imode = int_mode_for_mode (mode);
25114 /* Caution: if we looked through a constant pool memory above,
25115 the operand may actually have a different mode now. That's
25116 ok, since we want to pun this all the way back to an integer. */
25117 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
25118 gcc_assert (operand != NULL);
25119 mode = imode;
25120 }
25121
25122 if (!TARGET_64BIT)
25123 {
25124 if (mode == DImode)
25125 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25126 else
25127 {
25128 int i;
25129
25130 if (REG_P (operand))
25131 {
25132 gcc_assert (reload_completed);
25133 for (i = 0; i < size; i++)
25134 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
25135 }
25136 else if (offsettable_memref_p (operand))
25137 {
25138 operand = adjust_address (operand, SImode, 0);
25139 parts[0] = operand;
25140 for (i = 1; i < size; i++)
25141 parts[i] = adjust_address (operand, SImode, 4 * i);
25142 }
25143 else if (CONST_DOUBLE_P (operand))
25144 {
25145 const REAL_VALUE_TYPE *r;
25146 long l[4];
25147
25148 r = CONST_DOUBLE_REAL_VALUE (operand);
25149 switch (mode)
25150 {
25151 case TFmode:
25152 real_to_target (l, r, mode);
25153 parts[3] = gen_int_mode (l[3], SImode);
25154 parts[2] = gen_int_mode (l[2], SImode);
25155 break;
25156 case XFmode:
25157 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
25158 long double may not be 80-bit. */
25159 real_to_target (l, r, mode);
25160 parts[2] = gen_int_mode (l[2], SImode);
25161 break;
25162 case DFmode:
25163 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
25164 break;
25165 default:
25166 gcc_unreachable ();
25167 }
25168 parts[1] = gen_int_mode (l[1], SImode);
25169 parts[0] = gen_int_mode (l[0], SImode);
25170 }
25171 else
25172 gcc_unreachable ();
25173 }
25174 }
25175 else
25176 {
25177 if (mode == TImode)
25178 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25179 if (mode == XFmode || mode == TFmode)
25180 {
25181 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
25182 if (REG_P (operand))
25183 {
25184 gcc_assert (reload_completed);
25185 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
25186 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
25187 }
25188 else if (offsettable_memref_p (operand))
25189 {
25190 operand = adjust_address (operand, DImode, 0);
25191 parts[0] = operand;
25192 parts[1] = adjust_address (operand, upper_mode, 8);
25193 }
25194 else if (CONST_DOUBLE_P (operand))
25195 {
25196 long l[4];
25197
25198 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
25199
25200 /* real_to_target puts 32-bit pieces in each long. */
25201 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
25202 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
25203 << 32), DImode);
25204
25205 if (upper_mode == SImode)
25206 parts[1] = gen_int_mode (l[2], SImode);
25207 else
25208 parts[1]
25209 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
25210 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
25211 << 32), DImode);
25212 }
25213 else
25214 gcc_unreachable ();
25215 }
25216 }
25217
25218 return size;
25219 }
25220
25221 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
25222 Return false when normal moves are needed; true when all required
25223 insns have been emitted. Operands 2-4 contain the input values
25224 int the correct order; operands 5-7 contain the output values. */
25225
25226 void
25227 ix86_split_long_move (rtx operands[])
25228 {
25229 rtx part[2][4];
25230 int nparts, i, j;
25231 int push = 0;
25232 int collisions = 0;
25233 machine_mode mode = GET_MODE (operands[0]);
25234 bool collisionparts[4];
25235
25236 /* The DFmode expanders may ask us to move double.
25237 For 64bit target this is single move. By hiding the fact
25238 here we simplify i386.md splitters. */
25239 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
25240 {
25241 /* Optimize constant pool reference to immediates. This is used by
25242 fp moves, that force all constants to memory to allow combining. */
25243
25244 if (MEM_P (operands[1])
25245 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
25246 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
25247 operands[1] = get_pool_constant (XEXP (operands[1], 0));
25248 if (push_operand (operands[0], VOIDmode))
25249 {
25250 operands[0] = copy_rtx (operands[0]);
25251 PUT_MODE (operands[0], word_mode);
25252 }
25253 else
25254 operands[0] = gen_lowpart (DImode, operands[0]);
25255 operands[1] = gen_lowpart (DImode, operands[1]);
25256 emit_move_insn (operands[0], operands[1]);
25257 return;
25258 }
25259
25260 /* The only non-offsettable memory we handle is push. */
25261 if (push_operand (operands[0], VOIDmode))
25262 push = 1;
25263 else
25264 gcc_assert (!MEM_P (operands[0])
25265 || offsettable_memref_p (operands[0]));
25266
25267 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
25268 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
25269
25270 /* When emitting push, take care for source operands on the stack. */
25271 if (push && MEM_P (operands[1])
25272 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
25273 {
25274 rtx src_base = XEXP (part[1][nparts - 1], 0);
25275
25276 /* Compensate for the stack decrement by 4. */
25277 if (!TARGET_64BIT && nparts == 3
25278 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
25279 src_base = plus_constant (Pmode, src_base, 4);
25280
25281 /* src_base refers to the stack pointer and is
25282 automatically decreased by emitted push. */
25283 for (i = 0; i < nparts; i++)
25284 part[1][i] = change_address (part[1][i],
25285 GET_MODE (part[1][i]), src_base);
25286 }
25287
25288 /* We need to do copy in the right order in case an address register
25289 of the source overlaps the destination. */
25290 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
25291 {
25292 rtx tmp;
25293
25294 for (i = 0; i < nparts; i++)
25295 {
25296 collisionparts[i]
25297 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
25298 if (collisionparts[i])
25299 collisions++;
25300 }
25301
25302 /* Collision in the middle part can be handled by reordering. */
25303 if (collisions == 1 && nparts == 3 && collisionparts [1])
25304 {
25305 std::swap (part[0][1], part[0][2]);
25306 std::swap (part[1][1], part[1][2]);
25307 }
25308 else if (collisions == 1
25309 && nparts == 4
25310 && (collisionparts [1] || collisionparts [2]))
25311 {
25312 if (collisionparts [1])
25313 {
25314 std::swap (part[0][1], part[0][2]);
25315 std::swap (part[1][1], part[1][2]);
25316 }
25317 else
25318 {
25319 std::swap (part[0][2], part[0][3]);
25320 std::swap (part[1][2], part[1][3]);
25321 }
25322 }
25323
25324 /* If there are more collisions, we can't handle it by reordering.
25325 Do an lea to the last part and use only one colliding move. */
25326 else if (collisions > 1)
25327 {
25328 rtx base, addr, tls_base = NULL_RTX;
25329
25330 collisions = 1;
25331
25332 base = part[0][nparts - 1];
25333
25334 /* Handle the case when the last part isn't valid for lea.
25335 Happens in 64-bit mode storing the 12-byte XFmode. */
25336 if (GET_MODE (base) != Pmode)
25337 base = gen_rtx_REG (Pmode, REGNO (base));
25338
25339 addr = XEXP (part[1][0], 0);
25340 if (TARGET_TLS_DIRECT_SEG_REFS)
25341 {
25342 struct ix86_address parts;
25343 int ok = ix86_decompose_address (addr, &parts);
25344 gcc_assert (ok);
25345 if (parts.seg == DEFAULT_TLS_SEG_REG)
25346 {
25347 /* It is not valid to use %gs: or %fs: in
25348 lea though, so we need to remove it from the
25349 address used for lea and add it to each individual
25350 memory loads instead. */
25351 addr = copy_rtx (addr);
25352 rtx *x = &addr;
25353 while (GET_CODE (*x) == PLUS)
25354 {
25355 for (i = 0; i < 2; i++)
25356 {
25357 rtx u = XEXP (*x, i);
25358 if (GET_CODE (u) == ZERO_EXTEND)
25359 u = XEXP (u, 0);
25360 if (GET_CODE (u) == UNSPEC
25361 && XINT (u, 1) == UNSPEC_TP)
25362 {
25363 tls_base = XEXP (*x, i);
25364 *x = XEXP (*x, 1 - i);
25365 break;
25366 }
25367 }
25368 if (tls_base)
25369 break;
25370 x = &XEXP (*x, 0);
25371 }
25372 gcc_assert (tls_base);
25373 }
25374 }
25375 emit_insn (gen_rtx_SET (base, addr));
25376 if (tls_base)
25377 base = gen_rtx_PLUS (GET_MODE (base), base, tls_base);
25378 part[1][0] = replace_equiv_address (part[1][0], base);
25379 for (i = 1; i < nparts; i++)
25380 {
25381 if (tls_base)
25382 base = copy_rtx (base);
25383 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
25384 part[1][i] = replace_equiv_address (part[1][i], tmp);
25385 }
25386 }
25387 }
25388
25389 if (push)
25390 {
25391 if (!TARGET_64BIT)
25392 {
25393 if (nparts == 3)
25394 {
25395 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
25396 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
25397 stack_pointer_rtx, GEN_INT (-4)));
25398 emit_move_insn (part[0][2], part[1][2]);
25399 }
25400 else if (nparts == 4)
25401 {
25402 emit_move_insn (part[0][3], part[1][3]);
25403 emit_move_insn (part[0][2], part[1][2]);
25404 }
25405 }
25406 else
25407 {
25408 /* In 64bit mode we don't have 32bit push available. In case this is
25409 register, it is OK - we will just use larger counterpart. We also
25410 retype memory - these comes from attempt to avoid REX prefix on
25411 moving of second half of TFmode value. */
25412 if (GET_MODE (part[1][1]) == SImode)
25413 {
25414 switch (GET_CODE (part[1][1]))
25415 {
25416 case MEM:
25417 part[1][1] = adjust_address (part[1][1], DImode, 0);
25418 break;
25419
25420 case REG:
25421 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
25422 break;
25423
25424 default:
25425 gcc_unreachable ();
25426 }
25427
25428 if (GET_MODE (part[1][0]) == SImode)
25429 part[1][0] = part[1][1];
25430 }
25431 }
25432 emit_move_insn (part[0][1], part[1][1]);
25433 emit_move_insn (part[0][0], part[1][0]);
25434 return;
25435 }
25436
25437 /* Choose correct order to not overwrite the source before it is copied. */
25438 if ((REG_P (part[0][0])
25439 && REG_P (part[1][1])
25440 && (REGNO (part[0][0]) == REGNO (part[1][1])
25441 || (nparts == 3
25442 && REGNO (part[0][0]) == REGNO (part[1][2]))
25443 || (nparts == 4
25444 && REGNO (part[0][0]) == REGNO (part[1][3]))))
25445 || (collisions > 0
25446 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
25447 {
25448 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
25449 {
25450 operands[2 + i] = part[0][j];
25451 operands[6 + i] = part[1][j];
25452 }
25453 }
25454 else
25455 {
25456 for (i = 0; i < nparts; i++)
25457 {
25458 operands[2 + i] = part[0][i];
25459 operands[6 + i] = part[1][i];
25460 }
25461 }
25462
25463 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
25464 if (optimize_insn_for_size_p ())
25465 {
25466 for (j = 0; j < nparts - 1; j++)
25467 if (CONST_INT_P (operands[6 + j])
25468 && operands[6 + j] != const0_rtx
25469 && REG_P (operands[2 + j]))
25470 for (i = j; i < nparts - 1; i++)
25471 if (CONST_INT_P (operands[7 + i])
25472 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
25473 operands[7 + i] = operands[2 + j];
25474 }
25475
25476 for (i = 0; i < nparts; i++)
25477 emit_move_insn (operands[2 + i], operands[6 + i]);
25478
25479 return;
25480 }
25481
25482 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
25483 left shift by a constant, either using a single shift or
25484 a sequence of add instructions. */
25485
25486 static void
25487 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
25488 {
25489 rtx (*insn)(rtx, rtx, rtx);
25490
25491 if (count == 1
25492 || (count * ix86_cost->add <= ix86_cost->shift_const
25493 && !optimize_insn_for_size_p ()))
25494 {
25495 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
25496 while (count-- > 0)
25497 emit_insn (insn (operand, operand, operand));
25498 }
25499 else
25500 {
25501 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25502 emit_insn (insn (operand, operand, GEN_INT (count)));
25503 }
25504 }
25505
25506 void
25507 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
25508 {
25509 rtx (*gen_ashl3)(rtx, rtx, rtx);
25510 rtx (*gen_shld)(rtx, rtx, rtx);
25511 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25512
25513 rtx low[2], high[2];
25514 int count;
25515
25516 if (CONST_INT_P (operands[2]))
25517 {
25518 split_double_mode (mode, operands, 2, low, high);
25519 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25520
25521 if (count >= half_width)
25522 {
25523 emit_move_insn (high[0], low[1]);
25524 emit_move_insn (low[0], const0_rtx);
25525
25526 if (count > half_width)
25527 ix86_expand_ashl_const (high[0], count - half_width, mode);
25528 }
25529 else
25530 {
25531 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25532
25533 if (!rtx_equal_p (operands[0], operands[1]))
25534 emit_move_insn (operands[0], operands[1]);
25535
25536 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
25537 ix86_expand_ashl_const (low[0], count, mode);
25538 }
25539 return;
25540 }
25541
25542 split_double_mode (mode, operands, 1, low, high);
25543
25544 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25545
25546 if (operands[1] == const1_rtx)
25547 {
25548 /* Assuming we've chosen a QImode capable registers, then 1 << N
25549 can be done with two 32/64-bit shifts, no branches, no cmoves. */
25550 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
25551 {
25552 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
25553
25554 ix86_expand_clear (low[0]);
25555 ix86_expand_clear (high[0]);
25556 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
25557
25558 d = gen_lowpart (QImode, low[0]);
25559 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25560 s = gen_rtx_EQ (QImode, flags, const0_rtx);
25561 emit_insn (gen_rtx_SET (d, s));
25562
25563 d = gen_lowpart (QImode, high[0]);
25564 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25565 s = gen_rtx_NE (QImode, flags, const0_rtx);
25566 emit_insn (gen_rtx_SET (d, s));
25567 }
25568
25569 /* Otherwise, we can get the same results by manually performing
25570 a bit extract operation on bit 5/6, and then performing the two
25571 shifts. The two methods of getting 0/1 into low/high are exactly
25572 the same size. Avoiding the shift in the bit extract case helps
25573 pentium4 a bit; no one else seems to care much either way. */
25574 else
25575 {
25576 machine_mode half_mode;
25577 rtx (*gen_lshr3)(rtx, rtx, rtx);
25578 rtx (*gen_and3)(rtx, rtx, rtx);
25579 rtx (*gen_xor3)(rtx, rtx, rtx);
25580 HOST_WIDE_INT bits;
25581 rtx x;
25582
25583 if (mode == DImode)
25584 {
25585 half_mode = SImode;
25586 gen_lshr3 = gen_lshrsi3;
25587 gen_and3 = gen_andsi3;
25588 gen_xor3 = gen_xorsi3;
25589 bits = 5;
25590 }
25591 else
25592 {
25593 half_mode = DImode;
25594 gen_lshr3 = gen_lshrdi3;
25595 gen_and3 = gen_anddi3;
25596 gen_xor3 = gen_xordi3;
25597 bits = 6;
25598 }
25599
25600 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
25601 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
25602 else
25603 x = gen_lowpart (half_mode, operands[2]);
25604 emit_insn (gen_rtx_SET (high[0], x));
25605
25606 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
25607 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
25608 emit_move_insn (low[0], high[0]);
25609 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
25610 }
25611
25612 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25613 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
25614 return;
25615 }
25616
25617 if (operands[1] == constm1_rtx)
25618 {
25619 /* For -1 << N, we can avoid the shld instruction, because we
25620 know that we're shifting 0...31/63 ones into a -1. */
25621 emit_move_insn (low[0], constm1_rtx);
25622 if (optimize_insn_for_size_p ())
25623 emit_move_insn (high[0], low[0]);
25624 else
25625 emit_move_insn (high[0], constm1_rtx);
25626 }
25627 else
25628 {
25629 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25630
25631 if (!rtx_equal_p (operands[0], operands[1]))
25632 emit_move_insn (operands[0], operands[1]);
25633
25634 split_double_mode (mode, operands, 1, low, high);
25635 emit_insn (gen_shld (high[0], low[0], operands[2]));
25636 }
25637
25638 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25639
25640 if (TARGET_CMOVE && scratch)
25641 {
25642 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25643 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25644
25645 ix86_expand_clear (scratch);
25646 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
25647 }
25648 else
25649 {
25650 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25651 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25652
25653 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
25654 }
25655 }
25656
25657 void
25658 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
25659 {
25660 rtx (*gen_ashr3)(rtx, rtx, rtx)
25661 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
25662 rtx (*gen_shrd)(rtx, rtx, rtx);
25663 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25664
25665 rtx low[2], high[2];
25666 int count;
25667
25668 if (CONST_INT_P (operands[2]))
25669 {
25670 split_double_mode (mode, operands, 2, low, high);
25671 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25672
25673 if (count == GET_MODE_BITSIZE (mode) - 1)
25674 {
25675 emit_move_insn (high[0], high[1]);
25676 emit_insn (gen_ashr3 (high[0], high[0],
25677 GEN_INT (half_width - 1)));
25678 emit_move_insn (low[0], high[0]);
25679
25680 }
25681 else if (count >= half_width)
25682 {
25683 emit_move_insn (low[0], high[1]);
25684 emit_move_insn (high[0], low[0]);
25685 emit_insn (gen_ashr3 (high[0], high[0],
25686 GEN_INT (half_width - 1)));
25687
25688 if (count > half_width)
25689 emit_insn (gen_ashr3 (low[0], low[0],
25690 GEN_INT (count - half_width)));
25691 }
25692 else
25693 {
25694 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25695
25696 if (!rtx_equal_p (operands[0], operands[1]))
25697 emit_move_insn (operands[0], operands[1]);
25698
25699 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
25700 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
25701 }
25702 }
25703 else
25704 {
25705 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25706
25707 if (!rtx_equal_p (operands[0], operands[1]))
25708 emit_move_insn (operands[0], operands[1]);
25709
25710 split_double_mode (mode, operands, 1, low, high);
25711
25712 emit_insn (gen_shrd (low[0], high[0], operands[2]));
25713 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
25714
25715 if (TARGET_CMOVE && scratch)
25716 {
25717 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25718 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25719
25720 emit_move_insn (scratch, high[0]);
25721 emit_insn (gen_ashr3 (scratch, scratch,
25722 GEN_INT (half_width - 1)));
25723 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
25724 scratch));
25725 }
25726 else
25727 {
25728 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
25729 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
25730
25731 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
25732 }
25733 }
25734 }
25735
25736 void
25737 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
25738 {
25739 rtx (*gen_lshr3)(rtx, rtx, rtx)
25740 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
25741 rtx (*gen_shrd)(rtx, rtx, rtx);
25742 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25743
25744 rtx low[2], high[2];
25745 int count;
25746
25747 if (CONST_INT_P (operands[2]))
25748 {
25749 split_double_mode (mode, operands, 2, low, high);
25750 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25751
25752 if (count >= half_width)
25753 {
25754 emit_move_insn (low[0], high[1]);
25755 ix86_expand_clear (high[0]);
25756
25757 if (count > half_width)
25758 emit_insn (gen_lshr3 (low[0], low[0],
25759 GEN_INT (count - half_width)));
25760 }
25761 else
25762 {
25763 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25764
25765 if (!rtx_equal_p (operands[0], operands[1]))
25766 emit_move_insn (operands[0], operands[1]);
25767
25768 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
25769 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
25770 }
25771 }
25772 else
25773 {
25774 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25775
25776 if (!rtx_equal_p (operands[0], operands[1]))
25777 emit_move_insn (operands[0], operands[1]);
25778
25779 split_double_mode (mode, operands, 1, low, high);
25780
25781 emit_insn (gen_shrd (low[0], high[0], operands[2]));
25782 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
25783
25784 if (TARGET_CMOVE && scratch)
25785 {
25786 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25787 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25788
25789 ix86_expand_clear (scratch);
25790 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
25791 scratch));
25792 }
25793 else
25794 {
25795 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25796 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25797
25798 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
25799 }
25800 }
25801 }
25802
25803 /* Predict just emitted jump instruction to be taken with probability PROB. */
25804 static void
25805 predict_jump (int prob)
25806 {
25807 rtx insn = get_last_insn ();
25808 gcc_assert (JUMP_P (insn));
25809 add_int_reg_note (insn, REG_BR_PROB, prob);
25810 }
25811
25812 /* Helper function for the string operations below. Dest VARIABLE whether
25813 it is aligned to VALUE bytes. If true, jump to the label. */
25814 static rtx_code_label *
25815 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
25816 {
25817 rtx_code_label *label = gen_label_rtx ();
25818 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
25819 if (GET_MODE (variable) == DImode)
25820 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
25821 else
25822 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
25823 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
25824 1, label);
25825 if (epilogue)
25826 predict_jump (REG_BR_PROB_BASE * 50 / 100);
25827 else
25828 predict_jump (REG_BR_PROB_BASE * 90 / 100);
25829 return label;
25830 }
25831
25832 /* Adjust COUNTER by the VALUE. */
25833 static void
25834 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
25835 {
25836 rtx (*gen_add)(rtx, rtx, rtx)
25837 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
25838
25839 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
25840 }
25841
25842 /* Zero extend possibly SImode EXP to Pmode register. */
25843 rtx
25844 ix86_zero_extend_to_Pmode (rtx exp)
25845 {
25846 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
25847 }
25848
25849 /* Divide COUNTREG by SCALE. */
25850 static rtx
25851 scale_counter (rtx countreg, int scale)
25852 {
25853 rtx sc;
25854
25855 if (scale == 1)
25856 return countreg;
25857 if (CONST_INT_P (countreg))
25858 return GEN_INT (INTVAL (countreg) / scale);
25859 gcc_assert (REG_P (countreg));
25860
25861 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
25862 GEN_INT (exact_log2 (scale)),
25863 NULL, 1, OPTAB_DIRECT);
25864 return sc;
25865 }
25866
25867 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
25868 DImode for constant loop counts. */
25869
25870 static machine_mode
25871 counter_mode (rtx count_exp)
25872 {
25873 if (GET_MODE (count_exp) != VOIDmode)
25874 return GET_MODE (count_exp);
25875 if (!CONST_INT_P (count_exp))
25876 return Pmode;
25877 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
25878 return DImode;
25879 return SImode;
25880 }
25881
25882 /* Copy the address to a Pmode register. This is used for x32 to
25883 truncate DImode TLS address to a SImode register. */
25884
25885 static rtx
25886 ix86_copy_addr_to_reg (rtx addr)
25887 {
25888 rtx reg;
25889 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
25890 {
25891 reg = copy_addr_to_reg (addr);
25892 REG_POINTER (reg) = 1;
25893 return reg;
25894 }
25895 else
25896 {
25897 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
25898 reg = copy_to_mode_reg (DImode, addr);
25899 REG_POINTER (reg) = 1;
25900 return gen_rtx_SUBREG (SImode, reg, 0);
25901 }
25902 }
25903
25904 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
25905 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
25906 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
25907 memory by VALUE (supposed to be in MODE).
25908
25909 The size is rounded down to whole number of chunk size moved at once.
25910 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
25911
25912
25913 static void
25914 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
25915 rtx destptr, rtx srcptr, rtx value,
25916 rtx count, machine_mode mode, int unroll,
25917 int expected_size, bool issetmem)
25918 {
25919 rtx_code_label *out_label, *top_label;
25920 rtx iter, tmp;
25921 machine_mode iter_mode = counter_mode (count);
25922 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
25923 rtx piece_size = GEN_INT (piece_size_n);
25924 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
25925 rtx size;
25926 int i;
25927
25928 top_label = gen_label_rtx ();
25929 out_label = gen_label_rtx ();
25930 iter = gen_reg_rtx (iter_mode);
25931
25932 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
25933 NULL, 1, OPTAB_DIRECT);
25934 /* Those two should combine. */
25935 if (piece_size == const1_rtx)
25936 {
25937 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
25938 true, out_label);
25939 predict_jump (REG_BR_PROB_BASE * 10 / 100);
25940 }
25941 emit_move_insn (iter, const0_rtx);
25942
25943 emit_label (top_label);
25944
25945 tmp = convert_modes (Pmode, iter_mode, iter, true);
25946
25947 /* This assert could be relaxed - in this case we'll need to compute
25948 smallest power of two, containing in PIECE_SIZE_N and pass it to
25949 offset_address. */
25950 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
25951 destmem = offset_address (destmem, tmp, piece_size_n);
25952 destmem = adjust_address (destmem, mode, 0);
25953
25954 if (!issetmem)
25955 {
25956 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
25957 srcmem = adjust_address (srcmem, mode, 0);
25958
25959 /* When unrolling for chips that reorder memory reads and writes,
25960 we can save registers by using single temporary.
25961 Also using 4 temporaries is overkill in 32bit mode. */
25962 if (!TARGET_64BIT && 0)
25963 {
25964 for (i = 0; i < unroll; i++)
25965 {
25966 if (i)
25967 {
25968 destmem =
25969 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
25970 srcmem =
25971 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
25972 }
25973 emit_move_insn (destmem, srcmem);
25974 }
25975 }
25976 else
25977 {
25978 rtx tmpreg[4];
25979 gcc_assert (unroll <= 4);
25980 for (i = 0; i < unroll; i++)
25981 {
25982 tmpreg[i] = gen_reg_rtx (mode);
25983 if (i)
25984 {
25985 srcmem =
25986 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
25987 }
25988 emit_move_insn (tmpreg[i], srcmem);
25989 }
25990 for (i = 0; i < unroll; i++)
25991 {
25992 if (i)
25993 {
25994 destmem =
25995 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
25996 }
25997 emit_move_insn (destmem, tmpreg[i]);
25998 }
25999 }
26000 }
26001 else
26002 for (i = 0; i < unroll; i++)
26003 {
26004 if (i)
26005 destmem =
26006 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26007 emit_move_insn (destmem, value);
26008 }
26009
26010 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
26011 true, OPTAB_LIB_WIDEN);
26012 if (tmp != iter)
26013 emit_move_insn (iter, tmp);
26014
26015 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
26016 true, top_label);
26017 if (expected_size != -1)
26018 {
26019 expected_size /= GET_MODE_SIZE (mode) * unroll;
26020 if (expected_size == 0)
26021 predict_jump (0);
26022 else if (expected_size > REG_BR_PROB_BASE)
26023 predict_jump (REG_BR_PROB_BASE - 1);
26024 else
26025 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
26026 }
26027 else
26028 predict_jump (REG_BR_PROB_BASE * 80 / 100);
26029 iter = ix86_zero_extend_to_Pmode (iter);
26030 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
26031 true, OPTAB_LIB_WIDEN);
26032 if (tmp != destptr)
26033 emit_move_insn (destptr, tmp);
26034 if (!issetmem)
26035 {
26036 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
26037 true, OPTAB_LIB_WIDEN);
26038 if (tmp != srcptr)
26039 emit_move_insn (srcptr, tmp);
26040 }
26041 emit_label (out_label);
26042 }
26043
26044 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
26045 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
26046 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
26047 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
26048 ORIG_VALUE is the original value passed to memset to fill the memory with.
26049 Other arguments have same meaning as for previous function. */
26050
26051 static void
26052 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
26053 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
26054 rtx count,
26055 machine_mode mode, bool issetmem)
26056 {
26057 rtx destexp;
26058 rtx srcexp;
26059 rtx countreg;
26060 HOST_WIDE_INT rounded_count;
26061
26062 /* If possible, it is shorter to use rep movs.
26063 TODO: Maybe it is better to move this logic to decide_alg. */
26064 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
26065 && (!issetmem || orig_value == const0_rtx))
26066 mode = SImode;
26067
26068 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
26069 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
26070
26071 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
26072 GET_MODE_SIZE (mode)));
26073 if (mode != QImode)
26074 {
26075 destexp = gen_rtx_ASHIFT (Pmode, countreg,
26076 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26077 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
26078 }
26079 else
26080 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
26081 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
26082 {
26083 rounded_count
26084 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26085 destmem = shallow_copy_rtx (destmem);
26086 set_mem_size (destmem, rounded_count);
26087 }
26088 else if (MEM_SIZE_KNOWN_P (destmem))
26089 clear_mem_size (destmem);
26090
26091 if (issetmem)
26092 {
26093 value = force_reg (mode, gen_lowpart (mode, value));
26094 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
26095 }
26096 else
26097 {
26098 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
26099 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
26100 if (mode != QImode)
26101 {
26102 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
26103 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26104 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
26105 }
26106 else
26107 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
26108 if (CONST_INT_P (count))
26109 {
26110 rounded_count
26111 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26112 srcmem = shallow_copy_rtx (srcmem);
26113 set_mem_size (srcmem, rounded_count);
26114 }
26115 else
26116 {
26117 if (MEM_SIZE_KNOWN_P (srcmem))
26118 clear_mem_size (srcmem);
26119 }
26120 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
26121 destexp, srcexp));
26122 }
26123 }
26124
26125 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
26126 DESTMEM.
26127 SRC is passed by pointer to be updated on return.
26128 Return value is updated DST. */
26129 static rtx
26130 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
26131 HOST_WIDE_INT size_to_move)
26132 {
26133 rtx dst = destmem, src = *srcmem, adjust, tempreg;
26134 enum insn_code code;
26135 machine_mode move_mode;
26136 int piece_size, i;
26137
26138 /* Find the widest mode in which we could perform moves.
26139 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26140 it until move of such size is supported. */
26141 piece_size = 1 << floor_log2 (size_to_move);
26142 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
26143 code = optab_handler (mov_optab, move_mode);
26144 while (code == CODE_FOR_nothing && piece_size > 1)
26145 {
26146 piece_size >>= 1;
26147 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
26148 code = optab_handler (mov_optab, move_mode);
26149 }
26150
26151 /* Find the corresponding vector mode with the same size as MOVE_MODE.
26152 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
26153 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
26154 {
26155 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
26156 move_mode = mode_for_vector (word_mode, nunits);
26157 code = optab_handler (mov_optab, move_mode);
26158 if (code == CODE_FOR_nothing)
26159 {
26160 move_mode = word_mode;
26161 piece_size = GET_MODE_SIZE (move_mode);
26162 code = optab_handler (mov_optab, move_mode);
26163 }
26164 }
26165 gcc_assert (code != CODE_FOR_nothing);
26166
26167 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26168 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
26169
26170 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26171 gcc_assert (size_to_move % piece_size == 0);
26172 adjust = GEN_INT (piece_size);
26173 for (i = 0; i < size_to_move; i += piece_size)
26174 {
26175 /* We move from memory to memory, so we'll need to do it via
26176 a temporary register. */
26177 tempreg = gen_reg_rtx (move_mode);
26178 emit_insn (GEN_FCN (code) (tempreg, src));
26179 emit_insn (GEN_FCN (code) (dst, tempreg));
26180
26181 emit_move_insn (destptr,
26182 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26183 emit_move_insn (srcptr,
26184 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
26185
26186 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26187 piece_size);
26188 src = adjust_automodify_address_nv (src, move_mode, srcptr,
26189 piece_size);
26190 }
26191
26192 /* Update DST and SRC rtx. */
26193 *srcmem = src;
26194 return dst;
26195 }
26196
26197 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
26198 static void
26199 expand_movmem_epilogue (rtx destmem, rtx srcmem,
26200 rtx destptr, rtx srcptr, rtx count, int max_size)
26201 {
26202 rtx src, dest;
26203 if (CONST_INT_P (count))
26204 {
26205 HOST_WIDE_INT countval = INTVAL (count);
26206 HOST_WIDE_INT epilogue_size = countval % max_size;
26207 int i;
26208
26209 /* For now MAX_SIZE should be a power of 2. This assert could be
26210 relaxed, but it'll require a bit more complicated epilogue
26211 expanding. */
26212 gcc_assert ((max_size & (max_size - 1)) == 0);
26213 for (i = max_size; i >= 1; i >>= 1)
26214 {
26215 if (epilogue_size & i)
26216 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26217 }
26218 return;
26219 }
26220 if (max_size > 8)
26221 {
26222 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
26223 count, 1, OPTAB_DIRECT);
26224 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
26225 count, QImode, 1, 4, false);
26226 return;
26227 }
26228
26229 /* When there are stringops, we can cheaply increase dest and src pointers.
26230 Otherwise we save code size by maintaining offset (zero is readily
26231 available from preceding rep operation) and using x86 addressing modes.
26232 */
26233 if (TARGET_SINGLE_STRINGOP)
26234 {
26235 if (max_size > 4)
26236 {
26237 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26238 src = change_address (srcmem, SImode, srcptr);
26239 dest = change_address (destmem, SImode, destptr);
26240 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26241 emit_label (label);
26242 LABEL_NUSES (label) = 1;
26243 }
26244 if (max_size > 2)
26245 {
26246 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26247 src = change_address (srcmem, HImode, srcptr);
26248 dest = change_address (destmem, HImode, destptr);
26249 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26250 emit_label (label);
26251 LABEL_NUSES (label) = 1;
26252 }
26253 if (max_size > 1)
26254 {
26255 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26256 src = change_address (srcmem, QImode, srcptr);
26257 dest = change_address (destmem, QImode, destptr);
26258 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26259 emit_label (label);
26260 LABEL_NUSES (label) = 1;
26261 }
26262 }
26263 else
26264 {
26265 rtx offset = force_reg (Pmode, const0_rtx);
26266 rtx tmp;
26267
26268 if (max_size > 4)
26269 {
26270 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26271 src = change_address (srcmem, SImode, srcptr);
26272 dest = change_address (destmem, SImode, destptr);
26273 emit_move_insn (dest, src);
26274 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
26275 true, OPTAB_LIB_WIDEN);
26276 if (tmp != offset)
26277 emit_move_insn (offset, tmp);
26278 emit_label (label);
26279 LABEL_NUSES (label) = 1;
26280 }
26281 if (max_size > 2)
26282 {
26283 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26284 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26285 src = change_address (srcmem, HImode, tmp);
26286 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26287 dest = change_address (destmem, HImode, tmp);
26288 emit_move_insn (dest, src);
26289 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
26290 true, OPTAB_LIB_WIDEN);
26291 if (tmp != offset)
26292 emit_move_insn (offset, tmp);
26293 emit_label (label);
26294 LABEL_NUSES (label) = 1;
26295 }
26296 if (max_size > 1)
26297 {
26298 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26299 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26300 src = change_address (srcmem, QImode, tmp);
26301 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26302 dest = change_address (destmem, QImode, tmp);
26303 emit_move_insn (dest, src);
26304 emit_label (label);
26305 LABEL_NUSES (label) = 1;
26306 }
26307 }
26308 }
26309
26310 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
26311 with value PROMOTED_VAL.
26312 SRC is passed by pointer to be updated on return.
26313 Return value is updated DST. */
26314 static rtx
26315 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
26316 HOST_WIDE_INT size_to_move)
26317 {
26318 rtx dst = destmem, adjust;
26319 enum insn_code code;
26320 machine_mode move_mode;
26321 int piece_size, i;
26322
26323 /* Find the widest mode in which we could perform moves.
26324 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26325 it until move of such size is supported. */
26326 move_mode = GET_MODE (promoted_val);
26327 if (move_mode == VOIDmode)
26328 move_mode = QImode;
26329 if (size_to_move < GET_MODE_SIZE (move_mode))
26330 {
26331 move_mode = mode_for_size (size_to_move * BITS_PER_UNIT, MODE_INT, 0);
26332 promoted_val = gen_lowpart (move_mode, promoted_val);
26333 }
26334 piece_size = GET_MODE_SIZE (move_mode);
26335 code = optab_handler (mov_optab, move_mode);
26336 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
26337
26338 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26339
26340 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26341 gcc_assert (size_to_move % piece_size == 0);
26342 adjust = GEN_INT (piece_size);
26343 for (i = 0; i < size_to_move; i += piece_size)
26344 {
26345 if (piece_size <= GET_MODE_SIZE (word_mode))
26346 {
26347 emit_insn (gen_strset (destptr, dst, promoted_val));
26348 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26349 piece_size);
26350 continue;
26351 }
26352
26353 emit_insn (GEN_FCN (code) (dst, promoted_val));
26354
26355 emit_move_insn (destptr,
26356 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26357
26358 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26359 piece_size);
26360 }
26361
26362 /* Update DST rtx. */
26363 return dst;
26364 }
26365 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26366 static void
26367 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
26368 rtx count, int max_size)
26369 {
26370 count =
26371 expand_simple_binop (counter_mode (count), AND, count,
26372 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
26373 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
26374 gen_lowpart (QImode, value), count, QImode,
26375 1, max_size / 2, true);
26376 }
26377
26378 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26379 static void
26380 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
26381 rtx count, int max_size)
26382 {
26383 rtx dest;
26384
26385 if (CONST_INT_P (count))
26386 {
26387 HOST_WIDE_INT countval = INTVAL (count);
26388 HOST_WIDE_INT epilogue_size = countval % max_size;
26389 int i;
26390
26391 /* For now MAX_SIZE should be a power of 2. This assert could be
26392 relaxed, but it'll require a bit more complicated epilogue
26393 expanding. */
26394 gcc_assert ((max_size & (max_size - 1)) == 0);
26395 for (i = max_size; i >= 1; i >>= 1)
26396 {
26397 if (epilogue_size & i)
26398 {
26399 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26400 destmem = emit_memset (destmem, destptr, vec_value, i);
26401 else
26402 destmem = emit_memset (destmem, destptr, value, i);
26403 }
26404 }
26405 return;
26406 }
26407 if (max_size > 32)
26408 {
26409 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
26410 return;
26411 }
26412 if (max_size > 16)
26413 {
26414 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
26415 if (TARGET_64BIT)
26416 {
26417 dest = change_address (destmem, DImode, destptr);
26418 emit_insn (gen_strset (destptr, dest, value));
26419 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
26420 emit_insn (gen_strset (destptr, dest, value));
26421 }
26422 else
26423 {
26424 dest = change_address (destmem, SImode, destptr);
26425 emit_insn (gen_strset (destptr, dest, value));
26426 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26427 emit_insn (gen_strset (destptr, dest, value));
26428 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
26429 emit_insn (gen_strset (destptr, dest, value));
26430 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
26431 emit_insn (gen_strset (destptr, dest, value));
26432 }
26433 emit_label (label);
26434 LABEL_NUSES (label) = 1;
26435 }
26436 if (max_size > 8)
26437 {
26438 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
26439 if (TARGET_64BIT)
26440 {
26441 dest = change_address (destmem, DImode, destptr);
26442 emit_insn (gen_strset (destptr, dest, value));
26443 }
26444 else
26445 {
26446 dest = change_address (destmem, SImode, destptr);
26447 emit_insn (gen_strset (destptr, dest, value));
26448 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26449 emit_insn (gen_strset (destptr, dest, value));
26450 }
26451 emit_label (label);
26452 LABEL_NUSES (label) = 1;
26453 }
26454 if (max_size > 4)
26455 {
26456 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26457 dest = change_address (destmem, SImode, destptr);
26458 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
26459 emit_label (label);
26460 LABEL_NUSES (label) = 1;
26461 }
26462 if (max_size > 2)
26463 {
26464 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26465 dest = change_address (destmem, HImode, destptr);
26466 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
26467 emit_label (label);
26468 LABEL_NUSES (label) = 1;
26469 }
26470 if (max_size > 1)
26471 {
26472 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26473 dest = change_address (destmem, QImode, destptr);
26474 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
26475 emit_label (label);
26476 LABEL_NUSES (label) = 1;
26477 }
26478 }
26479
26480 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
26481 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
26482 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
26483 ignored.
26484 Return value is updated DESTMEM. */
26485 static rtx
26486 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
26487 rtx destptr, rtx srcptr, rtx value,
26488 rtx vec_value, rtx count, int align,
26489 int desired_alignment, bool issetmem)
26490 {
26491 int i;
26492 for (i = 1; i < desired_alignment; i <<= 1)
26493 {
26494 if (align <= i)
26495 {
26496 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
26497 if (issetmem)
26498 {
26499 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26500 destmem = emit_memset (destmem, destptr, vec_value, i);
26501 else
26502 destmem = emit_memset (destmem, destptr, value, i);
26503 }
26504 else
26505 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26506 ix86_adjust_counter (count, i);
26507 emit_label (label);
26508 LABEL_NUSES (label) = 1;
26509 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
26510 }
26511 }
26512 return destmem;
26513 }
26514
26515 /* Test if COUNT&SIZE is nonzero and if so, expand movme
26516 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
26517 and jump to DONE_LABEL. */
26518 static void
26519 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
26520 rtx destptr, rtx srcptr,
26521 rtx value, rtx vec_value,
26522 rtx count, int size,
26523 rtx done_label, bool issetmem)
26524 {
26525 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
26526 machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 1);
26527 rtx modesize;
26528 int n;
26529
26530 /* If we do not have vector value to copy, we must reduce size. */
26531 if (issetmem)
26532 {
26533 if (!vec_value)
26534 {
26535 if (GET_MODE (value) == VOIDmode && size > 8)
26536 mode = Pmode;
26537 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
26538 mode = GET_MODE (value);
26539 }
26540 else
26541 mode = GET_MODE (vec_value), value = vec_value;
26542 }
26543 else
26544 {
26545 /* Choose appropriate vector mode. */
26546 if (size >= 32)
26547 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
26548 else if (size >= 16)
26549 mode = TARGET_SSE ? V16QImode : DImode;
26550 srcmem = change_address (srcmem, mode, srcptr);
26551 }
26552 destmem = change_address (destmem, mode, destptr);
26553 modesize = GEN_INT (GET_MODE_SIZE (mode));
26554 gcc_assert (GET_MODE_SIZE (mode) <= size);
26555 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26556 {
26557 if (issetmem)
26558 emit_move_insn (destmem, gen_lowpart (mode, value));
26559 else
26560 {
26561 emit_move_insn (destmem, srcmem);
26562 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26563 }
26564 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26565 }
26566
26567 destmem = offset_address (destmem, count, 1);
26568 destmem = offset_address (destmem, GEN_INT (-2 * size),
26569 GET_MODE_SIZE (mode));
26570 if (!issetmem)
26571 {
26572 srcmem = offset_address (srcmem, count, 1);
26573 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
26574 GET_MODE_SIZE (mode));
26575 }
26576 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26577 {
26578 if (issetmem)
26579 emit_move_insn (destmem, gen_lowpart (mode, value));
26580 else
26581 {
26582 emit_move_insn (destmem, srcmem);
26583 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26584 }
26585 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26586 }
26587 emit_jump_insn (gen_jump (done_label));
26588 emit_barrier ();
26589
26590 emit_label (label);
26591 LABEL_NUSES (label) = 1;
26592 }
26593
26594 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
26595 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
26596 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
26597 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
26598 DONE_LABEL is a label after the whole copying sequence. The label is created
26599 on demand if *DONE_LABEL is NULL.
26600 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
26601 bounds after the initial copies.
26602
26603 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
26604 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
26605 we will dispatch to a library call for large blocks.
26606
26607 In pseudocode we do:
26608
26609 if (COUNT < SIZE)
26610 {
26611 Assume that SIZE is 4. Bigger sizes are handled analogously
26612 if (COUNT & 4)
26613 {
26614 copy 4 bytes from SRCPTR to DESTPTR
26615 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
26616 goto done_label
26617 }
26618 if (!COUNT)
26619 goto done_label;
26620 copy 1 byte from SRCPTR to DESTPTR
26621 if (COUNT & 2)
26622 {
26623 copy 2 bytes from SRCPTR to DESTPTR
26624 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
26625 }
26626 }
26627 else
26628 {
26629 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
26630 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
26631
26632 OLD_DESPTR = DESTPTR;
26633 Align DESTPTR up to DESIRED_ALIGN
26634 SRCPTR += DESTPTR - OLD_DESTPTR
26635 COUNT -= DEST_PTR - OLD_DESTPTR
26636 if (DYNAMIC_CHECK)
26637 Round COUNT down to multiple of SIZE
26638 << optional caller supplied zero size guard is here >>
26639 << optional caller supplied dynamic check is here >>
26640 << caller supplied main copy loop is here >>
26641 }
26642 done_label:
26643 */
26644 static void
26645 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
26646 rtx *destptr, rtx *srcptr,
26647 machine_mode mode,
26648 rtx value, rtx vec_value,
26649 rtx *count,
26650 rtx_code_label **done_label,
26651 int size,
26652 int desired_align,
26653 int align,
26654 unsigned HOST_WIDE_INT *min_size,
26655 bool dynamic_check,
26656 bool issetmem)
26657 {
26658 rtx_code_label *loop_label = NULL, *label;
26659 int n;
26660 rtx modesize;
26661 int prolog_size = 0;
26662 rtx mode_value;
26663
26664 /* Chose proper value to copy. */
26665 if (issetmem && VECTOR_MODE_P (mode))
26666 mode_value = vec_value;
26667 else
26668 mode_value = value;
26669 gcc_assert (GET_MODE_SIZE (mode) <= size);
26670
26671 /* See if block is big or small, handle small blocks. */
26672 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
26673 {
26674 int size2 = size;
26675 loop_label = gen_label_rtx ();
26676
26677 if (!*done_label)
26678 *done_label = gen_label_rtx ();
26679
26680 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
26681 1, loop_label);
26682 size2 >>= 1;
26683
26684 /* Handle sizes > 3. */
26685 for (;size2 > 2; size2 >>= 1)
26686 expand_small_movmem_or_setmem (destmem, srcmem,
26687 *destptr, *srcptr,
26688 value, vec_value,
26689 *count,
26690 size2, *done_label, issetmem);
26691 /* Nothing to copy? Jump to DONE_LABEL if so */
26692 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
26693 1, *done_label);
26694
26695 /* Do a byte copy. */
26696 destmem = change_address (destmem, QImode, *destptr);
26697 if (issetmem)
26698 emit_move_insn (destmem, gen_lowpart (QImode, value));
26699 else
26700 {
26701 srcmem = change_address (srcmem, QImode, *srcptr);
26702 emit_move_insn (destmem, srcmem);
26703 }
26704
26705 /* Handle sizes 2 and 3. */
26706 label = ix86_expand_aligntest (*count, 2, false);
26707 destmem = change_address (destmem, HImode, *destptr);
26708 destmem = offset_address (destmem, *count, 1);
26709 destmem = offset_address (destmem, GEN_INT (-2), 2);
26710 if (issetmem)
26711 emit_move_insn (destmem, gen_lowpart (HImode, value));
26712 else
26713 {
26714 srcmem = change_address (srcmem, HImode, *srcptr);
26715 srcmem = offset_address (srcmem, *count, 1);
26716 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
26717 emit_move_insn (destmem, srcmem);
26718 }
26719
26720 emit_label (label);
26721 LABEL_NUSES (label) = 1;
26722 emit_jump_insn (gen_jump (*done_label));
26723 emit_barrier ();
26724 }
26725 else
26726 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
26727 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
26728
26729 /* Start memcpy for COUNT >= SIZE. */
26730 if (loop_label)
26731 {
26732 emit_label (loop_label);
26733 LABEL_NUSES (loop_label) = 1;
26734 }
26735
26736 /* Copy first desired_align bytes. */
26737 if (!issetmem)
26738 srcmem = change_address (srcmem, mode, *srcptr);
26739 destmem = change_address (destmem, mode, *destptr);
26740 modesize = GEN_INT (GET_MODE_SIZE (mode));
26741 for (n = 0; prolog_size < desired_align - align; n++)
26742 {
26743 if (issetmem)
26744 emit_move_insn (destmem, mode_value);
26745 else
26746 {
26747 emit_move_insn (destmem, srcmem);
26748 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26749 }
26750 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26751 prolog_size += GET_MODE_SIZE (mode);
26752 }
26753
26754
26755 /* Copy last SIZE bytes. */
26756 destmem = offset_address (destmem, *count, 1);
26757 destmem = offset_address (destmem,
26758 GEN_INT (-size - prolog_size),
26759 1);
26760 if (issetmem)
26761 emit_move_insn (destmem, mode_value);
26762 else
26763 {
26764 srcmem = offset_address (srcmem, *count, 1);
26765 srcmem = offset_address (srcmem,
26766 GEN_INT (-size - prolog_size),
26767 1);
26768 emit_move_insn (destmem, srcmem);
26769 }
26770 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
26771 {
26772 destmem = offset_address (destmem, modesize, 1);
26773 if (issetmem)
26774 emit_move_insn (destmem, mode_value);
26775 else
26776 {
26777 srcmem = offset_address (srcmem, modesize, 1);
26778 emit_move_insn (destmem, srcmem);
26779 }
26780 }
26781
26782 /* Align destination. */
26783 if (desired_align > 1 && desired_align > align)
26784 {
26785 rtx saveddest = *destptr;
26786
26787 gcc_assert (desired_align <= size);
26788 /* Align destptr up, place it to new register. */
26789 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
26790 GEN_INT (prolog_size),
26791 NULL_RTX, 1, OPTAB_DIRECT);
26792 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
26793 REG_POINTER (*destptr) = 1;
26794 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
26795 GEN_INT (-desired_align),
26796 *destptr, 1, OPTAB_DIRECT);
26797 /* See how many bytes we skipped. */
26798 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
26799 *destptr,
26800 saveddest, 1, OPTAB_DIRECT);
26801 /* Adjust srcptr and count. */
26802 if (!issetmem)
26803 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
26804 saveddest, *srcptr, 1, OPTAB_DIRECT);
26805 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
26806 saveddest, *count, 1, OPTAB_DIRECT);
26807 /* We copied at most size + prolog_size. */
26808 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
26809 *min_size
26810 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
26811 else
26812 *min_size = 0;
26813
26814 /* Our loops always round down the block size, but for dispatch to
26815 library we need precise value. */
26816 if (dynamic_check)
26817 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
26818 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
26819 }
26820 else
26821 {
26822 gcc_assert (prolog_size == 0);
26823 /* Decrease count, so we won't end up copying last word twice. */
26824 if (!CONST_INT_P (*count))
26825 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
26826 constm1_rtx, *count, 1, OPTAB_DIRECT);
26827 else
26828 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
26829 (unsigned HOST_WIDE_INT)size));
26830 if (*min_size)
26831 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
26832 }
26833 }
26834
26835
26836 /* This function is like the previous one, except here we know how many bytes
26837 need to be copied. That allows us to update alignment not only of DST, which
26838 is returned, but also of SRC, which is passed as a pointer for that
26839 reason. */
26840 static rtx
26841 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
26842 rtx srcreg, rtx value, rtx vec_value,
26843 int desired_align, int align_bytes,
26844 bool issetmem)
26845 {
26846 rtx src = NULL;
26847 rtx orig_dst = dst;
26848 rtx orig_src = NULL;
26849 int piece_size = 1;
26850 int copied_bytes = 0;
26851
26852 if (!issetmem)
26853 {
26854 gcc_assert (srcp != NULL);
26855 src = *srcp;
26856 orig_src = src;
26857 }
26858
26859 for (piece_size = 1;
26860 piece_size <= desired_align && copied_bytes < align_bytes;
26861 piece_size <<= 1)
26862 {
26863 if (align_bytes & piece_size)
26864 {
26865 if (issetmem)
26866 {
26867 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
26868 dst = emit_memset (dst, destreg, vec_value, piece_size);
26869 else
26870 dst = emit_memset (dst, destreg, value, piece_size);
26871 }
26872 else
26873 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
26874 copied_bytes += piece_size;
26875 }
26876 }
26877 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
26878 set_mem_align (dst, desired_align * BITS_PER_UNIT);
26879 if (MEM_SIZE_KNOWN_P (orig_dst))
26880 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
26881
26882 if (!issetmem)
26883 {
26884 int src_align_bytes = get_mem_align_offset (src, desired_align
26885 * BITS_PER_UNIT);
26886 if (src_align_bytes >= 0)
26887 src_align_bytes = desired_align - src_align_bytes;
26888 if (src_align_bytes >= 0)
26889 {
26890 unsigned int src_align;
26891 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
26892 {
26893 if ((src_align_bytes & (src_align - 1))
26894 == (align_bytes & (src_align - 1)))
26895 break;
26896 }
26897 if (src_align > (unsigned int) desired_align)
26898 src_align = desired_align;
26899 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
26900 set_mem_align (src, src_align * BITS_PER_UNIT);
26901 }
26902 if (MEM_SIZE_KNOWN_P (orig_src))
26903 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
26904 *srcp = src;
26905 }
26906
26907 return dst;
26908 }
26909
26910 /* Return true if ALG can be used in current context.
26911 Assume we expand memset if MEMSET is true. */
26912 static bool
26913 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
26914 {
26915 if (alg == no_stringop)
26916 return false;
26917 if (alg == vector_loop)
26918 return TARGET_SSE || TARGET_AVX;
26919 /* Algorithms using the rep prefix want at least edi and ecx;
26920 additionally, memset wants eax and memcpy wants esi. Don't
26921 consider such algorithms if the user has appropriated those
26922 registers for their own purposes, or if we have a non-default
26923 address space, since some string insns cannot override the segment. */
26924 if (alg == rep_prefix_1_byte
26925 || alg == rep_prefix_4_byte
26926 || alg == rep_prefix_8_byte)
26927 {
26928 if (have_as)
26929 return false;
26930 if (fixed_regs[CX_REG]
26931 || fixed_regs[DI_REG]
26932 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
26933 return false;
26934 }
26935 return true;
26936 }
26937
26938 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
26939 static enum stringop_alg
26940 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
26941 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
26942 bool memset, bool zero_memset, bool have_as,
26943 int *dynamic_check, bool *noalign, bool recur)
26944 {
26945 const struct stringop_algs *algs;
26946 bool optimize_for_speed;
26947 int max = 0;
26948 const struct processor_costs *cost;
26949 int i;
26950 bool any_alg_usable_p = false;
26951
26952 *noalign = false;
26953 *dynamic_check = -1;
26954
26955 /* Even if the string operation call is cold, we still might spend a lot
26956 of time processing large blocks. */
26957 if (optimize_function_for_size_p (cfun)
26958 || (optimize_insn_for_size_p ()
26959 && (max_size < 256
26960 || (expected_size != -1 && expected_size < 256))))
26961 optimize_for_speed = false;
26962 else
26963 optimize_for_speed = true;
26964
26965 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
26966 if (memset)
26967 algs = &cost->memset[TARGET_64BIT != 0];
26968 else
26969 algs = &cost->memcpy[TARGET_64BIT != 0];
26970
26971 /* See maximal size for user defined algorithm. */
26972 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
26973 {
26974 enum stringop_alg candidate = algs->size[i].alg;
26975 bool usable = alg_usable_p (candidate, memset, have_as);
26976 any_alg_usable_p |= usable;
26977
26978 if (candidate != libcall && candidate && usable)
26979 max = algs->size[i].max;
26980 }
26981
26982 /* If expected size is not known but max size is small enough
26983 so inline version is a win, set expected size into
26984 the range. */
26985 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
26986 && expected_size == -1)
26987 expected_size = min_size / 2 + max_size / 2;
26988
26989 /* If user specified the algorithm, honor it if possible. */
26990 if (ix86_stringop_alg != no_stringop
26991 && alg_usable_p (ix86_stringop_alg, memset, have_as))
26992 return ix86_stringop_alg;
26993 /* rep; movq or rep; movl is the smallest variant. */
26994 else if (!optimize_for_speed)
26995 {
26996 *noalign = true;
26997 if (!count || (count & 3) || (memset && !zero_memset))
26998 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
26999 ? rep_prefix_1_byte : loop_1_byte;
27000 else
27001 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
27002 ? rep_prefix_4_byte : loop;
27003 }
27004 /* Very tiny blocks are best handled via the loop, REP is expensive to
27005 setup. */
27006 else if (expected_size != -1 && expected_size < 4)
27007 return loop_1_byte;
27008 else if (expected_size != -1)
27009 {
27010 enum stringop_alg alg = libcall;
27011 bool alg_noalign = false;
27012 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
27013 {
27014 /* We get here if the algorithms that were not libcall-based
27015 were rep-prefix based and we are unable to use rep prefixes
27016 based on global register usage. Break out of the loop and
27017 use the heuristic below. */
27018 if (algs->size[i].max == 0)
27019 break;
27020 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
27021 {
27022 enum stringop_alg candidate = algs->size[i].alg;
27023
27024 if (candidate != libcall
27025 && alg_usable_p (candidate, memset, have_as))
27026 {
27027 alg = candidate;
27028 alg_noalign = algs->size[i].noalign;
27029 }
27030 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
27031 last non-libcall inline algorithm. */
27032 if (TARGET_INLINE_ALL_STRINGOPS)
27033 {
27034 /* When the current size is best to be copied by a libcall,
27035 but we are still forced to inline, run the heuristic below
27036 that will pick code for medium sized blocks. */
27037 if (alg != libcall)
27038 {
27039 *noalign = alg_noalign;
27040 return alg;
27041 }
27042 else if (!any_alg_usable_p)
27043 break;
27044 }
27045 else if (alg_usable_p (candidate, memset, have_as))
27046 {
27047 *noalign = algs->size[i].noalign;
27048 return candidate;
27049 }
27050 }
27051 }
27052 }
27053 /* When asked to inline the call anyway, try to pick meaningful choice.
27054 We look for maximal size of block that is faster to copy by hand and
27055 take blocks of at most of that size guessing that average size will
27056 be roughly half of the block.
27057
27058 If this turns out to be bad, we might simply specify the preferred
27059 choice in ix86_costs. */
27060 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27061 && (algs->unknown_size == libcall
27062 || !alg_usable_p (algs->unknown_size, memset, have_as)))
27063 {
27064 enum stringop_alg alg;
27065 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
27066
27067 /* If there aren't any usable algorithms or if recursing already,
27068 then recursing on smaller sizes or same size isn't going to
27069 find anything. Just return the simple byte-at-a-time copy loop. */
27070 if (!any_alg_usable_p || recur)
27071 {
27072 /* Pick something reasonable. */
27073 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
27074 *dynamic_check = 128;
27075 return loop_1_byte;
27076 }
27077 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
27078 zero_memset, have_as, dynamic_check, noalign, true);
27079 gcc_assert (*dynamic_check == -1);
27080 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27081 *dynamic_check = max;
27082 else
27083 gcc_assert (alg != libcall);
27084 return alg;
27085 }
27086 return (alg_usable_p (algs->unknown_size, memset, have_as)
27087 ? algs->unknown_size : libcall);
27088 }
27089
27090 /* Decide on alignment. We know that the operand is already aligned to ALIGN
27091 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
27092 static int
27093 decide_alignment (int align,
27094 enum stringop_alg alg,
27095 int expected_size,
27096 machine_mode move_mode)
27097 {
27098 int desired_align = 0;
27099
27100 gcc_assert (alg != no_stringop);
27101
27102 if (alg == libcall)
27103 return 0;
27104 if (move_mode == VOIDmode)
27105 return 0;
27106
27107 desired_align = GET_MODE_SIZE (move_mode);
27108 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
27109 copying whole cacheline at once. */
27110 if (TARGET_PENTIUMPRO
27111 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
27112 desired_align = 8;
27113
27114 if (optimize_size)
27115 desired_align = 1;
27116 if (desired_align < align)
27117 desired_align = align;
27118 if (expected_size != -1 && expected_size < 4)
27119 desired_align = align;
27120
27121 return desired_align;
27122 }
27123
27124
27125 /* Helper function for memcpy. For QImode value 0xXY produce
27126 0xXYXYXYXY of wide specified by MODE. This is essentially
27127 a * 0x10101010, but we can do slightly better than
27128 synth_mult by unwinding the sequence by hand on CPUs with
27129 slow multiply. */
27130 static rtx
27131 promote_duplicated_reg (machine_mode mode, rtx val)
27132 {
27133 machine_mode valmode = GET_MODE (val);
27134 rtx tmp;
27135 int nops = mode == DImode ? 3 : 2;
27136
27137 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
27138 if (val == const0_rtx)
27139 return copy_to_mode_reg (mode, CONST0_RTX (mode));
27140 if (CONST_INT_P (val))
27141 {
27142 HOST_WIDE_INT v = INTVAL (val) & 255;
27143
27144 v |= v << 8;
27145 v |= v << 16;
27146 if (mode == DImode)
27147 v |= (v << 16) << 16;
27148 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
27149 }
27150
27151 if (valmode == VOIDmode)
27152 valmode = QImode;
27153 if (valmode != QImode)
27154 val = gen_lowpart (QImode, val);
27155 if (mode == QImode)
27156 return val;
27157 if (!TARGET_PARTIAL_REG_STALL)
27158 nops--;
27159 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
27160 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
27161 <= (ix86_cost->shift_const + ix86_cost->add) * nops
27162 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
27163 {
27164 rtx reg = convert_modes (mode, QImode, val, true);
27165 tmp = promote_duplicated_reg (mode, const1_rtx);
27166 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
27167 OPTAB_DIRECT);
27168 }
27169 else
27170 {
27171 rtx reg = convert_modes (mode, QImode, val, true);
27172
27173 if (!TARGET_PARTIAL_REG_STALL)
27174 if (mode == SImode)
27175 emit_insn (gen_insvsi_1 (reg, reg));
27176 else
27177 emit_insn (gen_insvdi_1 (reg, reg));
27178 else
27179 {
27180 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
27181 NULL, 1, OPTAB_DIRECT);
27182 reg =
27183 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27184 }
27185 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
27186 NULL, 1, OPTAB_DIRECT);
27187 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27188 if (mode == SImode)
27189 return reg;
27190 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
27191 NULL, 1, OPTAB_DIRECT);
27192 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27193 return reg;
27194 }
27195 }
27196
27197 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
27198 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
27199 alignment from ALIGN to DESIRED_ALIGN. */
27200 static rtx
27201 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
27202 int align)
27203 {
27204 rtx promoted_val;
27205
27206 if (TARGET_64BIT
27207 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
27208 promoted_val = promote_duplicated_reg (DImode, val);
27209 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
27210 promoted_val = promote_duplicated_reg (SImode, val);
27211 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
27212 promoted_val = promote_duplicated_reg (HImode, val);
27213 else
27214 promoted_val = val;
27215
27216 return promoted_val;
27217 }
27218
27219 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
27220 operations when profitable. The code depends upon architecture, block size
27221 and alignment, but always has one of the following overall structures:
27222
27223 Aligned move sequence:
27224
27225 1) Prologue guard: Conditional that jumps up to epilogues for small
27226 blocks that can be handled by epilogue alone. This is faster
27227 but also needed for correctness, since prologue assume the block
27228 is larger than the desired alignment.
27229
27230 Optional dynamic check for size and libcall for large
27231 blocks is emitted here too, with -minline-stringops-dynamically.
27232
27233 2) Prologue: copy first few bytes in order to get destination
27234 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
27235 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
27236 copied. We emit either a jump tree on power of two sized
27237 blocks, or a byte loop.
27238
27239 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27240 with specified algorithm.
27241
27242 4) Epilogue: code copying tail of the block that is too small to be
27243 handled by main body (or up to size guarded by prologue guard).
27244
27245 Misaligned move sequence
27246
27247 1) missaligned move prologue/epilogue containing:
27248 a) Prologue handling small memory blocks and jumping to done_label
27249 (skipped if blocks are known to be large enough)
27250 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
27251 needed by single possibly misaligned move
27252 (skipped if alignment is not needed)
27253 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
27254
27255 2) Zero size guard dispatching to done_label, if needed
27256
27257 3) dispatch to library call, if needed,
27258
27259 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27260 with specified algorithm. */
27261 bool
27262 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
27263 rtx align_exp, rtx expected_align_exp,
27264 rtx expected_size_exp, rtx min_size_exp,
27265 rtx max_size_exp, rtx probable_max_size_exp,
27266 bool issetmem)
27267 {
27268 rtx destreg;
27269 rtx srcreg = NULL;
27270 rtx_code_label *label = NULL;
27271 rtx tmp;
27272 rtx_code_label *jump_around_label = NULL;
27273 HOST_WIDE_INT align = 1;
27274 unsigned HOST_WIDE_INT count = 0;
27275 HOST_WIDE_INT expected_size = -1;
27276 int size_needed = 0, epilogue_size_needed;
27277 int desired_align = 0, align_bytes = 0;
27278 enum stringop_alg alg;
27279 rtx promoted_val = NULL;
27280 rtx vec_promoted_val = NULL;
27281 bool force_loopy_epilogue = false;
27282 int dynamic_check;
27283 bool need_zero_guard = false;
27284 bool noalign;
27285 machine_mode move_mode = VOIDmode;
27286 int unroll_factor = 1;
27287 /* TODO: Once value ranges are available, fill in proper data. */
27288 unsigned HOST_WIDE_INT min_size = 0;
27289 unsigned HOST_WIDE_INT max_size = -1;
27290 unsigned HOST_WIDE_INT probable_max_size = -1;
27291 bool misaligned_prologue_used = false;
27292 bool have_as;
27293
27294 if (CONST_INT_P (align_exp))
27295 align = INTVAL (align_exp);
27296 /* i386 can do misaligned access on reasonably increased cost. */
27297 if (CONST_INT_P (expected_align_exp)
27298 && INTVAL (expected_align_exp) > align)
27299 align = INTVAL (expected_align_exp);
27300 /* ALIGN is the minimum of destination and source alignment, but we care here
27301 just about destination alignment. */
27302 else if (!issetmem
27303 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
27304 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
27305
27306 if (CONST_INT_P (count_exp))
27307 {
27308 min_size = max_size = probable_max_size = count = expected_size
27309 = INTVAL (count_exp);
27310 /* When COUNT is 0, there is nothing to do. */
27311 if (!count)
27312 return true;
27313 }
27314 else
27315 {
27316 if (min_size_exp)
27317 min_size = INTVAL (min_size_exp);
27318 if (max_size_exp)
27319 max_size = INTVAL (max_size_exp);
27320 if (probable_max_size_exp)
27321 probable_max_size = INTVAL (probable_max_size_exp);
27322 if (CONST_INT_P (expected_size_exp))
27323 expected_size = INTVAL (expected_size_exp);
27324 }
27325
27326 /* Make sure we don't need to care about overflow later on. */
27327 if (count > (HOST_WIDE_INT_1U << 30))
27328 return false;
27329
27330 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
27331 if (!issetmem)
27332 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
27333
27334 /* Step 0: Decide on preferred algorithm, desired alignment and
27335 size of chunks to be copied by main loop. */
27336 alg = decide_alg (count, expected_size, min_size, probable_max_size,
27337 issetmem,
27338 issetmem && val_exp == const0_rtx, have_as,
27339 &dynamic_check, &noalign, false);
27340 if (alg == libcall)
27341 return false;
27342 gcc_assert (alg != no_stringop);
27343
27344 /* For now vector-version of memset is generated only for memory zeroing, as
27345 creating of promoted vector value is very cheap in this case. */
27346 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
27347 alg = unrolled_loop;
27348
27349 if (!count)
27350 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
27351 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
27352 if (!issetmem)
27353 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
27354
27355 unroll_factor = 1;
27356 move_mode = word_mode;
27357 switch (alg)
27358 {
27359 case libcall:
27360 case no_stringop:
27361 case last_alg:
27362 gcc_unreachable ();
27363 case loop_1_byte:
27364 need_zero_guard = true;
27365 move_mode = QImode;
27366 break;
27367 case loop:
27368 need_zero_guard = true;
27369 break;
27370 case unrolled_loop:
27371 need_zero_guard = true;
27372 unroll_factor = (TARGET_64BIT ? 4 : 2);
27373 break;
27374 case vector_loop:
27375 need_zero_guard = true;
27376 unroll_factor = 4;
27377 /* Find the widest supported mode. */
27378 move_mode = word_mode;
27379 while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
27380 != CODE_FOR_nothing)
27381 move_mode = GET_MODE_WIDER_MODE (move_mode);
27382
27383 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27384 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27385 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27386 {
27387 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27388 move_mode = mode_for_vector (word_mode, nunits);
27389 if (optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
27390 move_mode = word_mode;
27391 }
27392 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
27393 break;
27394 case rep_prefix_8_byte:
27395 move_mode = DImode;
27396 break;
27397 case rep_prefix_4_byte:
27398 move_mode = SImode;
27399 break;
27400 case rep_prefix_1_byte:
27401 move_mode = QImode;
27402 break;
27403 }
27404 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
27405 epilogue_size_needed = size_needed;
27406
27407 /* If we are going to call any library calls conditionally, make sure any
27408 pending stack adjustment happen before the first conditional branch,
27409 otherwise they will be emitted before the library call only and won't
27410 happen from the other branches. */
27411 if (dynamic_check != -1)
27412 do_pending_stack_adjust ();
27413
27414 desired_align = decide_alignment (align, alg, expected_size, move_mode);
27415 if (!TARGET_ALIGN_STRINGOPS || noalign)
27416 align = desired_align;
27417
27418 /* Step 1: Prologue guard. */
27419
27420 /* Alignment code needs count to be in register. */
27421 if (CONST_INT_P (count_exp) && desired_align > align)
27422 {
27423 if (INTVAL (count_exp) > desired_align
27424 && INTVAL (count_exp) > size_needed)
27425 {
27426 align_bytes
27427 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
27428 if (align_bytes <= 0)
27429 align_bytes = 0;
27430 else
27431 align_bytes = desired_align - align_bytes;
27432 }
27433 if (align_bytes == 0)
27434 count_exp = force_reg (counter_mode (count_exp), count_exp);
27435 }
27436 gcc_assert (desired_align >= 1 && align >= 1);
27437
27438 /* Misaligned move sequences handle both prologue and epilogue at once.
27439 Default code generation results in a smaller code for large alignments
27440 and also avoids redundant job when sizes are known precisely. */
27441 misaligned_prologue_used
27442 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
27443 && MAX (desired_align, epilogue_size_needed) <= 32
27444 && desired_align <= epilogue_size_needed
27445 && ((desired_align > align && !align_bytes)
27446 || (!count && epilogue_size_needed > 1)));
27447
27448 /* Do the cheap promotion to allow better CSE across the
27449 main loop and epilogue (ie one load of the big constant in the
27450 front of all code.
27451 For now the misaligned move sequences do not have fast path
27452 without broadcasting. */
27453 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
27454 {
27455 if (alg == vector_loop)
27456 {
27457 gcc_assert (val_exp == const0_rtx);
27458 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
27459 promoted_val = promote_duplicated_reg_to_size (val_exp,
27460 GET_MODE_SIZE (word_mode),
27461 desired_align, align);
27462 }
27463 else
27464 {
27465 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27466 desired_align, align);
27467 }
27468 }
27469 /* Misaligned move sequences handles both prologues and epilogues at once.
27470 Default code generation results in smaller code for large alignments and
27471 also avoids redundant job when sizes are known precisely. */
27472 if (misaligned_prologue_used)
27473 {
27474 /* Misaligned move prologue handled small blocks by itself. */
27475 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
27476 (dst, src, &destreg, &srcreg,
27477 move_mode, promoted_val, vec_promoted_val,
27478 &count_exp,
27479 &jump_around_label,
27480 desired_align < align
27481 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
27482 desired_align, align, &min_size, dynamic_check, issetmem);
27483 if (!issetmem)
27484 src = change_address (src, BLKmode, srcreg);
27485 dst = change_address (dst, BLKmode, destreg);
27486 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27487 epilogue_size_needed = 0;
27488 if (need_zero_guard
27489 && min_size < (unsigned HOST_WIDE_INT) size_needed)
27490 {
27491 /* It is possible that we copied enough so the main loop will not
27492 execute. */
27493 gcc_assert (size_needed > 1);
27494 if (jump_around_label == NULL_RTX)
27495 jump_around_label = gen_label_rtx ();
27496 emit_cmp_and_jump_insns (count_exp,
27497 GEN_INT (size_needed),
27498 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
27499 if (expected_size == -1
27500 || expected_size < (desired_align - align) / 2 + size_needed)
27501 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27502 else
27503 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27504 }
27505 }
27506 /* Ensure that alignment prologue won't copy past end of block. */
27507 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
27508 {
27509 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
27510 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
27511 Make sure it is power of 2. */
27512 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
27513
27514 /* To improve performance of small blocks, we jump around the VAL
27515 promoting mode. This mean that if the promoted VAL is not constant,
27516 we might not use it in the epilogue and have to use byte
27517 loop variant. */
27518 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
27519 force_loopy_epilogue = true;
27520 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27521 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27522 {
27523 /* If main algorithm works on QImode, no epilogue is needed.
27524 For small sizes just don't align anything. */
27525 if (size_needed == 1)
27526 desired_align = align;
27527 else
27528 goto epilogue;
27529 }
27530 else if (!count
27531 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27532 {
27533 label = gen_label_rtx ();
27534 emit_cmp_and_jump_insns (count_exp,
27535 GEN_INT (epilogue_size_needed),
27536 LTU, 0, counter_mode (count_exp), 1, label);
27537 if (expected_size == -1 || expected_size < epilogue_size_needed)
27538 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27539 else
27540 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27541 }
27542 }
27543
27544 /* Emit code to decide on runtime whether library call or inline should be
27545 used. */
27546 if (dynamic_check != -1)
27547 {
27548 if (!issetmem && CONST_INT_P (count_exp))
27549 {
27550 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
27551 {
27552 emit_block_copy_via_libcall (dst, src, count_exp);
27553 count_exp = const0_rtx;
27554 goto epilogue;
27555 }
27556 }
27557 else
27558 {
27559 rtx_code_label *hot_label = gen_label_rtx ();
27560 if (jump_around_label == NULL_RTX)
27561 jump_around_label = gen_label_rtx ();
27562 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
27563 LEU, 0, counter_mode (count_exp),
27564 1, hot_label);
27565 predict_jump (REG_BR_PROB_BASE * 90 / 100);
27566 if (issetmem)
27567 set_storage_via_libcall (dst, count_exp, val_exp);
27568 else
27569 emit_block_copy_via_libcall (dst, src, count_exp);
27570 emit_jump (jump_around_label);
27571 emit_label (hot_label);
27572 }
27573 }
27574
27575 /* Step 2: Alignment prologue. */
27576 /* Do the expensive promotion once we branched off the small blocks. */
27577 if (issetmem && !promoted_val)
27578 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27579 desired_align, align);
27580
27581 if (desired_align > align && !misaligned_prologue_used)
27582 {
27583 if (align_bytes == 0)
27584 {
27585 /* Except for the first move in prologue, we no longer know
27586 constant offset in aliasing info. It don't seems to worth
27587 the pain to maintain it for the first move, so throw away
27588 the info early. */
27589 dst = change_address (dst, BLKmode, destreg);
27590 if (!issetmem)
27591 src = change_address (src, BLKmode, srcreg);
27592 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
27593 promoted_val, vec_promoted_val,
27594 count_exp, align, desired_align,
27595 issetmem);
27596 /* At most desired_align - align bytes are copied. */
27597 if (min_size < (unsigned)(desired_align - align))
27598 min_size = 0;
27599 else
27600 min_size -= desired_align - align;
27601 }
27602 else
27603 {
27604 /* If we know how many bytes need to be stored before dst is
27605 sufficiently aligned, maintain aliasing info accurately. */
27606 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
27607 srcreg,
27608 promoted_val,
27609 vec_promoted_val,
27610 desired_align,
27611 align_bytes,
27612 issetmem);
27613
27614 count_exp = plus_constant (counter_mode (count_exp),
27615 count_exp, -align_bytes);
27616 count -= align_bytes;
27617 min_size -= align_bytes;
27618 max_size -= align_bytes;
27619 }
27620 if (need_zero_guard
27621 && min_size < (unsigned HOST_WIDE_INT) size_needed
27622 && (count < (unsigned HOST_WIDE_INT) size_needed
27623 || (align_bytes == 0
27624 && count < ((unsigned HOST_WIDE_INT) size_needed
27625 + desired_align - align))))
27626 {
27627 /* It is possible that we copied enough so the main loop will not
27628 execute. */
27629 gcc_assert (size_needed > 1);
27630 if (label == NULL_RTX)
27631 label = gen_label_rtx ();
27632 emit_cmp_and_jump_insns (count_exp,
27633 GEN_INT (size_needed),
27634 LTU, 0, counter_mode (count_exp), 1, label);
27635 if (expected_size == -1
27636 || expected_size < (desired_align - align) / 2 + size_needed)
27637 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27638 else
27639 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27640 }
27641 }
27642 if (label && size_needed == 1)
27643 {
27644 emit_label (label);
27645 LABEL_NUSES (label) = 1;
27646 label = NULL;
27647 epilogue_size_needed = 1;
27648 if (issetmem)
27649 promoted_val = val_exp;
27650 }
27651 else if (label == NULL_RTX && !misaligned_prologue_used)
27652 epilogue_size_needed = size_needed;
27653
27654 /* Step 3: Main loop. */
27655
27656 switch (alg)
27657 {
27658 case libcall:
27659 case no_stringop:
27660 case last_alg:
27661 gcc_unreachable ();
27662 case loop_1_byte:
27663 case loop:
27664 case unrolled_loop:
27665 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
27666 count_exp, move_mode, unroll_factor,
27667 expected_size, issetmem);
27668 break;
27669 case vector_loop:
27670 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
27671 vec_promoted_val, count_exp, move_mode,
27672 unroll_factor, expected_size, issetmem);
27673 break;
27674 case rep_prefix_8_byte:
27675 case rep_prefix_4_byte:
27676 case rep_prefix_1_byte:
27677 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
27678 val_exp, count_exp, move_mode, issetmem);
27679 break;
27680 }
27681 /* Adjust properly the offset of src and dest memory for aliasing. */
27682 if (CONST_INT_P (count_exp))
27683 {
27684 if (!issetmem)
27685 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
27686 (count / size_needed) * size_needed);
27687 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
27688 (count / size_needed) * size_needed);
27689 }
27690 else
27691 {
27692 if (!issetmem)
27693 src = change_address (src, BLKmode, srcreg);
27694 dst = change_address (dst, BLKmode, destreg);
27695 }
27696
27697 /* Step 4: Epilogue to copy the remaining bytes. */
27698 epilogue:
27699 if (label)
27700 {
27701 /* When the main loop is done, COUNT_EXP might hold original count,
27702 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
27703 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
27704 bytes. Compensate if needed. */
27705
27706 if (size_needed < epilogue_size_needed)
27707 {
27708 tmp =
27709 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
27710 GEN_INT (size_needed - 1), count_exp, 1,
27711 OPTAB_DIRECT);
27712 if (tmp != count_exp)
27713 emit_move_insn (count_exp, tmp);
27714 }
27715 emit_label (label);
27716 LABEL_NUSES (label) = 1;
27717 }
27718
27719 if (count_exp != const0_rtx && epilogue_size_needed > 1)
27720 {
27721 if (force_loopy_epilogue)
27722 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
27723 epilogue_size_needed);
27724 else
27725 {
27726 if (issetmem)
27727 expand_setmem_epilogue (dst, destreg, promoted_val,
27728 vec_promoted_val, count_exp,
27729 epilogue_size_needed);
27730 else
27731 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
27732 epilogue_size_needed);
27733 }
27734 }
27735 if (jump_around_label)
27736 emit_label (jump_around_label);
27737 return true;
27738 }
27739
27740
27741 /* Expand the appropriate insns for doing strlen if not just doing
27742 repnz; scasb
27743
27744 out = result, initialized with the start address
27745 align_rtx = alignment of the address.
27746 scratch = scratch register, initialized with the startaddress when
27747 not aligned, otherwise undefined
27748
27749 This is just the body. It needs the initializations mentioned above and
27750 some address computing at the end. These things are done in i386.md. */
27751
27752 static void
27753 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
27754 {
27755 int align;
27756 rtx tmp;
27757 rtx_code_label *align_2_label = NULL;
27758 rtx_code_label *align_3_label = NULL;
27759 rtx_code_label *align_4_label = gen_label_rtx ();
27760 rtx_code_label *end_0_label = gen_label_rtx ();
27761 rtx mem;
27762 rtx tmpreg = gen_reg_rtx (SImode);
27763 rtx scratch = gen_reg_rtx (SImode);
27764 rtx cmp;
27765
27766 align = 0;
27767 if (CONST_INT_P (align_rtx))
27768 align = INTVAL (align_rtx);
27769
27770 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
27771
27772 /* Is there a known alignment and is it less than 4? */
27773 if (align < 4)
27774 {
27775 rtx scratch1 = gen_reg_rtx (Pmode);
27776 emit_move_insn (scratch1, out);
27777 /* Is there a known alignment and is it not 2? */
27778 if (align != 2)
27779 {
27780 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
27781 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
27782
27783 /* Leave just the 3 lower bits. */
27784 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
27785 NULL_RTX, 0, OPTAB_WIDEN);
27786
27787 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
27788 Pmode, 1, align_4_label);
27789 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
27790 Pmode, 1, align_2_label);
27791 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
27792 Pmode, 1, align_3_label);
27793 }
27794 else
27795 {
27796 /* Since the alignment is 2, we have to check 2 or 0 bytes;
27797 check if is aligned to 4 - byte. */
27798
27799 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
27800 NULL_RTX, 0, OPTAB_WIDEN);
27801
27802 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
27803 Pmode, 1, align_4_label);
27804 }
27805
27806 mem = change_address (src, QImode, out);
27807
27808 /* Now compare the bytes. */
27809
27810 /* Compare the first n unaligned byte on a byte per byte basis. */
27811 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
27812 QImode, 1, end_0_label);
27813
27814 /* Increment the address. */
27815 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27816
27817 /* Not needed with an alignment of 2 */
27818 if (align != 2)
27819 {
27820 emit_label (align_2_label);
27821
27822 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
27823 end_0_label);
27824
27825 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27826
27827 emit_label (align_3_label);
27828 }
27829
27830 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
27831 end_0_label);
27832
27833 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27834 }
27835
27836 /* Generate loop to check 4 bytes at a time. It is not a good idea to
27837 align this loop. It gives only huge programs, but does not help to
27838 speed up. */
27839 emit_label (align_4_label);
27840
27841 mem = change_address (src, SImode, out);
27842 emit_move_insn (scratch, mem);
27843 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
27844
27845 /* This formula yields a nonzero result iff one of the bytes is zero.
27846 This saves three branches inside loop and many cycles. */
27847
27848 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
27849 emit_insn (gen_one_cmplsi2 (scratch, scratch));
27850 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
27851 emit_insn (gen_andsi3 (tmpreg, tmpreg,
27852 gen_int_mode (0x80808080, SImode)));
27853 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
27854 align_4_label);
27855
27856 if (TARGET_CMOVE)
27857 {
27858 rtx reg = gen_reg_rtx (SImode);
27859 rtx reg2 = gen_reg_rtx (Pmode);
27860 emit_move_insn (reg, tmpreg);
27861 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
27862
27863 /* If zero is not in the first two bytes, move two bytes forward. */
27864 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
27865 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27866 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
27867 emit_insn (gen_rtx_SET (tmpreg,
27868 gen_rtx_IF_THEN_ELSE (SImode, tmp,
27869 reg,
27870 tmpreg)));
27871 /* Emit lea manually to avoid clobbering of flags. */
27872 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
27873
27874 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27875 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
27876 emit_insn (gen_rtx_SET (out,
27877 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
27878 reg2,
27879 out)));
27880 }
27881 else
27882 {
27883 rtx_code_label *end_2_label = gen_label_rtx ();
27884 /* Is zero in the first two bytes? */
27885
27886 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
27887 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27888 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
27889 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
27890 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
27891 pc_rtx);
27892 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
27893 JUMP_LABEL (tmp) = end_2_label;
27894
27895 /* Not in the first two. Move two bytes forward. */
27896 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
27897 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
27898
27899 emit_label (end_2_label);
27900
27901 }
27902
27903 /* Avoid branch in fixing the byte. */
27904 tmpreg = gen_lowpart (QImode, tmpreg);
27905 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
27906 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
27907 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
27908 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
27909
27910 emit_label (end_0_label);
27911 }
27912
27913 /* Expand strlen. */
27914
27915 bool
27916 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
27917 {
27918 rtx addr, scratch1, scratch2, scratch3, scratch4;
27919
27920 /* The generic case of strlen expander is long. Avoid it's
27921 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
27922
27923 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
27924 && !TARGET_INLINE_ALL_STRINGOPS
27925 && !optimize_insn_for_size_p ()
27926 && (!CONST_INT_P (align) || INTVAL (align) < 4))
27927 return false;
27928
27929 addr = force_reg (Pmode, XEXP (src, 0));
27930 scratch1 = gen_reg_rtx (Pmode);
27931
27932 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
27933 && !optimize_insn_for_size_p ())
27934 {
27935 /* Well it seems that some optimizer does not combine a call like
27936 foo(strlen(bar), strlen(bar));
27937 when the move and the subtraction is done here. It does calculate
27938 the length just once when these instructions are done inside of
27939 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
27940 often used and I use one fewer register for the lifetime of
27941 output_strlen_unroll() this is better. */
27942
27943 emit_move_insn (out, addr);
27944
27945 ix86_expand_strlensi_unroll_1 (out, src, align);
27946
27947 /* strlensi_unroll_1 returns the address of the zero at the end of
27948 the string, like memchr(), so compute the length by subtracting
27949 the start address. */
27950 emit_insn (ix86_gen_sub3 (out, out, addr));
27951 }
27952 else
27953 {
27954 rtx unspec;
27955
27956 /* Can't use this if the user has appropriated eax, ecx, or edi. */
27957 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
27958 return false;
27959 /* Can't use this for non-default address spaces. */
27960 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
27961 return false;
27962
27963 scratch2 = gen_reg_rtx (Pmode);
27964 scratch3 = gen_reg_rtx (Pmode);
27965 scratch4 = force_reg (Pmode, constm1_rtx);
27966
27967 emit_move_insn (scratch3, addr);
27968 eoschar = force_reg (QImode, eoschar);
27969
27970 src = replace_equiv_address_nv (src, scratch3);
27971
27972 /* If .md starts supporting :P, this can be done in .md. */
27973 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
27974 scratch4), UNSPEC_SCAS);
27975 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
27976 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
27977 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
27978 }
27979 return true;
27980 }
27981
27982 /* For given symbol (function) construct code to compute address of it's PLT
27983 entry in large x86-64 PIC model. */
27984 static rtx
27985 construct_plt_address (rtx symbol)
27986 {
27987 rtx tmp, unspec;
27988
27989 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
27990 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
27991 gcc_assert (Pmode == DImode);
27992
27993 tmp = gen_reg_rtx (Pmode);
27994 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
27995
27996 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
27997 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
27998 return tmp;
27999 }
28000
28001 rtx
28002 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
28003 rtx callarg2,
28004 rtx pop, bool sibcall)
28005 {
28006 rtx vec[3];
28007 rtx use = NULL, call;
28008 unsigned int vec_len = 0;
28009 tree fndecl;
28010
28011 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28012 {
28013 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
28014 if (fndecl
28015 && (lookup_attribute ("interrupt",
28016 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
28017 error ("interrupt service routine can't be called directly");
28018 }
28019 else
28020 fndecl = NULL_TREE;
28021
28022 if (pop == const0_rtx)
28023 pop = NULL;
28024 gcc_assert (!TARGET_64BIT || !pop);
28025
28026 if (TARGET_MACHO && !TARGET_64BIT)
28027 {
28028 #if TARGET_MACHO
28029 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28030 fnaddr = machopic_indirect_call_target (fnaddr);
28031 #endif
28032 }
28033 else
28034 {
28035 /* Static functions and indirect calls don't need the pic register. Also,
28036 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
28037 it an indirect call. */
28038 rtx addr = XEXP (fnaddr, 0);
28039 if (flag_pic
28040 && GET_CODE (addr) == SYMBOL_REF
28041 && !SYMBOL_REF_LOCAL_P (addr))
28042 {
28043 if (flag_plt
28044 && (SYMBOL_REF_DECL (addr) == NULL_TREE
28045 || !lookup_attribute ("noplt",
28046 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
28047 {
28048 if (!TARGET_64BIT
28049 || (ix86_cmodel == CM_LARGE_PIC
28050 && DEFAULT_ABI != MS_ABI))
28051 {
28052 use_reg (&use, gen_rtx_REG (Pmode,
28053 REAL_PIC_OFFSET_TABLE_REGNUM));
28054 if (ix86_use_pseudo_pic_reg ())
28055 emit_move_insn (gen_rtx_REG (Pmode,
28056 REAL_PIC_OFFSET_TABLE_REGNUM),
28057 pic_offset_table_rtx);
28058 }
28059 }
28060 else if (!TARGET_PECOFF && !TARGET_MACHO)
28061 {
28062 if (TARGET_64BIT)
28063 {
28064 fnaddr = gen_rtx_UNSPEC (Pmode,
28065 gen_rtvec (1, addr),
28066 UNSPEC_GOTPCREL);
28067 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28068 }
28069 else
28070 {
28071 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
28072 UNSPEC_GOT);
28073 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28074 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
28075 fnaddr);
28076 }
28077 fnaddr = gen_const_mem (Pmode, fnaddr);
28078 /* Pmode may not be the same as word_mode for x32, which
28079 doesn't support indirect branch via 32-bit memory slot.
28080 Since x32 GOT slot is 64 bit with zero upper 32 bits,
28081 indirect branch via x32 GOT slot is OK. */
28082 if (GET_MODE (fnaddr) != word_mode)
28083 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
28084 fnaddr = gen_rtx_MEM (QImode, fnaddr);
28085 }
28086 }
28087 }
28088
28089 /* Skip setting up RAX register for -mskip-rax-setup when there are no
28090 parameters passed in vector registers. */
28091 if (TARGET_64BIT
28092 && (INTVAL (callarg2) > 0
28093 || (INTVAL (callarg2) == 0
28094 && (TARGET_SSE || !flag_skip_rax_setup))))
28095 {
28096 rtx al = gen_rtx_REG (QImode, AX_REG);
28097 emit_move_insn (al, callarg2);
28098 use_reg (&use, al);
28099 }
28100
28101 if (ix86_cmodel == CM_LARGE_PIC
28102 && !TARGET_PECOFF
28103 && MEM_P (fnaddr)
28104 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
28105 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
28106 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
28107 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
28108 branch via x32 GOT slot is OK. */
28109 else if (!(TARGET_X32
28110 && MEM_P (fnaddr)
28111 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
28112 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
28113 && (sibcall
28114 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
28115 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
28116 {
28117 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
28118 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
28119 }
28120
28121 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
28122
28123 if (retval)
28124 {
28125 /* We should add bounds as destination register in case
28126 pointer with bounds may be returned. */
28127 if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
28128 {
28129 rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
28130 rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
28131 if (GET_CODE (retval) == PARALLEL)
28132 {
28133 b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
28134 b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
28135 rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
28136 retval = chkp_join_splitted_slot (retval, par);
28137 }
28138 else
28139 {
28140 retval = gen_rtx_PARALLEL (VOIDmode,
28141 gen_rtvec (3, retval, b0, b1));
28142 chkp_put_regs_to_expr_list (retval);
28143 }
28144 }
28145
28146 call = gen_rtx_SET (retval, call);
28147 }
28148 vec[vec_len++] = call;
28149
28150 if (pop)
28151 {
28152 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
28153 pop = gen_rtx_SET (stack_pointer_rtx, pop);
28154 vec[vec_len++] = pop;
28155 }
28156
28157 if (cfun->machine->no_caller_saved_registers
28158 && (!fndecl
28159 || (!TREE_THIS_VOLATILE (fndecl)
28160 && !lookup_attribute ("no_caller_saved_registers",
28161 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
28162 {
28163 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
28164 bool is_64bit_ms_abi = (TARGET_64BIT
28165 && ix86_function_abi (fndecl) == MS_ABI);
28166 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
28167
28168 /* If there are no caller-saved registers, add all registers
28169 that are clobbered by the call which returns. */
28170 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
28171 if (!fixed_regs[i]
28172 && (ix86_call_used_regs[i] == 1
28173 || (ix86_call_used_regs[i] & c_mask))
28174 && !STACK_REGNO_P (i)
28175 && !MMX_REGNO_P (i))
28176 clobber_reg (&use,
28177 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
28178 }
28179 else if (TARGET_64BIT_MS_ABI
28180 && (!callarg2 || INTVAL (callarg2) != -2))
28181 {
28182 int const cregs_size
28183 = ARRAY_SIZE (x86_64_ms_sysv_extra_clobbered_registers);
28184 int i;
28185
28186 for (i = 0; i < cregs_size; i++)
28187 {
28188 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
28189 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
28190
28191 clobber_reg (&use, gen_rtx_REG (mode, regno));
28192 }
28193 }
28194
28195 if (vec_len > 1)
28196 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
28197 call = emit_call_insn (call);
28198 if (use)
28199 CALL_INSN_FUNCTION_USAGE (call) = use;
28200
28201 return call;
28202 }
28203
28204 /* Return true if the function being called was marked with attribute
28205 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
28206 to handle the non-PIC case in the backend because there is no easy
28207 interface for the front-end to force non-PLT calls to use the GOT.
28208 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
28209 to call the function marked "noplt" indirectly. */
28210
28211 static bool
28212 ix86_nopic_noplt_attribute_p (rtx call_op)
28213 {
28214 if (flag_pic || ix86_cmodel == CM_LARGE
28215 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
28216 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
28217 || SYMBOL_REF_LOCAL_P (call_op))
28218 return false;
28219
28220 tree symbol_decl = SYMBOL_REF_DECL (call_op);
28221
28222 if (!flag_plt
28223 || (symbol_decl != NULL_TREE
28224 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
28225 return true;
28226
28227 return false;
28228 }
28229
28230 /* Output the assembly for a call instruction. */
28231
28232 const char *
28233 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
28234 {
28235 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
28236 bool seh_nop_p = false;
28237 const char *xasm;
28238
28239 if (SIBLING_CALL_P (insn))
28240 {
28241 if (direct_p)
28242 {
28243 if (ix86_nopic_noplt_attribute_p (call_op))
28244 {
28245 if (TARGET_64BIT)
28246 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28247 else
28248 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28249 }
28250 else
28251 xasm = "%!jmp\t%P0";
28252 }
28253 /* SEH epilogue detection requires the indirect branch case
28254 to include REX.W. */
28255 else if (TARGET_SEH)
28256 xasm = "%!rex.W jmp\t%A0";
28257 else
28258 xasm = "%!jmp\t%A0";
28259
28260 output_asm_insn (xasm, &call_op);
28261 return "";
28262 }
28263
28264 /* SEH unwinding can require an extra nop to be emitted in several
28265 circumstances. Determine if we have one of those. */
28266 if (TARGET_SEH)
28267 {
28268 rtx_insn *i;
28269
28270 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
28271 {
28272 /* If we get to another real insn, we don't need the nop. */
28273 if (INSN_P (i))
28274 break;
28275
28276 /* If we get to the epilogue note, prevent a catch region from
28277 being adjacent to the standard epilogue sequence. If non-
28278 call-exceptions, we'll have done this during epilogue emission. */
28279 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
28280 && !flag_non_call_exceptions
28281 && !can_throw_internal (insn))
28282 {
28283 seh_nop_p = true;
28284 break;
28285 }
28286 }
28287
28288 /* If we didn't find a real insn following the call, prevent the
28289 unwinder from looking into the next function. */
28290 if (i == NULL)
28291 seh_nop_p = true;
28292 }
28293
28294 if (direct_p)
28295 {
28296 if (ix86_nopic_noplt_attribute_p (call_op))
28297 {
28298 if (TARGET_64BIT)
28299 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28300 else
28301 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28302 }
28303 else
28304 xasm = "%!call\t%P0";
28305 }
28306 else
28307 xasm = "%!call\t%A0";
28308
28309 output_asm_insn (xasm, &call_op);
28310
28311 if (seh_nop_p)
28312 return "nop";
28313
28314 return "";
28315 }
28316 \f
28317 /* Clear stack slot assignments remembered from previous functions.
28318 This is called from INIT_EXPANDERS once before RTL is emitted for each
28319 function. */
28320
28321 static struct machine_function *
28322 ix86_init_machine_status (void)
28323 {
28324 struct machine_function *f;
28325
28326 f = ggc_cleared_alloc<machine_function> ();
28327 f->use_fast_prologue_epilogue_nregs = -1;
28328 f->call_abi = ix86_abi;
28329
28330 return f;
28331 }
28332
28333 /* Return a MEM corresponding to a stack slot with mode MODE.
28334 Allocate a new slot if necessary.
28335
28336 The RTL for a function can have several slots available: N is
28337 which slot to use. */
28338
28339 rtx
28340 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
28341 {
28342 struct stack_local_entry *s;
28343
28344 gcc_assert (n < MAX_386_STACK_LOCALS);
28345
28346 for (s = ix86_stack_locals; s; s = s->next)
28347 if (s->mode == mode && s->n == n)
28348 return validize_mem (copy_rtx (s->rtl));
28349
28350 s = ggc_alloc<stack_local_entry> ();
28351 s->n = n;
28352 s->mode = mode;
28353 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
28354
28355 s->next = ix86_stack_locals;
28356 ix86_stack_locals = s;
28357 return validize_mem (copy_rtx (s->rtl));
28358 }
28359
28360 static void
28361 ix86_instantiate_decls (void)
28362 {
28363 struct stack_local_entry *s;
28364
28365 for (s = ix86_stack_locals; s; s = s->next)
28366 if (s->rtl != NULL_RTX)
28367 instantiate_decl_rtl (s->rtl);
28368 }
28369 \f
28370 /* Return the number used for encoding REG, in the range 0..7. */
28371
28372 static int
28373 reg_encoded_number (rtx reg)
28374 {
28375 unsigned regno = REGNO (reg);
28376 switch (regno)
28377 {
28378 case AX_REG:
28379 return 0;
28380 case CX_REG:
28381 return 1;
28382 case DX_REG:
28383 return 2;
28384 case BX_REG:
28385 return 3;
28386 case SP_REG:
28387 return 4;
28388 case BP_REG:
28389 return 5;
28390 case SI_REG:
28391 return 6;
28392 case DI_REG:
28393 return 7;
28394 default:
28395 break;
28396 }
28397 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
28398 return regno - FIRST_STACK_REG;
28399 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
28400 return regno - FIRST_SSE_REG;
28401 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
28402 return regno - FIRST_MMX_REG;
28403 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
28404 return regno - FIRST_REX_SSE_REG;
28405 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
28406 return regno - FIRST_REX_INT_REG;
28407 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
28408 return regno - FIRST_MASK_REG;
28409 if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
28410 return regno - FIRST_BND_REG;
28411 return -1;
28412 }
28413
28414 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
28415 in its encoding if it could be relevant for ROP mitigation, otherwise
28416 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
28417 used for calculating it into them. */
28418
28419 static int
28420 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
28421 int *popno0 = 0, int *popno1 = 0)
28422 {
28423 if (asm_noperands (PATTERN (insn)) >= 0)
28424 return -1;
28425 int has_modrm = get_attr_modrm (insn);
28426 if (!has_modrm)
28427 return -1;
28428 enum attr_modrm_class cls = get_attr_modrm_class (insn);
28429 rtx op0, op1;
28430 switch (cls)
28431 {
28432 case MODRM_CLASS_OP02:
28433 gcc_assert (noperands >= 3);
28434 if (popno0)
28435 {
28436 *popno0 = 0;
28437 *popno1 = 2;
28438 }
28439 op0 = operands[0];
28440 op1 = operands[2];
28441 break;
28442 case MODRM_CLASS_OP01:
28443 gcc_assert (noperands >= 2);
28444 if (popno0)
28445 {
28446 *popno0 = 0;
28447 *popno1 = 1;
28448 }
28449 op0 = operands[0];
28450 op1 = operands[1];
28451 break;
28452 default:
28453 return -1;
28454 }
28455 if (REG_P (op0) && REG_P (op1))
28456 {
28457 int enc0 = reg_encoded_number (op0);
28458 int enc1 = reg_encoded_number (op1);
28459 return 0xc0 + (enc1 << 3) + enc0;
28460 }
28461 return -1;
28462 }
28463
28464 /* Check whether x86 address PARTS is a pc-relative address. */
28465
28466 static bool
28467 rip_relative_addr_p (struct ix86_address *parts)
28468 {
28469 rtx base, index, disp;
28470
28471 base = parts->base;
28472 index = parts->index;
28473 disp = parts->disp;
28474
28475 if (disp && !base && !index)
28476 {
28477 if (TARGET_64BIT)
28478 {
28479 rtx symbol = disp;
28480
28481 if (GET_CODE (disp) == CONST)
28482 symbol = XEXP (disp, 0);
28483 if (GET_CODE (symbol) == PLUS
28484 && CONST_INT_P (XEXP (symbol, 1)))
28485 symbol = XEXP (symbol, 0);
28486
28487 if (GET_CODE (symbol) == LABEL_REF
28488 || (GET_CODE (symbol) == SYMBOL_REF
28489 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
28490 || (GET_CODE (symbol) == UNSPEC
28491 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
28492 || XINT (symbol, 1) == UNSPEC_PCREL
28493 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
28494 return true;
28495 }
28496 }
28497 return false;
28498 }
28499
28500 /* Calculate the length of the memory address in the instruction encoding.
28501 Includes addr32 prefix, does not include the one-byte modrm, opcode,
28502 or other prefixes. We never generate addr32 prefix for LEA insn. */
28503
28504 int
28505 memory_address_length (rtx addr, bool lea)
28506 {
28507 struct ix86_address parts;
28508 rtx base, index, disp;
28509 int len;
28510 int ok;
28511
28512 if (GET_CODE (addr) == PRE_DEC
28513 || GET_CODE (addr) == POST_INC
28514 || GET_CODE (addr) == PRE_MODIFY
28515 || GET_CODE (addr) == POST_MODIFY)
28516 return 0;
28517
28518 ok = ix86_decompose_address (addr, &parts);
28519 gcc_assert (ok);
28520
28521 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
28522
28523 /* If this is not LEA instruction, add the length of addr32 prefix. */
28524 if (TARGET_64BIT && !lea
28525 && (SImode_address_operand (addr, VOIDmode)
28526 || (parts.base && GET_MODE (parts.base) == SImode)
28527 || (parts.index && GET_MODE (parts.index) == SImode)))
28528 len++;
28529
28530 base = parts.base;
28531 index = parts.index;
28532 disp = parts.disp;
28533
28534 if (base && SUBREG_P (base))
28535 base = SUBREG_REG (base);
28536 if (index && SUBREG_P (index))
28537 index = SUBREG_REG (index);
28538
28539 gcc_assert (base == NULL_RTX || REG_P (base));
28540 gcc_assert (index == NULL_RTX || REG_P (index));
28541
28542 /* Rule of thumb:
28543 - esp as the base always wants an index,
28544 - ebp as the base always wants a displacement,
28545 - r12 as the base always wants an index,
28546 - r13 as the base always wants a displacement. */
28547
28548 /* Register Indirect. */
28549 if (base && !index && !disp)
28550 {
28551 /* esp (for its index) and ebp (for its displacement) need
28552 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
28553 code. */
28554 if (base == arg_pointer_rtx
28555 || base == frame_pointer_rtx
28556 || REGNO (base) == SP_REG
28557 || REGNO (base) == BP_REG
28558 || REGNO (base) == R12_REG
28559 || REGNO (base) == R13_REG)
28560 len++;
28561 }
28562
28563 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
28564 is not disp32, but disp32(%rip), so for disp32
28565 SIB byte is needed, unless print_operand_address
28566 optimizes it into disp32(%rip) or (%rip) is implied
28567 by UNSPEC. */
28568 else if (disp && !base && !index)
28569 {
28570 len += 4;
28571 if (rip_relative_addr_p (&parts))
28572 len++;
28573 }
28574 else
28575 {
28576 /* Find the length of the displacement constant. */
28577 if (disp)
28578 {
28579 if (base && satisfies_constraint_K (disp))
28580 len += 1;
28581 else
28582 len += 4;
28583 }
28584 /* ebp always wants a displacement. Similarly r13. */
28585 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
28586 len++;
28587
28588 /* An index requires the two-byte modrm form.... */
28589 if (index
28590 /* ...like esp (or r12), which always wants an index. */
28591 || base == arg_pointer_rtx
28592 || base == frame_pointer_rtx
28593 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
28594 len++;
28595 }
28596
28597 return len;
28598 }
28599
28600 /* Compute default value for "length_immediate" attribute. When SHORTFORM
28601 is set, expect that insn have 8bit immediate alternative. */
28602 int
28603 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
28604 {
28605 int len = 0;
28606 int i;
28607 extract_insn_cached (insn);
28608 for (i = recog_data.n_operands - 1; i >= 0; --i)
28609 if (CONSTANT_P (recog_data.operand[i]))
28610 {
28611 enum attr_mode mode = get_attr_mode (insn);
28612
28613 gcc_assert (!len);
28614 if (shortform && CONST_INT_P (recog_data.operand[i]))
28615 {
28616 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
28617 switch (mode)
28618 {
28619 case MODE_QI:
28620 len = 1;
28621 continue;
28622 case MODE_HI:
28623 ival = trunc_int_for_mode (ival, HImode);
28624 break;
28625 case MODE_SI:
28626 ival = trunc_int_for_mode (ival, SImode);
28627 break;
28628 default:
28629 break;
28630 }
28631 if (IN_RANGE (ival, -128, 127))
28632 {
28633 len = 1;
28634 continue;
28635 }
28636 }
28637 switch (mode)
28638 {
28639 case MODE_QI:
28640 len = 1;
28641 break;
28642 case MODE_HI:
28643 len = 2;
28644 break;
28645 case MODE_SI:
28646 len = 4;
28647 break;
28648 /* Immediates for DImode instructions are encoded
28649 as 32bit sign extended values. */
28650 case MODE_DI:
28651 len = 4;
28652 break;
28653 default:
28654 fatal_insn ("unknown insn mode", insn);
28655 }
28656 }
28657 return len;
28658 }
28659
28660 /* Compute default value for "length_address" attribute. */
28661 int
28662 ix86_attr_length_address_default (rtx_insn *insn)
28663 {
28664 int i;
28665
28666 if (get_attr_type (insn) == TYPE_LEA)
28667 {
28668 rtx set = PATTERN (insn), addr;
28669
28670 if (GET_CODE (set) == PARALLEL)
28671 set = XVECEXP (set, 0, 0);
28672
28673 gcc_assert (GET_CODE (set) == SET);
28674
28675 addr = SET_SRC (set);
28676
28677 return memory_address_length (addr, true);
28678 }
28679
28680 extract_insn_cached (insn);
28681 for (i = recog_data.n_operands - 1; i >= 0; --i)
28682 {
28683 rtx op = recog_data.operand[i];
28684 if (MEM_P (op))
28685 {
28686 constrain_operands_cached (insn, reload_completed);
28687 if (which_alternative != -1)
28688 {
28689 const char *constraints = recog_data.constraints[i];
28690 int alt = which_alternative;
28691
28692 while (*constraints == '=' || *constraints == '+')
28693 constraints++;
28694 while (alt-- > 0)
28695 while (*constraints++ != ',')
28696 ;
28697 /* Skip ignored operands. */
28698 if (*constraints == 'X')
28699 continue;
28700 }
28701
28702 int len = memory_address_length (XEXP (op, 0), false);
28703
28704 /* Account for segment prefix for non-default addr spaces. */
28705 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
28706 len++;
28707
28708 return len;
28709 }
28710 }
28711 return 0;
28712 }
28713
28714 /* Compute default value for "length_vex" attribute. It includes
28715 2 or 3 byte VEX prefix and 1 opcode byte. */
28716
28717 int
28718 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
28719 bool has_vex_w)
28720 {
28721 int i;
28722
28723 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
28724 byte VEX prefix. */
28725 if (!has_0f_opcode || has_vex_w)
28726 return 3 + 1;
28727
28728 /* We can always use 2 byte VEX prefix in 32bit. */
28729 if (!TARGET_64BIT)
28730 return 2 + 1;
28731
28732 extract_insn_cached (insn);
28733
28734 for (i = recog_data.n_operands - 1; i >= 0; --i)
28735 if (REG_P (recog_data.operand[i]))
28736 {
28737 /* REX.W bit uses 3 byte VEX prefix. */
28738 if (GET_MODE (recog_data.operand[i]) == DImode
28739 && GENERAL_REG_P (recog_data.operand[i]))
28740 return 3 + 1;
28741 }
28742 else
28743 {
28744 /* REX.X or REX.B bits use 3 byte VEX prefix. */
28745 if (MEM_P (recog_data.operand[i])
28746 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
28747 return 3 + 1;
28748 }
28749
28750 return 2 + 1;
28751 }
28752 \f
28753 /* Return the maximum number of instructions a cpu can issue. */
28754
28755 static int
28756 ix86_issue_rate (void)
28757 {
28758 switch (ix86_tune)
28759 {
28760 case PROCESSOR_PENTIUM:
28761 case PROCESSOR_LAKEMONT:
28762 case PROCESSOR_BONNELL:
28763 case PROCESSOR_SILVERMONT:
28764 case PROCESSOR_KNL:
28765 case PROCESSOR_INTEL:
28766 case PROCESSOR_K6:
28767 case PROCESSOR_BTVER2:
28768 case PROCESSOR_PENTIUM4:
28769 case PROCESSOR_NOCONA:
28770 return 2;
28771
28772 case PROCESSOR_PENTIUMPRO:
28773 case PROCESSOR_ATHLON:
28774 case PROCESSOR_K8:
28775 case PROCESSOR_AMDFAM10:
28776 case PROCESSOR_GENERIC:
28777 case PROCESSOR_BTVER1:
28778 return 3;
28779
28780 case PROCESSOR_BDVER1:
28781 case PROCESSOR_BDVER2:
28782 case PROCESSOR_BDVER3:
28783 case PROCESSOR_BDVER4:
28784 case PROCESSOR_ZNVER1:
28785 case PROCESSOR_CORE2:
28786 case PROCESSOR_NEHALEM:
28787 case PROCESSOR_SANDYBRIDGE:
28788 case PROCESSOR_HASWELL:
28789 return 4;
28790
28791 default:
28792 return 1;
28793 }
28794 }
28795
28796 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
28797 by DEP_INSN and nothing set by DEP_INSN. */
28798
28799 static bool
28800 ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
28801 {
28802 rtx set, set2;
28803
28804 /* Simplify the test for uninteresting insns. */
28805 if (insn_type != TYPE_SETCC
28806 && insn_type != TYPE_ICMOV
28807 && insn_type != TYPE_FCMOV
28808 && insn_type != TYPE_IBR)
28809 return false;
28810
28811 if ((set = single_set (dep_insn)) != 0)
28812 {
28813 set = SET_DEST (set);
28814 set2 = NULL_RTX;
28815 }
28816 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
28817 && XVECLEN (PATTERN (dep_insn), 0) == 2
28818 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
28819 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
28820 {
28821 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
28822 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
28823 }
28824 else
28825 return false;
28826
28827 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
28828 return false;
28829
28830 /* This test is true if the dependent insn reads the flags but
28831 not any other potentially set register. */
28832 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
28833 return false;
28834
28835 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
28836 return false;
28837
28838 return true;
28839 }
28840
28841 /* Return true iff USE_INSN has a memory address with operands set by
28842 SET_INSN. */
28843
28844 bool
28845 ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
28846 {
28847 int i;
28848 extract_insn_cached (use_insn);
28849 for (i = recog_data.n_operands - 1; i >= 0; --i)
28850 if (MEM_P (recog_data.operand[i]))
28851 {
28852 rtx addr = XEXP (recog_data.operand[i], 0);
28853 return modified_in_p (addr, set_insn) != 0;
28854 }
28855 return false;
28856 }
28857
28858 /* Helper function for exact_store_load_dependency.
28859 Return true if addr is found in insn. */
28860 static bool
28861 exact_dependency_1 (rtx addr, rtx insn)
28862 {
28863 enum rtx_code code;
28864 const char *format_ptr;
28865 int i, j;
28866
28867 code = GET_CODE (insn);
28868 switch (code)
28869 {
28870 case MEM:
28871 if (rtx_equal_p (addr, insn))
28872 return true;
28873 break;
28874 case REG:
28875 CASE_CONST_ANY:
28876 case SYMBOL_REF:
28877 case CODE_LABEL:
28878 case PC:
28879 case CC0:
28880 case EXPR_LIST:
28881 return false;
28882 default:
28883 break;
28884 }
28885
28886 format_ptr = GET_RTX_FORMAT (code);
28887 for (i = 0; i < GET_RTX_LENGTH (code); i++)
28888 {
28889 switch (*format_ptr++)
28890 {
28891 case 'e':
28892 if (exact_dependency_1 (addr, XEXP (insn, i)))
28893 return true;
28894 break;
28895 case 'E':
28896 for (j = 0; j < XVECLEN (insn, i); j++)
28897 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
28898 return true;
28899 break;
28900 }
28901 }
28902 return false;
28903 }
28904
28905 /* Return true if there exists exact dependency for store & load, i.e.
28906 the same memory address is used in them. */
28907 static bool
28908 exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
28909 {
28910 rtx set1, set2;
28911
28912 set1 = single_set (store);
28913 if (!set1)
28914 return false;
28915 if (!MEM_P (SET_DEST (set1)))
28916 return false;
28917 set2 = single_set (load);
28918 if (!set2)
28919 return false;
28920 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
28921 return true;
28922 return false;
28923 }
28924
28925 static int
28926 ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
28927 unsigned int)
28928 {
28929 enum attr_type insn_type, dep_insn_type;
28930 enum attr_memory memory;
28931 rtx set, set2;
28932 int dep_insn_code_number;
28933
28934 /* Anti and output dependencies have zero cost on all CPUs. */
28935 if (dep_type != 0)
28936 return 0;
28937
28938 dep_insn_code_number = recog_memoized (dep_insn);
28939
28940 /* If we can't recognize the insns, we can't really do anything. */
28941 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
28942 return cost;
28943
28944 insn_type = get_attr_type (insn);
28945 dep_insn_type = get_attr_type (dep_insn);
28946
28947 switch (ix86_tune)
28948 {
28949 case PROCESSOR_PENTIUM:
28950 case PROCESSOR_LAKEMONT:
28951 /* Address Generation Interlock adds a cycle of latency. */
28952 if (insn_type == TYPE_LEA)
28953 {
28954 rtx addr = PATTERN (insn);
28955
28956 if (GET_CODE (addr) == PARALLEL)
28957 addr = XVECEXP (addr, 0, 0);
28958
28959 gcc_assert (GET_CODE (addr) == SET);
28960
28961 addr = SET_SRC (addr);
28962 if (modified_in_p (addr, dep_insn))
28963 cost += 1;
28964 }
28965 else if (ix86_agi_dependent (dep_insn, insn))
28966 cost += 1;
28967
28968 /* ??? Compares pair with jump/setcc. */
28969 if (ix86_flags_dependent (insn, dep_insn, insn_type))
28970 cost = 0;
28971
28972 /* Floating point stores require value to be ready one cycle earlier. */
28973 if (insn_type == TYPE_FMOV
28974 && get_attr_memory (insn) == MEMORY_STORE
28975 && !ix86_agi_dependent (dep_insn, insn))
28976 cost += 1;
28977 break;
28978
28979 case PROCESSOR_PENTIUMPRO:
28980 /* INT->FP conversion is expensive. */
28981 if (get_attr_fp_int_src (dep_insn))
28982 cost += 5;
28983
28984 /* There is one cycle extra latency between an FP op and a store. */
28985 if (insn_type == TYPE_FMOV
28986 && (set = single_set (dep_insn)) != NULL_RTX
28987 && (set2 = single_set (insn)) != NULL_RTX
28988 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
28989 && MEM_P (SET_DEST (set2)))
28990 cost += 1;
28991
28992 memory = get_attr_memory (insn);
28993
28994 /* Show ability of reorder buffer to hide latency of load by executing
28995 in parallel with previous instruction in case
28996 previous instruction is not needed to compute the address. */
28997 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
28998 && !ix86_agi_dependent (dep_insn, insn))
28999 {
29000 /* Claim moves to take one cycle, as core can issue one load
29001 at time and the next load can start cycle later. */
29002 if (dep_insn_type == TYPE_IMOV
29003 || dep_insn_type == TYPE_FMOV)
29004 cost = 1;
29005 else if (cost > 1)
29006 cost--;
29007 }
29008 break;
29009
29010 case PROCESSOR_K6:
29011 /* The esp dependency is resolved before
29012 the instruction is really finished. */
29013 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
29014 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
29015 return 1;
29016
29017 /* INT->FP conversion is expensive. */
29018 if (get_attr_fp_int_src (dep_insn))
29019 cost += 5;
29020
29021 memory = get_attr_memory (insn);
29022
29023 /* Show ability of reorder buffer to hide latency of load by executing
29024 in parallel with previous instruction in case
29025 previous instruction is not needed to compute the address. */
29026 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29027 && !ix86_agi_dependent (dep_insn, insn))
29028 {
29029 /* Claim moves to take one cycle, as core can issue one load
29030 at time and the next load can start cycle later. */
29031 if (dep_insn_type == TYPE_IMOV
29032 || dep_insn_type == TYPE_FMOV)
29033 cost = 1;
29034 else if (cost > 2)
29035 cost -= 2;
29036 else
29037 cost = 1;
29038 }
29039 break;
29040
29041 case PROCESSOR_AMDFAM10:
29042 case PROCESSOR_BDVER1:
29043 case PROCESSOR_BDVER2:
29044 case PROCESSOR_BDVER3:
29045 case PROCESSOR_BDVER4:
29046 case PROCESSOR_ZNVER1:
29047 case PROCESSOR_BTVER1:
29048 case PROCESSOR_BTVER2:
29049 case PROCESSOR_GENERIC:
29050 /* Stack engine allows to execute push&pop instructions in parall. */
29051 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
29052 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
29053 return 0;
29054 /* FALLTHRU */
29055
29056 case PROCESSOR_ATHLON:
29057 case PROCESSOR_K8:
29058 memory = get_attr_memory (insn);
29059
29060 /* Show ability of reorder buffer to hide latency of load by executing
29061 in parallel with previous instruction in case
29062 previous instruction is not needed to compute the address. */
29063 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29064 && !ix86_agi_dependent (dep_insn, insn))
29065 {
29066 enum attr_unit unit = get_attr_unit (insn);
29067 int loadcost = 3;
29068
29069 /* Because of the difference between the length of integer and
29070 floating unit pipeline preparation stages, the memory operands
29071 for floating point are cheaper.
29072
29073 ??? For Athlon it the difference is most probably 2. */
29074 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
29075 loadcost = 3;
29076 else
29077 loadcost = TARGET_ATHLON ? 2 : 0;
29078
29079 if (cost >= loadcost)
29080 cost -= loadcost;
29081 else
29082 cost = 0;
29083 }
29084 break;
29085
29086 case PROCESSOR_CORE2:
29087 case PROCESSOR_NEHALEM:
29088 case PROCESSOR_SANDYBRIDGE:
29089 case PROCESSOR_HASWELL:
29090 /* Stack engine allows to execute push&pop instructions in parall. */
29091 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
29092 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
29093 return 0;
29094
29095 memory = get_attr_memory (insn);
29096
29097 /* Show ability of reorder buffer to hide latency of load by executing
29098 in parallel with previous instruction in case
29099 previous instruction is not needed to compute the address. */
29100 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29101 && !ix86_agi_dependent (dep_insn, insn))
29102 {
29103 if (cost >= 4)
29104 cost -= 4;
29105 else
29106 cost = 0;
29107 }
29108 break;
29109
29110 case PROCESSOR_SILVERMONT:
29111 case PROCESSOR_KNL:
29112 case PROCESSOR_INTEL:
29113 if (!reload_completed)
29114 return cost;
29115
29116 /* Increase cost of integer loads. */
29117 memory = get_attr_memory (dep_insn);
29118 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29119 {
29120 enum attr_unit unit = get_attr_unit (dep_insn);
29121 if (unit == UNIT_INTEGER && cost == 1)
29122 {
29123 if (memory == MEMORY_LOAD)
29124 cost = 3;
29125 else
29126 {
29127 /* Increase cost of ld/st for short int types only
29128 because of store forwarding issue. */
29129 rtx set = single_set (dep_insn);
29130 if (set && (GET_MODE (SET_DEST (set)) == QImode
29131 || GET_MODE (SET_DEST (set)) == HImode))
29132 {
29133 /* Increase cost of store/load insn if exact
29134 dependence exists and it is load insn. */
29135 enum attr_memory insn_memory = get_attr_memory (insn);
29136 if (insn_memory == MEMORY_LOAD
29137 && exact_store_load_dependency (dep_insn, insn))
29138 cost = 3;
29139 }
29140 }
29141 }
29142 }
29143
29144 default:
29145 break;
29146 }
29147
29148 return cost;
29149 }
29150
29151 /* How many alternative schedules to try. This should be as wide as the
29152 scheduling freedom in the DFA, but no wider. Making this value too
29153 large results extra work for the scheduler. */
29154
29155 static int
29156 ia32_multipass_dfa_lookahead (void)
29157 {
29158 switch (ix86_tune)
29159 {
29160 case PROCESSOR_PENTIUM:
29161 case PROCESSOR_LAKEMONT:
29162 return 2;
29163
29164 case PROCESSOR_PENTIUMPRO:
29165 case PROCESSOR_K6:
29166 return 1;
29167
29168 case PROCESSOR_BDVER1:
29169 case PROCESSOR_BDVER2:
29170 case PROCESSOR_BDVER3:
29171 case PROCESSOR_BDVER4:
29172 /* We use lookahead value 4 for BD both before and after reload
29173 schedules. Plan is to have value 8 included for O3. */
29174 return 4;
29175
29176 case PROCESSOR_CORE2:
29177 case PROCESSOR_NEHALEM:
29178 case PROCESSOR_SANDYBRIDGE:
29179 case PROCESSOR_HASWELL:
29180 case PROCESSOR_BONNELL:
29181 case PROCESSOR_SILVERMONT:
29182 case PROCESSOR_KNL:
29183 case PROCESSOR_INTEL:
29184 /* Generally, we want haifa-sched:max_issue() to look ahead as far
29185 as many instructions can be executed on a cycle, i.e.,
29186 issue_rate. I wonder why tuning for many CPUs does not do this. */
29187 if (reload_completed)
29188 return ix86_issue_rate ();
29189 /* Don't use lookahead for pre-reload schedule to save compile time. */
29190 return 0;
29191
29192 default:
29193 return 0;
29194 }
29195 }
29196
29197 /* Return true if target platform supports macro-fusion. */
29198
29199 static bool
29200 ix86_macro_fusion_p ()
29201 {
29202 return TARGET_FUSE_CMP_AND_BRANCH;
29203 }
29204
29205 /* Check whether current microarchitecture support macro fusion
29206 for insn pair "CONDGEN + CONDJMP". Refer to
29207 "Intel Architectures Optimization Reference Manual". */
29208
29209 static bool
29210 ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
29211 {
29212 rtx src, dest;
29213 enum rtx_code ccode;
29214 rtx compare_set = NULL_RTX, test_if, cond;
29215 rtx alu_set = NULL_RTX, addr = NULL_RTX;
29216
29217 if (!any_condjump_p (condjmp))
29218 return false;
29219
29220 if (get_attr_type (condgen) != TYPE_TEST
29221 && get_attr_type (condgen) != TYPE_ICMP
29222 && get_attr_type (condgen) != TYPE_INCDEC
29223 && get_attr_type (condgen) != TYPE_ALU)
29224 return false;
29225
29226 compare_set = single_set (condgen);
29227 if (compare_set == NULL_RTX
29228 && !TARGET_FUSE_ALU_AND_BRANCH)
29229 return false;
29230
29231 if (compare_set == NULL_RTX)
29232 {
29233 int i;
29234 rtx pat = PATTERN (condgen);
29235 for (i = 0; i < XVECLEN (pat, 0); i++)
29236 if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
29237 {
29238 rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
29239 if (GET_CODE (set_src) == COMPARE)
29240 compare_set = XVECEXP (pat, 0, i);
29241 else
29242 alu_set = XVECEXP (pat, 0, i);
29243 }
29244 }
29245 if (compare_set == NULL_RTX)
29246 return false;
29247 src = SET_SRC (compare_set);
29248 if (GET_CODE (src) != COMPARE)
29249 return false;
29250
29251 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
29252 supported. */
29253 if ((MEM_P (XEXP (src, 0))
29254 && CONST_INT_P (XEXP (src, 1)))
29255 || (MEM_P (XEXP (src, 1))
29256 && CONST_INT_P (XEXP (src, 0))))
29257 return false;
29258
29259 /* No fusion for RIP-relative address. */
29260 if (MEM_P (XEXP (src, 0)))
29261 addr = XEXP (XEXP (src, 0), 0);
29262 else if (MEM_P (XEXP (src, 1)))
29263 addr = XEXP (XEXP (src, 1), 0);
29264
29265 if (addr) {
29266 ix86_address parts;
29267 int ok = ix86_decompose_address (addr, &parts);
29268 gcc_assert (ok);
29269
29270 if (rip_relative_addr_p (&parts))
29271 return false;
29272 }
29273
29274 test_if = SET_SRC (pc_set (condjmp));
29275 cond = XEXP (test_if, 0);
29276 ccode = GET_CODE (cond);
29277 /* Check whether conditional jump use Sign or Overflow Flags. */
29278 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
29279 && (ccode == GE
29280 || ccode == GT
29281 || ccode == LE
29282 || ccode == LT))
29283 return false;
29284
29285 /* Return true for TYPE_TEST and TYPE_ICMP. */
29286 if (get_attr_type (condgen) == TYPE_TEST
29287 || get_attr_type (condgen) == TYPE_ICMP)
29288 return true;
29289
29290 /* The following is the case that macro-fusion for alu + jmp. */
29291 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
29292 return false;
29293
29294 /* No fusion for alu op with memory destination operand. */
29295 dest = SET_DEST (alu_set);
29296 if (MEM_P (dest))
29297 return false;
29298
29299 /* Macro-fusion for inc/dec + unsigned conditional jump is not
29300 supported. */
29301 if (get_attr_type (condgen) == TYPE_INCDEC
29302 && (ccode == GEU
29303 || ccode == GTU
29304 || ccode == LEU
29305 || ccode == LTU))
29306 return false;
29307
29308 return true;
29309 }
29310
29311 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
29312 execution. It is applied if
29313 (1) IMUL instruction is on the top of list;
29314 (2) There exists the only producer of independent IMUL instruction in
29315 ready list.
29316 Return index of IMUL producer if it was found and -1 otherwise. */
29317 static int
29318 do_reorder_for_imul (rtx_insn **ready, int n_ready)
29319 {
29320 rtx_insn *insn;
29321 rtx set, insn1, insn2;
29322 sd_iterator_def sd_it;
29323 dep_t dep;
29324 int index = -1;
29325 int i;
29326
29327 if (!TARGET_BONNELL)
29328 return index;
29329
29330 /* Check that IMUL instruction is on the top of ready list. */
29331 insn = ready[n_ready - 1];
29332 set = single_set (insn);
29333 if (!set)
29334 return index;
29335 if (!(GET_CODE (SET_SRC (set)) == MULT
29336 && GET_MODE (SET_SRC (set)) == SImode))
29337 return index;
29338
29339 /* Search for producer of independent IMUL instruction. */
29340 for (i = n_ready - 2; i >= 0; i--)
29341 {
29342 insn = ready[i];
29343 if (!NONDEBUG_INSN_P (insn))
29344 continue;
29345 /* Skip IMUL instruction. */
29346 insn2 = PATTERN (insn);
29347 if (GET_CODE (insn2) == PARALLEL)
29348 insn2 = XVECEXP (insn2, 0, 0);
29349 if (GET_CODE (insn2) == SET
29350 && GET_CODE (SET_SRC (insn2)) == MULT
29351 && GET_MODE (SET_SRC (insn2)) == SImode)
29352 continue;
29353
29354 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
29355 {
29356 rtx con;
29357 con = DEP_CON (dep);
29358 if (!NONDEBUG_INSN_P (con))
29359 continue;
29360 insn1 = PATTERN (con);
29361 if (GET_CODE (insn1) == PARALLEL)
29362 insn1 = XVECEXP (insn1, 0, 0);
29363
29364 if (GET_CODE (insn1) == SET
29365 && GET_CODE (SET_SRC (insn1)) == MULT
29366 && GET_MODE (SET_SRC (insn1)) == SImode)
29367 {
29368 sd_iterator_def sd_it1;
29369 dep_t dep1;
29370 /* Check if there is no other dependee for IMUL. */
29371 index = i;
29372 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
29373 {
29374 rtx pro;
29375 pro = DEP_PRO (dep1);
29376 if (!NONDEBUG_INSN_P (pro))
29377 continue;
29378 if (pro != insn)
29379 index = -1;
29380 }
29381 if (index >= 0)
29382 break;
29383 }
29384 }
29385 if (index >= 0)
29386 break;
29387 }
29388 return index;
29389 }
29390
29391 /* Try to find the best candidate on the top of ready list if two insns
29392 have the same priority - candidate is best if its dependees were
29393 scheduled earlier. Applied for Silvermont only.
29394 Return true if top 2 insns must be interchanged. */
29395 static bool
29396 swap_top_of_ready_list (rtx_insn **ready, int n_ready)
29397 {
29398 rtx_insn *top = ready[n_ready - 1];
29399 rtx_insn *next = ready[n_ready - 2];
29400 rtx set;
29401 sd_iterator_def sd_it;
29402 dep_t dep;
29403 int clock1 = -1;
29404 int clock2 = -1;
29405 #define INSN_TICK(INSN) (HID (INSN)->tick)
29406
29407 if (!TARGET_SILVERMONT && !TARGET_INTEL)
29408 return false;
29409
29410 if (!NONDEBUG_INSN_P (top))
29411 return false;
29412 if (!NONJUMP_INSN_P (top))
29413 return false;
29414 if (!NONDEBUG_INSN_P (next))
29415 return false;
29416 if (!NONJUMP_INSN_P (next))
29417 return false;
29418 set = single_set (top);
29419 if (!set)
29420 return false;
29421 set = single_set (next);
29422 if (!set)
29423 return false;
29424
29425 if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
29426 {
29427 if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
29428 return false;
29429 /* Determine winner more precise. */
29430 FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
29431 {
29432 rtx pro;
29433 pro = DEP_PRO (dep);
29434 if (!NONDEBUG_INSN_P (pro))
29435 continue;
29436 if (INSN_TICK (pro) > clock1)
29437 clock1 = INSN_TICK (pro);
29438 }
29439 FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
29440 {
29441 rtx pro;
29442 pro = DEP_PRO (dep);
29443 if (!NONDEBUG_INSN_P (pro))
29444 continue;
29445 if (INSN_TICK (pro) > clock2)
29446 clock2 = INSN_TICK (pro);
29447 }
29448
29449 if (clock1 == clock2)
29450 {
29451 /* Determine winner - load must win. */
29452 enum attr_memory memory1, memory2;
29453 memory1 = get_attr_memory (top);
29454 memory2 = get_attr_memory (next);
29455 if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
29456 return true;
29457 }
29458 return (bool) (clock2 < clock1);
29459 }
29460 return false;
29461 #undef INSN_TICK
29462 }
29463
29464 /* Perform possible reodering of ready list for Atom/Silvermont only.
29465 Return issue rate. */
29466 static int
29467 ix86_sched_reorder (FILE *dump, int sched_verbose, rtx_insn **ready,
29468 int *pn_ready, int clock_var)
29469 {
29470 int issue_rate = -1;
29471 int n_ready = *pn_ready;
29472 int i;
29473 rtx_insn *insn;
29474 int index = -1;
29475
29476 /* Set up issue rate. */
29477 issue_rate = ix86_issue_rate ();
29478
29479 /* Do reodering for BONNELL/SILVERMONT only. */
29480 if (!TARGET_BONNELL && !TARGET_SILVERMONT && !TARGET_INTEL)
29481 return issue_rate;
29482
29483 /* Nothing to do if ready list contains only 1 instruction. */
29484 if (n_ready <= 1)
29485 return issue_rate;
29486
29487 /* Do reodering for post-reload scheduler only. */
29488 if (!reload_completed)
29489 return issue_rate;
29490
29491 if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
29492 {
29493 if (sched_verbose > 1)
29494 fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
29495 INSN_UID (ready[index]));
29496
29497 /* Put IMUL producer (ready[index]) at the top of ready list. */
29498 insn = ready[index];
29499 for (i = index; i < n_ready - 1; i++)
29500 ready[i] = ready[i + 1];
29501 ready[n_ready - 1] = insn;
29502 return issue_rate;
29503 }
29504
29505 /* Skip selective scheduling since HID is not populated in it. */
29506 if (clock_var != 0
29507 && !sel_sched_p ()
29508 && swap_top_of_ready_list (ready, n_ready))
29509 {
29510 if (sched_verbose > 1)
29511 fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
29512 INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
29513 /* Swap 2 top elements of ready list. */
29514 insn = ready[n_ready - 1];
29515 ready[n_ready - 1] = ready[n_ready - 2];
29516 ready[n_ready - 2] = insn;
29517 }
29518 return issue_rate;
29519 }
29520
29521 static bool
29522 ix86_class_likely_spilled_p (reg_class_t);
29523
29524 /* Returns true if lhs of insn is HW function argument register and set up
29525 is_spilled to true if it is likely spilled HW register. */
29526 static bool
29527 insn_is_function_arg (rtx insn, bool* is_spilled)
29528 {
29529 rtx dst;
29530
29531 if (!NONDEBUG_INSN_P (insn))
29532 return false;
29533 /* Call instructions are not movable, ignore it. */
29534 if (CALL_P (insn))
29535 return false;
29536 insn = PATTERN (insn);
29537 if (GET_CODE (insn) == PARALLEL)
29538 insn = XVECEXP (insn, 0, 0);
29539 if (GET_CODE (insn) != SET)
29540 return false;
29541 dst = SET_DEST (insn);
29542 if (REG_P (dst) && HARD_REGISTER_P (dst)
29543 && ix86_function_arg_regno_p (REGNO (dst)))
29544 {
29545 /* Is it likely spilled HW register? */
29546 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
29547 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
29548 *is_spilled = true;
29549 return true;
29550 }
29551 return false;
29552 }
29553
29554 /* Add output dependencies for chain of function adjacent arguments if only
29555 there is a move to likely spilled HW register. Return first argument
29556 if at least one dependence was added or NULL otherwise. */
29557 static rtx_insn *
29558 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
29559 {
29560 rtx_insn *insn;
29561 rtx_insn *last = call;
29562 rtx_insn *first_arg = NULL;
29563 bool is_spilled = false;
29564
29565 head = PREV_INSN (head);
29566
29567 /* Find nearest to call argument passing instruction. */
29568 while (true)
29569 {
29570 last = PREV_INSN (last);
29571 if (last == head)
29572 return NULL;
29573 if (!NONDEBUG_INSN_P (last))
29574 continue;
29575 if (insn_is_function_arg (last, &is_spilled))
29576 break;
29577 return NULL;
29578 }
29579
29580 first_arg = last;
29581 while (true)
29582 {
29583 insn = PREV_INSN (last);
29584 if (!INSN_P (insn))
29585 break;
29586 if (insn == head)
29587 break;
29588 if (!NONDEBUG_INSN_P (insn))
29589 {
29590 last = insn;
29591 continue;
29592 }
29593 if (insn_is_function_arg (insn, &is_spilled))
29594 {
29595 /* Add output depdendence between two function arguments if chain
29596 of output arguments contains likely spilled HW registers. */
29597 if (is_spilled)
29598 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29599 first_arg = last = insn;
29600 }
29601 else
29602 break;
29603 }
29604 if (!is_spilled)
29605 return NULL;
29606 return first_arg;
29607 }
29608
29609 /* Add output or anti dependency from insn to first_arg to restrict its code
29610 motion. */
29611 static void
29612 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
29613 {
29614 rtx set;
29615 rtx tmp;
29616
29617 /* Add anti dependencies for bounds stores. */
29618 if (INSN_P (insn)
29619 && GET_CODE (PATTERN (insn)) == PARALLEL
29620 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
29621 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
29622 {
29623 add_dependence (first_arg, insn, REG_DEP_ANTI);
29624 return;
29625 }
29626
29627 set = single_set (insn);
29628 if (!set)
29629 return;
29630 tmp = SET_DEST (set);
29631 if (REG_P (tmp))
29632 {
29633 /* Add output dependency to the first function argument. */
29634 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29635 return;
29636 }
29637 /* Add anti dependency. */
29638 add_dependence (first_arg, insn, REG_DEP_ANTI);
29639 }
29640
29641 /* Avoid cross block motion of function argument through adding dependency
29642 from the first non-jump instruction in bb. */
29643 static void
29644 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
29645 {
29646 rtx_insn *insn = BB_END (bb);
29647
29648 while (insn)
29649 {
29650 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
29651 {
29652 rtx set = single_set (insn);
29653 if (set)
29654 {
29655 avoid_func_arg_motion (arg, insn);
29656 return;
29657 }
29658 }
29659 if (insn == BB_HEAD (bb))
29660 return;
29661 insn = PREV_INSN (insn);
29662 }
29663 }
29664
29665 /* Hook for pre-reload schedule - avoid motion of function arguments
29666 passed in likely spilled HW registers. */
29667 static void
29668 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
29669 {
29670 rtx_insn *insn;
29671 rtx_insn *first_arg = NULL;
29672 if (reload_completed)
29673 return;
29674 while (head != tail && DEBUG_INSN_P (head))
29675 head = NEXT_INSN (head);
29676 for (insn = tail; insn != head; insn = PREV_INSN (insn))
29677 if (INSN_P (insn) && CALL_P (insn))
29678 {
29679 first_arg = add_parameter_dependencies (insn, head);
29680 if (first_arg)
29681 {
29682 /* Add dependee for first argument to predecessors if only
29683 region contains more than one block. */
29684 basic_block bb = BLOCK_FOR_INSN (insn);
29685 int rgn = CONTAINING_RGN (bb->index);
29686 int nr_blks = RGN_NR_BLOCKS (rgn);
29687 /* Skip trivial regions and region head blocks that can have
29688 predecessors outside of region. */
29689 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
29690 {
29691 edge e;
29692 edge_iterator ei;
29693
29694 /* Regions are SCCs with the exception of selective
29695 scheduling with pipelining of outer blocks enabled.
29696 So also check that immediate predecessors of a non-head
29697 block are in the same region. */
29698 FOR_EACH_EDGE (e, ei, bb->preds)
29699 {
29700 /* Avoid creating of loop-carried dependencies through
29701 using topological ordering in the region. */
29702 if (rgn == CONTAINING_RGN (e->src->index)
29703 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
29704 add_dependee_for_func_arg (first_arg, e->src);
29705 }
29706 }
29707 insn = first_arg;
29708 if (insn == head)
29709 break;
29710 }
29711 }
29712 else if (first_arg)
29713 avoid_func_arg_motion (first_arg, insn);
29714 }
29715
29716 /* Hook for pre-reload schedule - set priority of moves from likely spilled
29717 HW registers to maximum, to schedule them at soon as possible. These are
29718 moves from function argument registers at the top of the function entry
29719 and moves from function return value registers after call. */
29720 static int
29721 ix86_adjust_priority (rtx_insn *insn, int priority)
29722 {
29723 rtx set;
29724
29725 if (reload_completed)
29726 return priority;
29727
29728 if (!NONDEBUG_INSN_P (insn))
29729 return priority;
29730
29731 set = single_set (insn);
29732 if (set)
29733 {
29734 rtx tmp = SET_SRC (set);
29735 if (REG_P (tmp)
29736 && HARD_REGISTER_P (tmp)
29737 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
29738 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
29739 return current_sched_info->sched_max_insns_priority;
29740 }
29741
29742 return priority;
29743 }
29744
29745 /* Model decoder of Core 2/i7.
29746 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
29747 track the instruction fetch block boundaries and make sure that long
29748 (9+ bytes) instructions are assigned to D0. */
29749
29750 /* Maximum length of an insn that can be handled by
29751 a secondary decoder unit. '8' for Core 2/i7. */
29752 static int core2i7_secondary_decoder_max_insn_size;
29753
29754 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
29755 '16' for Core 2/i7. */
29756 static int core2i7_ifetch_block_size;
29757
29758 /* Maximum number of instructions decoder can handle per cycle.
29759 '6' for Core 2/i7. */
29760 static int core2i7_ifetch_block_max_insns;
29761
29762 typedef struct ix86_first_cycle_multipass_data_ *
29763 ix86_first_cycle_multipass_data_t;
29764 typedef const struct ix86_first_cycle_multipass_data_ *
29765 const_ix86_first_cycle_multipass_data_t;
29766
29767 /* A variable to store target state across calls to max_issue within
29768 one cycle. */
29769 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
29770 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
29771
29772 /* Initialize DATA. */
29773 static void
29774 core2i7_first_cycle_multipass_init (void *_data)
29775 {
29776 ix86_first_cycle_multipass_data_t data
29777 = (ix86_first_cycle_multipass_data_t) _data;
29778
29779 data->ifetch_block_len = 0;
29780 data->ifetch_block_n_insns = 0;
29781 data->ready_try_change = NULL;
29782 data->ready_try_change_size = 0;
29783 }
29784
29785 /* Advancing the cycle; reset ifetch block counts. */
29786 static void
29787 core2i7_dfa_post_advance_cycle (void)
29788 {
29789 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
29790
29791 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
29792
29793 data->ifetch_block_len = 0;
29794 data->ifetch_block_n_insns = 0;
29795 }
29796
29797 static int min_insn_size (rtx_insn *);
29798
29799 /* Filter out insns from ready_try that the core will not be able to issue
29800 on current cycle due to decoder. */
29801 static void
29802 core2i7_first_cycle_multipass_filter_ready_try
29803 (const_ix86_first_cycle_multipass_data_t data,
29804 signed char *ready_try, int n_ready, bool first_cycle_insn_p)
29805 {
29806 while (n_ready--)
29807 {
29808 rtx_insn *insn;
29809 int insn_size;
29810
29811 if (ready_try[n_ready])
29812 continue;
29813
29814 insn = get_ready_element (n_ready);
29815 insn_size = min_insn_size (insn);
29816
29817 if (/* If this is a too long an insn for a secondary decoder ... */
29818 (!first_cycle_insn_p
29819 && insn_size > core2i7_secondary_decoder_max_insn_size)
29820 /* ... or it would not fit into the ifetch block ... */
29821 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
29822 /* ... or the decoder is full already ... */
29823 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
29824 /* ... mask the insn out. */
29825 {
29826 ready_try[n_ready] = 1;
29827
29828 if (data->ready_try_change)
29829 bitmap_set_bit (data->ready_try_change, n_ready);
29830 }
29831 }
29832 }
29833
29834 /* Prepare for a new round of multipass lookahead scheduling. */
29835 static void
29836 core2i7_first_cycle_multipass_begin (void *_data,
29837 signed char *ready_try, int n_ready,
29838 bool first_cycle_insn_p)
29839 {
29840 ix86_first_cycle_multipass_data_t data
29841 = (ix86_first_cycle_multipass_data_t) _data;
29842 const_ix86_first_cycle_multipass_data_t prev_data
29843 = ix86_first_cycle_multipass_data;
29844
29845 /* Restore the state from the end of the previous round. */
29846 data->ifetch_block_len = prev_data->ifetch_block_len;
29847 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
29848
29849 /* Filter instructions that cannot be issued on current cycle due to
29850 decoder restrictions. */
29851 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
29852 first_cycle_insn_p);
29853 }
29854
29855 /* INSN is being issued in current solution. Account for its impact on
29856 the decoder model. */
29857 static void
29858 core2i7_first_cycle_multipass_issue (void *_data,
29859 signed char *ready_try, int n_ready,
29860 rtx_insn *insn, const void *_prev_data)
29861 {
29862 ix86_first_cycle_multipass_data_t data
29863 = (ix86_first_cycle_multipass_data_t) _data;
29864 const_ix86_first_cycle_multipass_data_t prev_data
29865 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
29866
29867 int insn_size = min_insn_size (insn);
29868
29869 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
29870 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
29871 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
29872 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
29873
29874 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
29875 if (!data->ready_try_change)
29876 {
29877 data->ready_try_change = sbitmap_alloc (n_ready);
29878 data->ready_try_change_size = n_ready;
29879 }
29880 else if (data->ready_try_change_size < n_ready)
29881 {
29882 data->ready_try_change = sbitmap_resize (data->ready_try_change,
29883 n_ready, 0);
29884 data->ready_try_change_size = n_ready;
29885 }
29886 bitmap_clear (data->ready_try_change);
29887
29888 /* Filter out insns from ready_try that the core will not be able to issue
29889 on current cycle due to decoder. */
29890 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
29891 false);
29892 }
29893
29894 /* Revert the effect on ready_try. */
29895 static void
29896 core2i7_first_cycle_multipass_backtrack (const void *_data,
29897 signed char *ready_try,
29898 int n_ready ATTRIBUTE_UNUSED)
29899 {
29900 const_ix86_first_cycle_multipass_data_t data
29901 = (const_ix86_first_cycle_multipass_data_t) _data;
29902 unsigned int i = 0;
29903 sbitmap_iterator sbi;
29904
29905 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
29906 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
29907 {
29908 ready_try[i] = 0;
29909 }
29910 }
29911
29912 /* Save the result of multipass lookahead scheduling for the next round. */
29913 static void
29914 core2i7_first_cycle_multipass_end (const void *_data)
29915 {
29916 const_ix86_first_cycle_multipass_data_t data
29917 = (const_ix86_first_cycle_multipass_data_t) _data;
29918 ix86_first_cycle_multipass_data_t next_data
29919 = ix86_first_cycle_multipass_data;
29920
29921 if (data != NULL)
29922 {
29923 next_data->ifetch_block_len = data->ifetch_block_len;
29924 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
29925 }
29926 }
29927
29928 /* Deallocate target data. */
29929 static void
29930 core2i7_first_cycle_multipass_fini (void *_data)
29931 {
29932 ix86_first_cycle_multipass_data_t data
29933 = (ix86_first_cycle_multipass_data_t) _data;
29934
29935 if (data->ready_try_change)
29936 {
29937 sbitmap_free (data->ready_try_change);
29938 data->ready_try_change = NULL;
29939 data->ready_try_change_size = 0;
29940 }
29941 }
29942
29943 /* Prepare for scheduling pass. */
29944 static void
29945 ix86_sched_init_global (FILE *, int, int)
29946 {
29947 /* Install scheduling hooks for current CPU. Some of these hooks are used
29948 in time-critical parts of the scheduler, so we only set them up when
29949 they are actually used. */
29950 switch (ix86_tune)
29951 {
29952 case PROCESSOR_CORE2:
29953 case PROCESSOR_NEHALEM:
29954 case PROCESSOR_SANDYBRIDGE:
29955 case PROCESSOR_HASWELL:
29956 /* Do not perform multipass scheduling for pre-reload schedule
29957 to save compile time. */
29958 if (reload_completed)
29959 {
29960 targetm.sched.dfa_post_advance_cycle
29961 = core2i7_dfa_post_advance_cycle;
29962 targetm.sched.first_cycle_multipass_init
29963 = core2i7_first_cycle_multipass_init;
29964 targetm.sched.first_cycle_multipass_begin
29965 = core2i7_first_cycle_multipass_begin;
29966 targetm.sched.first_cycle_multipass_issue
29967 = core2i7_first_cycle_multipass_issue;
29968 targetm.sched.first_cycle_multipass_backtrack
29969 = core2i7_first_cycle_multipass_backtrack;
29970 targetm.sched.first_cycle_multipass_end
29971 = core2i7_first_cycle_multipass_end;
29972 targetm.sched.first_cycle_multipass_fini
29973 = core2i7_first_cycle_multipass_fini;
29974
29975 /* Set decoder parameters. */
29976 core2i7_secondary_decoder_max_insn_size = 8;
29977 core2i7_ifetch_block_size = 16;
29978 core2i7_ifetch_block_max_insns = 6;
29979 break;
29980 }
29981 /* Fall through. */
29982 default:
29983 targetm.sched.dfa_post_advance_cycle = NULL;
29984 targetm.sched.first_cycle_multipass_init = NULL;
29985 targetm.sched.first_cycle_multipass_begin = NULL;
29986 targetm.sched.first_cycle_multipass_issue = NULL;
29987 targetm.sched.first_cycle_multipass_backtrack = NULL;
29988 targetm.sched.first_cycle_multipass_end = NULL;
29989 targetm.sched.first_cycle_multipass_fini = NULL;
29990 break;
29991 }
29992 }
29993
29994 \f
29995 /* Compute the alignment given to a constant that is being placed in memory.
29996 EXP is the constant and ALIGN is the alignment that the object would
29997 ordinarily have.
29998 The value of this function is used instead of that alignment to align
29999 the object. */
30000
30001 int
30002 ix86_constant_alignment (tree exp, int align)
30003 {
30004 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
30005 || TREE_CODE (exp) == INTEGER_CST)
30006 {
30007 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
30008 return 64;
30009 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
30010 return 128;
30011 }
30012 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
30013 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
30014 return BITS_PER_WORD;
30015
30016 return align;
30017 }
30018
30019 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
30020 the data type, and ALIGN is the alignment that the object would
30021 ordinarily have. */
30022
30023 static int
30024 iamcu_alignment (tree type, int align)
30025 {
30026 enum machine_mode mode;
30027
30028 if (align < 32 || TYPE_USER_ALIGN (type))
30029 return align;
30030
30031 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
30032 bytes. */
30033 mode = TYPE_MODE (strip_array_types (type));
30034 switch (GET_MODE_CLASS (mode))
30035 {
30036 case MODE_INT:
30037 case MODE_COMPLEX_INT:
30038 case MODE_COMPLEX_FLOAT:
30039 case MODE_FLOAT:
30040 case MODE_DECIMAL_FLOAT:
30041 return 32;
30042 default:
30043 return align;
30044 }
30045 }
30046
30047 /* Compute the alignment for a static variable.
30048 TYPE is the data type, and ALIGN is the alignment that
30049 the object would ordinarily have. The value of this function is used
30050 instead of that alignment to align the object. */
30051
30052 int
30053 ix86_data_alignment (tree type, int align, bool opt)
30054 {
30055 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
30056 for symbols from other compilation units or symbols that don't need
30057 to bind locally. In order to preserve some ABI compatibility with
30058 those compilers, ensure we don't decrease alignment from what we
30059 used to assume. */
30060
30061 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
30062
30063 /* A data structure, equal or greater than the size of a cache line
30064 (64 bytes in the Pentium 4 and other recent Intel processors, including
30065 processors based on Intel Core microarchitecture) should be aligned
30066 so that its base address is a multiple of a cache line size. */
30067
30068 int max_align
30069 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
30070
30071 if (max_align < BITS_PER_WORD)
30072 max_align = BITS_PER_WORD;
30073
30074 switch (ix86_align_data_type)
30075 {
30076 case ix86_align_data_type_abi: opt = false; break;
30077 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
30078 case ix86_align_data_type_cacheline: break;
30079 }
30080
30081 if (TARGET_IAMCU)
30082 align = iamcu_alignment (type, align);
30083
30084 if (opt
30085 && AGGREGATE_TYPE_P (type)
30086 && TYPE_SIZE (type)
30087 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
30088 {
30089 if (wi::geu_p (TYPE_SIZE (type), max_align_compat)
30090 && align < max_align_compat)
30091 align = max_align_compat;
30092 if (wi::geu_p (TYPE_SIZE (type), max_align)
30093 && align < max_align)
30094 align = max_align;
30095 }
30096
30097 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
30098 to 16byte boundary. */
30099 if (TARGET_64BIT)
30100 {
30101 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
30102 && TYPE_SIZE (type)
30103 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
30104 && wi::geu_p (TYPE_SIZE (type), 128)
30105 && align < 128)
30106 return 128;
30107 }
30108
30109 if (!opt)
30110 return align;
30111
30112 if (TREE_CODE (type) == ARRAY_TYPE)
30113 {
30114 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
30115 return 64;
30116 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
30117 return 128;
30118 }
30119 else if (TREE_CODE (type) == COMPLEX_TYPE)
30120 {
30121
30122 if (TYPE_MODE (type) == DCmode && align < 64)
30123 return 64;
30124 if ((TYPE_MODE (type) == XCmode
30125 || TYPE_MODE (type) == TCmode) && align < 128)
30126 return 128;
30127 }
30128 else if ((TREE_CODE (type) == RECORD_TYPE
30129 || TREE_CODE (type) == UNION_TYPE
30130 || TREE_CODE (type) == QUAL_UNION_TYPE)
30131 && TYPE_FIELDS (type))
30132 {
30133 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
30134 return 64;
30135 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
30136 return 128;
30137 }
30138 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
30139 || TREE_CODE (type) == INTEGER_TYPE)
30140 {
30141 if (TYPE_MODE (type) == DFmode && align < 64)
30142 return 64;
30143 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
30144 return 128;
30145 }
30146
30147 return align;
30148 }
30149
30150 /* Compute the alignment for a local variable or a stack slot. EXP is
30151 the data type or decl itself, MODE is the widest mode available and
30152 ALIGN is the alignment that the object would ordinarily have. The
30153 value of this macro is used instead of that alignment to align the
30154 object. */
30155
30156 unsigned int
30157 ix86_local_alignment (tree exp, machine_mode mode,
30158 unsigned int align)
30159 {
30160 tree type, decl;
30161
30162 if (exp && DECL_P (exp))
30163 {
30164 type = TREE_TYPE (exp);
30165 decl = exp;
30166 }
30167 else
30168 {
30169 type = exp;
30170 decl = NULL;
30171 }
30172
30173 /* Don't do dynamic stack realignment for long long objects with
30174 -mpreferred-stack-boundary=2. */
30175 if (!TARGET_64BIT
30176 && align == 64
30177 && ix86_preferred_stack_boundary < 64
30178 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
30179 && (!type || !TYPE_USER_ALIGN (type))
30180 && (!decl || !DECL_USER_ALIGN (decl)))
30181 align = 32;
30182
30183 /* If TYPE is NULL, we are allocating a stack slot for caller-save
30184 register in MODE. We will return the largest alignment of XF
30185 and DF. */
30186 if (!type)
30187 {
30188 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
30189 align = GET_MODE_ALIGNMENT (DFmode);
30190 return align;
30191 }
30192
30193 /* Don't increase alignment for Intel MCU psABI. */
30194 if (TARGET_IAMCU)
30195 return align;
30196
30197 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
30198 to 16byte boundary. Exact wording is:
30199
30200 An array uses the same alignment as its elements, except that a local or
30201 global array variable of length at least 16 bytes or
30202 a C99 variable-length array variable always has alignment of at least 16 bytes.
30203
30204 This was added to allow use of aligned SSE instructions at arrays. This
30205 rule is meant for static storage (where compiler can not do the analysis
30206 by itself). We follow it for automatic variables only when convenient.
30207 We fully control everything in the function compiled and functions from
30208 other unit can not rely on the alignment.
30209
30210 Exclude va_list type. It is the common case of local array where
30211 we can not benefit from the alignment.
30212
30213 TODO: Probably one should optimize for size only when var is not escaping. */
30214 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
30215 && TARGET_SSE)
30216 {
30217 if (AGGREGATE_TYPE_P (type)
30218 && (va_list_type_node == NULL_TREE
30219 || (TYPE_MAIN_VARIANT (type)
30220 != TYPE_MAIN_VARIANT (va_list_type_node)))
30221 && TYPE_SIZE (type)
30222 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
30223 && wi::geu_p (TYPE_SIZE (type), 16)
30224 && align < 128)
30225 return 128;
30226 }
30227 if (TREE_CODE (type) == ARRAY_TYPE)
30228 {
30229 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
30230 return 64;
30231 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
30232 return 128;
30233 }
30234 else if (TREE_CODE (type) == COMPLEX_TYPE)
30235 {
30236 if (TYPE_MODE (type) == DCmode && align < 64)
30237 return 64;
30238 if ((TYPE_MODE (type) == XCmode
30239 || TYPE_MODE (type) == TCmode) && align < 128)
30240 return 128;
30241 }
30242 else if ((TREE_CODE (type) == RECORD_TYPE
30243 || TREE_CODE (type) == UNION_TYPE
30244 || TREE_CODE (type) == QUAL_UNION_TYPE)
30245 && TYPE_FIELDS (type))
30246 {
30247 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
30248 return 64;
30249 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
30250 return 128;
30251 }
30252 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
30253 || TREE_CODE (type) == INTEGER_TYPE)
30254 {
30255
30256 if (TYPE_MODE (type) == DFmode && align < 64)
30257 return 64;
30258 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
30259 return 128;
30260 }
30261 return align;
30262 }
30263
30264 /* Compute the minimum required alignment for dynamic stack realignment
30265 purposes for a local variable, parameter or a stack slot. EXP is
30266 the data type or decl itself, MODE is its mode and ALIGN is the
30267 alignment that the object would ordinarily have. */
30268
30269 unsigned int
30270 ix86_minimum_alignment (tree exp, machine_mode mode,
30271 unsigned int align)
30272 {
30273 tree type, decl;
30274
30275 if (exp && DECL_P (exp))
30276 {
30277 type = TREE_TYPE (exp);
30278 decl = exp;
30279 }
30280 else
30281 {
30282 type = exp;
30283 decl = NULL;
30284 }
30285
30286 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
30287 return align;
30288
30289 /* Don't do dynamic stack realignment for long long objects with
30290 -mpreferred-stack-boundary=2. */
30291 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
30292 && (!type || !TYPE_USER_ALIGN (type))
30293 && (!decl || !DECL_USER_ALIGN (decl)))
30294 {
30295 gcc_checking_assert (!TARGET_STV);
30296 return 32;
30297 }
30298
30299 return align;
30300 }
30301 \f
30302 /* Find a location for the static chain incoming to a nested function.
30303 This is a register, unless all free registers are used by arguments. */
30304
30305 static rtx
30306 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
30307 {
30308 unsigned regno;
30309
30310 /* While this function won't be called by the middle-end when a static
30311 chain isn't needed, it's also used throughout the backend so it's
30312 easiest to keep this check centralized. */
30313 if (DECL_P (fndecl_or_type) && !DECL_STATIC_CHAIN (fndecl_or_type))
30314 return NULL;
30315
30316 if (TARGET_64BIT)
30317 {
30318 /* We always use R10 in 64-bit mode. */
30319 regno = R10_REG;
30320 }
30321 else
30322 {
30323 const_tree fntype, fndecl;
30324 unsigned int ccvt;
30325
30326 /* By default in 32-bit mode we use ECX to pass the static chain. */
30327 regno = CX_REG;
30328
30329 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
30330 {
30331 fntype = TREE_TYPE (fndecl_or_type);
30332 fndecl = fndecl_or_type;
30333 }
30334 else
30335 {
30336 fntype = fndecl_or_type;
30337 fndecl = NULL;
30338 }
30339
30340 ccvt = ix86_get_callcvt (fntype);
30341 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
30342 {
30343 /* Fastcall functions use ecx/edx for arguments, which leaves
30344 us with EAX for the static chain.
30345 Thiscall functions use ecx for arguments, which also
30346 leaves us with EAX for the static chain. */
30347 regno = AX_REG;
30348 }
30349 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
30350 {
30351 /* Thiscall functions use ecx for arguments, which leaves
30352 us with EAX and EDX for the static chain.
30353 We are using for abi-compatibility EAX. */
30354 regno = AX_REG;
30355 }
30356 else if (ix86_function_regparm (fntype, fndecl) == 3)
30357 {
30358 /* For regparm 3, we have no free call-clobbered registers in
30359 which to store the static chain. In order to implement this,
30360 we have the trampoline push the static chain to the stack.
30361 However, we can't push a value below the return address when
30362 we call the nested function directly, so we have to use an
30363 alternate entry point. For this we use ESI, and have the
30364 alternate entry point push ESI, so that things appear the
30365 same once we're executing the nested function. */
30366 if (incoming_p)
30367 {
30368 if (fndecl == current_function_decl)
30369 ix86_static_chain_on_stack = true;
30370 return gen_frame_mem (SImode,
30371 plus_constant (Pmode,
30372 arg_pointer_rtx, -8));
30373 }
30374 regno = SI_REG;
30375 }
30376 }
30377
30378 return gen_rtx_REG (Pmode, regno);
30379 }
30380
30381 /* Emit RTL insns to initialize the variable parts of a trampoline.
30382 FNDECL is the decl of the target address; M_TRAMP is a MEM for
30383 the trampoline, and CHAIN_VALUE is an RTX for the static chain
30384 to be passed to the target function. */
30385
30386 static void
30387 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
30388 {
30389 rtx mem, fnaddr;
30390 int opcode;
30391 int offset = 0;
30392
30393 fnaddr = XEXP (DECL_RTL (fndecl), 0);
30394
30395 if (TARGET_64BIT)
30396 {
30397 int size;
30398
30399 /* Load the function address to r11. Try to load address using
30400 the shorter movl instead of movabs. We may want to support
30401 movq for kernel mode, but kernel does not use trampolines at
30402 the moment. FNADDR is a 32bit address and may not be in
30403 DImode when ptr_mode == SImode. Always use movl in this
30404 case. */
30405 if (ptr_mode == SImode
30406 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
30407 {
30408 fnaddr = copy_addr_to_reg (fnaddr);
30409
30410 mem = adjust_address (m_tramp, HImode, offset);
30411 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
30412
30413 mem = adjust_address (m_tramp, SImode, offset + 2);
30414 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
30415 offset += 6;
30416 }
30417 else
30418 {
30419 mem = adjust_address (m_tramp, HImode, offset);
30420 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
30421
30422 mem = adjust_address (m_tramp, DImode, offset + 2);
30423 emit_move_insn (mem, fnaddr);
30424 offset += 10;
30425 }
30426
30427 /* Load static chain using movabs to r10. Use the shorter movl
30428 instead of movabs when ptr_mode == SImode. */
30429 if (ptr_mode == SImode)
30430 {
30431 opcode = 0xba41;
30432 size = 6;
30433 }
30434 else
30435 {
30436 opcode = 0xba49;
30437 size = 10;
30438 }
30439
30440 mem = adjust_address (m_tramp, HImode, offset);
30441 emit_move_insn (mem, gen_int_mode (opcode, HImode));
30442
30443 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
30444 emit_move_insn (mem, chain_value);
30445 offset += size;
30446
30447 /* Jump to r11; the last (unused) byte is a nop, only there to
30448 pad the write out to a single 32-bit store. */
30449 mem = adjust_address (m_tramp, SImode, offset);
30450 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
30451 offset += 4;
30452 }
30453 else
30454 {
30455 rtx disp, chain;
30456
30457 /* Depending on the static chain location, either load a register
30458 with a constant, or push the constant to the stack. All of the
30459 instructions are the same size. */
30460 chain = ix86_static_chain (fndecl, true);
30461 if (REG_P (chain))
30462 {
30463 switch (REGNO (chain))
30464 {
30465 case AX_REG:
30466 opcode = 0xb8; break;
30467 case CX_REG:
30468 opcode = 0xb9; break;
30469 default:
30470 gcc_unreachable ();
30471 }
30472 }
30473 else
30474 opcode = 0x68;
30475
30476 mem = adjust_address (m_tramp, QImode, offset);
30477 emit_move_insn (mem, gen_int_mode (opcode, QImode));
30478
30479 mem = adjust_address (m_tramp, SImode, offset + 1);
30480 emit_move_insn (mem, chain_value);
30481 offset += 5;
30482
30483 mem = adjust_address (m_tramp, QImode, offset);
30484 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
30485
30486 mem = adjust_address (m_tramp, SImode, offset + 1);
30487
30488 /* Compute offset from the end of the jmp to the target function.
30489 In the case in which the trampoline stores the static chain on
30490 the stack, we need to skip the first insn which pushes the
30491 (call-saved) register static chain; this push is 1 byte. */
30492 offset += 5;
30493 disp = expand_binop (SImode, sub_optab, fnaddr,
30494 plus_constant (Pmode, XEXP (m_tramp, 0),
30495 offset - (MEM_P (chain) ? 1 : 0)),
30496 NULL_RTX, 1, OPTAB_DIRECT);
30497 emit_move_insn (mem, disp);
30498 }
30499
30500 gcc_assert (offset <= TRAMPOLINE_SIZE);
30501
30502 #ifdef HAVE_ENABLE_EXECUTE_STACK
30503 #ifdef CHECK_EXECUTE_STACK_ENABLED
30504 if (CHECK_EXECUTE_STACK_ENABLED)
30505 #endif
30506 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
30507 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
30508 #endif
30509 }
30510 \f
30511 /* The following file contains several enumerations and data structures
30512 built from the definitions in i386-builtin-types.def. */
30513
30514 #include "i386-builtin-types.inc"
30515
30516 /* Table for the ix86 builtin non-function types. */
30517 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
30518
30519 /* Retrieve an element from the above table, building some of
30520 the types lazily. */
30521
30522 static tree
30523 ix86_get_builtin_type (enum ix86_builtin_type tcode)
30524 {
30525 unsigned int index;
30526 tree type, itype;
30527
30528 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
30529
30530 type = ix86_builtin_type_tab[(int) tcode];
30531 if (type != NULL)
30532 return type;
30533
30534 gcc_assert (tcode > IX86_BT_LAST_PRIM);
30535 if (tcode <= IX86_BT_LAST_VECT)
30536 {
30537 machine_mode mode;
30538
30539 index = tcode - IX86_BT_LAST_PRIM - 1;
30540 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
30541 mode = ix86_builtin_type_vect_mode[index];
30542
30543 type = build_vector_type_for_mode (itype, mode);
30544 }
30545 else
30546 {
30547 int quals;
30548
30549 index = tcode - IX86_BT_LAST_VECT - 1;
30550 if (tcode <= IX86_BT_LAST_PTR)
30551 quals = TYPE_UNQUALIFIED;
30552 else
30553 quals = TYPE_QUAL_CONST;
30554
30555 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
30556 if (quals != TYPE_UNQUALIFIED)
30557 itype = build_qualified_type (itype, quals);
30558
30559 type = build_pointer_type (itype);
30560 }
30561
30562 ix86_builtin_type_tab[(int) tcode] = type;
30563 return type;
30564 }
30565
30566 /* Table for the ix86 builtin function types. */
30567 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
30568
30569 /* Retrieve an element from the above table, building some of
30570 the types lazily. */
30571
30572 static tree
30573 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
30574 {
30575 tree type;
30576
30577 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
30578
30579 type = ix86_builtin_func_type_tab[(int) tcode];
30580 if (type != NULL)
30581 return type;
30582
30583 if (tcode <= IX86_BT_LAST_FUNC)
30584 {
30585 unsigned start = ix86_builtin_func_start[(int) tcode];
30586 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
30587 tree rtype, atype, args = void_list_node;
30588 unsigned i;
30589
30590 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
30591 for (i = after - 1; i > start; --i)
30592 {
30593 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
30594 args = tree_cons (NULL, atype, args);
30595 }
30596
30597 type = build_function_type (rtype, args);
30598 }
30599 else
30600 {
30601 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
30602 enum ix86_builtin_func_type icode;
30603
30604 icode = ix86_builtin_func_alias_base[index];
30605 type = ix86_get_builtin_func_type (icode);
30606 }
30607
30608 ix86_builtin_func_type_tab[(int) tcode] = type;
30609 return type;
30610 }
30611
30612
30613 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
30614 bdesc_* arrays below should come first, then builtins for each bdesc_*
30615 array in ascending order, so that we can use direct array accesses. */
30616 enum ix86_builtins
30617 {
30618 IX86_BUILTIN_MASKMOVQ,
30619 IX86_BUILTIN_LDMXCSR,
30620 IX86_BUILTIN_STMXCSR,
30621 IX86_BUILTIN_MASKMOVDQU,
30622 IX86_BUILTIN_PSLLDQ128,
30623 IX86_BUILTIN_CLFLUSH,
30624 IX86_BUILTIN_MONITOR,
30625 IX86_BUILTIN_MWAIT,
30626 IX86_BUILTIN_CLZERO,
30627 IX86_BUILTIN_VEC_INIT_V2SI,
30628 IX86_BUILTIN_VEC_INIT_V4HI,
30629 IX86_BUILTIN_VEC_INIT_V8QI,
30630 IX86_BUILTIN_VEC_EXT_V2DF,
30631 IX86_BUILTIN_VEC_EXT_V2DI,
30632 IX86_BUILTIN_VEC_EXT_V4SF,
30633 IX86_BUILTIN_VEC_EXT_V4SI,
30634 IX86_BUILTIN_VEC_EXT_V8HI,
30635 IX86_BUILTIN_VEC_EXT_V2SI,
30636 IX86_BUILTIN_VEC_EXT_V4HI,
30637 IX86_BUILTIN_VEC_EXT_V16QI,
30638 IX86_BUILTIN_VEC_SET_V2DI,
30639 IX86_BUILTIN_VEC_SET_V4SF,
30640 IX86_BUILTIN_VEC_SET_V4SI,
30641 IX86_BUILTIN_VEC_SET_V8HI,
30642 IX86_BUILTIN_VEC_SET_V4HI,
30643 IX86_BUILTIN_VEC_SET_V16QI,
30644 IX86_BUILTIN_GATHERSIV2DF,
30645 IX86_BUILTIN_GATHERSIV4DF,
30646 IX86_BUILTIN_GATHERDIV2DF,
30647 IX86_BUILTIN_GATHERDIV4DF,
30648 IX86_BUILTIN_GATHERSIV4SF,
30649 IX86_BUILTIN_GATHERSIV8SF,
30650 IX86_BUILTIN_GATHERDIV4SF,
30651 IX86_BUILTIN_GATHERDIV8SF,
30652 IX86_BUILTIN_GATHERSIV2DI,
30653 IX86_BUILTIN_GATHERSIV4DI,
30654 IX86_BUILTIN_GATHERDIV2DI,
30655 IX86_BUILTIN_GATHERDIV4DI,
30656 IX86_BUILTIN_GATHERSIV4SI,
30657 IX86_BUILTIN_GATHERSIV8SI,
30658 IX86_BUILTIN_GATHERDIV4SI,
30659 IX86_BUILTIN_GATHERDIV8SI,
30660 IX86_BUILTIN_VFMSUBSD3_MASK3,
30661 IX86_BUILTIN_VFMSUBSS3_MASK3,
30662 IX86_BUILTIN_GATHER3SIV8SF,
30663 IX86_BUILTIN_GATHER3SIV4SF,
30664 IX86_BUILTIN_GATHER3SIV4DF,
30665 IX86_BUILTIN_GATHER3SIV2DF,
30666 IX86_BUILTIN_GATHER3DIV8SF,
30667 IX86_BUILTIN_GATHER3DIV4SF,
30668 IX86_BUILTIN_GATHER3DIV4DF,
30669 IX86_BUILTIN_GATHER3DIV2DF,
30670 IX86_BUILTIN_GATHER3SIV8SI,
30671 IX86_BUILTIN_GATHER3SIV4SI,
30672 IX86_BUILTIN_GATHER3SIV4DI,
30673 IX86_BUILTIN_GATHER3SIV2DI,
30674 IX86_BUILTIN_GATHER3DIV8SI,
30675 IX86_BUILTIN_GATHER3DIV4SI,
30676 IX86_BUILTIN_GATHER3DIV4DI,
30677 IX86_BUILTIN_GATHER3DIV2DI,
30678 IX86_BUILTIN_SCATTERSIV8SF,
30679 IX86_BUILTIN_SCATTERSIV4SF,
30680 IX86_BUILTIN_SCATTERSIV4DF,
30681 IX86_BUILTIN_SCATTERSIV2DF,
30682 IX86_BUILTIN_SCATTERDIV8SF,
30683 IX86_BUILTIN_SCATTERDIV4SF,
30684 IX86_BUILTIN_SCATTERDIV4DF,
30685 IX86_BUILTIN_SCATTERDIV2DF,
30686 IX86_BUILTIN_SCATTERSIV8SI,
30687 IX86_BUILTIN_SCATTERSIV4SI,
30688 IX86_BUILTIN_SCATTERSIV4DI,
30689 IX86_BUILTIN_SCATTERSIV2DI,
30690 IX86_BUILTIN_SCATTERDIV8SI,
30691 IX86_BUILTIN_SCATTERDIV4SI,
30692 IX86_BUILTIN_SCATTERDIV4DI,
30693 IX86_BUILTIN_SCATTERDIV2DI,
30694 /* Alternate 4 and 8 element gather/scatter for the vectorizer
30695 where all operands are 32-byte or 64-byte wide respectively. */
30696 IX86_BUILTIN_GATHERALTSIV4DF,
30697 IX86_BUILTIN_GATHERALTDIV8SF,
30698 IX86_BUILTIN_GATHERALTSIV4DI,
30699 IX86_BUILTIN_GATHERALTDIV8SI,
30700 IX86_BUILTIN_GATHER3ALTDIV16SF,
30701 IX86_BUILTIN_GATHER3ALTDIV16SI,
30702 IX86_BUILTIN_GATHER3ALTSIV4DF,
30703 IX86_BUILTIN_GATHER3ALTDIV8SF,
30704 IX86_BUILTIN_GATHER3ALTSIV4DI,
30705 IX86_BUILTIN_GATHER3ALTDIV8SI,
30706 IX86_BUILTIN_GATHER3ALTSIV8DF,
30707 IX86_BUILTIN_GATHER3ALTSIV8DI,
30708 IX86_BUILTIN_GATHER3DIV16SF,
30709 IX86_BUILTIN_GATHER3DIV16SI,
30710 IX86_BUILTIN_GATHER3DIV8DF,
30711 IX86_BUILTIN_GATHER3DIV8DI,
30712 IX86_BUILTIN_GATHER3SIV16SF,
30713 IX86_BUILTIN_GATHER3SIV16SI,
30714 IX86_BUILTIN_GATHER3SIV8DF,
30715 IX86_BUILTIN_GATHER3SIV8DI,
30716 IX86_BUILTIN_SCATTERALTSIV8DF,
30717 IX86_BUILTIN_SCATTERALTDIV16SF,
30718 IX86_BUILTIN_SCATTERALTSIV8DI,
30719 IX86_BUILTIN_SCATTERALTDIV16SI,
30720 IX86_BUILTIN_SCATTERDIV16SF,
30721 IX86_BUILTIN_SCATTERDIV16SI,
30722 IX86_BUILTIN_SCATTERDIV8DF,
30723 IX86_BUILTIN_SCATTERDIV8DI,
30724 IX86_BUILTIN_SCATTERSIV16SF,
30725 IX86_BUILTIN_SCATTERSIV16SI,
30726 IX86_BUILTIN_SCATTERSIV8DF,
30727 IX86_BUILTIN_SCATTERSIV8DI,
30728 IX86_BUILTIN_GATHERPFQPD,
30729 IX86_BUILTIN_GATHERPFDPS,
30730 IX86_BUILTIN_GATHERPFDPD,
30731 IX86_BUILTIN_GATHERPFQPS,
30732 IX86_BUILTIN_SCATTERPFDPD,
30733 IX86_BUILTIN_SCATTERPFDPS,
30734 IX86_BUILTIN_SCATTERPFQPD,
30735 IX86_BUILTIN_SCATTERPFQPS,
30736 IX86_BUILTIN_CLWB,
30737 IX86_BUILTIN_CLFLUSHOPT,
30738 IX86_BUILTIN_INFQ,
30739 IX86_BUILTIN_HUGE_VALQ,
30740 IX86_BUILTIN_NANQ,
30741 IX86_BUILTIN_NANSQ,
30742 IX86_BUILTIN_XABORT,
30743 IX86_BUILTIN_ADDCARRYX32,
30744 IX86_BUILTIN_ADDCARRYX64,
30745 IX86_BUILTIN_SBB32,
30746 IX86_BUILTIN_SBB64,
30747 IX86_BUILTIN_RDRAND16_STEP,
30748 IX86_BUILTIN_RDRAND32_STEP,
30749 IX86_BUILTIN_RDRAND64_STEP,
30750 IX86_BUILTIN_RDSEED16_STEP,
30751 IX86_BUILTIN_RDSEED32_STEP,
30752 IX86_BUILTIN_RDSEED64_STEP,
30753 IX86_BUILTIN_MONITORX,
30754 IX86_BUILTIN_MWAITX,
30755 IX86_BUILTIN_CFSTRING,
30756 IX86_BUILTIN_CPU_INIT,
30757 IX86_BUILTIN_CPU_IS,
30758 IX86_BUILTIN_CPU_SUPPORTS,
30759 IX86_BUILTIN_READ_FLAGS,
30760 IX86_BUILTIN_WRITE_FLAGS,
30761
30762 /* All the remaining builtins are tracked in bdesc_* arrays in
30763 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
30764 this point. */
30765 #define BDESC(mask, icode, name, code, comparison, flag) \
30766 code,
30767 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
30768 code, \
30769 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
30770 #define BDESC_END(kind, next_kind)
30771
30772 #include "i386-builtin.def"
30773
30774 #undef BDESC
30775 #undef BDESC_FIRST
30776 #undef BDESC_END
30777
30778 IX86_BUILTIN_MAX,
30779
30780 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
30781
30782 /* Now just the aliases for bdesc_* start/end. */
30783 #define BDESC(mask, icode, name, code, comparison, flag)
30784 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
30785 #define BDESC_END(kind, next_kind) \
30786 IX86_BUILTIN__BDESC_##kind##_LAST \
30787 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
30788
30789 #include "i386-builtin.def"
30790
30791 #undef BDESC
30792 #undef BDESC_FIRST
30793 #undef BDESC_END
30794
30795 /* Just to make sure there is no comma after the last enumerator. */
30796 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
30797 };
30798
30799 /* Table for the ix86 builtin decls. */
30800 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
30801
30802 /* Table of all of the builtin functions that are possible with different ISA's
30803 but are waiting to be built until a function is declared to use that
30804 ISA. */
30805 struct builtin_isa {
30806 const char *name; /* function name */
30807 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
30808 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
30809 bool const_p; /* true if the declaration is constant */
30810 bool leaf_p; /* true if the declaration has leaf attribute */
30811 bool nothrow_p; /* true if the declaration has nothrow attribute */
30812 bool set_and_not_built_p;
30813 };
30814
30815 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
30816
30817 /* Bits that can still enable any inclusion of a builtin. */
30818 static HOST_WIDE_INT deferred_isa_values = 0;
30819
30820 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
30821 of which isa_flags to use in the ix86_builtins_isa array. Stores the
30822 function decl in the ix86_builtins array. Returns the function decl or
30823 NULL_TREE, if the builtin was not added.
30824
30825 If the front end has a special hook for builtin functions, delay adding
30826 builtin functions that aren't in the current ISA until the ISA is changed
30827 with function specific optimization. Doing so, can save about 300K for the
30828 default compiler. When the builtin is expanded, check at that time whether
30829 it is valid.
30830
30831 If the front end doesn't have a special hook, record all builtins, even if
30832 it isn't an instruction set in the current ISA in case the user uses
30833 function specific options for a different ISA, so that we don't get scope
30834 errors if a builtin is added in the middle of a function scope. */
30835
30836 static inline tree
30837 def_builtin (HOST_WIDE_INT mask, const char *name,
30838 enum ix86_builtin_func_type tcode,
30839 enum ix86_builtins code)
30840 {
30841 tree decl = NULL_TREE;
30842
30843 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
30844 {
30845 ix86_builtins_isa[(int) code].isa = mask;
30846
30847 /* OPTION_MASK_ISA_AVX512VL has special meaning. Despite of generic case,
30848 where any bit set means that built-in is enable, this bit must be *and-ed*
30849 with another one. E.g.: OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL
30850 means that *both* cpuid bits must be set for the built-in to be available.
30851 Handle this here. */
30852 if (mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
30853 mask &= ~OPTION_MASK_ISA_AVX512VL;
30854
30855 mask &= ~OPTION_MASK_ISA_64BIT;
30856 if (mask == 0
30857 || (mask & ix86_isa_flags) != 0
30858 || (lang_hooks.builtin_function
30859 == lang_hooks.builtin_function_ext_scope))
30860
30861 {
30862 tree type = ix86_get_builtin_func_type (tcode);
30863 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
30864 NULL, NULL_TREE);
30865 ix86_builtins[(int) code] = decl;
30866 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
30867 }
30868 else
30869 {
30870 /* Just a MASK where set_and_not_built_p == true can potentially
30871 include a builtin. */
30872 deferred_isa_values |= mask;
30873 ix86_builtins[(int) code] = NULL_TREE;
30874 ix86_builtins_isa[(int) code].tcode = tcode;
30875 ix86_builtins_isa[(int) code].name = name;
30876 ix86_builtins_isa[(int) code].leaf_p = false;
30877 ix86_builtins_isa[(int) code].nothrow_p = false;
30878 ix86_builtins_isa[(int) code].const_p = false;
30879 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
30880 }
30881 }
30882
30883 return decl;
30884 }
30885
30886 /* Like def_builtin, but also marks the function decl "const". */
30887
30888 static inline tree
30889 def_builtin_const (HOST_WIDE_INT mask, const char *name,
30890 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
30891 {
30892 tree decl = def_builtin (mask, name, tcode, code);
30893 if (decl)
30894 TREE_READONLY (decl) = 1;
30895 else
30896 ix86_builtins_isa[(int) code].const_p = true;
30897
30898 return decl;
30899 }
30900
30901 /* Add any new builtin functions for a given ISA that may not have been
30902 declared. This saves a bit of space compared to adding all of the
30903 declarations to the tree, even if we didn't use them. */
30904
30905 static void
30906 ix86_add_new_builtins (HOST_WIDE_INT isa)
30907 {
30908 if ((isa & deferred_isa_values) == 0)
30909 return;
30910
30911 /* Bits in ISA value can be removed from potential isa values. */
30912 deferred_isa_values &= ~isa;
30913
30914 int i;
30915 tree saved_current_target_pragma = current_target_pragma;
30916 current_target_pragma = NULL_TREE;
30917
30918 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
30919 {
30920 if ((ix86_builtins_isa[i].isa & isa) != 0
30921 && ix86_builtins_isa[i].set_and_not_built_p)
30922 {
30923 tree decl, type;
30924
30925 /* Don't define the builtin again. */
30926 ix86_builtins_isa[i].set_and_not_built_p = false;
30927
30928 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
30929 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
30930 type, i, BUILT_IN_MD, NULL,
30931 NULL_TREE);
30932
30933 ix86_builtins[i] = decl;
30934 if (ix86_builtins_isa[i].const_p)
30935 TREE_READONLY (decl) = 1;
30936 if (ix86_builtins_isa[i].leaf_p)
30937 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
30938 NULL_TREE);
30939 if (ix86_builtins_isa[i].nothrow_p)
30940 TREE_NOTHROW (decl) = 1;
30941 }
30942 }
30943
30944 current_target_pragma = saved_current_target_pragma;
30945 }
30946
30947 /* Bits for builtin_description.flag. */
30948
30949 /* Set when we don't support the comparison natively, and should
30950 swap_comparison in order to support it. */
30951 #define BUILTIN_DESC_SWAP_OPERANDS 1
30952
30953 struct builtin_description
30954 {
30955 const HOST_WIDE_INT mask;
30956 const enum insn_code icode;
30957 const char *const name;
30958 const enum ix86_builtins code;
30959 const enum rtx_code comparison;
30960 const int flag;
30961 };
30962
30963 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
30964 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
30965 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
30966 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
30967 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
30968 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
30969 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
30970 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
30971 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
30972 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
30973 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
30974 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
30975 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
30976 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
30977 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
30978 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
30979 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
30980 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
30981 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
30982 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
30983 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
30984 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
30985 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
30986 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
30987 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
30988 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
30989 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
30990 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
30991 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
30992 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
30993 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
30994 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
30995 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
30996 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
30997 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
30998 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
30999 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
31000 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
31001 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
31002 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
31003 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
31004 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
31005 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
31006 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
31007 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
31008 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
31009 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
31010 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
31011 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
31012 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
31013 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
31014 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
31015
31016 #define BDESC(mask, icode, name, code, comparison, flag) \
31017 { mask, icode, name, code, comparison, flag },
31018 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
31019 static const struct builtin_description bdesc_##kind[] = \
31020 { \
31021 BDESC (mask, icode, name, code, comparison, flag)
31022 #define BDESC_END(kind, next_kind) \
31023 };
31024
31025 #include "i386-builtin.def"
31026
31027 #undef BDESC
31028 #undef BDESC_FIRST
31029 #undef BDESC_END
31030 \f
31031 /* TM vector builtins. */
31032
31033 /* Reuse the existing x86-specific `struct builtin_description' cause
31034 we're lazy. Add casts to make them fit. */
31035 static const struct builtin_description bdesc_tm[] =
31036 {
31037 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31038 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31039 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31040 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31041 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31042 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31043 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31044
31045 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31046 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31047 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31048 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31049 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31050 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31051 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31052
31053 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31054 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31055 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31056 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31057 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31058 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31059 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31060
31061 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
31062 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
31063 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
31064 };
31065
31066 /* Initialize the transactional memory vector load/store builtins. */
31067
31068 static void
31069 ix86_init_tm_builtins (void)
31070 {
31071 enum ix86_builtin_func_type ftype;
31072 const struct builtin_description *d;
31073 size_t i;
31074 tree decl;
31075 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
31076 tree attrs_log, attrs_type_log;
31077
31078 if (!flag_tm)
31079 return;
31080
31081 /* If there are no builtins defined, we must be compiling in a
31082 language without trans-mem support. */
31083 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
31084 return;
31085
31086 /* Use whatever attributes a normal TM load has. */
31087 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
31088 attrs_load = DECL_ATTRIBUTES (decl);
31089 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31090 /* Use whatever attributes a normal TM store has. */
31091 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
31092 attrs_store = DECL_ATTRIBUTES (decl);
31093 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31094 /* Use whatever attributes a normal TM log has. */
31095 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
31096 attrs_log = DECL_ATTRIBUTES (decl);
31097 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31098
31099 for (i = 0, d = bdesc_tm;
31100 i < ARRAY_SIZE (bdesc_tm);
31101 i++, d++)
31102 {
31103 if ((d->mask & ix86_isa_flags) != 0
31104 || (lang_hooks.builtin_function
31105 == lang_hooks.builtin_function_ext_scope))
31106 {
31107 tree type, attrs, attrs_type;
31108 enum built_in_function code = (enum built_in_function) d->code;
31109
31110 ftype = (enum ix86_builtin_func_type) d->flag;
31111 type = ix86_get_builtin_func_type (ftype);
31112
31113 if (BUILTIN_TM_LOAD_P (code))
31114 {
31115 attrs = attrs_load;
31116 attrs_type = attrs_type_load;
31117 }
31118 else if (BUILTIN_TM_STORE_P (code))
31119 {
31120 attrs = attrs_store;
31121 attrs_type = attrs_type_store;
31122 }
31123 else
31124 {
31125 attrs = attrs_log;
31126 attrs_type = attrs_type_log;
31127 }
31128 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
31129 /* The builtin without the prefix for
31130 calling it directly. */
31131 d->name + strlen ("__builtin_"),
31132 attrs);
31133 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
31134 set the TYPE_ATTRIBUTES. */
31135 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
31136
31137 set_builtin_decl (code, decl, false);
31138 }
31139 }
31140 }
31141
31142 /* Macros for verification of enum ix86_builtins order. */
31143 #define BDESC_VERIFY(x, y, z) \
31144 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
31145 #define BDESC_VERIFYS(x, y, z) \
31146 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
31147
31148 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31149 IX86_BUILTIN__BDESC_COMI_LAST, 1);
31150 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31151 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
31152 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31153 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
31154 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
31155 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
31156 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31157 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
31158 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
31159 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
31160 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
31161 IX86_BUILTIN__BDESC_MPX_LAST, 1);
31162 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
31163 IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
31164 BDESC_VERIFYS (IX86_BUILTIN_MAX,
31165 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
31166
31167 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
31168 in the current target ISA to allow the user to compile particular modules
31169 with different target specific options that differ from the command line
31170 options. */
31171 static void
31172 ix86_init_mmx_sse_builtins (void)
31173 {
31174 const struct builtin_description * d;
31175 enum ix86_builtin_func_type ftype;
31176 size_t i;
31177
31178 /* Add all special builtins with variable number of operands. */
31179 for (i = 0, d = bdesc_special_args;
31180 i < ARRAY_SIZE (bdesc_special_args);
31181 i++, d++)
31182 {
31183 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
31184 if (d->name == 0)
31185 continue;
31186
31187 ftype = (enum ix86_builtin_func_type) d->flag;
31188 def_builtin (d->mask, d->name, ftype, d->code);
31189 }
31190 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
31191 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31192 ARRAY_SIZE (bdesc_special_args) - 1);
31193
31194 /* Add all builtins with variable number of operands. */
31195 for (i = 0, d = bdesc_args;
31196 i < ARRAY_SIZE (bdesc_args);
31197 i++, d++)
31198 {
31199 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
31200 if (d->name == 0)
31201 continue;
31202
31203 ftype = (enum ix86_builtin_func_type) d->flag;
31204 def_builtin_const (d->mask, d->name, ftype, d->code);
31205 }
31206 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
31207 IX86_BUILTIN__BDESC_ARGS_FIRST,
31208 ARRAY_SIZE (bdesc_args) - 1);
31209
31210 /* Add all builtins with rounding. */
31211 for (i = 0, d = bdesc_round_args;
31212 i < ARRAY_SIZE (bdesc_round_args);
31213 i++, d++)
31214 {
31215 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
31216 if (d->name == 0)
31217 continue;
31218
31219 ftype = (enum ix86_builtin_func_type) d->flag;
31220 def_builtin_const (d->mask, d->name, ftype, d->code);
31221 }
31222 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
31223 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31224 ARRAY_SIZE (bdesc_round_args) - 1);
31225
31226 /* pcmpestr[im] insns. */
31227 for (i = 0, d = bdesc_pcmpestr;
31228 i < ARRAY_SIZE (bdesc_pcmpestr);
31229 i++, d++)
31230 {
31231 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
31232 if (d->code == IX86_BUILTIN_PCMPESTRM128)
31233 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
31234 else
31235 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
31236 def_builtin_const (d->mask, d->name, ftype, d->code);
31237 }
31238 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
31239 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31240 ARRAY_SIZE (bdesc_pcmpestr) - 1);
31241
31242 /* pcmpistr[im] insns. */
31243 for (i = 0, d = bdesc_pcmpistr;
31244 i < ARRAY_SIZE (bdesc_pcmpistr);
31245 i++, d++)
31246 {
31247 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
31248 if (d->code == IX86_BUILTIN_PCMPISTRM128)
31249 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
31250 else
31251 ftype = INT_FTYPE_V16QI_V16QI_INT;
31252 def_builtin_const (d->mask, d->name, ftype, d->code);
31253 }
31254 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
31255 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31256 ARRAY_SIZE (bdesc_pcmpistr) - 1);
31257
31258 /* comi/ucomi insns. */
31259 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
31260 {
31261 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
31262 if (d->mask == OPTION_MASK_ISA_SSE2)
31263 ftype = INT_FTYPE_V2DF_V2DF;
31264 else
31265 ftype = INT_FTYPE_V4SF_V4SF;
31266 def_builtin_const (d->mask, d->name, ftype, d->code);
31267 }
31268 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
31269 IX86_BUILTIN__BDESC_COMI_FIRST,
31270 ARRAY_SIZE (bdesc_comi) - 1);
31271
31272 /* SSE */
31273 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
31274 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
31275 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
31276 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
31277
31278 /* SSE or 3DNow!A */
31279 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
31280 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
31281 IX86_BUILTIN_MASKMOVQ);
31282
31283 /* SSE2 */
31284 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
31285 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
31286
31287 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
31288 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
31289 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
31290 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
31291
31292 /* SSE3. */
31293 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
31294 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
31295 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
31296 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
31297
31298 /* AES */
31299 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
31300 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
31301 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
31302 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
31303 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
31304 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
31305 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
31306 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
31307 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
31308 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
31309 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
31310 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
31311
31312 /* PCLMUL */
31313 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
31314 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
31315
31316 /* RDRND */
31317 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
31318 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
31319 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
31320 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
31321 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
31322 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
31323 IX86_BUILTIN_RDRAND64_STEP);
31324
31325 /* AVX2 */
31326 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
31327 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
31328 IX86_BUILTIN_GATHERSIV2DF);
31329
31330 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
31331 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
31332 IX86_BUILTIN_GATHERSIV4DF);
31333
31334 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
31335 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
31336 IX86_BUILTIN_GATHERDIV2DF);
31337
31338 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
31339 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
31340 IX86_BUILTIN_GATHERDIV4DF);
31341
31342 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
31343 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
31344 IX86_BUILTIN_GATHERSIV4SF);
31345
31346 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
31347 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
31348 IX86_BUILTIN_GATHERSIV8SF);
31349
31350 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
31351 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
31352 IX86_BUILTIN_GATHERDIV4SF);
31353
31354 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
31355 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
31356 IX86_BUILTIN_GATHERDIV8SF);
31357
31358 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
31359 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
31360 IX86_BUILTIN_GATHERSIV2DI);
31361
31362 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
31363 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
31364 IX86_BUILTIN_GATHERSIV4DI);
31365
31366 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
31367 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
31368 IX86_BUILTIN_GATHERDIV2DI);
31369
31370 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
31371 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
31372 IX86_BUILTIN_GATHERDIV4DI);
31373
31374 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
31375 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
31376 IX86_BUILTIN_GATHERSIV4SI);
31377
31378 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
31379 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
31380 IX86_BUILTIN_GATHERSIV8SI);
31381
31382 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
31383 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
31384 IX86_BUILTIN_GATHERDIV4SI);
31385
31386 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
31387 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
31388 IX86_BUILTIN_GATHERDIV8SI);
31389
31390 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
31391 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
31392 IX86_BUILTIN_GATHERALTSIV4DF);
31393
31394 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
31395 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
31396 IX86_BUILTIN_GATHERALTDIV8SF);
31397
31398 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
31399 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
31400 IX86_BUILTIN_GATHERALTSIV4DI);
31401
31402 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
31403 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
31404 IX86_BUILTIN_GATHERALTDIV8SI);
31405
31406 /* AVX512F */
31407 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
31408 V16SF_FTYPE_V16SF_PCFLOAT_V16SI_HI_INT,
31409 IX86_BUILTIN_GATHER3SIV16SF);
31410
31411 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
31412 V8DF_FTYPE_V8DF_PCDOUBLE_V8SI_QI_INT,
31413 IX86_BUILTIN_GATHER3SIV8DF);
31414
31415 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
31416 V8SF_FTYPE_V8SF_PCFLOAT_V8DI_QI_INT,
31417 IX86_BUILTIN_GATHER3DIV16SF);
31418
31419 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
31420 V8DF_FTYPE_V8DF_PCDOUBLE_V8DI_QI_INT,
31421 IX86_BUILTIN_GATHER3DIV8DF);
31422
31423 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
31424 V16SI_FTYPE_V16SI_PCINT_V16SI_HI_INT,
31425 IX86_BUILTIN_GATHER3SIV16SI);
31426
31427 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
31428 V8DI_FTYPE_V8DI_PCINT64_V8SI_QI_INT,
31429 IX86_BUILTIN_GATHER3SIV8DI);
31430
31431 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
31432 V8SI_FTYPE_V8SI_PCINT_V8DI_QI_INT,
31433 IX86_BUILTIN_GATHER3DIV16SI);
31434
31435 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
31436 V8DI_FTYPE_V8DI_PCINT64_V8DI_QI_INT,
31437 IX86_BUILTIN_GATHER3DIV8DI);
31438
31439 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
31440 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
31441 IX86_BUILTIN_GATHER3ALTSIV8DF);
31442
31443 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
31444 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
31445 IX86_BUILTIN_GATHER3ALTDIV16SF);
31446
31447 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
31448 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
31449 IX86_BUILTIN_GATHER3ALTSIV8DI);
31450
31451 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
31452 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
31453 IX86_BUILTIN_GATHER3ALTDIV16SI);
31454
31455 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
31456 VOID_FTYPE_PFLOAT_HI_V16SI_V16SF_INT,
31457 IX86_BUILTIN_SCATTERSIV16SF);
31458
31459 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
31460 VOID_FTYPE_PDOUBLE_QI_V8SI_V8DF_INT,
31461 IX86_BUILTIN_SCATTERSIV8DF);
31462
31463 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
31464 VOID_FTYPE_PFLOAT_QI_V8DI_V8SF_INT,
31465 IX86_BUILTIN_SCATTERDIV16SF);
31466
31467 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
31468 VOID_FTYPE_PDOUBLE_QI_V8DI_V8DF_INT,
31469 IX86_BUILTIN_SCATTERDIV8DF);
31470
31471 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
31472 VOID_FTYPE_PINT_HI_V16SI_V16SI_INT,
31473 IX86_BUILTIN_SCATTERSIV16SI);
31474
31475 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
31476 VOID_FTYPE_PLONGLONG_QI_V8SI_V8DI_INT,
31477 IX86_BUILTIN_SCATTERSIV8DI);
31478
31479 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
31480 VOID_FTYPE_PINT_QI_V8DI_V8SI_INT,
31481 IX86_BUILTIN_SCATTERDIV16SI);
31482
31483 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
31484 VOID_FTYPE_PLONGLONG_QI_V8DI_V8DI_INT,
31485 IX86_BUILTIN_SCATTERDIV8DI);
31486
31487 /* AVX512VL */
31488 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
31489 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_QI_INT,
31490 IX86_BUILTIN_GATHER3SIV2DF);
31491
31492 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
31493 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_QI_INT,
31494 IX86_BUILTIN_GATHER3SIV4DF);
31495
31496 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
31497 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_QI_INT,
31498 IX86_BUILTIN_GATHER3DIV2DF);
31499
31500 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
31501 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_QI_INT,
31502 IX86_BUILTIN_GATHER3DIV4DF);
31503
31504 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
31505 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_QI_INT,
31506 IX86_BUILTIN_GATHER3SIV4SF);
31507
31508 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
31509 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_QI_INT,
31510 IX86_BUILTIN_GATHER3SIV8SF);
31511
31512 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
31513 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_QI_INT,
31514 IX86_BUILTIN_GATHER3DIV4SF);
31515
31516 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
31517 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_QI_INT,
31518 IX86_BUILTIN_GATHER3DIV8SF);
31519
31520 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
31521 V2DI_FTYPE_V2DI_PCINT64_V4SI_QI_INT,
31522 IX86_BUILTIN_GATHER3SIV2DI);
31523
31524 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
31525 V4DI_FTYPE_V4DI_PCINT64_V4SI_QI_INT,
31526 IX86_BUILTIN_GATHER3SIV4DI);
31527
31528 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
31529 V2DI_FTYPE_V2DI_PCINT64_V2DI_QI_INT,
31530 IX86_BUILTIN_GATHER3DIV2DI);
31531
31532 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
31533 V4DI_FTYPE_V4DI_PCINT64_V4DI_QI_INT,
31534 IX86_BUILTIN_GATHER3DIV4DI);
31535
31536 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
31537 V4SI_FTYPE_V4SI_PCINT_V4SI_QI_INT,
31538 IX86_BUILTIN_GATHER3SIV4SI);
31539
31540 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
31541 V8SI_FTYPE_V8SI_PCINT_V8SI_QI_INT,
31542 IX86_BUILTIN_GATHER3SIV8SI);
31543
31544 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
31545 V4SI_FTYPE_V4SI_PCINT_V2DI_QI_INT,
31546 IX86_BUILTIN_GATHER3DIV4SI);
31547
31548 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
31549 V4SI_FTYPE_V4SI_PCINT_V4DI_QI_INT,
31550 IX86_BUILTIN_GATHER3DIV8SI);
31551
31552 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
31553 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
31554 IX86_BUILTIN_GATHER3ALTSIV4DF);
31555
31556 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
31557 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
31558 IX86_BUILTIN_GATHER3ALTDIV8SF);
31559
31560 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
31561 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
31562 IX86_BUILTIN_GATHER3ALTSIV4DI);
31563
31564 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
31565 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
31566 IX86_BUILTIN_GATHER3ALTDIV8SI);
31567
31568 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
31569 VOID_FTYPE_PFLOAT_QI_V8SI_V8SF_INT,
31570 IX86_BUILTIN_SCATTERSIV8SF);
31571
31572 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
31573 VOID_FTYPE_PFLOAT_QI_V4SI_V4SF_INT,
31574 IX86_BUILTIN_SCATTERSIV4SF);
31575
31576 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
31577 VOID_FTYPE_PDOUBLE_QI_V4SI_V4DF_INT,
31578 IX86_BUILTIN_SCATTERSIV4DF);
31579
31580 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
31581 VOID_FTYPE_PDOUBLE_QI_V4SI_V2DF_INT,
31582 IX86_BUILTIN_SCATTERSIV2DF);
31583
31584 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
31585 VOID_FTYPE_PFLOAT_QI_V4DI_V4SF_INT,
31586 IX86_BUILTIN_SCATTERDIV8SF);
31587
31588 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
31589 VOID_FTYPE_PFLOAT_QI_V2DI_V4SF_INT,
31590 IX86_BUILTIN_SCATTERDIV4SF);
31591
31592 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
31593 VOID_FTYPE_PDOUBLE_QI_V4DI_V4DF_INT,
31594 IX86_BUILTIN_SCATTERDIV4DF);
31595
31596 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
31597 VOID_FTYPE_PDOUBLE_QI_V2DI_V2DF_INT,
31598 IX86_BUILTIN_SCATTERDIV2DF);
31599
31600 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
31601 VOID_FTYPE_PINT_QI_V8SI_V8SI_INT,
31602 IX86_BUILTIN_SCATTERSIV8SI);
31603
31604 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
31605 VOID_FTYPE_PINT_QI_V4SI_V4SI_INT,
31606 IX86_BUILTIN_SCATTERSIV4SI);
31607
31608 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
31609 VOID_FTYPE_PLONGLONG_QI_V4SI_V4DI_INT,
31610 IX86_BUILTIN_SCATTERSIV4DI);
31611
31612 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
31613 VOID_FTYPE_PLONGLONG_QI_V4SI_V2DI_INT,
31614 IX86_BUILTIN_SCATTERSIV2DI);
31615
31616 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
31617 VOID_FTYPE_PINT_QI_V4DI_V4SI_INT,
31618 IX86_BUILTIN_SCATTERDIV8SI);
31619
31620 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
31621 VOID_FTYPE_PINT_QI_V2DI_V4SI_INT,
31622 IX86_BUILTIN_SCATTERDIV4SI);
31623
31624 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
31625 VOID_FTYPE_PLONGLONG_QI_V4DI_V4DI_INT,
31626 IX86_BUILTIN_SCATTERDIV4DI);
31627
31628 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
31629 VOID_FTYPE_PLONGLONG_QI_V2DI_V2DI_INT,
31630 IX86_BUILTIN_SCATTERDIV2DI);
31631 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
31632 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
31633 IX86_BUILTIN_SCATTERALTSIV8DF);
31634
31635 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
31636 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
31637 IX86_BUILTIN_SCATTERALTDIV16SF);
31638
31639 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
31640 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
31641 IX86_BUILTIN_SCATTERALTSIV8DI);
31642
31643 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
31644 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
31645 IX86_BUILTIN_SCATTERALTDIV16SI);
31646
31647 /* AVX512PF */
31648 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
31649 VOID_FTYPE_QI_V8SI_PCINT64_INT_INT,
31650 IX86_BUILTIN_GATHERPFDPD);
31651 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
31652 VOID_FTYPE_HI_V16SI_PCINT_INT_INT,
31653 IX86_BUILTIN_GATHERPFDPS);
31654 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
31655 VOID_FTYPE_QI_V8DI_PCINT64_INT_INT,
31656 IX86_BUILTIN_GATHERPFQPD);
31657 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
31658 VOID_FTYPE_QI_V8DI_PCINT_INT_INT,
31659 IX86_BUILTIN_GATHERPFQPS);
31660 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
31661 VOID_FTYPE_QI_V8SI_PCINT64_INT_INT,
31662 IX86_BUILTIN_SCATTERPFDPD);
31663 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
31664 VOID_FTYPE_HI_V16SI_PCINT_INT_INT,
31665 IX86_BUILTIN_SCATTERPFDPS);
31666 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
31667 VOID_FTYPE_QI_V8DI_PCINT64_INT_INT,
31668 IX86_BUILTIN_SCATTERPFQPD);
31669 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
31670 VOID_FTYPE_QI_V8DI_PCINT_INT_INT,
31671 IX86_BUILTIN_SCATTERPFQPS);
31672
31673 /* SHA */
31674 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
31675 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
31676 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
31677 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
31678 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
31679 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
31680 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
31681 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
31682 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
31683 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
31684 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
31685 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
31686 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
31687 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
31688
31689 /* RTM. */
31690 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
31691 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
31692
31693 /* MMX access to the vec_init patterns. */
31694 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
31695 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
31696
31697 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
31698 V4HI_FTYPE_HI_HI_HI_HI,
31699 IX86_BUILTIN_VEC_INIT_V4HI);
31700
31701 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
31702 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
31703 IX86_BUILTIN_VEC_INIT_V8QI);
31704
31705 /* Access to the vec_extract patterns. */
31706 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
31707 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
31708 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
31709 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
31710 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
31711 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
31712 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
31713 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
31714 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
31715 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
31716
31717 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
31718 "__builtin_ia32_vec_ext_v4hi",
31719 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
31720
31721 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
31722 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
31723
31724 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
31725 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
31726
31727 /* Access to the vec_set patterns. */
31728 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
31729 "__builtin_ia32_vec_set_v2di",
31730 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
31731
31732 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
31733 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
31734
31735 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
31736 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
31737
31738 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
31739 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
31740
31741 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
31742 "__builtin_ia32_vec_set_v4hi",
31743 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
31744
31745 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
31746 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
31747
31748 /* RDSEED */
31749 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
31750 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
31751 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
31752 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
31753 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
31754 "__builtin_ia32_rdseed_di_step",
31755 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
31756
31757 /* ADCX */
31758 def_builtin (0, "__builtin_ia32_addcarryx_u32",
31759 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
31760 def_builtin (OPTION_MASK_ISA_64BIT,
31761 "__builtin_ia32_addcarryx_u64",
31762 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
31763 IX86_BUILTIN_ADDCARRYX64);
31764
31765 /* SBB */
31766 def_builtin (0, "__builtin_ia32_sbb_u32",
31767 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
31768 def_builtin (OPTION_MASK_ISA_64BIT,
31769 "__builtin_ia32_sbb_u64",
31770 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
31771 IX86_BUILTIN_SBB64);
31772
31773 /* Read/write FLAGS. */
31774 def_builtin (~OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u32",
31775 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
31776 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
31777 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
31778 def_builtin (~OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u32",
31779 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
31780 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
31781 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
31782
31783 /* CLFLUSHOPT. */
31784 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
31785 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
31786
31787 /* CLWB. */
31788 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
31789 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
31790
31791 /* MONITORX and MWAITX. */
31792 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
31793 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
31794 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
31795 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
31796
31797 /* CLZERO. */
31798 def_builtin (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
31799 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
31800
31801 /* Add FMA4 multi-arg argument instructions */
31802 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
31803 {
31804 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
31805 if (d->name == 0)
31806 continue;
31807
31808 ftype = (enum ix86_builtin_func_type) d->flag;
31809 def_builtin_const (d->mask, d->name, ftype, d->code);
31810 }
31811 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
31812 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
31813 ARRAY_SIZE (bdesc_multi_arg) - 1);
31814 }
31815
31816 static void
31817 ix86_init_mpx_builtins ()
31818 {
31819 const struct builtin_description * d;
31820 enum ix86_builtin_func_type ftype;
31821 tree decl;
31822 size_t i;
31823
31824 for (i = 0, d = bdesc_mpx;
31825 i < ARRAY_SIZE (bdesc_mpx);
31826 i++, d++)
31827 {
31828 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
31829 if (d->name == 0)
31830 continue;
31831
31832 ftype = (enum ix86_builtin_func_type) d->flag;
31833 decl = def_builtin (d->mask, d->name, ftype, d->code);
31834
31835 /* With no leaf and nothrow flags for MPX builtins
31836 abnormal edges may follow its call when setjmp
31837 presents in the function. Since we may have a lot
31838 of MPX builtins calls it causes lots of useless
31839 edges and enormous PHI nodes. To avoid this we mark
31840 MPX builtins as leaf and nothrow. */
31841 if (decl)
31842 {
31843 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
31844 NULL_TREE);
31845 TREE_NOTHROW (decl) = 1;
31846 }
31847 else
31848 {
31849 ix86_builtins_isa[(int)d->code].leaf_p = true;
31850 ix86_builtins_isa[(int)d->code].nothrow_p = true;
31851 }
31852 }
31853 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
31854 IX86_BUILTIN__BDESC_MPX_FIRST,
31855 ARRAY_SIZE (bdesc_mpx) - 1);
31856
31857 for (i = 0, d = bdesc_mpx_const;
31858 i < ARRAY_SIZE (bdesc_mpx_const);
31859 i++, d++)
31860 {
31861 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
31862 if (d->name == 0)
31863 continue;
31864
31865 ftype = (enum ix86_builtin_func_type) d->flag;
31866 decl = def_builtin_const (d->mask, d->name, ftype, d->code);
31867
31868 if (decl)
31869 {
31870 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
31871 NULL_TREE);
31872 TREE_NOTHROW (decl) = 1;
31873 }
31874 else
31875 {
31876 ix86_builtins_isa[(int)d->code].leaf_p = true;
31877 ix86_builtins_isa[(int)d->code].nothrow_p = true;
31878 }
31879 }
31880 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
31881 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
31882 ARRAY_SIZE (bdesc_mpx_const) - 1);
31883 }
31884 #undef BDESC_VERIFY
31885 #undef BDESC_VERIFYS
31886
31887 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
31888 to return a pointer to VERSION_DECL if the outcome of the expression
31889 formed by PREDICATE_CHAIN is true. This function will be called during
31890 version dispatch to decide which function version to execute. It returns
31891 the basic block at the end, to which more conditions can be added. */
31892
31893 static basic_block
31894 add_condition_to_bb (tree function_decl, tree version_decl,
31895 tree predicate_chain, basic_block new_bb)
31896 {
31897 gimple *return_stmt;
31898 tree convert_expr, result_var;
31899 gimple *convert_stmt;
31900 gimple *call_cond_stmt;
31901 gimple *if_else_stmt;
31902
31903 basic_block bb1, bb2, bb3;
31904 edge e12, e23;
31905
31906 tree cond_var, and_expr_var = NULL_TREE;
31907 gimple_seq gseq;
31908
31909 tree predicate_decl, predicate_arg;
31910
31911 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
31912
31913 gcc_assert (new_bb != NULL);
31914 gseq = bb_seq (new_bb);
31915
31916
31917 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
31918 build_fold_addr_expr (version_decl));
31919 result_var = create_tmp_var (ptr_type_node);
31920 convert_stmt = gimple_build_assign (result_var, convert_expr);
31921 return_stmt = gimple_build_return (result_var);
31922
31923 if (predicate_chain == NULL_TREE)
31924 {
31925 gimple_seq_add_stmt (&gseq, convert_stmt);
31926 gimple_seq_add_stmt (&gseq, return_stmt);
31927 set_bb_seq (new_bb, gseq);
31928 gimple_set_bb (convert_stmt, new_bb);
31929 gimple_set_bb (return_stmt, new_bb);
31930 pop_cfun ();
31931 return new_bb;
31932 }
31933
31934 while (predicate_chain != NULL)
31935 {
31936 cond_var = create_tmp_var (integer_type_node);
31937 predicate_decl = TREE_PURPOSE (predicate_chain);
31938 predicate_arg = TREE_VALUE (predicate_chain);
31939 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
31940 gimple_call_set_lhs (call_cond_stmt, cond_var);
31941
31942 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
31943 gimple_set_bb (call_cond_stmt, new_bb);
31944 gimple_seq_add_stmt (&gseq, call_cond_stmt);
31945
31946 predicate_chain = TREE_CHAIN (predicate_chain);
31947
31948 if (and_expr_var == NULL)
31949 and_expr_var = cond_var;
31950 else
31951 {
31952 gimple *assign_stmt;
31953 /* Use MIN_EXPR to check if any integer is zero?.
31954 and_expr_var = min_expr <cond_var, and_expr_var> */
31955 assign_stmt = gimple_build_assign (and_expr_var,
31956 build2 (MIN_EXPR, integer_type_node,
31957 cond_var, and_expr_var));
31958
31959 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
31960 gimple_set_bb (assign_stmt, new_bb);
31961 gimple_seq_add_stmt (&gseq, assign_stmt);
31962 }
31963 }
31964
31965 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
31966 integer_zero_node,
31967 NULL_TREE, NULL_TREE);
31968 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
31969 gimple_set_bb (if_else_stmt, new_bb);
31970 gimple_seq_add_stmt (&gseq, if_else_stmt);
31971
31972 gimple_seq_add_stmt (&gseq, convert_stmt);
31973 gimple_seq_add_stmt (&gseq, return_stmt);
31974 set_bb_seq (new_bb, gseq);
31975
31976 bb1 = new_bb;
31977 e12 = split_block (bb1, if_else_stmt);
31978 bb2 = e12->dest;
31979 e12->flags &= ~EDGE_FALLTHRU;
31980 e12->flags |= EDGE_TRUE_VALUE;
31981
31982 e23 = split_block (bb2, return_stmt);
31983
31984 gimple_set_bb (convert_stmt, bb2);
31985 gimple_set_bb (return_stmt, bb2);
31986
31987 bb3 = e23->dest;
31988 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
31989
31990 remove_edge (e23);
31991 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
31992
31993 pop_cfun ();
31994
31995 return bb3;
31996 }
31997
31998 /* This parses the attribute arguments to target in DECL and determines
31999 the right builtin to use to match the platform specification.
32000 It returns the priority value for this version decl. If PREDICATE_LIST
32001 is not NULL, it stores the list of cpu features that need to be checked
32002 before dispatching this function. */
32003
32004 static unsigned int
32005 get_builtin_code_for_version (tree decl, tree *predicate_list)
32006 {
32007 tree attrs;
32008 struct cl_target_option cur_target;
32009 tree target_node;
32010 struct cl_target_option *new_target;
32011 const char *arg_str = NULL;
32012 const char *attrs_str = NULL;
32013 char *tok_str = NULL;
32014 char *token;
32015
32016 /* Priority of i386 features, greater value is higher priority. This is
32017 used to decide the order in which function dispatch must happen. For
32018 instance, a version specialized for SSE4.2 should be checked for dispatch
32019 before a version for SSE3, as SSE4.2 implies SSE3. */
32020 enum feature_priority
32021 {
32022 P_ZERO = 0,
32023 P_MMX,
32024 P_SSE,
32025 P_SSE2,
32026 P_SSE3,
32027 P_SSSE3,
32028 P_PROC_SSSE3,
32029 P_SSE4_A,
32030 P_PROC_SSE4_A,
32031 P_SSE4_1,
32032 P_SSE4_2,
32033 P_PROC_SSE4_2,
32034 P_POPCNT,
32035 P_AES,
32036 P_PCLMUL,
32037 P_AVX,
32038 P_PROC_AVX,
32039 P_BMI,
32040 P_PROC_BMI,
32041 P_FMA4,
32042 P_XOP,
32043 P_PROC_XOP,
32044 P_FMA,
32045 P_PROC_FMA,
32046 P_BMI2,
32047 P_AVX2,
32048 P_PROC_AVX2,
32049 P_AVX512F,
32050 P_PROC_AVX512F
32051 };
32052
32053 enum feature_priority priority = P_ZERO;
32054
32055 /* These are the target attribute strings for which a dispatcher is
32056 available, from fold_builtin_cpu. */
32057
32058 static struct _feature_list
32059 {
32060 const char *const name;
32061 const enum feature_priority priority;
32062 }
32063 const feature_list[] =
32064 {
32065 {"mmx", P_MMX},
32066 {"sse", P_SSE},
32067 {"sse2", P_SSE2},
32068 {"sse3", P_SSE3},
32069 {"sse4a", P_SSE4_A},
32070 {"ssse3", P_SSSE3},
32071 {"sse4.1", P_SSE4_1},
32072 {"sse4.2", P_SSE4_2},
32073 {"popcnt", P_POPCNT},
32074 {"aes", P_AES},
32075 {"pclmul", P_PCLMUL},
32076 {"avx", P_AVX},
32077 {"bmi", P_BMI},
32078 {"fma4", P_FMA4},
32079 {"xop", P_XOP},
32080 {"fma", P_FMA},
32081 {"bmi2", P_BMI2},
32082 {"avx2", P_AVX2},
32083 {"avx512f", P_AVX512F}
32084 };
32085
32086
32087 static unsigned int NUM_FEATURES
32088 = sizeof (feature_list) / sizeof (struct _feature_list);
32089
32090 unsigned int i;
32091
32092 tree predicate_chain = NULL_TREE;
32093 tree predicate_decl, predicate_arg;
32094
32095 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32096 gcc_assert (attrs != NULL);
32097
32098 attrs = TREE_VALUE (TREE_VALUE (attrs));
32099
32100 gcc_assert (TREE_CODE (attrs) == STRING_CST);
32101 attrs_str = TREE_STRING_POINTER (attrs);
32102
32103 /* Return priority zero for default function. */
32104 if (strcmp (attrs_str, "default") == 0)
32105 return 0;
32106
32107 /* Handle arch= if specified. For priority, set it to be 1 more than
32108 the best instruction set the processor can handle. For instance, if
32109 there is a version for atom and a version for ssse3 (the highest ISA
32110 priority for atom), the atom version must be checked for dispatch
32111 before the ssse3 version. */
32112 if (strstr (attrs_str, "arch=") != NULL)
32113 {
32114 cl_target_option_save (&cur_target, &global_options);
32115 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
32116 &global_options_set);
32117
32118 gcc_assert (target_node);
32119 new_target = TREE_TARGET_OPTION (target_node);
32120 gcc_assert (new_target);
32121
32122 if (new_target->arch_specified && new_target->arch > 0)
32123 {
32124 switch (new_target->arch)
32125 {
32126 case PROCESSOR_CORE2:
32127 arg_str = "core2";
32128 priority = P_PROC_SSSE3;
32129 break;
32130 case PROCESSOR_NEHALEM:
32131 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
32132 arg_str = "westmere";
32133 else
32134 /* We translate "arch=corei7" and "arch=nehalem" to
32135 "corei7" so that it will be mapped to M_INTEL_COREI7
32136 as cpu type to cover all M_INTEL_COREI7_XXXs. */
32137 arg_str = "corei7";
32138 priority = P_PROC_SSE4_2;
32139 break;
32140 case PROCESSOR_SANDYBRIDGE:
32141 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
32142 arg_str = "ivybridge";
32143 else
32144 arg_str = "sandybridge";
32145 priority = P_PROC_AVX;
32146 break;
32147 case PROCESSOR_HASWELL:
32148 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
32149 arg_str = "skylake-avx512";
32150 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_XSAVES)
32151 arg_str = "skylake";
32152 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
32153 arg_str = "broadwell";
32154 else
32155 arg_str = "haswell";
32156 priority = P_PROC_AVX2;
32157 break;
32158 case PROCESSOR_BONNELL:
32159 arg_str = "bonnell";
32160 priority = P_PROC_SSSE3;
32161 break;
32162 case PROCESSOR_KNL:
32163 arg_str = "knl";
32164 priority = P_PROC_AVX512F;
32165 break;
32166 case PROCESSOR_SILVERMONT:
32167 arg_str = "silvermont";
32168 priority = P_PROC_SSE4_2;
32169 break;
32170 case PROCESSOR_AMDFAM10:
32171 arg_str = "amdfam10h";
32172 priority = P_PROC_SSE4_A;
32173 break;
32174 case PROCESSOR_BTVER1:
32175 arg_str = "btver1";
32176 priority = P_PROC_SSE4_A;
32177 break;
32178 case PROCESSOR_BTVER2:
32179 arg_str = "btver2";
32180 priority = P_PROC_BMI;
32181 break;
32182 case PROCESSOR_BDVER1:
32183 arg_str = "bdver1";
32184 priority = P_PROC_XOP;
32185 break;
32186 case PROCESSOR_BDVER2:
32187 arg_str = "bdver2";
32188 priority = P_PROC_FMA;
32189 break;
32190 case PROCESSOR_BDVER3:
32191 arg_str = "bdver3";
32192 priority = P_PROC_FMA;
32193 break;
32194 case PROCESSOR_BDVER4:
32195 arg_str = "bdver4";
32196 priority = P_PROC_AVX2;
32197 break;
32198 case PROCESSOR_ZNVER1:
32199 arg_str = "znver1";
32200 priority = P_PROC_AVX2;
32201 break;
32202 }
32203 }
32204
32205 cl_target_option_restore (&global_options, &cur_target);
32206
32207 if (predicate_list && arg_str == NULL)
32208 {
32209 error_at (DECL_SOURCE_LOCATION (decl),
32210 "No dispatcher found for the versioning attributes");
32211 return 0;
32212 }
32213
32214 if (predicate_list)
32215 {
32216 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
32217 /* For a C string literal the length includes the trailing NULL. */
32218 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
32219 predicate_chain = tree_cons (predicate_decl, predicate_arg,
32220 predicate_chain);
32221 }
32222 }
32223
32224 /* Process feature name. */
32225 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
32226 strcpy (tok_str, attrs_str);
32227 token = strtok (tok_str, ",");
32228 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
32229
32230 while (token != NULL)
32231 {
32232 /* Do not process "arch=" */
32233 if (strncmp (token, "arch=", 5) == 0)
32234 {
32235 token = strtok (NULL, ",");
32236 continue;
32237 }
32238 for (i = 0; i < NUM_FEATURES; ++i)
32239 {
32240 if (strcmp (token, feature_list[i].name) == 0)
32241 {
32242 if (predicate_list)
32243 {
32244 predicate_arg = build_string_literal (
32245 strlen (feature_list[i].name) + 1,
32246 feature_list[i].name);
32247 predicate_chain = tree_cons (predicate_decl, predicate_arg,
32248 predicate_chain);
32249 }
32250 /* Find the maximum priority feature. */
32251 if (feature_list[i].priority > priority)
32252 priority = feature_list[i].priority;
32253
32254 break;
32255 }
32256 }
32257 if (predicate_list && i == NUM_FEATURES)
32258 {
32259 error_at (DECL_SOURCE_LOCATION (decl),
32260 "No dispatcher found for %s", token);
32261 return 0;
32262 }
32263 token = strtok (NULL, ",");
32264 }
32265 free (tok_str);
32266
32267 if (predicate_list && predicate_chain == NULL_TREE)
32268 {
32269 error_at (DECL_SOURCE_LOCATION (decl),
32270 "No dispatcher found for the versioning attributes : %s",
32271 attrs_str);
32272 return 0;
32273 }
32274 else if (predicate_list)
32275 {
32276 predicate_chain = nreverse (predicate_chain);
32277 *predicate_list = predicate_chain;
32278 }
32279
32280 return priority;
32281 }
32282
32283 /* This compares the priority of target features in function DECL1
32284 and DECL2. It returns positive value if DECL1 is higher priority,
32285 negative value if DECL2 is higher priority and 0 if they are the
32286 same. */
32287
32288 static int
32289 ix86_compare_version_priority (tree decl1, tree decl2)
32290 {
32291 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
32292 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
32293
32294 return (int)priority1 - (int)priority2;
32295 }
32296
32297 /* V1 and V2 point to function versions with different priorities
32298 based on the target ISA. This function compares their priorities. */
32299
32300 static int
32301 feature_compare (const void *v1, const void *v2)
32302 {
32303 typedef struct _function_version_info
32304 {
32305 tree version_decl;
32306 tree predicate_chain;
32307 unsigned int dispatch_priority;
32308 } function_version_info;
32309
32310 const function_version_info c1 = *(const function_version_info *)v1;
32311 const function_version_info c2 = *(const function_version_info *)v2;
32312 return (c2.dispatch_priority - c1.dispatch_priority);
32313 }
32314
32315 /* This function generates the dispatch function for
32316 multi-versioned functions. DISPATCH_DECL is the function which will
32317 contain the dispatch logic. FNDECLS are the function choices for
32318 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
32319 in DISPATCH_DECL in which the dispatch code is generated. */
32320
32321 static int
32322 dispatch_function_versions (tree dispatch_decl,
32323 void *fndecls_p,
32324 basic_block *empty_bb)
32325 {
32326 tree default_decl;
32327 gimple *ifunc_cpu_init_stmt;
32328 gimple_seq gseq;
32329 int ix;
32330 tree ele;
32331 vec<tree> *fndecls;
32332 unsigned int num_versions = 0;
32333 unsigned int actual_versions = 0;
32334 unsigned int i;
32335
32336 struct _function_version_info
32337 {
32338 tree version_decl;
32339 tree predicate_chain;
32340 unsigned int dispatch_priority;
32341 }*function_version_info;
32342
32343 gcc_assert (dispatch_decl != NULL
32344 && fndecls_p != NULL
32345 && empty_bb != NULL);
32346
32347 /*fndecls_p is actually a vector. */
32348 fndecls = static_cast<vec<tree> *> (fndecls_p);
32349
32350 /* At least one more version other than the default. */
32351 num_versions = fndecls->length ();
32352 gcc_assert (num_versions >= 2);
32353
32354 function_version_info = (struct _function_version_info *)
32355 XNEWVEC (struct _function_version_info, (num_versions - 1));
32356
32357 /* The first version in the vector is the default decl. */
32358 default_decl = (*fndecls)[0];
32359
32360 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
32361
32362 gseq = bb_seq (*empty_bb);
32363 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
32364 constructors, so explicity call __builtin_cpu_init here. */
32365 ifunc_cpu_init_stmt = gimple_build_call_vec (
32366 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
32367 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
32368 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
32369 set_bb_seq (*empty_bb, gseq);
32370
32371 pop_cfun ();
32372
32373
32374 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
32375 {
32376 tree version_decl = ele;
32377 tree predicate_chain = NULL_TREE;
32378 unsigned int priority;
32379 /* Get attribute string, parse it and find the right predicate decl.
32380 The predicate function could be a lengthy combination of many
32381 features, like arch-type and various isa-variants. */
32382 priority = get_builtin_code_for_version (version_decl,
32383 &predicate_chain);
32384
32385 if (predicate_chain == NULL_TREE)
32386 continue;
32387
32388 function_version_info [actual_versions].version_decl = version_decl;
32389 function_version_info [actual_versions].predicate_chain
32390 = predicate_chain;
32391 function_version_info [actual_versions].dispatch_priority = priority;
32392 actual_versions++;
32393 }
32394
32395 /* Sort the versions according to descending order of dispatch priority. The
32396 priority is based on the ISA. This is not a perfect solution. There
32397 could still be ambiguity. If more than one function version is suitable
32398 to execute, which one should be dispatched? In future, allow the user
32399 to specify a dispatch priority next to the version. */
32400 qsort (function_version_info, actual_versions,
32401 sizeof (struct _function_version_info), feature_compare);
32402
32403 for (i = 0; i < actual_versions; ++i)
32404 *empty_bb = add_condition_to_bb (dispatch_decl,
32405 function_version_info[i].version_decl,
32406 function_version_info[i].predicate_chain,
32407 *empty_bb);
32408
32409 /* dispatch default version at the end. */
32410 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
32411 NULL, *empty_bb);
32412
32413 free (function_version_info);
32414 return 0;
32415 }
32416
32417 /* Comparator function to be used in qsort routine to sort attribute
32418 specification strings to "target". */
32419
32420 static int
32421 attr_strcmp (const void *v1, const void *v2)
32422 {
32423 const char *c1 = *(char *const*)v1;
32424 const char *c2 = *(char *const*)v2;
32425 return strcmp (c1, c2);
32426 }
32427
32428 /* ARGLIST is the argument to target attribute. This function tokenizes
32429 the comma separated arguments, sorts them and returns a string which
32430 is a unique identifier for the comma separated arguments. It also
32431 replaces non-identifier characters "=,-" with "_". */
32432
32433 static char *
32434 sorted_attr_string (tree arglist)
32435 {
32436 tree arg;
32437 size_t str_len_sum = 0;
32438 char **args = NULL;
32439 char *attr_str, *ret_str;
32440 char *attr = NULL;
32441 unsigned int argnum = 1;
32442 unsigned int i;
32443
32444 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
32445 {
32446 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
32447 size_t len = strlen (str);
32448 str_len_sum += len + 1;
32449 if (arg != arglist)
32450 argnum++;
32451 for (i = 0; i < strlen (str); i++)
32452 if (str[i] == ',')
32453 argnum++;
32454 }
32455
32456 attr_str = XNEWVEC (char, str_len_sum);
32457 str_len_sum = 0;
32458 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
32459 {
32460 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
32461 size_t len = strlen (str);
32462 memcpy (attr_str + str_len_sum, str, len);
32463 attr_str[str_len_sum + len] = TREE_CHAIN (arg) ? ',' : '\0';
32464 str_len_sum += len + 1;
32465 }
32466
32467 /* Replace "=,-" with "_". */
32468 for (i = 0; i < strlen (attr_str); i++)
32469 if (attr_str[i] == '=' || attr_str[i]== '-')
32470 attr_str[i] = '_';
32471
32472 if (argnum == 1)
32473 return attr_str;
32474
32475 args = XNEWVEC (char *, argnum);
32476
32477 i = 0;
32478 attr = strtok (attr_str, ",");
32479 while (attr != NULL)
32480 {
32481 args[i] = attr;
32482 i++;
32483 attr = strtok (NULL, ",");
32484 }
32485
32486 qsort (args, argnum, sizeof (char *), attr_strcmp);
32487
32488 ret_str = XNEWVEC (char, str_len_sum);
32489 str_len_sum = 0;
32490 for (i = 0; i < argnum; i++)
32491 {
32492 size_t len = strlen (args[i]);
32493 memcpy (ret_str + str_len_sum, args[i], len);
32494 ret_str[str_len_sum + len] = i < argnum - 1 ? '_' : '\0';
32495 str_len_sum += len + 1;
32496 }
32497
32498 XDELETEVEC (args);
32499 XDELETEVEC (attr_str);
32500 return ret_str;
32501 }
32502
32503 /* This function changes the assembler name for functions that are
32504 versions. If DECL is a function version and has a "target"
32505 attribute, it appends the attribute string to its assembler name. */
32506
32507 static tree
32508 ix86_mangle_function_version_assembler_name (tree decl, tree id)
32509 {
32510 tree version_attr;
32511 const char *orig_name, *version_string;
32512 char *attr_str, *assembler_name;
32513
32514 if (DECL_DECLARED_INLINE_P (decl)
32515 && lookup_attribute ("gnu_inline",
32516 DECL_ATTRIBUTES (decl)))
32517 error_at (DECL_SOURCE_LOCATION (decl),
32518 "Function versions cannot be marked as gnu_inline,"
32519 " bodies have to be generated");
32520
32521 if (DECL_VIRTUAL_P (decl)
32522 || DECL_VINDEX (decl))
32523 sorry ("Virtual function multiversioning not supported");
32524
32525 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32526
32527 /* target attribute string cannot be NULL. */
32528 gcc_assert (version_attr != NULL_TREE);
32529
32530 orig_name = IDENTIFIER_POINTER (id);
32531 version_string
32532 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
32533
32534 if (strcmp (version_string, "default") == 0)
32535 return id;
32536
32537 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
32538 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
32539
32540 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
32541
32542 /* Allow assembler name to be modified if already set. */
32543 if (DECL_ASSEMBLER_NAME_SET_P (decl))
32544 SET_DECL_RTL (decl, NULL);
32545
32546 tree ret = get_identifier (assembler_name);
32547 XDELETEVEC (attr_str);
32548 XDELETEVEC (assembler_name);
32549 return ret;
32550 }
32551
32552 /* This function returns true if FN1 and FN2 are versions of the same function,
32553 that is, the target strings of the function decls are different. This assumes
32554 that FN1 and FN2 have the same signature. */
32555
32556 static bool
32557 ix86_function_versions (tree fn1, tree fn2)
32558 {
32559 tree attr1, attr2;
32560 char *target1, *target2;
32561 bool result;
32562
32563 if (TREE_CODE (fn1) != FUNCTION_DECL
32564 || TREE_CODE (fn2) != FUNCTION_DECL)
32565 return false;
32566
32567 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
32568 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
32569
32570 /* At least one function decl should have the target attribute specified. */
32571 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
32572 return false;
32573
32574 /* Diagnose missing target attribute if one of the decls is already
32575 multi-versioned. */
32576 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
32577 {
32578 if (DECL_FUNCTION_VERSIONED (fn1) || DECL_FUNCTION_VERSIONED (fn2))
32579 {
32580 if (attr2 != NULL_TREE)
32581 {
32582 std::swap (fn1, fn2);
32583 attr1 = attr2;
32584 }
32585 error_at (DECL_SOURCE_LOCATION (fn2),
32586 "missing %<target%> attribute for multi-versioned %D",
32587 fn2);
32588 inform (DECL_SOURCE_LOCATION (fn1),
32589 "previous declaration of %D", fn1);
32590 /* Prevent diagnosing of the same error multiple times. */
32591 DECL_ATTRIBUTES (fn2)
32592 = tree_cons (get_identifier ("target"),
32593 copy_node (TREE_VALUE (attr1)),
32594 DECL_ATTRIBUTES (fn2));
32595 }
32596 return false;
32597 }
32598
32599 target1 = sorted_attr_string (TREE_VALUE (attr1));
32600 target2 = sorted_attr_string (TREE_VALUE (attr2));
32601
32602 /* The sorted target strings must be different for fn1 and fn2
32603 to be versions. */
32604 if (strcmp (target1, target2) == 0)
32605 result = false;
32606 else
32607 result = true;
32608
32609 XDELETEVEC (target1);
32610 XDELETEVEC (target2);
32611
32612 return result;
32613 }
32614
32615 static tree
32616 ix86_mangle_decl_assembler_name (tree decl, tree id)
32617 {
32618 /* For function version, add the target suffix to the assembler name. */
32619 if (TREE_CODE (decl) == FUNCTION_DECL
32620 && DECL_FUNCTION_VERSIONED (decl))
32621 id = ix86_mangle_function_version_assembler_name (decl, id);
32622 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
32623 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
32624 #endif
32625
32626 return id;
32627 }
32628
32629 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
32630 is true, append the full path name of the source file. */
32631
32632 static char *
32633 make_name (tree decl, const char *suffix, bool make_unique)
32634 {
32635 char *global_var_name;
32636 int name_len;
32637 const char *name;
32638 const char *unique_name = NULL;
32639
32640 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
32641
32642 /* Get a unique name that can be used globally without any chances
32643 of collision at link time. */
32644 if (make_unique)
32645 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
32646
32647 name_len = strlen (name) + strlen (suffix) + 2;
32648
32649 if (make_unique)
32650 name_len += strlen (unique_name) + 1;
32651 global_var_name = XNEWVEC (char, name_len);
32652
32653 /* Use '.' to concatenate names as it is demangler friendly. */
32654 if (make_unique)
32655 snprintf (global_var_name, name_len, "%s.%s.%s", name, unique_name,
32656 suffix);
32657 else
32658 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
32659
32660 return global_var_name;
32661 }
32662
32663 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
32664
32665 /* Make a dispatcher declaration for the multi-versioned function DECL.
32666 Calls to DECL function will be replaced with calls to the dispatcher
32667 by the front-end. Return the decl created. */
32668
32669 static tree
32670 make_dispatcher_decl (const tree decl)
32671 {
32672 tree func_decl;
32673 char *func_name;
32674 tree fn_type, func_type;
32675 bool is_uniq = false;
32676
32677 if (TREE_PUBLIC (decl) == 0)
32678 is_uniq = true;
32679
32680 func_name = make_name (decl, "ifunc", is_uniq);
32681
32682 fn_type = TREE_TYPE (decl);
32683 func_type = build_function_type (TREE_TYPE (fn_type),
32684 TYPE_ARG_TYPES (fn_type));
32685
32686 func_decl = build_fn_decl (func_name, func_type);
32687 XDELETEVEC (func_name);
32688 TREE_USED (func_decl) = 1;
32689 DECL_CONTEXT (func_decl) = NULL_TREE;
32690 DECL_INITIAL (func_decl) = error_mark_node;
32691 DECL_ARTIFICIAL (func_decl) = 1;
32692 /* Mark this func as external, the resolver will flip it again if
32693 it gets generated. */
32694 DECL_EXTERNAL (func_decl) = 1;
32695 /* This will be of type IFUNCs have to be externally visible. */
32696 TREE_PUBLIC (func_decl) = 1;
32697
32698 return func_decl;
32699 }
32700
32701 #endif
32702
32703 /* Returns true if decl is multi-versioned and DECL is the default function,
32704 that is it is not tagged with target specific optimization. */
32705
32706 static bool
32707 is_function_default_version (const tree decl)
32708 {
32709 if (TREE_CODE (decl) != FUNCTION_DECL
32710 || !DECL_FUNCTION_VERSIONED (decl))
32711 return false;
32712 tree attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32713 gcc_assert (attr);
32714 attr = TREE_VALUE (TREE_VALUE (attr));
32715 return (TREE_CODE (attr) == STRING_CST
32716 && strcmp (TREE_STRING_POINTER (attr), "default") == 0);
32717 }
32718
32719 /* Make a dispatcher declaration for the multi-versioned function DECL.
32720 Calls to DECL function will be replaced with calls to the dispatcher
32721 by the front-end. Returns the decl of the dispatcher function. */
32722
32723 static tree
32724 ix86_get_function_versions_dispatcher (void *decl)
32725 {
32726 tree fn = (tree) decl;
32727 struct cgraph_node *node = NULL;
32728 struct cgraph_node *default_node = NULL;
32729 struct cgraph_function_version_info *node_v = NULL;
32730 struct cgraph_function_version_info *first_v = NULL;
32731
32732 tree dispatch_decl = NULL;
32733
32734 struct cgraph_function_version_info *default_version_info = NULL;
32735
32736 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
32737
32738 node = cgraph_node::get (fn);
32739 gcc_assert (node != NULL);
32740
32741 node_v = node->function_version ();
32742 gcc_assert (node_v != NULL);
32743
32744 if (node_v->dispatcher_resolver != NULL)
32745 return node_v->dispatcher_resolver;
32746
32747 /* Find the default version and make it the first node. */
32748 first_v = node_v;
32749 /* Go to the beginning of the chain. */
32750 while (first_v->prev != NULL)
32751 first_v = first_v->prev;
32752 default_version_info = first_v;
32753 while (default_version_info != NULL)
32754 {
32755 if (is_function_default_version
32756 (default_version_info->this_node->decl))
32757 break;
32758 default_version_info = default_version_info->next;
32759 }
32760
32761 /* If there is no default node, just return NULL. */
32762 if (default_version_info == NULL)
32763 return NULL;
32764
32765 /* Make default info the first node. */
32766 if (first_v != default_version_info)
32767 {
32768 default_version_info->prev->next = default_version_info->next;
32769 if (default_version_info->next)
32770 default_version_info->next->prev = default_version_info->prev;
32771 first_v->prev = default_version_info;
32772 default_version_info->next = first_v;
32773 default_version_info->prev = NULL;
32774 }
32775
32776 default_node = default_version_info->this_node;
32777
32778 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
32779 if (targetm.has_ifunc_p ())
32780 {
32781 struct cgraph_function_version_info *it_v = NULL;
32782 struct cgraph_node *dispatcher_node = NULL;
32783 struct cgraph_function_version_info *dispatcher_version_info = NULL;
32784
32785 /* Right now, the dispatching is done via ifunc. */
32786 dispatch_decl = make_dispatcher_decl (default_node->decl);
32787
32788 dispatcher_node = cgraph_node::get_create (dispatch_decl);
32789 gcc_assert (dispatcher_node != NULL);
32790 dispatcher_node->dispatcher_function = 1;
32791 dispatcher_version_info
32792 = dispatcher_node->insert_new_function_version ();
32793 dispatcher_version_info->next = default_version_info;
32794 dispatcher_node->definition = 1;
32795
32796 /* Set the dispatcher for all the versions. */
32797 it_v = default_version_info;
32798 while (it_v != NULL)
32799 {
32800 it_v->dispatcher_resolver = dispatch_decl;
32801 it_v = it_v->next;
32802 }
32803 }
32804 else
32805 #endif
32806 {
32807 error_at (DECL_SOURCE_LOCATION (default_node->decl),
32808 "multiversioning needs ifunc which is not supported "
32809 "on this target");
32810 }
32811
32812 return dispatch_decl;
32813 }
32814
32815 /* Make the resolver function decl to dispatch the versions of
32816 a multi-versioned function, DEFAULT_DECL. Create an
32817 empty basic block in the resolver and store the pointer in
32818 EMPTY_BB. Return the decl of the resolver function. */
32819
32820 static tree
32821 make_resolver_func (const tree default_decl,
32822 const tree dispatch_decl,
32823 basic_block *empty_bb)
32824 {
32825 char *resolver_name;
32826 tree decl, type, decl_name, t;
32827 bool is_uniq = false;
32828
32829 /* IFUNC's have to be globally visible. So, if the default_decl is
32830 not, then the name of the IFUNC should be made unique. */
32831 if (TREE_PUBLIC (default_decl) == 0)
32832 is_uniq = true;
32833
32834 /* Append the filename to the resolver function if the versions are
32835 not externally visible. This is because the resolver function has
32836 to be externally visible for the loader to find it. So, appending
32837 the filename will prevent conflicts with a resolver function from
32838 another module which is based on the same version name. */
32839 resolver_name = make_name (default_decl, "resolver", is_uniq);
32840
32841 /* The resolver function should return a (void *). */
32842 type = build_function_type_list (ptr_type_node, NULL_TREE);
32843
32844 decl = build_fn_decl (resolver_name, type);
32845 decl_name = get_identifier (resolver_name);
32846 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
32847
32848 DECL_NAME (decl) = decl_name;
32849 TREE_USED (decl) = 1;
32850 DECL_ARTIFICIAL (decl) = 1;
32851 DECL_IGNORED_P (decl) = 0;
32852 /* IFUNC resolvers have to be externally visible. */
32853 TREE_PUBLIC (decl) = 1;
32854 DECL_UNINLINABLE (decl) = 1;
32855
32856 /* Resolver is not external, body is generated. */
32857 DECL_EXTERNAL (decl) = 0;
32858 DECL_EXTERNAL (dispatch_decl) = 0;
32859
32860 DECL_CONTEXT (decl) = NULL_TREE;
32861 DECL_INITIAL (decl) = make_node (BLOCK);
32862 DECL_STATIC_CONSTRUCTOR (decl) = 0;
32863
32864 if (DECL_COMDAT_GROUP (default_decl)
32865 || TREE_PUBLIC (default_decl))
32866 {
32867 /* In this case, each translation unit with a call to this
32868 versioned function will put out a resolver. Ensure it
32869 is comdat to keep just one copy. */
32870 DECL_COMDAT (decl) = 1;
32871 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
32872 }
32873 /* Build result decl and add to function_decl. */
32874 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
32875 DECL_ARTIFICIAL (t) = 1;
32876 DECL_IGNORED_P (t) = 1;
32877 DECL_RESULT (decl) = t;
32878
32879 gimplify_function_tree (decl);
32880 push_cfun (DECL_STRUCT_FUNCTION (decl));
32881 *empty_bb = init_lowered_empty_function (decl, false, 0);
32882
32883 cgraph_node::add_new_function (decl, true);
32884 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
32885
32886 pop_cfun ();
32887
32888 gcc_assert (dispatch_decl != NULL);
32889 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
32890 DECL_ATTRIBUTES (dispatch_decl)
32891 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
32892
32893 /* Create the alias for dispatch to resolver here. */
32894 /*cgraph_create_function_alias (dispatch_decl, decl);*/
32895 cgraph_node::create_same_body_alias (dispatch_decl, decl);
32896 XDELETEVEC (resolver_name);
32897 return decl;
32898 }
32899
32900 /* Generate the dispatching code body to dispatch multi-versioned function
32901 DECL. The target hook is called to process the "target" attributes and
32902 provide the code to dispatch the right function at run-time. NODE points
32903 to the dispatcher decl whose body will be created. */
32904
32905 static tree
32906 ix86_generate_version_dispatcher_body (void *node_p)
32907 {
32908 tree resolver_decl;
32909 basic_block empty_bb;
32910 tree default_ver_decl;
32911 struct cgraph_node *versn;
32912 struct cgraph_node *node;
32913
32914 struct cgraph_function_version_info *node_version_info = NULL;
32915 struct cgraph_function_version_info *versn_info = NULL;
32916
32917 node = (cgraph_node *)node_p;
32918
32919 node_version_info = node->function_version ();
32920 gcc_assert (node->dispatcher_function
32921 && node_version_info != NULL);
32922
32923 if (node_version_info->dispatcher_resolver)
32924 return node_version_info->dispatcher_resolver;
32925
32926 /* The first version in the chain corresponds to the default version. */
32927 default_ver_decl = node_version_info->next->this_node->decl;
32928
32929 /* node is going to be an alias, so remove the finalized bit. */
32930 node->definition = false;
32931
32932 resolver_decl = make_resolver_func (default_ver_decl,
32933 node->decl, &empty_bb);
32934
32935 node_version_info->dispatcher_resolver = resolver_decl;
32936
32937 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
32938
32939 auto_vec<tree, 2> fn_ver_vec;
32940
32941 for (versn_info = node_version_info->next; versn_info;
32942 versn_info = versn_info->next)
32943 {
32944 versn = versn_info->this_node;
32945 /* Check for virtual functions here again, as by this time it should
32946 have been determined if this function needs a vtable index or
32947 not. This happens for methods in derived classes that override
32948 virtual methods in base classes but are not explicitly marked as
32949 virtual. */
32950 if (DECL_VINDEX (versn->decl))
32951 sorry ("Virtual function multiversioning not supported");
32952
32953 fn_ver_vec.safe_push (versn->decl);
32954 }
32955
32956 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
32957 cgraph_edge::rebuild_edges ();
32958 pop_cfun ();
32959 return resolver_decl;
32960 }
32961 /* This builds the processor_model struct type defined in
32962 libgcc/config/i386/cpuinfo.c */
32963
32964 static tree
32965 build_processor_model_struct (void)
32966 {
32967 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
32968 "__cpu_features"};
32969 tree field = NULL_TREE, field_chain = NULL_TREE;
32970 int i;
32971 tree type = make_node (RECORD_TYPE);
32972
32973 /* The first 3 fields are unsigned int. */
32974 for (i = 0; i < 3; ++i)
32975 {
32976 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
32977 get_identifier (field_name[i]), unsigned_type_node);
32978 if (field_chain != NULL_TREE)
32979 DECL_CHAIN (field) = field_chain;
32980 field_chain = field;
32981 }
32982
32983 /* The last field is an array of unsigned integers of size one. */
32984 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
32985 get_identifier (field_name[3]),
32986 build_array_type (unsigned_type_node,
32987 build_index_type (size_one_node)));
32988 if (field_chain != NULL_TREE)
32989 DECL_CHAIN (field) = field_chain;
32990 field_chain = field;
32991
32992 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
32993 return type;
32994 }
32995
32996 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
32997
32998 static tree
32999 make_var_decl (tree type, const char *name)
33000 {
33001 tree new_decl;
33002
33003 new_decl = build_decl (UNKNOWN_LOCATION,
33004 VAR_DECL,
33005 get_identifier(name),
33006 type);
33007
33008 DECL_EXTERNAL (new_decl) = 1;
33009 TREE_STATIC (new_decl) = 1;
33010 TREE_PUBLIC (new_decl) = 1;
33011 DECL_INITIAL (new_decl) = 0;
33012 DECL_ARTIFICIAL (new_decl) = 0;
33013 DECL_PRESERVE_P (new_decl) = 1;
33014
33015 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
33016 assemble_variable (new_decl, 0, 0, 0);
33017
33018 return new_decl;
33019 }
33020
33021 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
33022 into an integer defined in libgcc/config/i386/cpuinfo.c */
33023
33024 static tree
33025 fold_builtin_cpu (tree fndecl, tree *args)
33026 {
33027 unsigned int i;
33028 enum ix86_builtins fn_code = (enum ix86_builtins)
33029 DECL_FUNCTION_CODE (fndecl);
33030 tree param_string_cst = NULL;
33031
33032 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
33033 enum processor_features
33034 {
33035 F_CMOV = 0,
33036 F_MMX,
33037 F_POPCNT,
33038 F_SSE,
33039 F_SSE2,
33040 F_SSE3,
33041 F_SSSE3,
33042 F_SSE4_1,
33043 F_SSE4_2,
33044 F_AVX,
33045 F_AVX2,
33046 F_SSE4_A,
33047 F_FMA4,
33048 F_XOP,
33049 F_FMA,
33050 F_AVX512F,
33051 F_BMI,
33052 F_BMI2,
33053 F_AES,
33054 F_PCLMUL,
33055 F_AVX512VL,
33056 F_AVX512BW,
33057 F_AVX512DQ,
33058 F_AVX512CD,
33059 F_AVX512ER,
33060 F_AVX512PF,
33061 F_AVX512VBMI,
33062 F_AVX512IFMA,
33063 F_MAX
33064 };
33065
33066 /* These are the values for vendor types and cpu types and subtypes
33067 in cpuinfo.c. Cpu types and subtypes should be subtracted by
33068 the corresponding start value. */
33069 enum processor_model
33070 {
33071 M_INTEL = 1,
33072 M_AMD,
33073 M_CPU_TYPE_START,
33074 M_INTEL_BONNELL,
33075 M_INTEL_CORE2,
33076 M_INTEL_COREI7,
33077 M_AMDFAM10H,
33078 M_AMDFAM15H,
33079 M_INTEL_SILVERMONT,
33080 M_INTEL_KNL,
33081 M_AMD_BTVER1,
33082 M_AMD_BTVER2,
33083 M_CPU_SUBTYPE_START,
33084 M_INTEL_COREI7_NEHALEM,
33085 M_INTEL_COREI7_WESTMERE,
33086 M_INTEL_COREI7_SANDYBRIDGE,
33087 M_AMDFAM10H_BARCELONA,
33088 M_AMDFAM10H_SHANGHAI,
33089 M_AMDFAM10H_ISTANBUL,
33090 M_AMDFAM15H_BDVER1,
33091 M_AMDFAM15H_BDVER2,
33092 M_AMDFAM15H_BDVER3,
33093 M_AMDFAM15H_BDVER4,
33094 M_AMDFAM17H_ZNVER1,
33095 M_INTEL_COREI7_IVYBRIDGE,
33096 M_INTEL_COREI7_HASWELL,
33097 M_INTEL_COREI7_BROADWELL,
33098 M_INTEL_COREI7_SKYLAKE,
33099 M_INTEL_COREI7_SKYLAKE_AVX512
33100 };
33101
33102 static struct _arch_names_table
33103 {
33104 const char *const name;
33105 const enum processor_model model;
33106 }
33107 const arch_names_table[] =
33108 {
33109 {"amd", M_AMD},
33110 {"intel", M_INTEL},
33111 {"atom", M_INTEL_BONNELL},
33112 {"slm", M_INTEL_SILVERMONT},
33113 {"core2", M_INTEL_CORE2},
33114 {"corei7", M_INTEL_COREI7},
33115 {"nehalem", M_INTEL_COREI7_NEHALEM},
33116 {"westmere", M_INTEL_COREI7_WESTMERE},
33117 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
33118 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
33119 {"haswell", M_INTEL_COREI7_HASWELL},
33120 {"broadwell", M_INTEL_COREI7_BROADWELL},
33121 {"skylake", M_INTEL_COREI7_SKYLAKE},
33122 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
33123 {"bonnell", M_INTEL_BONNELL},
33124 {"silvermont", M_INTEL_SILVERMONT},
33125 {"knl", M_INTEL_KNL},
33126 {"amdfam10h", M_AMDFAM10H},
33127 {"barcelona", M_AMDFAM10H_BARCELONA},
33128 {"shanghai", M_AMDFAM10H_SHANGHAI},
33129 {"istanbul", M_AMDFAM10H_ISTANBUL},
33130 {"btver1", M_AMD_BTVER1},
33131 {"amdfam15h", M_AMDFAM15H},
33132 {"bdver1", M_AMDFAM15H_BDVER1},
33133 {"bdver2", M_AMDFAM15H_BDVER2},
33134 {"bdver3", M_AMDFAM15H_BDVER3},
33135 {"bdver4", M_AMDFAM15H_BDVER4},
33136 {"btver2", M_AMD_BTVER2},
33137 {"znver1", M_AMDFAM17H_ZNVER1},
33138 };
33139
33140 static struct _isa_names_table
33141 {
33142 const char *const name;
33143 const enum processor_features feature;
33144 }
33145 const isa_names_table[] =
33146 {
33147 {"cmov", F_CMOV},
33148 {"mmx", F_MMX},
33149 {"popcnt", F_POPCNT},
33150 {"sse", F_SSE},
33151 {"sse2", F_SSE2},
33152 {"sse3", F_SSE3},
33153 {"ssse3", F_SSSE3},
33154 {"sse4a", F_SSE4_A},
33155 {"sse4.1", F_SSE4_1},
33156 {"sse4.2", F_SSE4_2},
33157 {"avx", F_AVX},
33158 {"fma4", F_FMA4},
33159 {"xop", F_XOP},
33160 {"fma", F_FMA},
33161 {"avx2", F_AVX2},
33162 {"avx512f", F_AVX512F},
33163 {"bmi", F_BMI},
33164 {"bmi2", F_BMI2},
33165 {"aes", F_AES},
33166 {"pclmul", F_PCLMUL},
33167 {"avx512vl",F_AVX512VL},
33168 {"avx512bw",F_AVX512BW},
33169 {"avx512dq",F_AVX512DQ},
33170 {"avx512cd",F_AVX512CD},
33171 {"avx512er",F_AVX512ER},
33172 {"avx512pf",F_AVX512PF},
33173 {"avx512vbmi",F_AVX512VBMI},
33174 {"avx512ifma",F_AVX512IFMA},
33175 };
33176
33177 tree __processor_model_type = build_processor_model_struct ();
33178 tree __cpu_model_var = make_var_decl (__processor_model_type,
33179 "__cpu_model");
33180
33181
33182 varpool_node::add (__cpu_model_var);
33183
33184 gcc_assert ((args != NULL) && (*args != NULL));
33185
33186 param_string_cst = *args;
33187 while (param_string_cst
33188 && TREE_CODE (param_string_cst) != STRING_CST)
33189 {
33190 /* *args must be a expr that can contain other EXPRS leading to a
33191 STRING_CST. */
33192 if (!EXPR_P (param_string_cst))
33193 {
33194 error ("Parameter to builtin must be a string constant or literal");
33195 return integer_zero_node;
33196 }
33197 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
33198 }
33199
33200 gcc_assert (param_string_cst);
33201
33202 if (fn_code == IX86_BUILTIN_CPU_IS)
33203 {
33204 tree ref;
33205 tree field;
33206 tree final;
33207
33208 unsigned int field_val = 0;
33209 unsigned int NUM_ARCH_NAMES
33210 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
33211
33212 for (i = 0; i < NUM_ARCH_NAMES; i++)
33213 if (strcmp (arch_names_table[i].name,
33214 TREE_STRING_POINTER (param_string_cst)) == 0)
33215 break;
33216
33217 if (i == NUM_ARCH_NAMES)
33218 {
33219 error ("Parameter to builtin not valid: %s",
33220 TREE_STRING_POINTER (param_string_cst));
33221 return integer_zero_node;
33222 }
33223
33224 field = TYPE_FIELDS (__processor_model_type);
33225 field_val = arch_names_table[i].model;
33226
33227 /* CPU types are stored in the next field. */
33228 if (field_val > M_CPU_TYPE_START
33229 && field_val < M_CPU_SUBTYPE_START)
33230 {
33231 field = DECL_CHAIN (field);
33232 field_val -= M_CPU_TYPE_START;
33233 }
33234
33235 /* CPU subtypes are stored in the next field. */
33236 if (field_val > M_CPU_SUBTYPE_START)
33237 {
33238 field = DECL_CHAIN ( DECL_CHAIN (field));
33239 field_val -= M_CPU_SUBTYPE_START;
33240 }
33241
33242 /* Get the appropriate field in __cpu_model. */
33243 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
33244 field, NULL_TREE);
33245
33246 /* Check the value. */
33247 final = build2 (EQ_EXPR, unsigned_type_node, ref,
33248 build_int_cstu (unsigned_type_node, field_val));
33249 return build1 (CONVERT_EXPR, integer_type_node, final);
33250 }
33251 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
33252 {
33253 tree ref;
33254 tree array_elt;
33255 tree field;
33256 tree final;
33257
33258 unsigned int field_val = 0;
33259 unsigned int NUM_ISA_NAMES
33260 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
33261
33262 for (i = 0; i < NUM_ISA_NAMES; i++)
33263 if (strcmp (isa_names_table[i].name,
33264 TREE_STRING_POINTER (param_string_cst)) == 0)
33265 break;
33266
33267 if (i == NUM_ISA_NAMES)
33268 {
33269 error ("Parameter to builtin not valid: %s",
33270 TREE_STRING_POINTER (param_string_cst));
33271 return integer_zero_node;
33272 }
33273
33274 field = TYPE_FIELDS (__processor_model_type);
33275 /* Get the last field, which is __cpu_features. */
33276 while (DECL_CHAIN (field))
33277 field = DECL_CHAIN (field);
33278
33279 /* Get the appropriate field: __cpu_model.__cpu_features */
33280 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
33281 field, NULL_TREE);
33282
33283 /* Access the 0th element of __cpu_features array. */
33284 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
33285 integer_zero_node, NULL_TREE, NULL_TREE);
33286
33287 field_val = (1 << isa_names_table[i].feature);
33288 /* Return __cpu_model.__cpu_features[0] & field_val */
33289 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
33290 build_int_cstu (unsigned_type_node, field_val));
33291 return build1 (CONVERT_EXPR, integer_type_node, final);
33292 }
33293 gcc_unreachable ();
33294 }
33295
33296 static tree
33297 ix86_fold_builtin (tree fndecl, int n_args,
33298 tree *args, bool ignore ATTRIBUTE_UNUSED)
33299 {
33300 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
33301 {
33302 enum ix86_builtins fn_code = (enum ix86_builtins)
33303 DECL_FUNCTION_CODE (fndecl);
33304 switch (fn_code)
33305 {
33306 case IX86_BUILTIN_CPU_IS:
33307 case IX86_BUILTIN_CPU_SUPPORTS:
33308 gcc_assert (n_args == 1);
33309 return fold_builtin_cpu (fndecl, args);
33310
33311 case IX86_BUILTIN_NANQ:
33312 case IX86_BUILTIN_NANSQ:
33313 {
33314 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33315 const char *str = c_getstr (*args);
33316 int quiet = fn_code == IX86_BUILTIN_NANQ;
33317 REAL_VALUE_TYPE real;
33318
33319 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
33320 return build_real (type, real);
33321 return NULL_TREE;
33322 }
33323
33324 default:
33325 break;
33326 }
33327 }
33328
33329 #ifdef SUBTARGET_FOLD_BUILTIN
33330 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
33331 #endif
33332
33333 return NULL_TREE;
33334 }
33335
33336 /* Make builtins to detect cpu type and features supported. NAME is
33337 the builtin name, CODE is the builtin code, and FTYPE is the function
33338 type of the builtin. */
33339
33340 static void
33341 make_cpu_type_builtin (const char* name, int code,
33342 enum ix86_builtin_func_type ftype, bool is_const)
33343 {
33344 tree decl;
33345 tree type;
33346
33347 type = ix86_get_builtin_func_type (ftype);
33348 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
33349 NULL, NULL_TREE);
33350 gcc_assert (decl != NULL_TREE);
33351 ix86_builtins[(int) code] = decl;
33352 TREE_READONLY (decl) = is_const;
33353 }
33354
33355 /* Make builtins to get CPU type and features supported. The created
33356 builtins are :
33357
33358 __builtin_cpu_init (), to detect cpu type and features,
33359 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
33360 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
33361 */
33362
33363 static void
33364 ix86_init_platform_type_builtins (void)
33365 {
33366 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
33367 INT_FTYPE_VOID, false);
33368 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
33369 INT_FTYPE_PCCHAR, true);
33370 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
33371 INT_FTYPE_PCCHAR, true);
33372 }
33373
33374 /* Internal method for ix86_init_builtins. */
33375
33376 static void
33377 ix86_init_builtins_va_builtins_abi (void)
33378 {
33379 tree ms_va_ref, sysv_va_ref;
33380 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
33381 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
33382 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
33383 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
33384
33385 if (!TARGET_64BIT)
33386 return;
33387 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
33388 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
33389 ms_va_ref = build_reference_type (ms_va_list_type_node);
33390 sysv_va_ref =
33391 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
33392
33393 fnvoid_va_end_ms =
33394 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33395 fnvoid_va_start_ms =
33396 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33397 fnvoid_va_end_sysv =
33398 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
33399 fnvoid_va_start_sysv =
33400 build_varargs_function_type_list (void_type_node, sysv_va_ref,
33401 NULL_TREE);
33402 fnvoid_va_copy_ms =
33403 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
33404 NULL_TREE);
33405 fnvoid_va_copy_sysv =
33406 build_function_type_list (void_type_node, sysv_va_ref,
33407 sysv_va_ref, NULL_TREE);
33408
33409 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
33410 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
33411 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
33412 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
33413 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
33414 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
33415 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
33416 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33417 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
33418 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33419 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
33420 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33421 }
33422
33423 static void
33424 ix86_init_builtin_types (void)
33425 {
33426 tree float80_type_node, const_string_type_node;
33427
33428 /* The __float80 type. */
33429 float80_type_node = long_double_type_node;
33430 if (TYPE_MODE (float80_type_node) != XFmode)
33431 {
33432 if (float64x_type_node != NULL_TREE
33433 && TYPE_MODE (float64x_type_node) == XFmode)
33434 float80_type_node = float64x_type_node;
33435 else
33436 {
33437 /* The __float80 type. */
33438 float80_type_node = make_node (REAL_TYPE);
33439
33440 TYPE_PRECISION (float80_type_node) = 80;
33441 layout_type (float80_type_node);
33442 }
33443 }
33444 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
33445
33446 /* The __float128 type. The node has already been created as
33447 _Float128, so we only need to register the __float128 name for
33448 it. */
33449 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
33450
33451 const_string_type_node
33452 = build_pointer_type (build_qualified_type
33453 (char_type_node, TYPE_QUAL_CONST));
33454
33455 /* This macro is built by i386-builtin-types.awk. */
33456 DEFINE_BUILTIN_PRIMITIVE_TYPES;
33457 }
33458
33459 static void
33460 ix86_init_builtins (void)
33461 {
33462 tree ftype, decl;
33463
33464 ix86_init_builtin_types ();
33465
33466 /* Builtins to get CPU type and features. */
33467 ix86_init_platform_type_builtins ();
33468
33469 /* TFmode support builtins. */
33470 def_builtin_const (0, "__builtin_infq",
33471 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
33472 def_builtin_const (0, "__builtin_huge_valq",
33473 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
33474
33475 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
33476 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
33477 BUILT_IN_MD, "nanq", NULL_TREE);
33478 TREE_READONLY (decl) = 1;
33479 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
33480
33481 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
33482 BUILT_IN_MD, "nansq", NULL_TREE);
33483 TREE_READONLY (decl) = 1;
33484 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
33485
33486 /* We will expand them to normal call if SSE isn't available since
33487 they are used by libgcc. */
33488 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
33489 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
33490 BUILT_IN_MD, "__fabstf2", NULL_TREE);
33491 TREE_READONLY (decl) = 1;
33492 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
33493
33494 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
33495 decl = add_builtin_function ("__builtin_copysignq", ftype,
33496 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
33497 "__copysigntf3", NULL_TREE);
33498 TREE_READONLY (decl) = 1;
33499 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
33500
33501 ix86_init_tm_builtins ();
33502 ix86_init_mmx_sse_builtins ();
33503 ix86_init_mpx_builtins ();
33504
33505 if (TARGET_LP64)
33506 ix86_init_builtins_va_builtins_abi ();
33507
33508 #ifdef SUBTARGET_INIT_BUILTINS
33509 SUBTARGET_INIT_BUILTINS;
33510 #endif
33511 }
33512
33513 /* Return the ix86 builtin for CODE. */
33514
33515 static tree
33516 ix86_builtin_decl (unsigned code, bool)
33517 {
33518 if (code >= IX86_BUILTIN_MAX)
33519 return error_mark_node;
33520
33521 return ix86_builtins[code];
33522 }
33523
33524 /* Errors in the source file can cause expand_expr to return const0_rtx
33525 where we expect a vector. To avoid crashing, use one of the vector
33526 clear instructions. */
33527 static rtx
33528 safe_vector_operand (rtx x, machine_mode mode)
33529 {
33530 if (x == const0_rtx)
33531 x = CONST0_RTX (mode);
33532 return x;
33533 }
33534
33535 /* Fixup modeless constants to fit required mode. */
33536 static rtx
33537 fixup_modeless_constant (rtx x, machine_mode mode)
33538 {
33539 if (GET_MODE (x) == VOIDmode)
33540 x = convert_to_mode (mode, x, 1);
33541 return x;
33542 }
33543
33544 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
33545
33546 static rtx
33547 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
33548 {
33549 rtx pat;
33550 tree arg0 = CALL_EXPR_ARG (exp, 0);
33551 tree arg1 = CALL_EXPR_ARG (exp, 1);
33552 rtx op0 = expand_normal (arg0);
33553 rtx op1 = expand_normal (arg1);
33554 machine_mode tmode = insn_data[icode].operand[0].mode;
33555 machine_mode mode0 = insn_data[icode].operand[1].mode;
33556 machine_mode mode1 = insn_data[icode].operand[2].mode;
33557
33558 if (VECTOR_MODE_P (mode0))
33559 op0 = safe_vector_operand (op0, mode0);
33560 if (VECTOR_MODE_P (mode1))
33561 op1 = safe_vector_operand (op1, mode1);
33562
33563 if (optimize || !target
33564 || GET_MODE (target) != tmode
33565 || !insn_data[icode].operand[0].predicate (target, tmode))
33566 target = gen_reg_rtx (tmode);
33567
33568 if (GET_MODE (op1) == SImode && mode1 == TImode)
33569 {
33570 rtx x = gen_reg_rtx (V4SImode);
33571 emit_insn (gen_sse2_loadd (x, op1));
33572 op1 = gen_lowpart (TImode, x);
33573 }
33574
33575 if (!insn_data[icode].operand[1].predicate (op0, mode0))
33576 op0 = copy_to_mode_reg (mode0, op0);
33577 if (!insn_data[icode].operand[2].predicate (op1, mode1))
33578 op1 = copy_to_mode_reg (mode1, op1);
33579
33580 pat = GEN_FCN (icode) (target, op0, op1);
33581 if (! pat)
33582 return 0;
33583
33584 emit_insn (pat);
33585
33586 return target;
33587 }
33588
33589 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
33590
33591 static rtx
33592 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
33593 enum ix86_builtin_func_type m_type,
33594 enum rtx_code sub_code)
33595 {
33596 rtx pat;
33597 int i;
33598 int nargs;
33599 bool comparison_p = false;
33600 bool tf_p = false;
33601 bool last_arg_constant = false;
33602 int num_memory = 0;
33603 struct {
33604 rtx op;
33605 machine_mode mode;
33606 } args[4];
33607
33608 machine_mode tmode = insn_data[icode].operand[0].mode;
33609
33610 switch (m_type)
33611 {
33612 case MULTI_ARG_4_DF2_DI_I:
33613 case MULTI_ARG_4_DF2_DI_I1:
33614 case MULTI_ARG_4_SF2_SI_I:
33615 case MULTI_ARG_4_SF2_SI_I1:
33616 nargs = 4;
33617 last_arg_constant = true;
33618 break;
33619
33620 case MULTI_ARG_3_SF:
33621 case MULTI_ARG_3_DF:
33622 case MULTI_ARG_3_SF2:
33623 case MULTI_ARG_3_DF2:
33624 case MULTI_ARG_3_DI:
33625 case MULTI_ARG_3_SI:
33626 case MULTI_ARG_3_SI_DI:
33627 case MULTI_ARG_3_HI:
33628 case MULTI_ARG_3_HI_SI:
33629 case MULTI_ARG_3_QI:
33630 case MULTI_ARG_3_DI2:
33631 case MULTI_ARG_3_SI2:
33632 case MULTI_ARG_3_HI2:
33633 case MULTI_ARG_3_QI2:
33634 nargs = 3;
33635 break;
33636
33637 case MULTI_ARG_2_SF:
33638 case MULTI_ARG_2_DF:
33639 case MULTI_ARG_2_DI:
33640 case MULTI_ARG_2_SI:
33641 case MULTI_ARG_2_HI:
33642 case MULTI_ARG_2_QI:
33643 nargs = 2;
33644 break;
33645
33646 case MULTI_ARG_2_DI_IMM:
33647 case MULTI_ARG_2_SI_IMM:
33648 case MULTI_ARG_2_HI_IMM:
33649 case MULTI_ARG_2_QI_IMM:
33650 nargs = 2;
33651 last_arg_constant = true;
33652 break;
33653
33654 case MULTI_ARG_1_SF:
33655 case MULTI_ARG_1_DF:
33656 case MULTI_ARG_1_SF2:
33657 case MULTI_ARG_1_DF2:
33658 case MULTI_ARG_1_DI:
33659 case MULTI_ARG_1_SI:
33660 case MULTI_ARG_1_HI:
33661 case MULTI_ARG_1_QI:
33662 case MULTI_ARG_1_SI_DI:
33663 case MULTI_ARG_1_HI_DI:
33664 case MULTI_ARG_1_HI_SI:
33665 case MULTI_ARG_1_QI_DI:
33666 case MULTI_ARG_1_QI_SI:
33667 case MULTI_ARG_1_QI_HI:
33668 nargs = 1;
33669 break;
33670
33671 case MULTI_ARG_2_DI_CMP:
33672 case MULTI_ARG_2_SI_CMP:
33673 case MULTI_ARG_2_HI_CMP:
33674 case MULTI_ARG_2_QI_CMP:
33675 nargs = 2;
33676 comparison_p = true;
33677 break;
33678
33679 case MULTI_ARG_2_SF_TF:
33680 case MULTI_ARG_2_DF_TF:
33681 case MULTI_ARG_2_DI_TF:
33682 case MULTI_ARG_2_SI_TF:
33683 case MULTI_ARG_2_HI_TF:
33684 case MULTI_ARG_2_QI_TF:
33685 nargs = 2;
33686 tf_p = true;
33687 break;
33688
33689 default:
33690 gcc_unreachable ();
33691 }
33692
33693 if (optimize || !target
33694 || GET_MODE (target) != tmode
33695 || !insn_data[icode].operand[0].predicate (target, tmode))
33696 target = gen_reg_rtx (tmode);
33697
33698 gcc_assert (nargs <= 4);
33699
33700 for (i = 0; i < nargs; i++)
33701 {
33702 tree arg = CALL_EXPR_ARG (exp, i);
33703 rtx op = expand_normal (arg);
33704 int adjust = (comparison_p) ? 1 : 0;
33705 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
33706
33707 if (last_arg_constant && i == nargs - 1)
33708 {
33709 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
33710 {
33711 enum insn_code new_icode = icode;
33712 switch (icode)
33713 {
33714 case CODE_FOR_xop_vpermil2v2df3:
33715 case CODE_FOR_xop_vpermil2v4sf3:
33716 case CODE_FOR_xop_vpermil2v4df3:
33717 case CODE_FOR_xop_vpermil2v8sf3:
33718 error ("the last argument must be a 2-bit immediate");
33719 return gen_reg_rtx (tmode);
33720 case CODE_FOR_xop_rotlv2di3:
33721 new_icode = CODE_FOR_rotlv2di3;
33722 goto xop_rotl;
33723 case CODE_FOR_xop_rotlv4si3:
33724 new_icode = CODE_FOR_rotlv4si3;
33725 goto xop_rotl;
33726 case CODE_FOR_xop_rotlv8hi3:
33727 new_icode = CODE_FOR_rotlv8hi3;
33728 goto xop_rotl;
33729 case CODE_FOR_xop_rotlv16qi3:
33730 new_icode = CODE_FOR_rotlv16qi3;
33731 xop_rotl:
33732 if (CONST_INT_P (op))
33733 {
33734 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
33735 op = GEN_INT (INTVAL (op) & mask);
33736 gcc_checking_assert
33737 (insn_data[icode].operand[i + 1].predicate (op, mode));
33738 }
33739 else
33740 {
33741 gcc_checking_assert
33742 (nargs == 2
33743 && insn_data[new_icode].operand[0].mode == tmode
33744 && insn_data[new_icode].operand[1].mode == tmode
33745 && insn_data[new_icode].operand[2].mode == mode
33746 && insn_data[new_icode].operand[0].predicate
33747 == insn_data[icode].operand[0].predicate
33748 && insn_data[new_icode].operand[1].predicate
33749 == insn_data[icode].operand[1].predicate);
33750 icode = new_icode;
33751 goto non_constant;
33752 }
33753 break;
33754 default:
33755 gcc_unreachable ();
33756 }
33757 }
33758 }
33759 else
33760 {
33761 non_constant:
33762 if (VECTOR_MODE_P (mode))
33763 op = safe_vector_operand (op, mode);
33764
33765 /* If we aren't optimizing, only allow one memory operand to be
33766 generated. */
33767 if (memory_operand (op, mode))
33768 num_memory++;
33769
33770 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
33771
33772 if (optimize
33773 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
33774 || num_memory > 1)
33775 op = force_reg (mode, op);
33776 }
33777
33778 args[i].op = op;
33779 args[i].mode = mode;
33780 }
33781
33782 switch (nargs)
33783 {
33784 case 1:
33785 pat = GEN_FCN (icode) (target, args[0].op);
33786 break;
33787
33788 case 2:
33789 if (tf_p)
33790 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
33791 GEN_INT ((int)sub_code));
33792 else if (! comparison_p)
33793 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
33794 else
33795 {
33796 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
33797 args[0].op,
33798 args[1].op);
33799
33800 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
33801 }
33802 break;
33803
33804 case 3:
33805 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
33806 break;
33807
33808 case 4:
33809 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
33810 break;
33811
33812 default:
33813 gcc_unreachable ();
33814 }
33815
33816 if (! pat)
33817 return 0;
33818
33819 emit_insn (pat);
33820 return target;
33821 }
33822
33823 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
33824 insns with vec_merge. */
33825
33826 static rtx
33827 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
33828 rtx target)
33829 {
33830 rtx pat;
33831 tree arg0 = CALL_EXPR_ARG (exp, 0);
33832 rtx op1, op0 = expand_normal (arg0);
33833 machine_mode tmode = insn_data[icode].operand[0].mode;
33834 machine_mode mode0 = insn_data[icode].operand[1].mode;
33835
33836 if (optimize || !target
33837 || GET_MODE (target) != tmode
33838 || !insn_data[icode].operand[0].predicate (target, tmode))
33839 target = gen_reg_rtx (tmode);
33840
33841 if (VECTOR_MODE_P (mode0))
33842 op0 = safe_vector_operand (op0, mode0);
33843
33844 if ((optimize && !register_operand (op0, mode0))
33845 || !insn_data[icode].operand[1].predicate (op0, mode0))
33846 op0 = copy_to_mode_reg (mode0, op0);
33847
33848 op1 = op0;
33849 if (!insn_data[icode].operand[2].predicate (op1, mode0))
33850 op1 = copy_to_mode_reg (mode0, op1);
33851
33852 pat = GEN_FCN (icode) (target, op0, op1);
33853 if (! pat)
33854 return 0;
33855 emit_insn (pat);
33856 return target;
33857 }
33858
33859 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
33860
33861 static rtx
33862 ix86_expand_sse_compare (const struct builtin_description *d,
33863 tree exp, rtx target, bool swap)
33864 {
33865 rtx pat;
33866 tree arg0 = CALL_EXPR_ARG (exp, 0);
33867 tree arg1 = CALL_EXPR_ARG (exp, 1);
33868 rtx op0 = expand_normal (arg0);
33869 rtx op1 = expand_normal (arg1);
33870 rtx op2;
33871 machine_mode tmode = insn_data[d->icode].operand[0].mode;
33872 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33873 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
33874 enum rtx_code comparison = d->comparison;
33875
33876 if (VECTOR_MODE_P (mode0))
33877 op0 = safe_vector_operand (op0, mode0);
33878 if (VECTOR_MODE_P (mode1))
33879 op1 = safe_vector_operand (op1, mode1);
33880
33881 /* Swap operands if we have a comparison that isn't available in
33882 hardware. */
33883 if (swap)
33884 std::swap (op0, op1);
33885
33886 if (optimize || !target
33887 || GET_MODE (target) != tmode
33888 || !insn_data[d->icode].operand[0].predicate (target, tmode))
33889 target = gen_reg_rtx (tmode);
33890
33891 if ((optimize && !register_operand (op0, mode0))
33892 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
33893 op0 = copy_to_mode_reg (mode0, op0);
33894 if ((optimize && !register_operand (op1, mode1))
33895 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
33896 op1 = copy_to_mode_reg (mode1, op1);
33897
33898 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
33899 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
33900 if (! pat)
33901 return 0;
33902 emit_insn (pat);
33903 return target;
33904 }
33905
33906 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
33907
33908 static rtx
33909 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
33910 rtx target)
33911 {
33912 rtx pat;
33913 tree arg0 = CALL_EXPR_ARG (exp, 0);
33914 tree arg1 = CALL_EXPR_ARG (exp, 1);
33915 rtx op0 = expand_normal (arg0);
33916 rtx op1 = expand_normal (arg1);
33917 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
33918 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
33919 enum rtx_code comparison = d->comparison;
33920
33921 if (VECTOR_MODE_P (mode0))
33922 op0 = safe_vector_operand (op0, mode0);
33923 if (VECTOR_MODE_P (mode1))
33924 op1 = safe_vector_operand (op1, mode1);
33925
33926 /* Swap operands if we have a comparison that isn't available in
33927 hardware. */
33928 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
33929 std::swap (op0, op1);
33930
33931 target = gen_reg_rtx (SImode);
33932 emit_move_insn (target, const0_rtx);
33933 target = gen_rtx_SUBREG (QImode, target, 0);
33934
33935 if ((optimize && !register_operand (op0, mode0))
33936 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33937 op0 = copy_to_mode_reg (mode0, op0);
33938 if ((optimize && !register_operand (op1, mode1))
33939 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
33940 op1 = copy_to_mode_reg (mode1, op1);
33941
33942 pat = GEN_FCN (d->icode) (op0, op1);
33943 if (! pat)
33944 return 0;
33945 emit_insn (pat);
33946 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33947 gen_rtx_fmt_ee (comparison, QImode,
33948 SET_DEST (pat),
33949 const0_rtx)));
33950
33951 return SUBREG_REG (target);
33952 }
33953
33954 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
33955
33956 static rtx
33957 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
33958 rtx target)
33959 {
33960 rtx pat;
33961 tree arg0 = CALL_EXPR_ARG (exp, 0);
33962 rtx op1, op0 = expand_normal (arg0);
33963 machine_mode tmode = insn_data[d->icode].operand[0].mode;
33964 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33965
33966 if (optimize || target == 0
33967 || GET_MODE (target) != tmode
33968 || !insn_data[d->icode].operand[0].predicate (target, tmode))
33969 target = gen_reg_rtx (tmode);
33970
33971 if (VECTOR_MODE_P (mode0))
33972 op0 = safe_vector_operand (op0, mode0);
33973
33974 if ((optimize && !register_operand (op0, mode0))
33975 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33976 op0 = copy_to_mode_reg (mode0, op0);
33977
33978 op1 = GEN_INT (d->comparison);
33979
33980 pat = GEN_FCN (d->icode) (target, op0, op1);
33981 if (! pat)
33982 return 0;
33983 emit_insn (pat);
33984 return target;
33985 }
33986
33987 static rtx
33988 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
33989 tree exp, rtx target)
33990 {
33991 rtx pat;
33992 tree arg0 = CALL_EXPR_ARG (exp, 0);
33993 tree arg1 = CALL_EXPR_ARG (exp, 1);
33994 rtx op0 = expand_normal (arg0);
33995 rtx op1 = expand_normal (arg1);
33996 rtx op2;
33997 machine_mode tmode = insn_data[d->icode].operand[0].mode;
33998 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33999 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
34000
34001 if (optimize || target == 0
34002 || GET_MODE (target) != tmode
34003 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34004 target = gen_reg_rtx (tmode);
34005
34006 op0 = safe_vector_operand (op0, mode0);
34007 op1 = safe_vector_operand (op1, mode1);
34008
34009 if ((optimize && !register_operand (op0, mode0))
34010 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34011 op0 = copy_to_mode_reg (mode0, op0);
34012 if ((optimize && !register_operand (op1, mode1))
34013 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34014 op1 = copy_to_mode_reg (mode1, op1);
34015
34016 op2 = GEN_INT (d->comparison);
34017
34018 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
34019 if (! pat)
34020 return 0;
34021 emit_insn (pat);
34022 return target;
34023 }
34024
34025 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
34026
34027 static rtx
34028 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
34029 rtx target)
34030 {
34031 rtx pat;
34032 tree arg0 = CALL_EXPR_ARG (exp, 0);
34033 tree arg1 = CALL_EXPR_ARG (exp, 1);
34034 rtx op0 = expand_normal (arg0);
34035 rtx op1 = expand_normal (arg1);
34036 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
34037 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
34038 enum rtx_code comparison = d->comparison;
34039
34040 if (VECTOR_MODE_P (mode0))
34041 op0 = safe_vector_operand (op0, mode0);
34042 if (VECTOR_MODE_P (mode1))
34043 op1 = safe_vector_operand (op1, mode1);
34044
34045 target = gen_reg_rtx (SImode);
34046 emit_move_insn (target, const0_rtx);
34047 target = gen_rtx_SUBREG (QImode, target, 0);
34048
34049 if ((optimize && !register_operand (op0, mode0))
34050 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34051 op0 = copy_to_mode_reg (mode0, op0);
34052 if ((optimize && !register_operand (op1, mode1))
34053 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34054 op1 = copy_to_mode_reg (mode1, op1);
34055
34056 pat = GEN_FCN (d->icode) (op0, op1);
34057 if (! pat)
34058 return 0;
34059 emit_insn (pat);
34060 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34061 gen_rtx_fmt_ee (comparison, QImode,
34062 SET_DEST (pat),
34063 const0_rtx)));
34064
34065 return SUBREG_REG (target);
34066 }
34067
34068 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
34069
34070 static rtx
34071 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
34072 tree exp, rtx target)
34073 {
34074 rtx pat;
34075 tree arg0 = CALL_EXPR_ARG (exp, 0);
34076 tree arg1 = CALL_EXPR_ARG (exp, 1);
34077 tree arg2 = CALL_EXPR_ARG (exp, 2);
34078 tree arg3 = CALL_EXPR_ARG (exp, 3);
34079 tree arg4 = CALL_EXPR_ARG (exp, 4);
34080 rtx scratch0, scratch1;
34081 rtx op0 = expand_normal (arg0);
34082 rtx op1 = expand_normal (arg1);
34083 rtx op2 = expand_normal (arg2);
34084 rtx op3 = expand_normal (arg3);
34085 rtx op4 = expand_normal (arg4);
34086 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
34087
34088 tmode0 = insn_data[d->icode].operand[0].mode;
34089 tmode1 = insn_data[d->icode].operand[1].mode;
34090 modev2 = insn_data[d->icode].operand[2].mode;
34091 modei3 = insn_data[d->icode].operand[3].mode;
34092 modev4 = insn_data[d->icode].operand[4].mode;
34093 modei5 = insn_data[d->icode].operand[5].mode;
34094 modeimm = insn_data[d->icode].operand[6].mode;
34095
34096 if (VECTOR_MODE_P (modev2))
34097 op0 = safe_vector_operand (op0, modev2);
34098 if (VECTOR_MODE_P (modev4))
34099 op2 = safe_vector_operand (op2, modev4);
34100
34101 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34102 op0 = copy_to_mode_reg (modev2, op0);
34103 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
34104 op1 = copy_to_mode_reg (modei3, op1);
34105 if ((optimize && !register_operand (op2, modev4))
34106 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
34107 op2 = copy_to_mode_reg (modev4, op2);
34108 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
34109 op3 = copy_to_mode_reg (modei5, op3);
34110
34111 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
34112 {
34113 error ("the fifth argument must be an 8-bit immediate");
34114 return const0_rtx;
34115 }
34116
34117 if (d->code == IX86_BUILTIN_PCMPESTRI128)
34118 {
34119 if (optimize || !target
34120 || GET_MODE (target) != tmode0
34121 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34122 target = gen_reg_rtx (tmode0);
34123
34124 scratch1 = gen_reg_rtx (tmode1);
34125
34126 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
34127 }
34128 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
34129 {
34130 if (optimize || !target
34131 || GET_MODE (target) != tmode1
34132 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34133 target = gen_reg_rtx (tmode1);
34134
34135 scratch0 = gen_reg_rtx (tmode0);
34136
34137 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
34138 }
34139 else
34140 {
34141 gcc_assert (d->flag);
34142
34143 scratch0 = gen_reg_rtx (tmode0);
34144 scratch1 = gen_reg_rtx (tmode1);
34145
34146 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
34147 }
34148
34149 if (! pat)
34150 return 0;
34151
34152 emit_insn (pat);
34153
34154 if (d->flag)
34155 {
34156 target = gen_reg_rtx (SImode);
34157 emit_move_insn (target, const0_rtx);
34158 target = gen_rtx_SUBREG (QImode, target, 0);
34159
34160 emit_insn
34161 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34162 gen_rtx_fmt_ee (EQ, QImode,
34163 gen_rtx_REG ((machine_mode) d->flag,
34164 FLAGS_REG),
34165 const0_rtx)));
34166 return SUBREG_REG (target);
34167 }
34168 else
34169 return target;
34170 }
34171
34172
34173 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
34174
34175 static rtx
34176 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
34177 tree exp, rtx target)
34178 {
34179 rtx pat;
34180 tree arg0 = CALL_EXPR_ARG (exp, 0);
34181 tree arg1 = CALL_EXPR_ARG (exp, 1);
34182 tree arg2 = CALL_EXPR_ARG (exp, 2);
34183 rtx scratch0, scratch1;
34184 rtx op0 = expand_normal (arg0);
34185 rtx op1 = expand_normal (arg1);
34186 rtx op2 = expand_normal (arg2);
34187 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
34188
34189 tmode0 = insn_data[d->icode].operand[0].mode;
34190 tmode1 = insn_data[d->icode].operand[1].mode;
34191 modev2 = insn_data[d->icode].operand[2].mode;
34192 modev3 = insn_data[d->icode].operand[3].mode;
34193 modeimm = insn_data[d->icode].operand[4].mode;
34194
34195 if (VECTOR_MODE_P (modev2))
34196 op0 = safe_vector_operand (op0, modev2);
34197 if (VECTOR_MODE_P (modev3))
34198 op1 = safe_vector_operand (op1, modev3);
34199
34200 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34201 op0 = copy_to_mode_reg (modev2, op0);
34202 if ((optimize && !register_operand (op1, modev3))
34203 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
34204 op1 = copy_to_mode_reg (modev3, op1);
34205
34206 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
34207 {
34208 error ("the third argument must be an 8-bit immediate");
34209 return const0_rtx;
34210 }
34211
34212 if (d->code == IX86_BUILTIN_PCMPISTRI128)
34213 {
34214 if (optimize || !target
34215 || GET_MODE (target) != tmode0
34216 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34217 target = gen_reg_rtx (tmode0);
34218
34219 scratch1 = gen_reg_rtx (tmode1);
34220
34221 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
34222 }
34223 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
34224 {
34225 if (optimize || !target
34226 || GET_MODE (target) != tmode1
34227 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34228 target = gen_reg_rtx (tmode1);
34229
34230 scratch0 = gen_reg_rtx (tmode0);
34231
34232 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
34233 }
34234 else
34235 {
34236 gcc_assert (d->flag);
34237
34238 scratch0 = gen_reg_rtx (tmode0);
34239 scratch1 = gen_reg_rtx (tmode1);
34240
34241 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
34242 }
34243
34244 if (! pat)
34245 return 0;
34246
34247 emit_insn (pat);
34248
34249 if (d->flag)
34250 {
34251 target = gen_reg_rtx (SImode);
34252 emit_move_insn (target, const0_rtx);
34253 target = gen_rtx_SUBREG (QImode, target, 0);
34254
34255 emit_insn
34256 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34257 gen_rtx_fmt_ee (EQ, QImode,
34258 gen_rtx_REG ((machine_mode) d->flag,
34259 FLAGS_REG),
34260 const0_rtx)));
34261 return SUBREG_REG (target);
34262 }
34263 else
34264 return target;
34265 }
34266
34267 /* Subroutine of ix86_expand_builtin to take care of insns with
34268 variable number of operands. */
34269
34270 static rtx
34271 ix86_expand_args_builtin (const struct builtin_description *d,
34272 tree exp, rtx target)
34273 {
34274 rtx pat, real_target;
34275 unsigned int i, nargs;
34276 unsigned int nargs_constant = 0;
34277 unsigned int mask_pos = 0;
34278 int num_memory = 0;
34279 struct
34280 {
34281 rtx op;
34282 machine_mode mode;
34283 } args[6];
34284 bool last_arg_count = false;
34285 enum insn_code icode = d->icode;
34286 const struct insn_data_d *insn_p = &insn_data[icode];
34287 machine_mode tmode = insn_p->operand[0].mode;
34288 machine_mode rmode = VOIDmode;
34289 bool swap = false;
34290 enum rtx_code comparison = d->comparison;
34291
34292 switch ((enum ix86_builtin_func_type) d->flag)
34293 {
34294 case V2DF_FTYPE_V2DF_ROUND:
34295 case V4DF_FTYPE_V4DF_ROUND:
34296 case V8DF_FTYPE_V8DF_ROUND:
34297 case V4SF_FTYPE_V4SF_ROUND:
34298 case V8SF_FTYPE_V8SF_ROUND:
34299 case V16SF_FTYPE_V16SF_ROUND:
34300 case V4SI_FTYPE_V4SF_ROUND:
34301 case V8SI_FTYPE_V8SF_ROUND:
34302 case V16SI_FTYPE_V16SF_ROUND:
34303 return ix86_expand_sse_round (d, exp, target);
34304 case V4SI_FTYPE_V2DF_V2DF_ROUND:
34305 case V8SI_FTYPE_V4DF_V4DF_ROUND:
34306 case V16SI_FTYPE_V8DF_V8DF_ROUND:
34307 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
34308 case INT_FTYPE_V8SF_V8SF_PTEST:
34309 case INT_FTYPE_V4DI_V4DI_PTEST:
34310 case INT_FTYPE_V4DF_V4DF_PTEST:
34311 case INT_FTYPE_V4SF_V4SF_PTEST:
34312 case INT_FTYPE_V2DI_V2DI_PTEST:
34313 case INT_FTYPE_V2DF_V2DF_PTEST:
34314 return ix86_expand_sse_ptest (d, exp, target);
34315 case FLOAT128_FTYPE_FLOAT128:
34316 case FLOAT_FTYPE_FLOAT:
34317 case INT_FTYPE_INT:
34318 case UINT64_FTYPE_INT:
34319 case UINT16_FTYPE_UINT16:
34320 case INT64_FTYPE_INT64:
34321 case INT64_FTYPE_V4SF:
34322 case INT64_FTYPE_V2DF:
34323 case INT_FTYPE_V16QI:
34324 case INT_FTYPE_V8QI:
34325 case INT_FTYPE_V8SF:
34326 case INT_FTYPE_V4DF:
34327 case INT_FTYPE_V4SF:
34328 case INT_FTYPE_V2DF:
34329 case INT_FTYPE_V32QI:
34330 case V16QI_FTYPE_V16QI:
34331 case V8SI_FTYPE_V8SF:
34332 case V8SI_FTYPE_V4SI:
34333 case V8HI_FTYPE_V8HI:
34334 case V8HI_FTYPE_V16QI:
34335 case V8QI_FTYPE_V8QI:
34336 case V8SF_FTYPE_V8SF:
34337 case V8SF_FTYPE_V8SI:
34338 case V8SF_FTYPE_V4SF:
34339 case V8SF_FTYPE_V8HI:
34340 case V4SI_FTYPE_V4SI:
34341 case V4SI_FTYPE_V16QI:
34342 case V4SI_FTYPE_V4SF:
34343 case V4SI_FTYPE_V8SI:
34344 case V4SI_FTYPE_V8HI:
34345 case V4SI_FTYPE_V4DF:
34346 case V4SI_FTYPE_V2DF:
34347 case V4HI_FTYPE_V4HI:
34348 case V4DF_FTYPE_V4DF:
34349 case V4DF_FTYPE_V4SI:
34350 case V4DF_FTYPE_V4SF:
34351 case V4DF_FTYPE_V2DF:
34352 case V4SF_FTYPE_V4SF:
34353 case V4SF_FTYPE_V4SI:
34354 case V4SF_FTYPE_V8SF:
34355 case V4SF_FTYPE_V4DF:
34356 case V4SF_FTYPE_V8HI:
34357 case V4SF_FTYPE_V2DF:
34358 case V2DI_FTYPE_V2DI:
34359 case V2DI_FTYPE_V16QI:
34360 case V2DI_FTYPE_V8HI:
34361 case V2DI_FTYPE_V4SI:
34362 case V2DF_FTYPE_V2DF:
34363 case V2DF_FTYPE_V4SI:
34364 case V2DF_FTYPE_V4DF:
34365 case V2DF_FTYPE_V4SF:
34366 case V2DF_FTYPE_V2SI:
34367 case V2SI_FTYPE_V2SI:
34368 case V2SI_FTYPE_V4SF:
34369 case V2SI_FTYPE_V2SF:
34370 case V2SI_FTYPE_V2DF:
34371 case V2SF_FTYPE_V2SF:
34372 case V2SF_FTYPE_V2SI:
34373 case V32QI_FTYPE_V32QI:
34374 case V32QI_FTYPE_V16QI:
34375 case V16HI_FTYPE_V16HI:
34376 case V16HI_FTYPE_V8HI:
34377 case V8SI_FTYPE_V8SI:
34378 case V16HI_FTYPE_V16QI:
34379 case V8SI_FTYPE_V16QI:
34380 case V4DI_FTYPE_V16QI:
34381 case V8SI_FTYPE_V8HI:
34382 case V4DI_FTYPE_V8HI:
34383 case V4DI_FTYPE_V4SI:
34384 case V4DI_FTYPE_V2DI:
34385 case UHI_FTYPE_UHI:
34386 case UHI_FTYPE_V16QI:
34387 case USI_FTYPE_V32QI:
34388 case UDI_FTYPE_V64QI:
34389 case V16QI_FTYPE_UHI:
34390 case V32QI_FTYPE_USI:
34391 case V64QI_FTYPE_UDI:
34392 case V8HI_FTYPE_UQI:
34393 case V16HI_FTYPE_UHI:
34394 case V32HI_FTYPE_USI:
34395 case V4SI_FTYPE_UQI:
34396 case V8SI_FTYPE_UQI:
34397 case V4SI_FTYPE_UHI:
34398 case V8SI_FTYPE_UHI:
34399 case UQI_FTYPE_V8HI:
34400 case UHI_FTYPE_V16HI:
34401 case USI_FTYPE_V32HI:
34402 case UQI_FTYPE_V4SI:
34403 case UQI_FTYPE_V8SI:
34404 case UHI_FTYPE_V16SI:
34405 case UQI_FTYPE_V2DI:
34406 case UQI_FTYPE_V4DI:
34407 case UQI_FTYPE_V8DI:
34408 case V16SI_FTYPE_UHI:
34409 case V2DI_FTYPE_UQI:
34410 case V4DI_FTYPE_UQI:
34411 case V16SI_FTYPE_INT:
34412 case V16SF_FTYPE_V8SF:
34413 case V16SI_FTYPE_V8SI:
34414 case V16SF_FTYPE_V4SF:
34415 case V16SI_FTYPE_V4SI:
34416 case V16SI_FTYPE_V16SF:
34417 case V16SF_FTYPE_V16SF:
34418 case V8DI_FTYPE_UQI:
34419 case V8DF_FTYPE_V4DF:
34420 case V8DF_FTYPE_V2DF:
34421 case V8DF_FTYPE_V8DF:
34422 nargs = 1;
34423 break;
34424 case V4SF_FTYPE_V4SF_VEC_MERGE:
34425 case V2DF_FTYPE_V2DF_VEC_MERGE:
34426 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
34427 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
34428 case V16QI_FTYPE_V16QI_V16QI:
34429 case V16QI_FTYPE_V8HI_V8HI:
34430 case V16SF_FTYPE_V16SF_V16SF:
34431 case V8QI_FTYPE_V8QI_V8QI:
34432 case V8QI_FTYPE_V4HI_V4HI:
34433 case V8HI_FTYPE_V8HI_V8HI:
34434 case V8HI_FTYPE_V16QI_V16QI:
34435 case V8HI_FTYPE_V4SI_V4SI:
34436 case V8SF_FTYPE_V8SF_V8SF:
34437 case V8SF_FTYPE_V8SF_V8SI:
34438 case V8DF_FTYPE_V8DF_V8DF:
34439 case V4SI_FTYPE_V4SI_V4SI:
34440 case V4SI_FTYPE_V8HI_V8HI:
34441 case V4SI_FTYPE_V2DF_V2DF:
34442 case V4HI_FTYPE_V4HI_V4HI:
34443 case V4HI_FTYPE_V8QI_V8QI:
34444 case V4HI_FTYPE_V2SI_V2SI:
34445 case V4DF_FTYPE_V4DF_V4DF:
34446 case V4DF_FTYPE_V4DF_V4DI:
34447 case V4SF_FTYPE_V4SF_V4SF:
34448 case V4SF_FTYPE_V4SF_V4SI:
34449 case V4SF_FTYPE_V4SF_V2SI:
34450 case V4SF_FTYPE_V4SF_V2DF:
34451 case V4SF_FTYPE_V4SF_UINT:
34452 case V4SF_FTYPE_V4SF_DI:
34453 case V4SF_FTYPE_V4SF_SI:
34454 case V2DI_FTYPE_V2DI_V2DI:
34455 case V2DI_FTYPE_V16QI_V16QI:
34456 case V2DI_FTYPE_V4SI_V4SI:
34457 case V2DI_FTYPE_V2DI_V16QI:
34458 case V2SI_FTYPE_V2SI_V2SI:
34459 case V2SI_FTYPE_V4HI_V4HI:
34460 case V2SI_FTYPE_V2SF_V2SF:
34461 case V2DF_FTYPE_V2DF_V2DF:
34462 case V2DF_FTYPE_V2DF_V4SF:
34463 case V2DF_FTYPE_V2DF_V2DI:
34464 case V2DF_FTYPE_V2DF_DI:
34465 case V2DF_FTYPE_V2DF_SI:
34466 case V2DF_FTYPE_V2DF_UINT:
34467 case V2SF_FTYPE_V2SF_V2SF:
34468 case V1DI_FTYPE_V1DI_V1DI:
34469 case V1DI_FTYPE_V8QI_V8QI:
34470 case V1DI_FTYPE_V2SI_V2SI:
34471 case V32QI_FTYPE_V16HI_V16HI:
34472 case V16HI_FTYPE_V8SI_V8SI:
34473 case V32QI_FTYPE_V32QI_V32QI:
34474 case V16HI_FTYPE_V32QI_V32QI:
34475 case V16HI_FTYPE_V16HI_V16HI:
34476 case V8SI_FTYPE_V4DF_V4DF:
34477 case V8SI_FTYPE_V8SI_V8SI:
34478 case V8SI_FTYPE_V16HI_V16HI:
34479 case V4DI_FTYPE_V4DI_V4DI:
34480 case V4DI_FTYPE_V8SI_V8SI:
34481 case V8DI_FTYPE_V64QI_V64QI:
34482 if (comparison == UNKNOWN)
34483 return ix86_expand_binop_builtin (icode, exp, target);
34484 nargs = 2;
34485 break;
34486 case V4SF_FTYPE_V4SF_V4SF_SWAP:
34487 case V2DF_FTYPE_V2DF_V2DF_SWAP:
34488 gcc_assert (comparison != UNKNOWN);
34489 nargs = 2;
34490 swap = true;
34491 break;
34492 case V16HI_FTYPE_V16HI_V8HI_COUNT:
34493 case V16HI_FTYPE_V16HI_SI_COUNT:
34494 case V8SI_FTYPE_V8SI_V4SI_COUNT:
34495 case V8SI_FTYPE_V8SI_SI_COUNT:
34496 case V4DI_FTYPE_V4DI_V2DI_COUNT:
34497 case V4DI_FTYPE_V4DI_INT_COUNT:
34498 case V8HI_FTYPE_V8HI_V8HI_COUNT:
34499 case V8HI_FTYPE_V8HI_SI_COUNT:
34500 case V4SI_FTYPE_V4SI_V4SI_COUNT:
34501 case V4SI_FTYPE_V4SI_SI_COUNT:
34502 case V4HI_FTYPE_V4HI_V4HI_COUNT:
34503 case V4HI_FTYPE_V4HI_SI_COUNT:
34504 case V2DI_FTYPE_V2DI_V2DI_COUNT:
34505 case V2DI_FTYPE_V2DI_SI_COUNT:
34506 case V2SI_FTYPE_V2SI_V2SI_COUNT:
34507 case V2SI_FTYPE_V2SI_SI_COUNT:
34508 case V1DI_FTYPE_V1DI_V1DI_COUNT:
34509 case V1DI_FTYPE_V1DI_SI_COUNT:
34510 nargs = 2;
34511 last_arg_count = true;
34512 break;
34513 case UINT64_FTYPE_UINT64_UINT64:
34514 case UINT_FTYPE_UINT_UINT:
34515 case UINT_FTYPE_UINT_USHORT:
34516 case UINT_FTYPE_UINT_UCHAR:
34517 case UINT16_FTYPE_UINT16_INT:
34518 case UINT8_FTYPE_UINT8_INT:
34519 case UHI_FTYPE_UHI_UHI:
34520 case USI_FTYPE_USI_USI:
34521 case UDI_FTYPE_UDI_UDI:
34522 case V16SI_FTYPE_V8DF_V8DF:
34523 nargs = 2;
34524 break;
34525 case V2DI_FTYPE_V2DI_INT_CONVERT:
34526 nargs = 2;
34527 rmode = V1TImode;
34528 nargs_constant = 1;
34529 break;
34530 case V4DI_FTYPE_V4DI_INT_CONVERT:
34531 nargs = 2;
34532 rmode = V2TImode;
34533 nargs_constant = 1;
34534 break;
34535 case V8DI_FTYPE_V8DI_INT_CONVERT:
34536 nargs = 2;
34537 rmode = V4TImode;
34538 nargs_constant = 1;
34539 break;
34540 case V8HI_FTYPE_V8HI_INT:
34541 case V8HI_FTYPE_V8SF_INT:
34542 case V16HI_FTYPE_V16SF_INT:
34543 case V8HI_FTYPE_V4SF_INT:
34544 case V8SF_FTYPE_V8SF_INT:
34545 case V4SF_FTYPE_V16SF_INT:
34546 case V16SF_FTYPE_V16SF_INT:
34547 case V4SI_FTYPE_V4SI_INT:
34548 case V4SI_FTYPE_V8SI_INT:
34549 case V4HI_FTYPE_V4HI_INT:
34550 case V4DF_FTYPE_V4DF_INT:
34551 case V4DF_FTYPE_V8DF_INT:
34552 case V4SF_FTYPE_V4SF_INT:
34553 case V4SF_FTYPE_V8SF_INT:
34554 case V2DI_FTYPE_V2DI_INT:
34555 case V2DF_FTYPE_V2DF_INT:
34556 case V2DF_FTYPE_V4DF_INT:
34557 case V16HI_FTYPE_V16HI_INT:
34558 case V8SI_FTYPE_V8SI_INT:
34559 case V16SI_FTYPE_V16SI_INT:
34560 case V4SI_FTYPE_V16SI_INT:
34561 case V4DI_FTYPE_V4DI_INT:
34562 case V2DI_FTYPE_V4DI_INT:
34563 case V4DI_FTYPE_V8DI_INT:
34564 case QI_FTYPE_V4SF_INT:
34565 case QI_FTYPE_V2DF_INT:
34566 nargs = 2;
34567 nargs_constant = 1;
34568 break;
34569 case V16QI_FTYPE_V16QI_V16QI_V16QI:
34570 case V8SF_FTYPE_V8SF_V8SF_V8SF:
34571 case V4DF_FTYPE_V4DF_V4DF_V4DF:
34572 case V4SF_FTYPE_V4SF_V4SF_V4SF:
34573 case V2DF_FTYPE_V2DF_V2DF_V2DF:
34574 case V32QI_FTYPE_V32QI_V32QI_V32QI:
34575 case UHI_FTYPE_V16SI_V16SI_UHI:
34576 case UQI_FTYPE_V8DI_V8DI_UQI:
34577 case V16HI_FTYPE_V16SI_V16HI_UHI:
34578 case V16QI_FTYPE_V16SI_V16QI_UHI:
34579 case V16QI_FTYPE_V8DI_V16QI_UQI:
34580 case V16SF_FTYPE_V16SF_V16SF_UHI:
34581 case V16SF_FTYPE_V4SF_V16SF_UHI:
34582 case V16SI_FTYPE_SI_V16SI_UHI:
34583 case V16SI_FTYPE_V16HI_V16SI_UHI:
34584 case V16SI_FTYPE_V16QI_V16SI_UHI:
34585 case V8SF_FTYPE_V4SF_V8SF_UQI:
34586 case V4DF_FTYPE_V2DF_V4DF_UQI:
34587 case V8SI_FTYPE_V4SI_V8SI_UQI:
34588 case V8SI_FTYPE_SI_V8SI_UQI:
34589 case V4SI_FTYPE_V4SI_V4SI_UQI:
34590 case V4SI_FTYPE_SI_V4SI_UQI:
34591 case V4DI_FTYPE_V2DI_V4DI_UQI:
34592 case V4DI_FTYPE_DI_V4DI_UQI:
34593 case V2DI_FTYPE_V2DI_V2DI_UQI:
34594 case V2DI_FTYPE_DI_V2DI_UQI:
34595 case V64QI_FTYPE_V64QI_V64QI_UDI:
34596 case V64QI_FTYPE_V16QI_V64QI_UDI:
34597 case V64QI_FTYPE_QI_V64QI_UDI:
34598 case V32QI_FTYPE_V32QI_V32QI_USI:
34599 case V32QI_FTYPE_V16QI_V32QI_USI:
34600 case V32QI_FTYPE_QI_V32QI_USI:
34601 case V16QI_FTYPE_V16QI_V16QI_UHI:
34602 case V16QI_FTYPE_QI_V16QI_UHI:
34603 case V32HI_FTYPE_V8HI_V32HI_USI:
34604 case V32HI_FTYPE_HI_V32HI_USI:
34605 case V16HI_FTYPE_V8HI_V16HI_UHI:
34606 case V16HI_FTYPE_HI_V16HI_UHI:
34607 case V8HI_FTYPE_V8HI_V8HI_UQI:
34608 case V8HI_FTYPE_HI_V8HI_UQI:
34609 case V8SF_FTYPE_V8HI_V8SF_UQI:
34610 case V4SF_FTYPE_V8HI_V4SF_UQI:
34611 case V8SI_FTYPE_V8SF_V8SI_UQI:
34612 case V4SI_FTYPE_V4SF_V4SI_UQI:
34613 case V4DI_FTYPE_V4SF_V4DI_UQI:
34614 case V2DI_FTYPE_V4SF_V2DI_UQI:
34615 case V4SF_FTYPE_V4DI_V4SF_UQI:
34616 case V4SF_FTYPE_V2DI_V4SF_UQI:
34617 case V4DF_FTYPE_V4DI_V4DF_UQI:
34618 case V2DF_FTYPE_V2DI_V2DF_UQI:
34619 case V16QI_FTYPE_V8HI_V16QI_UQI:
34620 case V16QI_FTYPE_V16HI_V16QI_UHI:
34621 case V16QI_FTYPE_V4SI_V16QI_UQI:
34622 case V16QI_FTYPE_V8SI_V16QI_UQI:
34623 case V8HI_FTYPE_V4SI_V8HI_UQI:
34624 case V8HI_FTYPE_V8SI_V8HI_UQI:
34625 case V16QI_FTYPE_V2DI_V16QI_UQI:
34626 case V16QI_FTYPE_V4DI_V16QI_UQI:
34627 case V8HI_FTYPE_V2DI_V8HI_UQI:
34628 case V8HI_FTYPE_V4DI_V8HI_UQI:
34629 case V4SI_FTYPE_V2DI_V4SI_UQI:
34630 case V4SI_FTYPE_V4DI_V4SI_UQI:
34631 case V32QI_FTYPE_V32HI_V32QI_USI:
34632 case UHI_FTYPE_V16QI_V16QI_UHI:
34633 case USI_FTYPE_V32QI_V32QI_USI:
34634 case UDI_FTYPE_V64QI_V64QI_UDI:
34635 case UQI_FTYPE_V8HI_V8HI_UQI:
34636 case UHI_FTYPE_V16HI_V16HI_UHI:
34637 case USI_FTYPE_V32HI_V32HI_USI:
34638 case UQI_FTYPE_V4SI_V4SI_UQI:
34639 case UQI_FTYPE_V8SI_V8SI_UQI:
34640 case UQI_FTYPE_V2DI_V2DI_UQI:
34641 case UQI_FTYPE_V4DI_V4DI_UQI:
34642 case V4SF_FTYPE_V2DF_V4SF_UQI:
34643 case V4SF_FTYPE_V4DF_V4SF_UQI:
34644 case V16SI_FTYPE_V16SI_V16SI_UHI:
34645 case V16SI_FTYPE_V4SI_V16SI_UHI:
34646 case V2DI_FTYPE_V4SI_V2DI_UQI:
34647 case V2DI_FTYPE_V8HI_V2DI_UQI:
34648 case V2DI_FTYPE_V16QI_V2DI_UQI:
34649 case V4DI_FTYPE_V4DI_V4DI_UQI:
34650 case V4DI_FTYPE_V4SI_V4DI_UQI:
34651 case V4DI_FTYPE_V8HI_V4DI_UQI:
34652 case V4DI_FTYPE_V16QI_V4DI_UQI:
34653 case V4DI_FTYPE_V4DF_V4DI_UQI:
34654 case V2DI_FTYPE_V2DF_V2DI_UQI:
34655 case V4SI_FTYPE_V4DF_V4SI_UQI:
34656 case V4SI_FTYPE_V2DF_V4SI_UQI:
34657 case V4SI_FTYPE_V8HI_V4SI_UQI:
34658 case V4SI_FTYPE_V16QI_V4SI_UQI:
34659 case V4DI_FTYPE_V4DI_V4DI_V4DI:
34660 case V8DF_FTYPE_V2DF_V8DF_UQI:
34661 case V8DF_FTYPE_V4DF_V8DF_UQI:
34662 case V8DF_FTYPE_V8DF_V8DF_UQI:
34663 case V8SF_FTYPE_V8SF_V8SF_UQI:
34664 case V8SF_FTYPE_V8SI_V8SF_UQI:
34665 case V4DF_FTYPE_V4DF_V4DF_UQI:
34666 case V4SF_FTYPE_V4SF_V4SF_UQI:
34667 case V2DF_FTYPE_V2DF_V2DF_UQI:
34668 case V2DF_FTYPE_V4SF_V2DF_UQI:
34669 case V2DF_FTYPE_V4SI_V2DF_UQI:
34670 case V4SF_FTYPE_V4SI_V4SF_UQI:
34671 case V4DF_FTYPE_V4SF_V4DF_UQI:
34672 case V4DF_FTYPE_V4SI_V4DF_UQI:
34673 case V8SI_FTYPE_V8SI_V8SI_UQI:
34674 case V8SI_FTYPE_V8HI_V8SI_UQI:
34675 case V8SI_FTYPE_V16QI_V8SI_UQI:
34676 case V8DF_FTYPE_V8SI_V8DF_UQI:
34677 case V8DI_FTYPE_DI_V8DI_UQI:
34678 case V16SF_FTYPE_V8SF_V16SF_UHI:
34679 case V16SI_FTYPE_V8SI_V16SI_UHI:
34680 case V16HI_FTYPE_V16HI_V16HI_UHI:
34681 case V8HI_FTYPE_V16QI_V8HI_UQI:
34682 case V16HI_FTYPE_V16QI_V16HI_UHI:
34683 case V32HI_FTYPE_V32HI_V32HI_USI:
34684 case V32HI_FTYPE_V32QI_V32HI_USI:
34685 case V8DI_FTYPE_V16QI_V8DI_UQI:
34686 case V8DI_FTYPE_V2DI_V8DI_UQI:
34687 case V8DI_FTYPE_V4DI_V8DI_UQI:
34688 case V8DI_FTYPE_V8DI_V8DI_UQI:
34689 case V8DI_FTYPE_V8HI_V8DI_UQI:
34690 case V8DI_FTYPE_V8SI_V8DI_UQI:
34691 case V8HI_FTYPE_V8DI_V8HI_UQI:
34692 case V8SI_FTYPE_V8DI_V8SI_UQI:
34693 case V4SI_FTYPE_V4SI_V4SI_V4SI:
34694 nargs = 3;
34695 break;
34696 case V32QI_FTYPE_V32QI_V32QI_INT:
34697 case V16HI_FTYPE_V16HI_V16HI_INT:
34698 case V16QI_FTYPE_V16QI_V16QI_INT:
34699 case V4DI_FTYPE_V4DI_V4DI_INT:
34700 case V8HI_FTYPE_V8HI_V8HI_INT:
34701 case V8SI_FTYPE_V8SI_V8SI_INT:
34702 case V8SI_FTYPE_V8SI_V4SI_INT:
34703 case V8SF_FTYPE_V8SF_V8SF_INT:
34704 case V8SF_FTYPE_V8SF_V4SF_INT:
34705 case V4SI_FTYPE_V4SI_V4SI_INT:
34706 case V4DF_FTYPE_V4DF_V4DF_INT:
34707 case V16SF_FTYPE_V16SF_V16SF_INT:
34708 case V16SF_FTYPE_V16SF_V4SF_INT:
34709 case V16SI_FTYPE_V16SI_V4SI_INT:
34710 case V4DF_FTYPE_V4DF_V2DF_INT:
34711 case V4SF_FTYPE_V4SF_V4SF_INT:
34712 case V2DI_FTYPE_V2DI_V2DI_INT:
34713 case V4DI_FTYPE_V4DI_V2DI_INT:
34714 case V2DF_FTYPE_V2DF_V2DF_INT:
34715 case UQI_FTYPE_V8DI_V8UDI_INT:
34716 case UQI_FTYPE_V8DF_V8DF_INT:
34717 case UQI_FTYPE_V2DF_V2DF_INT:
34718 case UQI_FTYPE_V4SF_V4SF_INT:
34719 case UHI_FTYPE_V16SI_V16SI_INT:
34720 case UHI_FTYPE_V16SF_V16SF_INT:
34721 nargs = 3;
34722 nargs_constant = 1;
34723 break;
34724 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
34725 nargs = 3;
34726 rmode = V4DImode;
34727 nargs_constant = 1;
34728 break;
34729 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
34730 nargs = 3;
34731 rmode = V2DImode;
34732 nargs_constant = 1;
34733 break;
34734 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
34735 nargs = 3;
34736 rmode = DImode;
34737 nargs_constant = 1;
34738 break;
34739 case V2DI_FTYPE_V2DI_UINT_UINT:
34740 nargs = 3;
34741 nargs_constant = 2;
34742 break;
34743 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
34744 nargs = 3;
34745 rmode = V8DImode;
34746 nargs_constant = 1;
34747 break;
34748 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
34749 nargs = 5;
34750 rmode = V8DImode;
34751 mask_pos = 2;
34752 nargs_constant = 1;
34753 break;
34754 case QI_FTYPE_V8DF_INT_UQI:
34755 case QI_FTYPE_V4DF_INT_UQI:
34756 case QI_FTYPE_V2DF_INT_UQI:
34757 case HI_FTYPE_V16SF_INT_UHI:
34758 case QI_FTYPE_V8SF_INT_UQI:
34759 case QI_FTYPE_V4SF_INT_UQI:
34760 nargs = 3;
34761 mask_pos = 1;
34762 nargs_constant = 1;
34763 break;
34764 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
34765 nargs = 5;
34766 rmode = V4DImode;
34767 mask_pos = 2;
34768 nargs_constant = 1;
34769 break;
34770 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
34771 nargs = 5;
34772 rmode = V2DImode;
34773 mask_pos = 2;
34774 nargs_constant = 1;
34775 break;
34776 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
34777 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
34778 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
34779 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
34780 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
34781 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
34782 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
34783 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
34784 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
34785 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
34786 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
34787 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
34788 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
34789 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
34790 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
34791 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
34792 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
34793 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
34794 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
34795 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
34796 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
34797 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
34798 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
34799 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
34800 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
34801 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
34802 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
34803 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
34804 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
34805 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
34806 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
34807 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
34808 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
34809 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
34810 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
34811 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
34812 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
34813 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
34814 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
34815 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
34816 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
34817 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
34818 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
34819 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
34820 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
34821 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
34822 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
34823 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
34824 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
34825 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
34826 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
34827 nargs = 4;
34828 break;
34829 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
34830 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
34831 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
34832 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
34833 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
34834 nargs = 4;
34835 nargs_constant = 1;
34836 break;
34837 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
34838 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
34839 case QI_FTYPE_V4DF_V4DF_INT_UQI:
34840 case QI_FTYPE_V8SF_V8SF_INT_UQI:
34841 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
34842 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
34843 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
34844 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
34845 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
34846 case USI_FTYPE_V32QI_V32QI_INT_USI:
34847 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
34848 case USI_FTYPE_V32HI_V32HI_INT_USI:
34849 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
34850 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
34851 nargs = 4;
34852 mask_pos = 1;
34853 nargs_constant = 1;
34854 break;
34855 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
34856 nargs = 4;
34857 nargs_constant = 2;
34858 break;
34859 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
34860 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
34861 nargs = 4;
34862 break;
34863 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
34864 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
34865 mask_pos = 1;
34866 nargs = 4;
34867 nargs_constant = 1;
34868 break;
34869 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
34870 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
34871 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
34872 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
34873 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
34874 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
34875 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
34876 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
34877 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
34878 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
34879 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
34880 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
34881 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
34882 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
34883 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
34884 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
34885 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
34886 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
34887 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
34888 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
34889 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
34890 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
34891 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
34892 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
34893 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
34894 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
34895 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
34896 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
34897 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
34898 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
34899 nargs = 4;
34900 mask_pos = 2;
34901 nargs_constant = 1;
34902 break;
34903 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
34904 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
34905 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
34906 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
34907 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
34908 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
34909 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
34910 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
34911 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
34912 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
34913 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
34914 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
34915 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
34916 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
34917 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
34918 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
34919 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
34920 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
34921 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
34922 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
34923 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
34924 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
34925 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
34926 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
34927 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
34928 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
34929 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
34930 nargs = 5;
34931 mask_pos = 2;
34932 nargs_constant = 1;
34933 break;
34934 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
34935 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
34936 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
34937 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
34938 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
34939 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
34940 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
34941 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
34942 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
34943 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
34944 nargs = 5;
34945 mask_pos = 1;
34946 nargs_constant = 1;
34947 break;
34948
34949 default:
34950 gcc_unreachable ();
34951 }
34952
34953 gcc_assert (nargs <= ARRAY_SIZE (args));
34954
34955 if (comparison != UNKNOWN)
34956 {
34957 gcc_assert (nargs == 2);
34958 return ix86_expand_sse_compare (d, exp, target, swap);
34959 }
34960
34961 if (rmode == VOIDmode || rmode == tmode)
34962 {
34963 if (optimize
34964 || target == 0
34965 || GET_MODE (target) != tmode
34966 || !insn_p->operand[0].predicate (target, tmode))
34967 target = gen_reg_rtx (tmode);
34968 real_target = target;
34969 }
34970 else
34971 {
34972 real_target = gen_reg_rtx (tmode);
34973 target = lowpart_subreg (rmode, real_target, tmode);
34974 }
34975
34976 for (i = 0; i < nargs; i++)
34977 {
34978 tree arg = CALL_EXPR_ARG (exp, i);
34979 rtx op = expand_normal (arg);
34980 machine_mode mode = insn_p->operand[i + 1].mode;
34981 bool match = insn_p->operand[i + 1].predicate (op, mode);
34982
34983 if (last_arg_count && (i + 1) == nargs)
34984 {
34985 /* SIMD shift insns take either an 8-bit immediate or
34986 register as count. But builtin functions take int as
34987 count. If count doesn't match, we put it in register. */
34988 if (!match)
34989 {
34990 op = lowpart_subreg (SImode, op, GET_MODE (op));
34991 if (!insn_p->operand[i + 1].predicate (op, mode))
34992 op = copy_to_reg (op);
34993 }
34994 }
34995 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
34996 (!mask_pos && (nargs - i) <= nargs_constant))
34997 {
34998 if (!match)
34999 switch (icode)
35000 {
35001 case CODE_FOR_avx_vinsertf128v4di:
35002 case CODE_FOR_avx_vextractf128v4di:
35003 error ("the last argument must be an 1-bit immediate");
35004 return const0_rtx;
35005
35006 case CODE_FOR_avx512f_cmpv8di3_mask:
35007 case CODE_FOR_avx512f_cmpv16si3_mask:
35008 case CODE_FOR_avx512f_ucmpv8di3_mask:
35009 case CODE_FOR_avx512f_ucmpv16si3_mask:
35010 case CODE_FOR_avx512vl_cmpv4di3_mask:
35011 case CODE_FOR_avx512vl_cmpv8si3_mask:
35012 case CODE_FOR_avx512vl_ucmpv4di3_mask:
35013 case CODE_FOR_avx512vl_ucmpv8si3_mask:
35014 case CODE_FOR_avx512vl_cmpv2di3_mask:
35015 case CODE_FOR_avx512vl_cmpv4si3_mask:
35016 case CODE_FOR_avx512vl_ucmpv2di3_mask:
35017 case CODE_FOR_avx512vl_ucmpv4si3_mask:
35018 error ("the last argument must be a 3-bit immediate");
35019 return const0_rtx;
35020
35021 case CODE_FOR_sse4_1_roundsd:
35022 case CODE_FOR_sse4_1_roundss:
35023
35024 case CODE_FOR_sse4_1_roundpd:
35025 case CODE_FOR_sse4_1_roundps:
35026 case CODE_FOR_avx_roundpd256:
35027 case CODE_FOR_avx_roundps256:
35028
35029 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
35030 case CODE_FOR_sse4_1_roundps_sfix:
35031 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
35032 case CODE_FOR_avx_roundps_sfix256:
35033
35034 case CODE_FOR_sse4_1_blendps:
35035 case CODE_FOR_avx_blendpd256:
35036 case CODE_FOR_avx_vpermilv4df:
35037 case CODE_FOR_avx_vpermilv4df_mask:
35038 case CODE_FOR_avx512f_getmantv8df_mask:
35039 case CODE_FOR_avx512f_getmantv16sf_mask:
35040 case CODE_FOR_avx512vl_getmantv8sf_mask:
35041 case CODE_FOR_avx512vl_getmantv4df_mask:
35042 case CODE_FOR_avx512vl_getmantv4sf_mask:
35043 case CODE_FOR_avx512vl_getmantv2df_mask:
35044 case CODE_FOR_avx512dq_rangepv8df_mask_round:
35045 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
35046 case CODE_FOR_avx512dq_rangepv4df_mask:
35047 case CODE_FOR_avx512dq_rangepv8sf_mask:
35048 case CODE_FOR_avx512dq_rangepv2df_mask:
35049 case CODE_FOR_avx512dq_rangepv4sf_mask:
35050 case CODE_FOR_avx_shufpd256_mask:
35051 error ("the last argument must be a 4-bit immediate");
35052 return const0_rtx;
35053
35054 case CODE_FOR_sha1rnds4:
35055 case CODE_FOR_sse4_1_blendpd:
35056 case CODE_FOR_avx_vpermilv2df:
35057 case CODE_FOR_avx_vpermilv2df_mask:
35058 case CODE_FOR_xop_vpermil2v2df3:
35059 case CODE_FOR_xop_vpermil2v4sf3:
35060 case CODE_FOR_xop_vpermil2v4df3:
35061 case CODE_FOR_xop_vpermil2v8sf3:
35062 case CODE_FOR_avx512f_vinsertf32x4_mask:
35063 case CODE_FOR_avx512f_vinserti32x4_mask:
35064 case CODE_FOR_avx512f_vextractf32x4_mask:
35065 case CODE_FOR_avx512f_vextracti32x4_mask:
35066 case CODE_FOR_sse2_shufpd:
35067 case CODE_FOR_sse2_shufpd_mask:
35068 case CODE_FOR_avx512dq_shuf_f64x2_mask:
35069 case CODE_FOR_avx512dq_shuf_i64x2_mask:
35070 case CODE_FOR_avx512vl_shuf_i32x4_mask:
35071 case CODE_FOR_avx512vl_shuf_f32x4_mask:
35072 error ("the last argument must be a 2-bit immediate");
35073 return const0_rtx;
35074
35075 case CODE_FOR_avx_vextractf128v4df:
35076 case CODE_FOR_avx_vextractf128v8sf:
35077 case CODE_FOR_avx_vextractf128v8si:
35078 case CODE_FOR_avx_vinsertf128v4df:
35079 case CODE_FOR_avx_vinsertf128v8sf:
35080 case CODE_FOR_avx_vinsertf128v8si:
35081 case CODE_FOR_avx512f_vinsertf64x4_mask:
35082 case CODE_FOR_avx512f_vinserti64x4_mask:
35083 case CODE_FOR_avx512f_vextractf64x4_mask:
35084 case CODE_FOR_avx512f_vextracti64x4_mask:
35085 case CODE_FOR_avx512dq_vinsertf32x8_mask:
35086 case CODE_FOR_avx512dq_vinserti32x8_mask:
35087 case CODE_FOR_avx512vl_vinsertv4df:
35088 case CODE_FOR_avx512vl_vinsertv4di:
35089 case CODE_FOR_avx512vl_vinsertv8sf:
35090 case CODE_FOR_avx512vl_vinsertv8si:
35091 error ("the last argument must be a 1-bit immediate");
35092 return const0_rtx;
35093
35094 case CODE_FOR_avx_vmcmpv2df3:
35095 case CODE_FOR_avx_vmcmpv4sf3:
35096 case CODE_FOR_avx_cmpv2df3:
35097 case CODE_FOR_avx_cmpv4sf3:
35098 case CODE_FOR_avx_cmpv4df3:
35099 case CODE_FOR_avx_cmpv8sf3:
35100 case CODE_FOR_avx512f_cmpv8df3_mask:
35101 case CODE_FOR_avx512f_cmpv16sf3_mask:
35102 case CODE_FOR_avx512f_vmcmpv2df3_mask:
35103 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
35104 error ("the last argument must be a 5-bit immediate");
35105 return const0_rtx;
35106
35107 default:
35108 switch (nargs_constant)
35109 {
35110 case 2:
35111 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
35112 (!mask_pos && (nargs - i) == nargs_constant))
35113 {
35114 error ("the next to last argument must be an 8-bit immediate");
35115 break;
35116 }
35117 /* FALLTHRU */
35118 case 1:
35119 error ("the last argument must be an 8-bit immediate");
35120 break;
35121 default:
35122 gcc_unreachable ();
35123 }
35124 return const0_rtx;
35125 }
35126 }
35127 else
35128 {
35129 if (VECTOR_MODE_P (mode))
35130 op = safe_vector_operand (op, mode);
35131
35132 /* If we aren't optimizing, only allow one memory operand to
35133 be generated. */
35134 if (memory_operand (op, mode))
35135 num_memory++;
35136
35137 op = fixup_modeless_constant (op, mode);
35138
35139 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35140 {
35141 if (optimize || !match || num_memory > 1)
35142 op = copy_to_mode_reg (mode, op);
35143 }
35144 else
35145 {
35146 op = copy_to_reg (op);
35147 op = lowpart_subreg (mode, op, GET_MODE (op));
35148 }
35149 }
35150
35151 args[i].op = op;
35152 args[i].mode = mode;
35153 }
35154
35155 switch (nargs)
35156 {
35157 case 1:
35158 pat = GEN_FCN (icode) (real_target, args[0].op);
35159 break;
35160 case 2:
35161 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
35162 break;
35163 case 3:
35164 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35165 args[2].op);
35166 break;
35167 case 4:
35168 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35169 args[2].op, args[3].op);
35170 break;
35171 case 5:
35172 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35173 args[2].op, args[3].op, args[4].op);
35174 break;
35175 case 6:
35176 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35177 args[2].op, args[3].op, args[4].op,
35178 args[5].op);
35179 break;
35180 default:
35181 gcc_unreachable ();
35182 }
35183
35184 if (! pat)
35185 return 0;
35186
35187 emit_insn (pat);
35188 return target;
35189 }
35190
35191 /* Transform pattern of following layout:
35192 (parallel [
35193 set (A B)
35194 (unspec [C] UNSPEC_EMBEDDED_ROUNDING)])
35195 ])
35196 into:
35197 (set (A B))
35198
35199 Or:
35200 (parallel [ A B
35201 ...
35202 (unspec [C] UNSPEC_EMBEDDED_ROUNDING)
35203 ...
35204 ])
35205 into:
35206 (parallel [ A B ... ]) */
35207
35208 static rtx
35209 ix86_erase_embedded_rounding (rtx pat)
35210 {
35211 if (GET_CODE (pat) == INSN)
35212 pat = PATTERN (pat);
35213
35214 gcc_assert (GET_CODE (pat) == PARALLEL);
35215
35216 if (XVECLEN (pat, 0) == 2)
35217 {
35218 rtx p0 = XVECEXP (pat, 0, 0);
35219 rtx p1 = XVECEXP (pat, 0, 1);
35220
35221 gcc_assert (GET_CODE (p0) == SET
35222 && GET_CODE (p1) == UNSPEC
35223 && XINT (p1, 1) == UNSPEC_EMBEDDED_ROUNDING);
35224
35225 return p0;
35226 }
35227 else
35228 {
35229 rtx *res = XALLOCAVEC (rtx, XVECLEN (pat, 0));
35230 int i = 0;
35231 int j = 0;
35232
35233 for (; i < XVECLEN (pat, 0); ++i)
35234 {
35235 rtx elem = XVECEXP (pat, 0, i);
35236 if (GET_CODE (elem) != UNSPEC
35237 || XINT (elem, 1) != UNSPEC_EMBEDDED_ROUNDING)
35238 res [j++] = elem;
35239 }
35240
35241 /* No more than 1 occurence was removed. */
35242 gcc_assert (j >= XVECLEN (pat, 0) - 1);
35243
35244 return gen_rtx_PARALLEL (GET_MODE (pat), gen_rtvec_v (j, res));
35245 }
35246 }
35247
35248 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
35249 with rounding. */
35250 static rtx
35251 ix86_expand_sse_comi_round (const struct builtin_description *d,
35252 tree exp, rtx target)
35253 {
35254 rtx pat, set_dst;
35255 tree arg0 = CALL_EXPR_ARG (exp, 0);
35256 tree arg1 = CALL_EXPR_ARG (exp, 1);
35257 tree arg2 = CALL_EXPR_ARG (exp, 2);
35258 tree arg3 = CALL_EXPR_ARG (exp, 3);
35259 rtx op0 = expand_normal (arg0);
35260 rtx op1 = expand_normal (arg1);
35261 rtx op2 = expand_normal (arg2);
35262 rtx op3 = expand_normal (arg3);
35263 enum insn_code icode = d->icode;
35264 const struct insn_data_d *insn_p = &insn_data[icode];
35265 machine_mode mode0 = insn_p->operand[0].mode;
35266 machine_mode mode1 = insn_p->operand[1].mode;
35267 enum rtx_code comparison = UNEQ;
35268 bool need_ucomi = false;
35269
35270 /* See avxintrin.h for values. */
35271 enum rtx_code comi_comparisons[32] =
35272 {
35273 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
35274 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
35275 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
35276 };
35277 bool need_ucomi_values[32] =
35278 {
35279 true, false, false, true, true, false, false, true,
35280 true, false, false, true, true, false, false, true,
35281 false, true, true, false, false, true, true, false,
35282 false, true, true, false, false, true, true, false
35283 };
35284
35285 if (!CONST_INT_P (op2))
35286 {
35287 error ("the third argument must be comparison constant");
35288 return const0_rtx;
35289 }
35290 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
35291 {
35292 error ("incorrect comparison mode");
35293 return const0_rtx;
35294 }
35295
35296 if (!insn_p->operand[2].predicate (op3, SImode))
35297 {
35298 error ("incorrect rounding operand");
35299 return const0_rtx;
35300 }
35301
35302 comparison = comi_comparisons[INTVAL (op2)];
35303 need_ucomi = need_ucomi_values[INTVAL (op2)];
35304
35305 if (VECTOR_MODE_P (mode0))
35306 op0 = safe_vector_operand (op0, mode0);
35307 if (VECTOR_MODE_P (mode1))
35308 op1 = safe_vector_operand (op1, mode1);
35309
35310 target = gen_reg_rtx (SImode);
35311 emit_move_insn (target, const0_rtx);
35312 target = gen_rtx_SUBREG (QImode, target, 0);
35313
35314 if ((optimize && !register_operand (op0, mode0))
35315 || !insn_p->operand[0].predicate (op0, mode0))
35316 op0 = copy_to_mode_reg (mode0, op0);
35317 if ((optimize && !register_operand (op1, mode1))
35318 || !insn_p->operand[1].predicate (op1, mode1))
35319 op1 = copy_to_mode_reg (mode1, op1);
35320
35321 if (need_ucomi)
35322 icode = icode == CODE_FOR_sse_comi_round
35323 ? CODE_FOR_sse_ucomi_round
35324 : CODE_FOR_sse2_ucomi_round;
35325
35326 pat = GEN_FCN (icode) (op0, op1, op3);
35327 if (! pat)
35328 return 0;
35329
35330 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
35331 if (INTVAL (op3) == NO_ROUND)
35332 {
35333 pat = ix86_erase_embedded_rounding (pat);
35334 if (! pat)
35335 return 0;
35336
35337 set_dst = SET_DEST (pat);
35338 }
35339 else
35340 {
35341 gcc_assert (GET_CODE (XVECEXP (pat, 0, 0)) == SET);
35342 set_dst = SET_DEST (XVECEXP (pat, 0, 0));
35343 }
35344
35345 emit_insn (pat);
35346 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35347 gen_rtx_fmt_ee (comparison, QImode,
35348 set_dst,
35349 const0_rtx)));
35350
35351 return SUBREG_REG (target);
35352 }
35353
35354 static rtx
35355 ix86_expand_round_builtin (const struct builtin_description *d,
35356 tree exp, rtx target)
35357 {
35358 rtx pat;
35359 unsigned int i, nargs;
35360 struct
35361 {
35362 rtx op;
35363 machine_mode mode;
35364 } args[6];
35365 enum insn_code icode = d->icode;
35366 const struct insn_data_d *insn_p = &insn_data[icode];
35367 machine_mode tmode = insn_p->operand[0].mode;
35368 unsigned int nargs_constant = 0;
35369 unsigned int redundant_embed_rnd = 0;
35370
35371 switch ((enum ix86_builtin_func_type) d->flag)
35372 {
35373 case UINT64_FTYPE_V2DF_INT:
35374 case UINT64_FTYPE_V4SF_INT:
35375 case UINT_FTYPE_V2DF_INT:
35376 case UINT_FTYPE_V4SF_INT:
35377 case INT64_FTYPE_V2DF_INT:
35378 case INT64_FTYPE_V4SF_INT:
35379 case INT_FTYPE_V2DF_INT:
35380 case INT_FTYPE_V4SF_INT:
35381 nargs = 2;
35382 break;
35383 case V4SF_FTYPE_V4SF_UINT_INT:
35384 case V4SF_FTYPE_V4SF_UINT64_INT:
35385 case V2DF_FTYPE_V2DF_UINT64_INT:
35386 case V4SF_FTYPE_V4SF_INT_INT:
35387 case V4SF_FTYPE_V4SF_INT64_INT:
35388 case V2DF_FTYPE_V2DF_INT64_INT:
35389 case V4SF_FTYPE_V4SF_V4SF_INT:
35390 case V2DF_FTYPE_V2DF_V2DF_INT:
35391 case V4SF_FTYPE_V4SF_V2DF_INT:
35392 case V2DF_FTYPE_V2DF_V4SF_INT:
35393 nargs = 3;
35394 break;
35395 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
35396 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
35397 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
35398 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
35399 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
35400 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
35401 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
35402 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
35403 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
35404 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
35405 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
35406 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
35407 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
35408 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
35409 nargs = 4;
35410 break;
35411 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
35412 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
35413 nargs_constant = 2;
35414 nargs = 4;
35415 break;
35416 case INT_FTYPE_V4SF_V4SF_INT_INT:
35417 case INT_FTYPE_V2DF_V2DF_INT_INT:
35418 return ix86_expand_sse_comi_round (d, exp, target);
35419 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
35420 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
35421 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
35422 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
35423 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
35424 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
35425 nargs = 5;
35426 break;
35427 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
35428 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
35429 nargs_constant = 4;
35430 nargs = 5;
35431 break;
35432 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
35433 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
35434 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
35435 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
35436 nargs_constant = 3;
35437 nargs = 5;
35438 break;
35439 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
35440 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
35441 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
35442 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
35443 nargs = 6;
35444 nargs_constant = 4;
35445 break;
35446 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
35447 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
35448 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
35449 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
35450 nargs = 6;
35451 nargs_constant = 3;
35452 break;
35453 default:
35454 gcc_unreachable ();
35455 }
35456 gcc_assert (nargs <= ARRAY_SIZE (args));
35457
35458 if (optimize
35459 || target == 0
35460 || GET_MODE (target) != tmode
35461 || !insn_p->operand[0].predicate (target, tmode))
35462 target = gen_reg_rtx (tmode);
35463
35464 for (i = 0; i < nargs; i++)
35465 {
35466 tree arg = CALL_EXPR_ARG (exp, i);
35467 rtx op = expand_normal (arg);
35468 machine_mode mode = insn_p->operand[i + 1].mode;
35469 bool match = insn_p->operand[i + 1].predicate (op, mode);
35470
35471 if (i == nargs - nargs_constant)
35472 {
35473 if (!match)
35474 {
35475 switch (icode)
35476 {
35477 case CODE_FOR_avx512f_getmantv8df_mask_round:
35478 case CODE_FOR_avx512f_getmantv16sf_mask_round:
35479 case CODE_FOR_avx512f_vgetmantv2df_round:
35480 case CODE_FOR_avx512f_vgetmantv4sf_round:
35481 error ("the immediate argument must be a 4-bit immediate");
35482 return const0_rtx;
35483 case CODE_FOR_avx512f_cmpv8df3_mask_round:
35484 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
35485 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
35486 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
35487 error ("the immediate argument must be a 5-bit immediate");
35488 return const0_rtx;
35489 default:
35490 error ("the immediate argument must be an 8-bit immediate");
35491 return const0_rtx;
35492 }
35493 }
35494 }
35495 else if (i == nargs-1)
35496 {
35497 if (!insn_p->operand[nargs].predicate (op, SImode))
35498 {
35499 error ("incorrect rounding operand");
35500 return const0_rtx;
35501 }
35502
35503 /* If there is no rounding use normal version of the pattern. */
35504 if (INTVAL (op) == NO_ROUND)
35505 redundant_embed_rnd = 1;
35506 }
35507 else
35508 {
35509 if (VECTOR_MODE_P (mode))
35510 op = safe_vector_operand (op, mode);
35511
35512 op = fixup_modeless_constant (op, mode);
35513
35514 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35515 {
35516 if (optimize || !match)
35517 op = copy_to_mode_reg (mode, op);
35518 }
35519 else
35520 {
35521 op = copy_to_reg (op);
35522 op = lowpart_subreg (mode, op, GET_MODE (op));
35523 }
35524 }
35525
35526 args[i].op = op;
35527 args[i].mode = mode;
35528 }
35529
35530 switch (nargs)
35531 {
35532 case 1:
35533 pat = GEN_FCN (icode) (target, args[0].op);
35534 break;
35535 case 2:
35536 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
35537 break;
35538 case 3:
35539 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35540 args[2].op);
35541 break;
35542 case 4:
35543 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35544 args[2].op, args[3].op);
35545 break;
35546 case 5:
35547 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35548 args[2].op, args[3].op, args[4].op);
35549 break;
35550 case 6:
35551 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35552 args[2].op, args[3].op, args[4].op,
35553 args[5].op);
35554 break;
35555 default:
35556 gcc_unreachable ();
35557 }
35558
35559 if (!pat)
35560 return 0;
35561
35562 if (redundant_embed_rnd)
35563 pat = ix86_erase_embedded_rounding (pat);
35564
35565 emit_insn (pat);
35566 return target;
35567 }
35568
35569 /* Subroutine of ix86_expand_builtin to take care of special insns
35570 with variable number of operands. */
35571
35572 static rtx
35573 ix86_expand_special_args_builtin (const struct builtin_description *d,
35574 tree exp, rtx target)
35575 {
35576 tree arg;
35577 rtx pat, op;
35578 unsigned int i, nargs, arg_adjust, memory;
35579 bool aligned_mem = false;
35580 struct
35581 {
35582 rtx op;
35583 machine_mode mode;
35584 } args[3];
35585 enum insn_code icode = d->icode;
35586 bool last_arg_constant = false;
35587 const struct insn_data_d *insn_p = &insn_data[icode];
35588 machine_mode tmode = insn_p->operand[0].mode;
35589 enum { load, store } klass;
35590
35591 switch ((enum ix86_builtin_func_type) d->flag)
35592 {
35593 case VOID_FTYPE_VOID:
35594 emit_insn (GEN_FCN (icode) (target));
35595 return 0;
35596 case VOID_FTYPE_UINT64:
35597 case VOID_FTYPE_UNSIGNED:
35598 nargs = 0;
35599 klass = store;
35600 memory = 0;
35601 break;
35602
35603 case INT_FTYPE_VOID:
35604 case USHORT_FTYPE_VOID:
35605 case UINT64_FTYPE_VOID:
35606 case UNSIGNED_FTYPE_VOID:
35607 nargs = 0;
35608 klass = load;
35609 memory = 0;
35610 break;
35611 case UINT64_FTYPE_PUNSIGNED:
35612 case V2DI_FTYPE_PV2DI:
35613 case V4DI_FTYPE_PV4DI:
35614 case V32QI_FTYPE_PCCHAR:
35615 case V16QI_FTYPE_PCCHAR:
35616 case V8SF_FTYPE_PCV4SF:
35617 case V8SF_FTYPE_PCFLOAT:
35618 case V4SF_FTYPE_PCFLOAT:
35619 case V4DF_FTYPE_PCV2DF:
35620 case V4DF_FTYPE_PCDOUBLE:
35621 case V2DF_FTYPE_PCDOUBLE:
35622 case VOID_FTYPE_PVOID:
35623 case V8DI_FTYPE_PV8DI:
35624 nargs = 1;
35625 klass = load;
35626 memory = 0;
35627 switch (icode)
35628 {
35629 case CODE_FOR_sse4_1_movntdqa:
35630 case CODE_FOR_avx2_movntdqa:
35631 case CODE_FOR_avx512f_movntdqa:
35632 aligned_mem = true;
35633 break;
35634 default:
35635 break;
35636 }
35637 break;
35638 case VOID_FTYPE_PV2SF_V4SF:
35639 case VOID_FTYPE_PV8DI_V8DI:
35640 case VOID_FTYPE_PV4DI_V4DI:
35641 case VOID_FTYPE_PV2DI_V2DI:
35642 case VOID_FTYPE_PCHAR_V32QI:
35643 case VOID_FTYPE_PCHAR_V16QI:
35644 case VOID_FTYPE_PFLOAT_V16SF:
35645 case VOID_FTYPE_PFLOAT_V8SF:
35646 case VOID_FTYPE_PFLOAT_V4SF:
35647 case VOID_FTYPE_PDOUBLE_V8DF:
35648 case VOID_FTYPE_PDOUBLE_V4DF:
35649 case VOID_FTYPE_PDOUBLE_V2DF:
35650 case VOID_FTYPE_PLONGLONG_LONGLONG:
35651 case VOID_FTYPE_PULONGLONG_ULONGLONG:
35652 case VOID_FTYPE_PINT_INT:
35653 nargs = 1;
35654 klass = store;
35655 /* Reserve memory operand for target. */
35656 memory = ARRAY_SIZE (args);
35657 switch (icode)
35658 {
35659 /* These builtins and instructions require the memory
35660 to be properly aligned. */
35661 case CODE_FOR_avx_movntv4di:
35662 case CODE_FOR_sse2_movntv2di:
35663 case CODE_FOR_avx_movntv8sf:
35664 case CODE_FOR_sse_movntv4sf:
35665 case CODE_FOR_sse4a_vmmovntv4sf:
35666 case CODE_FOR_avx_movntv4df:
35667 case CODE_FOR_sse2_movntv2df:
35668 case CODE_FOR_sse4a_vmmovntv2df:
35669 case CODE_FOR_sse2_movntidi:
35670 case CODE_FOR_sse_movntq:
35671 case CODE_FOR_sse2_movntisi:
35672 case CODE_FOR_avx512f_movntv16sf:
35673 case CODE_FOR_avx512f_movntv8df:
35674 case CODE_FOR_avx512f_movntv8di:
35675 aligned_mem = true;
35676 break;
35677 default:
35678 break;
35679 }
35680 break;
35681 case V4SF_FTYPE_V4SF_PCV2SF:
35682 case V2DF_FTYPE_V2DF_PCDOUBLE:
35683 nargs = 2;
35684 klass = load;
35685 memory = 1;
35686 break;
35687 case V8SF_FTYPE_PCV8SF_V8SI:
35688 case V4DF_FTYPE_PCV4DF_V4DI:
35689 case V4SF_FTYPE_PCV4SF_V4SI:
35690 case V2DF_FTYPE_PCV2DF_V2DI:
35691 case V8SI_FTYPE_PCV8SI_V8SI:
35692 case V4DI_FTYPE_PCV4DI_V4DI:
35693 case V4SI_FTYPE_PCV4SI_V4SI:
35694 case V2DI_FTYPE_PCV2DI_V2DI:
35695 nargs = 2;
35696 klass = load;
35697 memory = 0;
35698 break;
35699 case VOID_FTYPE_PV8DF_V8DF_UQI:
35700 case VOID_FTYPE_PV4DF_V4DF_UQI:
35701 case VOID_FTYPE_PV2DF_V2DF_UQI:
35702 case VOID_FTYPE_PV16SF_V16SF_UHI:
35703 case VOID_FTYPE_PV8SF_V8SF_UQI:
35704 case VOID_FTYPE_PV4SF_V4SF_UQI:
35705 case VOID_FTYPE_PV8DI_V8DI_UQI:
35706 case VOID_FTYPE_PV4DI_V4DI_UQI:
35707 case VOID_FTYPE_PV2DI_V2DI_UQI:
35708 case VOID_FTYPE_PV16SI_V16SI_UHI:
35709 case VOID_FTYPE_PV8SI_V8SI_UQI:
35710 case VOID_FTYPE_PV4SI_V4SI_UQI:
35711 switch (icode)
35712 {
35713 /* These builtins and instructions require the memory
35714 to be properly aligned. */
35715 case CODE_FOR_avx512f_storev16sf_mask:
35716 case CODE_FOR_avx512f_storev16si_mask:
35717 case CODE_FOR_avx512f_storev8df_mask:
35718 case CODE_FOR_avx512f_storev8di_mask:
35719 case CODE_FOR_avx512vl_storev8sf_mask:
35720 case CODE_FOR_avx512vl_storev8si_mask:
35721 case CODE_FOR_avx512vl_storev4df_mask:
35722 case CODE_FOR_avx512vl_storev4di_mask:
35723 case CODE_FOR_avx512vl_storev4sf_mask:
35724 case CODE_FOR_avx512vl_storev4si_mask:
35725 case CODE_FOR_avx512vl_storev2df_mask:
35726 case CODE_FOR_avx512vl_storev2di_mask:
35727 aligned_mem = true;
35728 break;
35729 default:
35730 break;
35731 }
35732 /* FALLTHRU */
35733 case VOID_FTYPE_PV8SF_V8SI_V8SF:
35734 case VOID_FTYPE_PV4DF_V4DI_V4DF:
35735 case VOID_FTYPE_PV4SF_V4SI_V4SF:
35736 case VOID_FTYPE_PV2DF_V2DI_V2DF:
35737 case VOID_FTYPE_PV8SI_V8SI_V8SI:
35738 case VOID_FTYPE_PV4DI_V4DI_V4DI:
35739 case VOID_FTYPE_PV4SI_V4SI_V4SI:
35740 case VOID_FTYPE_PV2DI_V2DI_V2DI:
35741 case VOID_FTYPE_PV8SI_V8DI_UQI:
35742 case VOID_FTYPE_PV8HI_V8DI_UQI:
35743 case VOID_FTYPE_PV16HI_V16SI_UHI:
35744 case VOID_FTYPE_PV16QI_V8DI_UQI:
35745 case VOID_FTYPE_PV16QI_V16SI_UHI:
35746 case VOID_FTYPE_PV4SI_V4DI_UQI:
35747 case VOID_FTYPE_PV4SI_V2DI_UQI:
35748 case VOID_FTYPE_PV8HI_V4DI_UQI:
35749 case VOID_FTYPE_PV8HI_V2DI_UQI:
35750 case VOID_FTYPE_PV8HI_V8SI_UQI:
35751 case VOID_FTYPE_PV8HI_V4SI_UQI:
35752 case VOID_FTYPE_PV16QI_V4DI_UQI:
35753 case VOID_FTYPE_PV16QI_V2DI_UQI:
35754 case VOID_FTYPE_PV16QI_V8SI_UQI:
35755 case VOID_FTYPE_PV16QI_V4SI_UQI:
35756 case VOID_FTYPE_PCHAR_V64QI_UDI:
35757 case VOID_FTYPE_PCHAR_V32QI_USI:
35758 case VOID_FTYPE_PCHAR_V16QI_UHI:
35759 case VOID_FTYPE_PSHORT_V32HI_USI:
35760 case VOID_FTYPE_PSHORT_V16HI_UHI:
35761 case VOID_FTYPE_PSHORT_V8HI_UQI:
35762 case VOID_FTYPE_PINT_V16SI_UHI:
35763 case VOID_FTYPE_PINT_V8SI_UQI:
35764 case VOID_FTYPE_PINT_V4SI_UQI:
35765 case VOID_FTYPE_PINT64_V8DI_UQI:
35766 case VOID_FTYPE_PINT64_V4DI_UQI:
35767 case VOID_FTYPE_PINT64_V2DI_UQI:
35768 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
35769 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
35770 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
35771 case VOID_FTYPE_PFLOAT_V16SF_UHI:
35772 case VOID_FTYPE_PFLOAT_V8SF_UQI:
35773 case VOID_FTYPE_PFLOAT_V4SF_UQI:
35774 nargs = 2;
35775 klass = store;
35776 /* Reserve memory operand for target. */
35777 memory = ARRAY_SIZE (args);
35778 break;
35779 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
35780 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
35781 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
35782 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
35783 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
35784 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
35785 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
35786 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
35787 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
35788 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
35789 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
35790 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
35791 switch (icode)
35792 {
35793 /* These builtins and instructions require the memory
35794 to be properly aligned. */
35795 case CODE_FOR_avx512f_loadv16sf_mask:
35796 case CODE_FOR_avx512f_loadv16si_mask:
35797 case CODE_FOR_avx512f_loadv8df_mask:
35798 case CODE_FOR_avx512f_loadv8di_mask:
35799 case CODE_FOR_avx512vl_loadv8sf_mask:
35800 case CODE_FOR_avx512vl_loadv8si_mask:
35801 case CODE_FOR_avx512vl_loadv4df_mask:
35802 case CODE_FOR_avx512vl_loadv4di_mask:
35803 case CODE_FOR_avx512vl_loadv4sf_mask:
35804 case CODE_FOR_avx512vl_loadv4si_mask:
35805 case CODE_FOR_avx512vl_loadv2df_mask:
35806 case CODE_FOR_avx512vl_loadv2di_mask:
35807 case CODE_FOR_avx512bw_loadv64qi_mask:
35808 case CODE_FOR_avx512vl_loadv32qi_mask:
35809 case CODE_FOR_avx512vl_loadv16qi_mask:
35810 case CODE_FOR_avx512bw_loadv32hi_mask:
35811 case CODE_FOR_avx512vl_loadv16hi_mask:
35812 case CODE_FOR_avx512vl_loadv8hi_mask:
35813 aligned_mem = true;
35814 break;
35815 default:
35816 break;
35817 }
35818 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
35819 case V32QI_FTYPE_PCCHAR_V32QI_USI:
35820 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
35821 case V32HI_FTYPE_PCSHORT_V32HI_USI:
35822 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
35823 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
35824 case V16SI_FTYPE_PCINT_V16SI_UHI:
35825 case V8SI_FTYPE_PCINT_V8SI_UQI:
35826 case V4SI_FTYPE_PCINT_V4SI_UQI:
35827 case V8DI_FTYPE_PCINT64_V8DI_UQI:
35828 case V4DI_FTYPE_PCINT64_V4DI_UQI:
35829 case V2DI_FTYPE_PCINT64_V2DI_UQI:
35830 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
35831 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
35832 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
35833 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
35834 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
35835 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
35836 nargs = 3;
35837 klass = load;
35838 memory = 0;
35839 break;
35840 case VOID_FTYPE_UINT_UINT_UINT:
35841 case VOID_FTYPE_UINT64_UINT_UINT:
35842 case UCHAR_FTYPE_UINT_UINT_UINT:
35843 case UCHAR_FTYPE_UINT64_UINT_UINT:
35844 nargs = 3;
35845 klass = load;
35846 memory = ARRAY_SIZE (args);
35847 last_arg_constant = true;
35848 break;
35849 default:
35850 gcc_unreachable ();
35851 }
35852
35853 gcc_assert (nargs <= ARRAY_SIZE (args));
35854
35855 if (klass == store)
35856 {
35857 arg = CALL_EXPR_ARG (exp, 0);
35858 op = expand_normal (arg);
35859 gcc_assert (target == 0);
35860 if (memory)
35861 {
35862 op = ix86_zero_extend_to_Pmode (op);
35863 target = gen_rtx_MEM (tmode, op);
35864 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
35865 on it. Try to improve it using get_pointer_alignment,
35866 and if the special builtin is one that requires strict
35867 mode alignment, also from it's GET_MODE_ALIGNMENT.
35868 Failure to do so could lead to ix86_legitimate_combined_insn
35869 rejecting all changes to such insns. */
35870 unsigned int align = get_pointer_alignment (arg);
35871 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
35872 align = GET_MODE_ALIGNMENT (tmode);
35873 if (MEM_ALIGN (target) < align)
35874 set_mem_align (target, align);
35875 }
35876 else
35877 target = force_reg (tmode, op);
35878 arg_adjust = 1;
35879 }
35880 else
35881 {
35882 arg_adjust = 0;
35883 if (optimize
35884 || target == 0
35885 || !register_operand (target, tmode)
35886 || GET_MODE (target) != tmode)
35887 target = gen_reg_rtx (tmode);
35888 }
35889
35890 for (i = 0; i < nargs; i++)
35891 {
35892 machine_mode mode = insn_p->operand[i + 1].mode;
35893 bool match;
35894
35895 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
35896 op = expand_normal (arg);
35897 match = insn_p->operand[i + 1].predicate (op, mode);
35898
35899 if (last_arg_constant && (i + 1) == nargs)
35900 {
35901 if (!match)
35902 {
35903 if (icode == CODE_FOR_lwp_lwpvalsi3
35904 || icode == CODE_FOR_lwp_lwpinssi3
35905 || icode == CODE_FOR_lwp_lwpvaldi3
35906 || icode == CODE_FOR_lwp_lwpinsdi3)
35907 error ("the last argument must be a 32-bit immediate");
35908 else
35909 error ("the last argument must be an 8-bit immediate");
35910 return const0_rtx;
35911 }
35912 }
35913 else
35914 {
35915 if (i == memory)
35916 {
35917 /* This must be the memory operand. */
35918 op = ix86_zero_extend_to_Pmode (op);
35919 op = gen_rtx_MEM (mode, op);
35920 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
35921 on it. Try to improve it using get_pointer_alignment,
35922 and if the special builtin is one that requires strict
35923 mode alignment, also from it's GET_MODE_ALIGNMENT.
35924 Failure to do so could lead to ix86_legitimate_combined_insn
35925 rejecting all changes to such insns. */
35926 unsigned int align = get_pointer_alignment (arg);
35927 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
35928 align = GET_MODE_ALIGNMENT (mode);
35929 if (MEM_ALIGN (op) < align)
35930 set_mem_align (op, align);
35931 }
35932 else
35933 {
35934 /* This must be register. */
35935 if (VECTOR_MODE_P (mode))
35936 op = safe_vector_operand (op, mode);
35937
35938 op = fixup_modeless_constant (op, mode);
35939
35940 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35941 op = copy_to_mode_reg (mode, op);
35942 else
35943 {
35944 op = copy_to_reg (op);
35945 op = lowpart_subreg (mode, op, GET_MODE (op));
35946 }
35947 }
35948 }
35949
35950 args[i].op = op;
35951 args[i].mode = mode;
35952 }
35953
35954 switch (nargs)
35955 {
35956 case 0:
35957 pat = GEN_FCN (icode) (target);
35958 break;
35959 case 1:
35960 pat = GEN_FCN (icode) (target, args[0].op);
35961 break;
35962 case 2:
35963 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
35964 break;
35965 case 3:
35966 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
35967 break;
35968 default:
35969 gcc_unreachable ();
35970 }
35971
35972 if (! pat)
35973 return 0;
35974 emit_insn (pat);
35975 return klass == store ? 0 : target;
35976 }
35977
35978 /* Return the integer constant in ARG. Constrain it to be in the range
35979 of the subparts of VEC_TYPE; issue an error if not. */
35980
35981 static int
35982 get_element_number (tree vec_type, tree arg)
35983 {
35984 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
35985
35986 if (!tree_fits_uhwi_p (arg)
35987 || (elt = tree_to_uhwi (arg), elt > max))
35988 {
35989 error ("selector must be an integer constant in the range 0..%wi", max);
35990 return 0;
35991 }
35992
35993 return elt;
35994 }
35995
35996 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
35997 ix86_expand_vector_init. We DO have language-level syntax for this, in
35998 the form of (type){ init-list }. Except that since we can't place emms
35999 instructions from inside the compiler, we can't allow the use of MMX
36000 registers unless the user explicitly asks for it. So we do *not* define
36001 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
36002 we have builtins invoked by mmintrin.h that gives us license to emit
36003 these sorts of instructions. */
36004
36005 static rtx
36006 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
36007 {
36008 machine_mode tmode = TYPE_MODE (type);
36009 machine_mode inner_mode = GET_MODE_INNER (tmode);
36010 int i, n_elt = GET_MODE_NUNITS (tmode);
36011 rtvec v = rtvec_alloc (n_elt);
36012
36013 gcc_assert (VECTOR_MODE_P (tmode));
36014 gcc_assert (call_expr_nargs (exp) == n_elt);
36015
36016 for (i = 0; i < n_elt; ++i)
36017 {
36018 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
36019 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
36020 }
36021
36022 if (!target || !register_operand (target, tmode))
36023 target = gen_reg_rtx (tmode);
36024
36025 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
36026 return target;
36027 }
36028
36029 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36030 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
36031 had a language-level syntax for referencing vector elements. */
36032
36033 static rtx
36034 ix86_expand_vec_ext_builtin (tree exp, rtx target)
36035 {
36036 machine_mode tmode, mode0;
36037 tree arg0, arg1;
36038 int elt;
36039 rtx op0;
36040
36041 arg0 = CALL_EXPR_ARG (exp, 0);
36042 arg1 = CALL_EXPR_ARG (exp, 1);
36043
36044 op0 = expand_normal (arg0);
36045 elt = get_element_number (TREE_TYPE (arg0), arg1);
36046
36047 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36048 mode0 = TYPE_MODE (TREE_TYPE (arg0));
36049 gcc_assert (VECTOR_MODE_P (mode0));
36050
36051 op0 = force_reg (mode0, op0);
36052
36053 if (optimize || !target || !register_operand (target, tmode))
36054 target = gen_reg_rtx (tmode);
36055
36056 ix86_expand_vector_extract (true, target, op0, elt);
36057
36058 return target;
36059 }
36060
36061 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36062 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
36063 a language-level syntax for referencing vector elements. */
36064
36065 static rtx
36066 ix86_expand_vec_set_builtin (tree exp)
36067 {
36068 machine_mode tmode, mode1;
36069 tree arg0, arg1, arg2;
36070 int elt;
36071 rtx op0, op1, target;
36072
36073 arg0 = CALL_EXPR_ARG (exp, 0);
36074 arg1 = CALL_EXPR_ARG (exp, 1);
36075 arg2 = CALL_EXPR_ARG (exp, 2);
36076
36077 tmode = TYPE_MODE (TREE_TYPE (arg0));
36078 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36079 gcc_assert (VECTOR_MODE_P (tmode));
36080
36081 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
36082 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
36083 elt = get_element_number (TREE_TYPE (arg0), arg2);
36084
36085 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
36086 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
36087
36088 op0 = force_reg (tmode, op0);
36089 op1 = force_reg (mode1, op1);
36090
36091 /* OP0 is the source of these builtin functions and shouldn't be
36092 modified. Create a copy, use it and return it as target. */
36093 target = gen_reg_rtx (tmode);
36094 emit_move_insn (target, op0);
36095 ix86_expand_vector_set (true, target, op1, elt);
36096
36097 return target;
36098 }
36099
36100 /* Emit conditional move of SRC to DST with condition
36101 OP1 CODE OP2. */
36102 static void
36103 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
36104 {
36105 rtx t;
36106
36107 if (TARGET_CMOVE)
36108 {
36109 t = ix86_expand_compare (code, op1, op2);
36110 emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
36111 src, dst)));
36112 }
36113 else
36114 {
36115 rtx_code_label *nomove = gen_label_rtx ();
36116 emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
36117 const0_rtx, GET_MODE (op1), 1, nomove);
36118 emit_move_insn (dst, src);
36119 emit_label (nomove);
36120 }
36121 }
36122
36123 /* Choose max of DST and SRC and put it to DST. */
36124 static void
36125 ix86_emit_move_max (rtx dst, rtx src)
36126 {
36127 ix86_emit_cmove (dst, src, LTU, dst, src);
36128 }
36129
36130 /* Expand an expression EXP that calls a built-in function,
36131 with result going to TARGET if that's convenient
36132 (and in mode MODE if that's convenient).
36133 SUBTARGET may be used as the target for computing one of EXP's operands.
36134 IGNORE is nonzero if the value is to be ignored. */
36135
36136 static rtx
36137 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
36138 machine_mode mode, int ignore)
36139 {
36140 size_t i;
36141 enum insn_code icode;
36142 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
36143 tree arg0, arg1, arg2, arg3, arg4;
36144 rtx op0, op1, op2, op3, op4, pat, insn;
36145 machine_mode mode0, mode1, mode2, mode3, mode4;
36146 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
36147
36148 /* For CPU builtins that can be folded, fold first and expand the fold. */
36149 switch (fcode)
36150 {
36151 case IX86_BUILTIN_CPU_INIT:
36152 {
36153 /* Make it call __cpu_indicator_init in libgcc. */
36154 tree call_expr, fndecl, type;
36155 type = build_function_type_list (integer_type_node, NULL_TREE);
36156 fndecl = build_fn_decl ("__cpu_indicator_init", type);
36157 call_expr = build_call_expr (fndecl, 0);
36158 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
36159 }
36160 case IX86_BUILTIN_CPU_IS:
36161 case IX86_BUILTIN_CPU_SUPPORTS:
36162 {
36163 tree arg0 = CALL_EXPR_ARG (exp, 0);
36164 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
36165 gcc_assert (fold_expr != NULL_TREE);
36166 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
36167 }
36168 }
36169
36170 /* Determine whether the builtin function is available under the current ISA.
36171 Originally the builtin was not created if it wasn't applicable to the
36172 current ISA based on the command line switches. With function specific
36173 options, we need to check in the context of the function making the call
36174 whether it is supported. */
36175 if (ix86_builtins_isa[fcode].isa
36176 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
36177 {
36178 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, 0,
36179 NULL, NULL, (enum fpmath_unit) 0,
36180 false);
36181 if (!opts)
36182 error ("%qE needs unknown isa option", fndecl);
36183 else
36184 {
36185 gcc_assert (opts != NULL);
36186 error ("%qE needs isa option %s", fndecl, opts);
36187 free (opts);
36188 }
36189 return expand_call (exp, target, ignore);
36190 }
36191
36192 switch (fcode)
36193 {
36194 case IX86_BUILTIN_BNDMK:
36195 if (!target
36196 || GET_MODE (target) != BNDmode
36197 || !register_operand (target, BNDmode))
36198 target = gen_reg_rtx (BNDmode);
36199
36200 arg0 = CALL_EXPR_ARG (exp, 0);
36201 arg1 = CALL_EXPR_ARG (exp, 1);
36202
36203 op0 = expand_normal (arg0);
36204 op1 = expand_normal (arg1);
36205
36206 if (!register_operand (op0, Pmode))
36207 op0 = ix86_zero_extend_to_Pmode (op0);
36208 if (!register_operand (op1, Pmode))
36209 op1 = ix86_zero_extend_to_Pmode (op1);
36210
36211 /* Builtin arg1 is size of block but instruction op1 should
36212 be (size - 1). */
36213 op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
36214 NULL_RTX, 1, OPTAB_DIRECT);
36215
36216 emit_insn (BNDmode == BND64mode
36217 ? gen_bnd64_mk (target, op0, op1)
36218 : gen_bnd32_mk (target, op0, op1));
36219 return target;
36220
36221 case IX86_BUILTIN_BNDSTX:
36222 arg0 = CALL_EXPR_ARG (exp, 0);
36223 arg1 = CALL_EXPR_ARG (exp, 1);
36224 arg2 = CALL_EXPR_ARG (exp, 2);
36225
36226 op0 = expand_normal (arg0);
36227 op1 = expand_normal (arg1);
36228 op2 = expand_normal (arg2);
36229
36230 if (!register_operand (op0, Pmode))
36231 op0 = ix86_zero_extend_to_Pmode (op0);
36232 if (!register_operand (op1, BNDmode))
36233 op1 = copy_to_mode_reg (BNDmode, op1);
36234 if (!register_operand (op2, Pmode))
36235 op2 = ix86_zero_extend_to_Pmode (op2);
36236
36237 emit_insn (BNDmode == BND64mode
36238 ? gen_bnd64_stx (op2, op0, op1)
36239 : gen_bnd32_stx (op2, op0, op1));
36240 return 0;
36241
36242 case IX86_BUILTIN_BNDLDX:
36243 if (!target
36244 || GET_MODE (target) != BNDmode
36245 || !register_operand (target, BNDmode))
36246 target = gen_reg_rtx (BNDmode);
36247
36248 arg0 = CALL_EXPR_ARG (exp, 0);
36249 arg1 = CALL_EXPR_ARG (exp, 1);
36250
36251 op0 = expand_normal (arg0);
36252 op1 = expand_normal (arg1);
36253
36254 if (!register_operand (op0, Pmode))
36255 op0 = ix86_zero_extend_to_Pmode (op0);
36256 if (!register_operand (op1, Pmode))
36257 op1 = ix86_zero_extend_to_Pmode (op1);
36258
36259 emit_insn (BNDmode == BND64mode
36260 ? gen_bnd64_ldx (target, op0, op1)
36261 : gen_bnd32_ldx (target, op0, op1));
36262 return target;
36263
36264 case IX86_BUILTIN_BNDCL:
36265 arg0 = CALL_EXPR_ARG (exp, 0);
36266 arg1 = CALL_EXPR_ARG (exp, 1);
36267
36268 op0 = expand_normal (arg0);
36269 op1 = expand_normal (arg1);
36270
36271 if (!register_operand (op0, Pmode))
36272 op0 = ix86_zero_extend_to_Pmode (op0);
36273 if (!register_operand (op1, BNDmode))
36274 op1 = copy_to_mode_reg (BNDmode, op1);
36275
36276 emit_insn (BNDmode == BND64mode
36277 ? gen_bnd64_cl (op1, op0)
36278 : gen_bnd32_cl (op1, op0));
36279 return 0;
36280
36281 case IX86_BUILTIN_BNDCU:
36282 arg0 = CALL_EXPR_ARG (exp, 0);
36283 arg1 = CALL_EXPR_ARG (exp, 1);
36284
36285 op0 = expand_normal (arg0);
36286 op1 = expand_normal (arg1);
36287
36288 if (!register_operand (op0, Pmode))
36289 op0 = ix86_zero_extend_to_Pmode (op0);
36290 if (!register_operand (op1, BNDmode))
36291 op1 = copy_to_mode_reg (BNDmode, op1);
36292
36293 emit_insn (BNDmode == BND64mode
36294 ? gen_bnd64_cu (op1, op0)
36295 : gen_bnd32_cu (op1, op0));
36296 return 0;
36297
36298 case IX86_BUILTIN_BNDRET:
36299 arg0 = CALL_EXPR_ARG (exp, 0);
36300 gcc_assert (TREE_CODE (arg0) == SSA_NAME);
36301 target = chkp_get_rtl_bounds (arg0);
36302
36303 /* If no bounds were specified for returned value,
36304 then use INIT bounds. It usually happens when
36305 some built-in function is expanded. */
36306 if (!target)
36307 {
36308 rtx t1 = gen_reg_rtx (Pmode);
36309 rtx t2 = gen_reg_rtx (Pmode);
36310 target = gen_reg_rtx (BNDmode);
36311 emit_move_insn (t1, const0_rtx);
36312 emit_move_insn (t2, constm1_rtx);
36313 emit_insn (BNDmode == BND64mode
36314 ? gen_bnd64_mk (target, t1, t2)
36315 : gen_bnd32_mk (target, t1, t2));
36316 }
36317
36318 gcc_assert (target && REG_P (target));
36319 return target;
36320
36321 case IX86_BUILTIN_BNDNARROW:
36322 {
36323 rtx m1, m1h1, m1h2, lb, ub, t1;
36324
36325 /* Return value and lb. */
36326 arg0 = CALL_EXPR_ARG (exp, 0);
36327 /* Bounds. */
36328 arg1 = CALL_EXPR_ARG (exp, 1);
36329 /* Size. */
36330 arg2 = CALL_EXPR_ARG (exp, 2);
36331
36332 lb = expand_normal (arg0);
36333 op1 = expand_normal (arg1);
36334 op2 = expand_normal (arg2);
36335
36336 /* Size was passed but we need to use (size - 1) as for bndmk. */
36337 op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
36338 NULL_RTX, 1, OPTAB_DIRECT);
36339
36340 /* Add LB to size and inverse to get UB. */
36341 op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
36342 op2, 1, OPTAB_DIRECT);
36343 ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
36344
36345 if (!register_operand (lb, Pmode))
36346 lb = ix86_zero_extend_to_Pmode (lb);
36347 if (!register_operand (ub, Pmode))
36348 ub = ix86_zero_extend_to_Pmode (ub);
36349
36350 /* We need to move bounds to memory before any computations. */
36351 if (MEM_P (op1))
36352 m1 = op1;
36353 else
36354 {
36355 m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
36356 emit_move_insn (m1, op1);
36357 }
36358
36359 /* Generate mem expression to be used for access to LB and UB. */
36360 m1h1 = adjust_address (m1, Pmode, 0);
36361 m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
36362
36363 t1 = gen_reg_rtx (Pmode);
36364
36365 /* Compute LB. */
36366 emit_move_insn (t1, m1h1);
36367 ix86_emit_move_max (t1, lb);
36368 emit_move_insn (m1h1, t1);
36369
36370 /* Compute UB. UB is stored in 1's complement form. Therefore
36371 we also use max here. */
36372 emit_move_insn (t1, m1h2);
36373 ix86_emit_move_max (t1, ub);
36374 emit_move_insn (m1h2, t1);
36375
36376 op2 = gen_reg_rtx (BNDmode);
36377 emit_move_insn (op2, m1);
36378
36379 return chkp_join_splitted_slot (lb, op2);
36380 }
36381
36382 case IX86_BUILTIN_BNDINT:
36383 {
36384 rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
36385
36386 if (!target
36387 || GET_MODE (target) != BNDmode
36388 || !register_operand (target, BNDmode))
36389 target = gen_reg_rtx (BNDmode);
36390
36391 arg0 = CALL_EXPR_ARG (exp, 0);
36392 arg1 = CALL_EXPR_ARG (exp, 1);
36393
36394 op0 = expand_normal (arg0);
36395 op1 = expand_normal (arg1);
36396
36397 res = assign_386_stack_local (BNDmode, SLOT_TEMP);
36398 rh1 = adjust_address (res, Pmode, 0);
36399 rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
36400
36401 /* Put first bounds to temporaries. */
36402 lb1 = gen_reg_rtx (Pmode);
36403 ub1 = gen_reg_rtx (Pmode);
36404 if (MEM_P (op0))
36405 {
36406 emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
36407 emit_move_insn (ub1, adjust_address (op0, Pmode,
36408 GET_MODE_SIZE (Pmode)));
36409 }
36410 else
36411 {
36412 emit_move_insn (res, op0);
36413 emit_move_insn (lb1, rh1);
36414 emit_move_insn (ub1, rh2);
36415 }
36416
36417 /* Put second bounds to temporaries. */
36418 lb2 = gen_reg_rtx (Pmode);
36419 ub2 = gen_reg_rtx (Pmode);
36420 if (MEM_P (op1))
36421 {
36422 emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
36423 emit_move_insn (ub2, adjust_address (op1, Pmode,
36424 GET_MODE_SIZE (Pmode)));
36425 }
36426 else
36427 {
36428 emit_move_insn (res, op1);
36429 emit_move_insn (lb2, rh1);
36430 emit_move_insn (ub2, rh2);
36431 }
36432
36433 /* Compute LB. */
36434 ix86_emit_move_max (lb1, lb2);
36435 emit_move_insn (rh1, lb1);
36436
36437 /* Compute UB. UB is stored in 1's complement form. Therefore
36438 we also use max here. */
36439 ix86_emit_move_max (ub1, ub2);
36440 emit_move_insn (rh2, ub1);
36441
36442 emit_move_insn (target, res);
36443
36444 return target;
36445 }
36446
36447 case IX86_BUILTIN_SIZEOF:
36448 {
36449 tree name;
36450 rtx symbol;
36451
36452 if (!target
36453 || GET_MODE (target) != Pmode
36454 || !register_operand (target, Pmode))
36455 target = gen_reg_rtx (Pmode);
36456
36457 arg0 = CALL_EXPR_ARG (exp, 0);
36458 gcc_assert (TREE_CODE (arg0) == VAR_DECL);
36459
36460 name = DECL_ASSEMBLER_NAME (arg0);
36461 symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
36462
36463 emit_insn (Pmode == SImode
36464 ? gen_move_size_reloc_si (target, symbol)
36465 : gen_move_size_reloc_di (target, symbol));
36466
36467 return target;
36468 }
36469
36470 case IX86_BUILTIN_BNDLOWER:
36471 {
36472 rtx mem, hmem;
36473
36474 if (!target
36475 || GET_MODE (target) != Pmode
36476 || !register_operand (target, Pmode))
36477 target = gen_reg_rtx (Pmode);
36478
36479 arg0 = CALL_EXPR_ARG (exp, 0);
36480 op0 = expand_normal (arg0);
36481
36482 /* We need to move bounds to memory first. */
36483 if (MEM_P (op0))
36484 mem = op0;
36485 else
36486 {
36487 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
36488 emit_move_insn (mem, op0);
36489 }
36490
36491 /* Generate mem expression to access LB and load it. */
36492 hmem = adjust_address (mem, Pmode, 0);
36493 emit_move_insn (target, hmem);
36494
36495 return target;
36496 }
36497
36498 case IX86_BUILTIN_BNDUPPER:
36499 {
36500 rtx mem, hmem, res;
36501
36502 if (!target
36503 || GET_MODE (target) != Pmode
36504 || !register_operand (target, Pmode))
36505 target = gen_reg_rtx (Pmode);
36506
36507 arg0 = CALL_EXPR_ARG (exp, 0);
36508 op0 = expand_normal (arg0);
36509
36510 /* We need to move bounds to memory first. */
36511 if (MEM_P (op0))
36512 mem = op0;
36513 else
36514 {
36515 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
36516 emit_move_insn (mem, op0);
36517 }
36518
36519 /* Generate mem expression to access UB. */
36520 hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
36521
36522 /* We need to inverse all bits of UB. */
36523 res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
36524
36525 if (res != target)
36526 emit_move_insn (target, res);
36527
36528 return target;
36529 }
36530
36531 case IX86_BUILTIN_MASKMOVQ:
36532 case IX86_BUILTIN_MASKMOVDQU:
36533 icode = (fcode == IX86_BUILTIN_MASKMOVQ
36534 ? CODE_FOR_mmx_maskmovq
36535 : CODE_FOR_sse2_maskmovdqu);
36536 /* Note the arg order is different from the operand order. */
36537 arg1 = CALL_EXPR_ARG (exp, 0);
36538 arg2 = CALL_EXPR_ARG (exp, 1);
36539 arg0 = CALL_EXPR_ARG (exp, 2);
36540 op0 = expand_normal (arg0);
36541 op1 = expand_normal (arg1);
36542 op2 = expand_normal (arg2);
36543 mode0 = insn_data[icode].operand[0].mode;
36544 mode1 = insn_data[icode].operand[1].mode;
36545 mode2 = insn_data[icode].operand[2].mode;
36546
36547 op0 = ix86_zero_extend_to_Pmode (op0);
36548 op0 = gen_rtx_MEM (mode1, op0);
36549
36550 if (!insn_data[icode].operand[0].predicate (op0, mode0))
36551 op0 = copy_to_mode_reg (mode0, op0);
36552 if (!insn_data[icode].operand[1].predicate (op1, mode1))
36553 op1 = copy_to_mode_reg (mode1, op1);
36554 if (!insn_data[icode].operand[2].predicate (op2, mode2))
36555 op2 = copy_to_mode_reg (mode2, op2);
36556 pat = GEN_FCN (icode) (op0, op1, op2);
36557 if (! pat)
36558 return 0;
36559 emit_insn (pat);
36560 return 0;
36561
36562 case IX86_BUILTIN_LDMXCSR:
36563 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
36564 target = assign_386_stack_local (SImode, SLOT_TEMP);
36565 emit_move_insn (target, op0);
36566 emit_insn (gen_sse_ldmxcsr (target));
36567 return 0;
36568
36569 case IX86_BUILTIN_STMXCSR:
36570 target = assign_386_stack_local (SImode, SLOT_TEMP);
36571 emit_insn (gen_sse_stmxcsr (target));
36572 return copy_to_mode_reg (SImode, target);
36573
36574 case IX86_BUILTIN_CLFLUSH:
36575 arg0 = CALL_EXPR_ARG (exp, 0);
36576 op0 = expand_normal (arg0);
36577 icode = CODE_FOR_sse2_clflush;
36578 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36579 op0 = ix86_zero_extend_to_Pmode (op0);
36580
36581 emit_insn (gen_sse2_clflush (op0));
36582 return 0;
36583
36584 case IX86_BUILTIN_CLWB:
36585 arg0 = CALL_EXPR_ARG (exp, 0);
36586 op0 = expand_normal (arg0);
36587 icode = CODE_FOR_clwb;
36588 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36589 op0 = ix86_zero_extend_to_Pmode (op0);
36590
36591 emit_insn (gen_clwb (op0));
36592 return 0;
36593
36594 case IX86_BUILTIN_CLFLUSHOPT:
36595 arg0 = CALL_EXPR_ARG (exp, 0);
36596 op0 = expand_normal (arg0);
36597 icode = CODE_FOR_clflushopt;
36598 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36599 op0 = ix86_zero_extend_to_Pmode (op0);
36600
36601 emit_insn (gen_clflushopt (op0));
36602 return 0;
36603
36604 case IX86_BUILTIN_MONITOR:
36605 case IX86_BUILTIN_MONITORX:
36606 arg0 = CALL_EXPR_ARG (exp, 0);
36607 arg1 = CALL_EXPR_ARG (exp, 1);
36608 arg2 = CALL_EXPR_ARG (exp, 2);
36609 op0 = expand_normal (arg0);
36610 op1 = expand_normal (arg1);
36611 op2 = expand_normal (arg2);
36612 if (!REG_P (op0))
36613 op0 = ix86_zero_extend_to_Pmode (op0);
36614 if (!REG_P (op1))
36615 op1 = copy_to_mode_reg (SImode, op1);
36616 if (!REG_P (op2))
36617 op2 = copy_to_mode_reg (SImode, op2);
36618
36619 emit_insn (fcode == IX86_BUILTIN_MONITOR
36620 ? ix86_gen_monitor (op0, op1, op2)
36621 : ix86_gen_monitorx (op0, op1, op2));
36622 return 0;
36623
36624 case IX86_BUILTIN_MWAIT:
36625 arg0 = CALL_EXPR_ARG (exp, 0);
36626 arg1 = CALL_EXPR_ARG (exp, 1);
36627 op0 = expand_normal (arg0);
36628 op1 = expand_normal (arg1);
36629 if (!REG_P (op0))
36630 op0 = copy_to_mode_reg (SImode, op0);
36631 if (!REG_P (op1))
36632 op1 = copy_to_mode_reg (SImode, op1);
36633 emit_insn (gen_sse3_mwait (op0, op1));
36634 return 0;
36635
36636 case IX86_BUILTIN_MWAITX:
36637 arg0 = CALL_EXPR_ARG (exp, 0);
36638 arg1 = CALL_EXPR_ARG (exp, 1);
36639 arg2 = CALL_EXPR_ARG (exp, 2);
36640 op0 = expand_normal (arg0);
36641 op1 = expand_normal (arg1);
36642 op2 = expand_normal (arg2);
36643 if (!REG_P (op0))
36644 op0 = copy_to_mode_reg (SImode, op0);
36645 if (!REG_P (op1))
36646 op1 = copy_to_mode_reg (SImode, op1);
36647 if (!REG_P (op2))
36648 op2 = copy_to_mode_reg (SImode, op2);
36649 emit_insn (gen_mwaitx (op0, op1, op2));
36650 return 0;
36651
36652 case IX86_BUILTIN_CLZERO:
36653 arg0 = CALL_EXPR_ARG (exp, 0);
36654 op0 = expand_normal (arg0);
36655 if (!REG_P (op0))
36656 op0 = ix86_zero_extend_to_Pmode (op0);
36657 emit_insn (ix86_gen_clzero (op0));
36658 return 0;
36659
36660 case IX86_BUILTIN_VEC_INIT_V2SI:
36661 case IX86_BUILTIN_VEC_INIT_V4HI:
36662 case IX86_BUILTIN_VEC_INIT_V8QI:
36663 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
36664
36665 case IX86_BUILTIN_VEC_EXT_V2DF:
36666 case IX86_BUILTIN_VEC_EXT_V2DI:
36667 case IX86_BUILTIN_VEC_EXT_V4SF:
36668 case IX86_BUILTIN_VEC_EXT_V4SI:
36669 case IX86_BUILTIN_VEC_EXT_V8HI:
36670 case IX86_BUILTIN_VEC_EXT_V2SI:
36671 case IX86_BUILTIN_VEC_EXT_V4HI:
36672 case IX86_BUILTIN_VEC_EXT_V16QI:
36673 return ix86_expand_vec_ext_builtin (exp, target);
36674
36675 case IX86_BUILTIN_VEC_SET_V2DI:
36676 case IX86_BUILTIN_VEC_SET_V4SF:
36677 case IX86_BUILTIN_VEC_SET_V4SI:
36678 case IX86_BUILTIN_VEC_SET_V8HI:
36679 case IX86_BUILTIN_VEC_SET_V4HI:
36680 case IX86_BUILTIN_VEC_SET_V16QI:
36681 return ix86_expand_vec_set_builtin (exp);
36682
36683 case IX86_BUILTIN_INFQ:
36684 case IX86_BUILTIN_HUGE_VALQ:
36685 {
36686 REAL_VALUE_TYPE inf;
36687 rtx tmp;
36688
36689 real_inf (&inf);
36690 tmp = const_double_from_real_value (inf, mode);
36691
36692 tmp = validize_mem (force_const_mem (mode, tmp));
36693
36694 if (target == 0)
36695 target = gen_reg_rtx (mode);
36696
36697 emit_move_insn (target, tmp);
36698 return target;
36699 }
36700
36701 case IX86_BUILTIN_NANQ:
36702 case IX86_BUILTIN_NANSQ:
36703 return expand_call (exp, target, ignore);
36704
36705 case IX86_BUILTIN_RDPMC:
36706 case IX86_BUILTIN_RDTSC:
36707 case IX86_BUILTIN_RDTSCP:
36708
36709 op0 = gen_reg_rtx (DImode);
36710 op1 = gen_reg_rtx (DImode);
36711
36712 if (fcode == IX86_BUILTIN_RDPMC)
36713 {
36714 arg0 = CALL_EXPR_ARG (exp, 0);
36715 op2 = expand_normal (arg0);
36716 if (!register_operand (op2, SImode))
36717 op2 = copy_to_mode_reg (SImode, op2);
36718
36719 insn = (TARGET_64BIT
36720 ? gen_rdpmc_rex64 (op0, op1, op2)
36721 : gen_rdpmc (op0, op2));
36722 emit_insn (insn);
36723 }
36724 else if (fcode == IX86_BUILTIN_RDTSC)
36725 {
36726 insn = (TARGET_64BIT
36727 ? gen_rdtsc_rex64 (op0, op1)
36728 : gen_rdtsc (op0));
36729 emit_insn (insn);
36730 }
36731 else
36732 {
36733 op2 = gen_reg_rtx (SImode);
36734
36735 insn = (TARGET_64BIT
36736 ? gen_rdtscp_rex64 (op0, op1, op2)
36737 : gen_rdtscp (op0, op2));
36738 emit_insn (insn);
36739
36740 arg0 = CALL_EXPR_ARG (exp, 0);
36741 op4 = expand_normal (arg0);
36742 if (!address_operand (op4, VOIDmode))
36743 {
36744 op4 = convert_memory_address (Pmode, op4);
36745 op4 = copy_addr_to_reg (op4);
36746 }
36747 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
36748 }
36749
36750 if (target == 0)
36751 {
36752 /* mode is VOIDmode if __builtin_rd* has been called
36753 without lhs. */
36754 if (mode == VOIDmode)
36755 return target;
36756 target = gen_reg_rtx (mode);
36757 }
36758
36759 if (TARGET_64BIT)
36760 {
36761 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
36762 op1, 1, OPTAB_DIRECT);
36763 op0 = expand_simple_binop (DImode, IOR, op0, op1,
36764 op0, 1, OPTAB_DIRECT);
36765 }
36766
36767 emit_move_insn (target, op0);
36768 return target;
36769
36770 case IX86_BUILTIN_FXSAVE:
36771 case IX86_BUILTIN_FXRSTOR:
36772 case IX86_BUILTIN_FXSAVE64:
36773 case IX86_BUILTIN_FXRSTOR64:
36774 case IX86_BUILTIN_FNSTENV:
36775 case IX86_BUILTIN_FLDENV:
36776 mode0 = BLKmode;
36777 switch (fcode)
36778 {
36779 case IX86_BUILTIN_FXSAVE:
36780 icode = CODE_FOR_fxsave;
36781 break;
36782 case IX86_BUILTIN_FXRSTOR:
36783 icode = CODE_FOR_fxrstor;
36784 break;
36785 case IX86_BUILTIN_FXSAVE64:
36786 icode = CODE_FOR_fxsave64;
36787 break;
36788 case IX86_BUILTIN_FXRSTOR64:
36789 icode = CODE_FOR_fxrstor64;
36790 break;
36791 case IX86_BUILTIN_FNSTENV:
36792 icode = CODE_FOR_fnstenv;
36793 break;
36794 case IX86_BUILTIN_FLDENV:
36795 icode = CODE_FOR_fldenv;
36796 break;
36797 default:
36798 gcc_unreachable ();
36799 }
36800
36801 arg0 = CALL_EXPR_ARG (exp, 0);
36802 op0 = expand_normal (arg0);
36803
36804 if (!address_operand (op0, VOIDmode))
36805 {
36806 op0 = convert_memory_address (Pmode, op0);
36807 op0 = copy_addr_to_reg (op0);
36808 }
36809 op0 = gen_rtx_MEM (mode0, op0);
36810
36811 pat = GEN_FCN (icode) (op0);
36812 if (pat)
36813 emit_insn (pat);
36814 return 0;
36815
36816 case IX86_BUILTIN_XSAVE:
36817 case IX86_BUILTIN_XRSTOR:
36818 case IX86_BUILTIN_XSAVE64:
36819 case IX86_BUILTIN_XRSTOR64:
36820 case IX86_BUILTIN_XSAVEOPT:
36821 case IX86_BUILTIN_XSAVEOPT64:
36822 case IX86_BUILTIN_XSAVES:
36823 case IX86_BUILTIN_XRSTORS:
36824 case IX86_BUILTIN_XSAVES64:
36825 case IX86_BUILTIN_XRSTORS64:
36826 case IX86_BUILTIN_XSAVEC:
36827 case IX86_BUILTIN_XSAVEC64:
36828 arg0 = CALL_EXPR_ARG (exp, 0);
36829 arg1 = CALL_EXPR_ARG (exp, 1);
36830 op0 = expand_normal (arg0);
36831 op1 = expand_normal (arg1);
36832
36833 if (!address_operand (op0, VOIDmode))
36834 {
36835 op0 = convert_memory_address (Pmode, op0);
36836 op0 = copy_addr_to_reg (op0);
36837 }
36838 op0 = gen_rtx_MEM (BLKmode, op0);
36839
36840 op1 = force_reg (DImode, op1);
36841
36842 if (TARGET_64BIT)
36843 {
36844 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
36845 NULL, 1, OPTAB_DIRECT);
36846 switch (fcode)
36847 {
36848 case IX86_BUILTIN_XSAVE:
36849 icode = CODE_FOR_xsave_rex64;
36850 break;
36851 case IX86_BUILTIN_XRSTOR:
36852 icode = CODE_FOR_xrstor_rex64;
36853 break;
36854 case IX86_BUILTIN_XSAVE64:
36855 icode = CODE_FOR_xsave64;
36856 break;
36857 case IX86_BUILTIN_XRSTOR64:
36858 icode = CODE_FOR_xrstor64;
36859 break;
36860 case IX86_BUILTIN_XSAVEOPT:
36861 icode = CODE_FOR_xsaveopt_rex64;
36862 break;
36863 case IX86_BUILTIN_XSAVEOPT64:
36864 icode = CODE_FOR_xsaveopt64;
36865 break;
36866 case IX86_BUILTIN_XSAVES:
36867 icode = CODE_FOR_xsaves_rex64;
36868 break;
36869 case IX86_BUILTIN_XRSTORS:
36870 icode = CODE_FOR_xrstors_rex64;
36871 break;
36872 case IX86_BUILTIN_XSAVES64:
36873 icode = CODE_FOR_xsaves64;
36874 break;
36875 case IX86_BUILTIN_XRSTORS64:
36876 icode = CODE_FOR_xrstors64;
36877 break;
36878 case IX86_BUILTIN_XSAVEC:
36879 icode = CODE_FOR_xsavec_rex64;
36880 break;
36881 case IX86_BUILTIN_XSAVEC64:
36882 icode = CODE_FOR_xsavec64;
36883 break;
36884 default:
36885 gcc_unreachable ();
36886 }
36887
36888 op2 = gen_lowpart (SImode, op2);
36889 op1 = gen_lowpart (SImode, op1);
36890 pat = GEN_FCN (icode) (op0, op1, op2);
36891 }
36892 else
36893 {
36894 switch (fcode)
36895 {
36896 case IX86_BUILTIN_XSAVE:
36897 icode = CODE_FOR_xsave;
36898 break;
36899 case IX86_BUILTIN_XRSTOR:
36900 icode = CODE_FOR_xrstor;
36901 break;
36902 case IX86_BUILTIN_XSAVEOPT:
36903 icode = CODE_FOR_xsaveopt;
36904 break;
36905 case IX86_BUILTIN_XSAVES:
36906 icode = CODE_FOR_xsaves;
36907 break;
36908 case IX86_BUILTIN_XRSTORS:
36909 icode = CODE_FOR_xrstors;
36910 break;
36911 case IX86_BUILTIN_XSAVEC:
36912 icode = CODE_FOR_xsavec;
36913 break;
36914 default:
36915 gcc_unreachable ();
36916 }
36917 pat = GEN_FCN (icode) (op0, op1);
36918 }
36919
36920 if (pat)
36921 emit_insn (pat);
36922 return 0;
36923
36924 case IX86_BUILTIN_LLWPCB:
36925 arg0 = CALL_EXPR_ARG (exp, 0);
36926 op0 = expand_normal (arg0);
36927 icode = CODE_FOR_lwp_llwpcb;
36928 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36929 op0 = ix86_zero_extend_to_Pmode (op0);
36930 emit_insn (gen_lwp_llwpcb (op0));
36931 return 0;
36932
36933 case IX86_BUILTIN_SLWPCB:
36934 icode = CODE_FOR_lwp_slwpcb;
36935 if (!target
36936 || !insn_data[icode].operand[0].predicate (target, Pmode))
36937 target = gen_reg_rtx (Pmode);
36938 emit_insn (gen_lwp_slwpcb (target));
36939 return target;
36940
36941 case IX86_BUILTIN_BEXTRI32:
36942 case IX86_BUILTIN_BEXTRI64:
36943 arg0 = CALL_EXPR_ARG (exp, 0);
36944 arg1 = CALL_EXPR_ARG (exp, 1);
36945 op0 = expand_normal (arg0);
36946 op1 = expand_normal (arg1);
36947 icode = (fcode == IX86_BUILTIN_BEXTRI32
36948 ? CODE_FOR_tbm_bextri_si
36949 : CODE_FOR_tbm_bextri_di);
36950 if (!CONST_INT_P (op1))
36951 {
36952 error ("last argument must be an immediate");
36953 return const0_rtx;
36954 }
36955 else
36956 {
36957 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
36958 unsigned char lsb_index = INTVAL (op1) & 0xFF;
36959 op1 = GEN_INT (length);
36960 op2 = GEN_INT (lsb_index);
36961 pat = GEN_FCN (icode) (target, op0, op1, op2);
36962 if (pat)
36963 emit_insn (pat);
36964 return target;
36965 }
36966
36967 case IX86_BUILTIN_RDRAND16_STEP:
36968 icode = CODE_FOR_rdrandhi_1;
36969 mode0 = HImode;
36970 goto rdrand_step;
36971
36972 case IX86_BUILTIN_RDRAND32_STEP:
36973 icode = CODE_FOR_rdrandsi_1;
36974 mode0 = SImode;
36975 goto rdrand_step;
36976
36977 case IX86_BUILTIN_RDRAND64_STEP:
36978 icode = CODE_FOR_rdranddi_1;
36979 mode0 = DImode;
36980
36981 rdrand_step:
36982 op0 = gen_reg_rtx (mode0);
36983 emit_insn (GEN_FCN (icode) (op0));
36984
36985 arg0 = CALL_EXPR_ARG (exp, 0);
36986 op1 = expand_normal (arg0);
36987 if (!address_operand (op1, VOIDmode))
36988 {
36989 op1 = convert_memory_address (Pmode, op1);
36990 op1 = copy_addr_to_reg (op1);
36991 }
36992 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
36993
36994 op1 = gen_reg_rtx (SImode);
36995 emit_move_insn (op1, CONST1_RTX (SImode));
36996
36997 /* Emit SImode conditional move. */
36998 if (mode0 == HImode)
36999 {
37000 op2 = gen_reg_rtx (SImode);
37001 emit_insn (gen_zero_extendhisi2 (op2, op0));
37002 }
37003 else if (mode0 == SImode)
37004 op2 = op0;
37005 else
37006 op2 = gen_rtx_SUBREG (SImode, op0, 0);
37007
37008 if (target == 0
37009 || !register_operand (target, SImode))
37010 target = gen_reg_rtx (SImode);
37011
37012 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
37013 const0_rtx);
37014 emit_insn (gen_rtx_SET (target,
37015 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
37016 return target;
37017
37018 case IX86_BUILTIN_RDSEED16_STEP:
37019 icode = CODE_FOR_rdseedhi_1;
37020 mode0 = HImode;
37021 goto rdseed_step;
37022
37023 case IX86_BUILTIN_RDSEED32_STEP:
37024 icode = CODE_FOR_rdseedsi_1;
37025 mode0 = SImode;
37026 goto rdseed_step;
37027
37028 case IX86_BUILTIN_RDSEED64_STEP:
37029 icode = CODE_FOR_rdseeddi_1;
37030 mode0 = DImode;
37031
37032 rdseed_step:
37033 op0 = gen_reg_rtx (mode0);
37034 emit_insn (GEN_FCN (icode) (op0));
37035
37036 arg0 = CALL_EXPR_ARG (exp, 0);
37037 op1 = expand_normal (arg0);
37038 if (!address_operand (op1, VOIDmode))
37039 {
37040 op1 = convert_memory_address (Pmode, op1);
37041 op1 = copy_addr_to_reg (op1);
37042 }
37043 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
37044
37045 op2 = gen_reg_rtx (QImode);
37046
37047 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
37048 const0_rtx);
37049 emit_insn (gen_rtx_SET (op2, pat));
37050
37051 if (target == 0
37052 || !register_operand (target, SImode))
37053 target = gen_reg_rtx (SImode);
37054
37055 emit_insn (gen_zero_extendqisi2 (target, op2));
37056 return target;
37057
37058 case IX86_BUILTIN_SBB32:
37059 icode = CODE_FOR_subborrowsi;
37060 mode0 = SImode;
37061 goto handlecarry;
37062
37063 case IX86_BUILTIN_SBB64:
37064 icode = CODE_FOR_subborrowdi;
37065 mode0 = DImode;
37066 goto handlecarry;
37067
37068 case IX86_BUILTIN_ADDCARRYX32:
37069 icode = CODE_FOR_addcarrysi;
37070 mode0 = SImode;
37071 goto handlecarry;
37072
37073 case IX86_BUILTIN_ADDCARRYX64:
37074 icode = CODE_FOR_addcarrydi;
37075 mode0 = DImode;
37076
37077 handlecarry:
37078 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
37079 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
37080 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
37081 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
37082
37083 op1 = expand_normal (arg0);
37084 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
37085
37086 op2 = expand_normal (arg1);
37087 if (!register_operand (op2, mode0))
37088 op2 = copy_to_mode_reg (mode0, op2);
37089
37090 op3 = expand_normal (arg2);
37091 if (!register_operand (op3, mode0))
37092 op3 = copy_to_mode_reg (mode0, op3);
37093
37094 op4 = expand_normal (arg3);
37095 if (!address_operand (op4, VOIDmode))
37096 {
37097 op4 = convert_memory_address (Pmode, op4);
37098 op4 = copy_addr_to_reg (op4);
37099 }
37100
37101 /* Generate CF from input operand. */
37102 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
37103
37104 /* Generate instruction that consumes CF. */
37105 op0 = gen_reg_rtx (mode0);
37106
37107 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
37108 pat = gen_rtx_LTU (mode0, op1, const0_rtx);
37109 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat));
37110
37111 /* Return current CF value. */
37112 if (target == 0)
37113 target = gen_reg_rtx (QImode);
37114
37115 PUT_MODE (pat, QImode);
37116 emit_insn (gen_rtx_SET (target, pat));
37117
37118 /* Store the result. */
37119 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
37120
37121 return target;
37122
37123 case IX86_BUILTIN_READ_FLAGS:
37124 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
37125
37126 if (optimize
37127 || target == NULL_RTX
37128 || !nonimmediate_operand (target, word_mode)
37129 || GET_MODE (target) != word_mode)
37130 target = gen_reg_rtx (word_mode);
37131
37132 emit_insn (gen_pop (target));
37133 return target;
37134
37135 case IX86_BUILTIN_WRITE_FLAGS:
37136
37137 arg0 = CALL_EXPR_ARG (exp, 0);
37138 op0 = expand_normal (arg0);
37139 if (!general_no_elim_operand (op0, word_mode))
37140 op0 = copy_to_mode_reg (word_mode, op0);
37141
37142 emit_insn (gen_push (op0));
37143 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
37144 return 0;
37145
37146 case IX86_BUILTIN_KORTESTC16:
37147 icode = CODE_FOR_kortestchi;
37148 mode0 = HImode;
37149 mode1 = CCCmode;
37150 goto kortest;
37151
37152 case IX86_BUILTIN_KORTESTZ16:
37153 icode = CODE_FOR_kortestzhi;
37154 mode0 = HImode;
37155 mode1 = CCZmode;
37156
37157 kortest:
37158 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
37159 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
37160 op0 = expand_normal (arg0);
37161 op1 = expand_normal (arg1);
37162
37163 op0 = copy_to_reg (op0);
37164 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
37165 op1 = copy_to_reg (op1);
37166 op1 = lowpart_subreg (mode0, op1, GET_MODE (op1));
37167
37168 target = gen_reg_rtx (QImode);
37169 emit_insn (gen_rtx_SET (target, const0_rtx));
37170
37171 /* Emit kortest. */
37172 emit_insn (GEN_FCN (icode) (op0, op1));
37173 /* And use setcc to return result from flags. */
37174 ix86_expand_setcc (target, EQ,
37175 gen_rtx_REG (mode1, FLAGS_REG), const0_rtx);
37176 return target;
37177
37178 case IX86_BUILTIN_GATHERSIV2DF:
37179 icode = CODE_FOR_avx2_gathersiv2df;
37180 goto gather_gen;
37181 case IX86_BUILTIN_GATHERSIV4DF:
37182 icode = CODE_FOR_avx2_gathersiv4df;
37183 goto gather_gen;
37184 case IX86_BUILTIN_GATHERDIV2DF:
37185 icode = CODE_FOR_avx2_gatherdiv2df;
37186 goto gather_gen;
37187 case IX86_BUILTIN_GATHERDIV4DF:
37188 icode = CODE_FOR_avx2_gatherdiv4df;
37189 goto gather_gen;
37190 case IX86_BUILTIN_GATHERSIV4SF:
37191 icode = CODE_FOR_avx2_gathersiv4sf;
37192 goto gather_gen;
37193 case IX86_BUILTIN_GATHERSIV8SF:
37194 icode = CODE_FOR_avx2_gathersiv8sf;
37195 goto gather_gen;
37196 case IX86_BUILTIN_GATHERDIV4SF:
37197 icode = CODE_FOR_avx2_gatherdiv4sf;
37198 goto gather_gen;
37199 case IX86_BUILTIN_GATHERDIV8SF:
37200 icode = CODE_FOR_avx2_gatherdiv8sf;
37201 goto gather_gen;
37202 case IX86_BUILTIN_GATHERSIV2DI:
37203 icode = CODE_FOR_avx2_gathersiv2di;
37204 goto gather_gen;
37205 case IX86_BUILTIN_GATHERSIV4DI:
37206 icode = CODE_FOR_avx2_gathersiv4di;
37207 goto gather_gen;
37208 case IX86_BUILTIN_GATHERDIV2DI:
37209 icode = CODE_FOR_avx2_gatherdiv2di;
37210 goto gather_gen;
37211 case IX86_BUILTIN_GATHERDIV4DI:
37212 icode = CODE_FOR_avx2_gatherdiv4di;
37213 goto gather_gen;
37214 case IX86_BUILTIN_GATHERSIV4SI:
37215 icode = CODE_FOR_avx2_gathersiv4si;
37216 goto gather_gen;
37217 case IX86_BUILTIN_GATHERSIV8SI:
37218 icode = CODE_FOR_avx2_gathersiv8si;
37219 goto gather_gen;
37220 case IX86_BUILTIN_GATHERDIV4SI:
37221 icode = CODE_FOR_avx2_gatherdiv4si;
37222 goto gather_gen;
37223 case IX86_BUILTIN_GATHERDIV8SI:
37224 icode = CODE_FOR_avx2_gatherdiv8si;
37225 goto gather_gen;
37226 case IX86_BUILTIN_GATHERALTSIV4DF:
37227 icode = CODE_FOR_avx2_gathersiv4df;
37228 goto gather_gen;
37229 case IX86_BUILTIN_GATHERALTDIV8SF:
37230 icode = CODE_FOR_avx2_gatherdiv8sf;
37231 goto gather_gen;
37232 case IX86_BUILTIN_GATHERALTSIV4DI:
37233 icode = CODE_FOR_avx2_gathersiv4di;
37234 goto gather_gen;
37235 case IX86_BUILTIN_GATHERALTDIV8SI:
37236 icode = CODE_FOR_avx2_gatherdiv8si;
37237 goto gather_gen;
37238 case IX86_BUILTIN_GATHER3SIV16SF:
37239 icode = CODE_FOR_avx512f_gathersiv16sf;
37240 goto gather_gen;
37241 case IX86_BUILTIN_GATHER3SIV8DF:
37242 icode = CODE_FOR_avx512f_gathersiv8df;
37243 goto gather_gen;
37244 case IX86_BUILTIN_GATHER3DIV16SF:
37245 icode = CODE_FOR_avx512f_gatherdiv16sf;
37246 goto gather_gen;
37247 case IX86_BUILTIN_GATHER3DIV8DF:
37248 icode = CODE_FOR_avx512f_gatherdiv8df;
37249 goto gather_gen;
37250 case IX86_BUILTIN_GATHER3SIV16SI:
37251 icode = CODE_FOR_avx512f_gathersiv16si;
37252 goto gather_gen;
37253 case IX86_BUILTIN_GATHER3SIV8DI:
37254 icode = CODE_FOR_avx512f_gathersiv8di;
37255 goto gather_gen;
37256 case IX86_BUILTIN_GATHER3DIV16SI:
37257 icode = CODE_FOR_avx512f_gatherdiv16si;
37258 goto gather_gen;
37259 case IX86_BUILTIN_GATHER3DIV8DI:
37260 icode = CODE_FOR_avx512f_gatherdiv8di;
37261 goto gather_gen;
37262 case IX86_BUILTIN_GATHER3ALTSIV8DF:
37263 icode = CODE_FOR_avx512f_gathersiv8df;
37264 goto gather_gen;
37265 case IX86_BUILTIN_GATHER3ALTDIV16SF:
37266 icode = CODE_FOR_avx512f_gatherdiv16sf;
37267 goto gather_gen;
37268 case IX86_BUILTIN_GATHER3ALTSIV8DI:
37269 icode = CODE_FOR_avx512f_gathersiv8di;
37270 goto gather_gen;
37271 case IX86_BUILTIN_GATHER3ALTDIV16SI:
37272 icode = CODE_FOR_avx512f_gatherdiv16si;
37273 goto gather_gen;
37274 case IX86_BUILTIN_GATHER3SIV2DF:
37275 icode = CODE_FOR_avx512vl_gathersiv2df;
37276 goto gather_gen;
37277 case IX86_BUILTIN_GATHER3SIV4DF:
37278 icode = CODE_FOR_avx512vl_gathersiv4df;
37279 goto gather_gen;
37280 case IX86_BUILTIN_GATHER3DIV2DF:
37281 icode = CODE_FOR_avx512vl_gatherdiv2df;
37282 goto gather_gen;
37283 case IX86_BUILTIN_GATHER3DIV4DF:
37284 icode = CODE_FOR_avx512vl_gatherdiv4df;
37285 goto gather_gen;
37286 case IX86_BUILTIN_GATHER3SIV4SF:
37287 icode = CODE_FOR_avx512vl_gathersiv4sf;
37288 goto gather_gen;
37289 case IX86_BUILTIN_GATHER3SIV8SF:
37290 icode = CODE_FOR_avx512vl_gathersiv8sf;
37291 goto gather_gen;
37292 case IX86_BUILTIN_GATHER3DIV4SF:
37293 icode = CODE_FOR_avx512vl_gatherdiv4sf;
37294 goto gather_gen;
37295 case IX86_BUILTIN_GATHER3DIV8SF:
37296 icode = CODE_FOR_avx512vl_gatherdiv8sf;
37297 goto gather_gen;
37298 case IX86_BUILTIN_GATHER3SIV2DI:
37299 icode = CODE_FOR_avx512vl_gathersiv2di;
37300 goto gather_gen;
37301 case IX86_BUILTIN_GATHER3SIV4DI:
37302 icode = CODE_FOR_avx512vl_gathersiv4di;
37303 goto gather_gen;
37304 case IX86_BUILTIN_GATHER3DIV2DI:
37305 icode = CODE_FOR_avx512vl_gatherdiv2di;
37306 goto gather_gen;
37307 case IX86_BUILTIN_GATHER3DIV4DI:
37308 icode = CODE_FOR_avx512vl_gatherdiv4di;
37309 goto gather_gen;
37310 case IX86_BUILTIN_GATHER3SIV4SI:
37311 icode = CODE_FOR_avx512vl_gathersiv4si;
37312 goto gather_gen;
37313 case IX86_BUILTIN_GATHER3SIV8SI:
37314 icode = CODE_FOR_avx512vl_gathersiv8si;
37315 goto gather_gen;
37316 case IX86_BUILTIN_GATHER3DIV4SI:
37317 icode = CODE_FOR_avx512vl_gatherdiv4si;
37318 goto gather_gen;
37319 case IX86_BUILTIN_GATHER3DIV8SI:
37320 icode = CODE_FOR_avx512vl_gatherdiv8si;
37321 goto gather_gen;
37322 case IX86_BUILTIN_GATHER3ALTSIV4DF:
37323 icode = CODE_FOR_avx512vl_gathersiv4df;
37324 goto gather_gen;
37325 case IX86_BUILTIN_GATHER3ALTDIV8SF:
37326 icode = CODE_FOR_avx512vl_gatherdiv8sf;
37327 goto gather_gen;
37328 case IX86_BUILTIN_GATHER3ALTSIV4DI:
37329 icode = CODE_FOR_avx512vl_gathersiv4di;
37330 goto gather_gen;
37331 case IX86_BUILTIN_GATHER3ALTDIV8SI:
37332 icode = CODE_FOR_avx512vl_gatherdiv8si;
37333 goto gather_gen;
37334 case IX86_BUILTIN_SCATTERSIV16SF:
37335 icode = CODE_FOR_avx512f_scattersiv16sf;
37336 goto scatter_gen;
37337 case IX86_BUILTIN_SCATTERSIV8DF:
37338 icode = CODE_FOR_avx512f_scattersiv8df;
37339 goto scatter_gen;
37340 case IX86_BUILTIN_SCATTERDIV16SF:
37341 icode = CODE_FOR_avx512f_scatterdiv16sf;
37342 goto scatter_gen;
37343 case IX86_BUILTIN_SCATTERDIV8DF:
37344 icode = CODE_FOR_avx512f_scatterdiv8df;
37345 goto scatter_gen;
37346 case IX86_BUILTIN_SCATTERSIV16SI:
37347 icode = CODE_FOR_avx512f_scattersiv16si;
37348 goto scatter_gen;
37349 case IX86_BUILTIN_SCATTERSIV8DI:
37350 icode = CODE_FOR_avx512f_scattersiv8di;
37351 goto scatter_gen;
37352 case IX86_BUILTIN_SCATTERDIV16SI:
37353 icode = CODE_FOR_avx512f_scatterdiv16si;
37354 goto scatter_gen;
37355 case IX86_BUILTIN_SCATTERDIV8DI:
37356 icode = CODE_FOR_avx512f_scatterdiv8di;
37357 goto scatter_gen;
37358 case IX86_BUILTIN_SCATTERSIV8SF:
37359 icode = CODE_FOR_avx512vl_scattersiv8sf;
37360 goto scatter_gen;
37361 case IX86_BUILTIN_SCATTERSIV4SF:
37362 icode = CODE_FOR_avx512vl_scattersiv4sf;
37363 goto scatter_gen;
37364 case IX86_BUILTIN_SCATTERSIV4DF:
37365 icode = CODE_FOR_avx512vl_scattersiv4df;
37366 goto scatter_gen;
37367 case IX86_BUILTIN_SCATTERSIV2DF:
37368 icode = CODE_FOR_avx512vl_scattersiv2df;
37369 goto scatter_gen;
37370 case IX86_BUILTIN_SCATTERDIV8SF:
37371 icode = CODE_FOR_avx512vl_scatterdiv8sf;
37372 goto scatter_gen;
37373 case IX86_BUILTIN_SCATTERDIV4SF:
37374 icode = CODE_FOR_avx512vl_scatterdiv4sf;
37375 goto scatter_gen;
37376 case IX86_BUILTIN_SCATTERDIV4DF:
37377 icode = CODE_FOR_avx512vl_scatterdiv4df;
37378 goto scatter_gen;
37379 case IX86_BUILTIN_SCATTERDIV2DF:
37380 icode = CODE_FOR_avx512vl_scatterdiv2df;
37381 goto scatter_gen;
37382 case IX86_BUILTIN_SCATTERSIV8SI:
37383 icode = CODE_FOR_avx512vl_scattersiv8si;
37384 goto scatter_gen;
37385 case IX86_BUILTIN_SCATTERSIV4SI:
37386 icode = CODE_FOR_avx512vl_scattersiv4si;
37387 goto scatter_gen;
37388 case IX86_BUILTIN_SCATTERSIV4DI:
37389 icode = CODE_FOR_avx512vl_scattersiv4di;
37390 goto scatter_gen;
37391 case IX86_BUILTIN_SCATTERSIV2DI:
37392 icode = CODE_FOR_avx512vl_scattersiv2di;
37393 goto scatter_gen;
37394 case IX86_BUILTIN_SCATTERDIV8SI:
37395 icode = CODE_FOR_avx512vl_scatterdiv8si;
37396 goto scatter_gen;
37397 case IX86_BUILTIN_SCATTERDIV4SI:
37398 icode = CODE_FOR_avx512vl_scatterdiv4si;
37399 goto scatter_gen;
37400 case IX86_BUILTIN_SCATTERDIV4DI:
37401 icode = CODE_FOR_avx512vl_scatterdiv4di;
37402 goto scatter_gen;
37403 case IX86_BUILTIN_SCATTERDIV2DI:
37404 icode = CODE_FOR_avx512vl_scatterdiv2di;
37405 goto scatter_gen;
37406 case IX86_BUILTIN_GATHERPFDPD:
37407 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
37408 goto vec_prefetch_gen;
37409 case IX86_BUILTIN_SCATTERALTSIV8DF:
37410 icode = CODE_FOR_avx512f_scattersiv8df;
37411 goto scatter_gen;
37412 case IX86_BUILTIN_SCATTERALTDIV16SF:
37413 icode = CODE_FOR_avx512f_scatterdiv16sf;
37414 goto scatter_gen;
37415 case IX86_BUILTIN_SCATTERALTSIV8DI:
37416 icode = CODE_FOR_avx512f_scattersiv8di;
37417 goto scatter_gen;
37418 case IX86_BUILTIN_SCATTERALTDIV16SI:
37419 icode = CODE_FOR_avx512f_scatterdiv16si;
37420 goto scatter_gen;
37421 case IX86_BUILTIN_GATHERPFDPS:
37422 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
37423 goto vec_prefetch_gen;
37424 case IX86_BUILTIN_GATHERPFQPD:
37425 icode = CODE_FOR_avx512pf_gatherpfv8didf;
37426 goto vec_prefetch_gen;
37427 case IX86_BUILTIN_GATHERPFQPS:
37428 icode = CODE_FOR_avx512pf_gatherpfv8disf;
37429 goto vec_prefetch_gen;
37430 case IX86_BUILTIN_SCATTERPFDPD:
37431 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
37432 goto vec_prefetch_gen;
37433 case IX86_BUILTIN_SCATTERPFDPS:
37434 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
37435 goto vec_prefetch_gen;
37436 case IX86_BUILTIN_SCATTERPFQPD:
37437 icode = CODE_FOR_avx512pf_scatterpfv8didf;
37438 goto vec_prefetch_gen;
37439 case IX86_BUILTIN_SCATTERPFQPS:
37440 icode = CODE_FOR_avx512pf_scatterpfv8disf;
37441 goto vec_prefetch_gen;
37442
37443 gather_gen:
37444 rtx half;
37445 rtx (*gen) (rtx, rtx);
37446
37447 arg0 = CALL_EXPR_ARG (exp, 0);
37448 arg1 = CALL_EXPR_ARG (exp, 1);
37449 arg2 = CALL_EXPR_ARG (exp, 2);
37450 arg3 = CALL_EXPR_ARG (exp, 3);
37451 arg4 = CALL_EXPR_ARG (exp, 4);
37452 op0 = expand_normal (arg0);
37453 op1 = expand_normal (arg1);
37454 op2 = expand_normal (arg2);
37455 op3 = expand_normal (arg3);
37456 op4 = expand_normal (arg4);
37457 /* Note the arg order is different from the operand order. */
37458 mode0 = insn_data[icode].operand[1].mode;
37459 mode2 = insn_data[icode].operand[3].mode;
37460 mode3 = insn_data[icode].operand[4].mode;
37461 mode4 = insn_data[icode].operand[5].mode;
37462
37463 if (target == NULL_RTX
37464 || GET_MODE (target) != insn_data[icode].operand[0].mode
37465 || !insn_data[icode].operand[0].predicate (target,
37466 GET_MODE (target)))
37467 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
37468 else
37469 subtarget = target;
37470
37471 switch (fcode)
37472 {
37473 case IX86_BUILTIN_GATHER3ALTSIV8DF:
37474 case IX86_BUILTIN_GATHER3ALTSIV8DI:
37475 half = gen_reg_rtx (V8SImode);
37476 if (!nonimmediate_operand (op2, V16SImode))
37477 op2 = copy_to_mode_reg (V16SImode, op2);
37478 emit_insn (gen_vec_extract_lo_v16si (half, op2));
37479 op2 = half;
37480 break;
37481 case IX86_BUILTIN_GATHER3ALTSIV4DF:
37482 case IX86_BUILTIN_GATHER3ALTSIV4DI:
37483 case IX86_BUILTIN_GATHERALTSIV4DF:
37484 case IX86_BUILTIN_GATHERALTSIV4DI:
37485 half = gen_reg_rtx (V4SImode);
37486 if (!nonimmediate_operand (op2, V8SImode))
37487 op2 = copy_to_mode_reg (V8SImode, op2);
37488 emit_insn (gen_vec_extract_lo_v8si (half, op2));
37489 op2 = half;
37490 break;
37491 case IX86_BUILTIN_GATHER3ALTDIV16SF:
37492 case IX86_BUILTIN_GATHER3ALTDIV16SI:
37493 half = gen_reg_rtx (mode0);
37494 if (mode0 == V8SFmode)
37495 gen = gen_vec_extract_lo_v16sf;
37496 else
37497 gen = gen_vec_extract_lo_v16si;
37498 if (!nonimmediate_operand (op0, GET_MODE (op0)))
37499 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
37500 emit_insn (gen (half, op0));
37501 op0 = half;
37502 if (GET_MODE (op3) != VOIDmode)
37503 {
37504 if (!nonimmediate_operand (op3, GET_MODE (op3)))
37505 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
37506 emit_insn (gen (half, op3));
37507 op3 = half;
37508 }
37509 break;
37510 case IX86_BUILTIN_GATHER3ALTDIV8SF:
37511 case IX86_BUILTIN_GATHER3ALTDIV8SI:
37512 case IX86_BUILTIN_GATHERALTDIV8SF:
37513 case IX86_BUILTIN_GATHERALTDIV8SI:
37514 half = gen_reg_rtx (mode0);
37515 if (mode0 == V4SFmode)
37516 gen = gen_vec_extract_lo_v8sf;
37517 else
37518 gen = gen_vec_extract_lo_v8si;
37519 if (!nonimmediate_operand (op0, GET_MODE (op0)))
37520 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
37521 emit_insn (gen (half, op0));
37522 op0 = half;
37523 if (GET_MODE (op3) != VOIDmode)
37524 {
37525 if (!nonimmediate_operand (op3, GET_MODE (op3)))
37526 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
37527 emit_insn (gen (half, op3));
37528 op3 = half;
37529 }
37530 break;
37531 default:
37532 break;
37533 }
37534
37535 /* Force memory operand only with base register here. But we
37536 don't want to do it on memory operand for other builtin
37537 functions. */
37538 op1 = ix86_zero_extend_to_Pmode (op1);
37539
37540 if (!insn_data[icode].operand[1].predicate (op0, mode0))
37541 op0 = copy_to_mode_reg (mode0, op0);
37542 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
37543 op1 = copy_to_mode_reg (Pmode, op1);
37544 if (!insn_data[icode].operand[3].predicate (op2, mode2))
37545 op2 = copy_to_mode_reg (mode2, op2);
37546
37547 op3 = fixup_modeless_constant (op3, mode3);
37548
37549 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
37550 {
37551 if (!insn_data[icode].operand[4].predicate (op3, mode3))
37552 op3 = copy_to_mode_reg (mode3, op3);
37553 }
37554 else
37555 {
37556 op3 = copy_to_reg (op3);
37557 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
37558 }
37559 if (!insn_data[icode].operand[5].predicate (op4, mode4))
37560 {
37561 error ("the last argument must be scale 1, 2, 4, 8");
37562 return const0_rtx;
37563 }
37564
37565 /* Optimize. If mask is known to have all high bits set,
37566 replace op0 with pc_rtx to signal that the instruction
37567 overwrites the whole destination and doesn't use its
37568 previous contents. */
37569 if (optimize)
37570 {
37571 if (TREE_CODE (arg3) == INTEGER_CST)
37572 {
37573 if (integer_all_onesp (arg3))
37574 op0 = pc_rtx;
37575 }
37576 else if (TREE_CODE (arg3) == VECTOR_CST)
37577 {
37578 unsigned int negative = 0;
37579 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
37580 {
37581 tree cst = VECTOR_CST_ELT (arg3, i);
37582 if (TREE_CODE (cst) == INTEGER_CST
37583 && tree_int_cst_sign_bit (cst))
37584 negative++;
37585 else if (TREE_CODE (cst) == REAL_CST
37586 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
37587 negative++;
37588 }
37589 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
37590 op0 = pc_rtx;
37591 }
37592 else if (TREE_CODE (arg3) == SSA_NAME
37593 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
37594 {
37595 /* Recognize also when mask is like:
37596 __v2df src = _mm_setzero_pd ();
37597 __v2df mask = _mm_cmpeq_pd (src, src);
37598 or
37599 __v8sf src = _mm256_setzero_ps ();
37600 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
37601 as that is a cheaper way to load all ones into
37602 a register than having to load a constant from
37603 memory. */
37604 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
37605 if (is_gimple_call (def_stmt))
37606 {
37607 tree fndecl = gimple_call_fndecl (def_stmt);
37608 if (fndecl
37609 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
37610 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
37611 {
37612 case IX86_BUILTIN_CMPPD:
37613 case IX86_BUILTIN_CMPPS:
37614 case IX86_BUILTIN_CMPPD256:
37615 case IX86_BUILTIN_CMPPS256:
37616 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
37617 break;
37618 /* FALLTHRU */
37619 case IX86_BUILTIN_CMPEQPD:
37620 case IX86_BUILTIN_CMPEQPS:
37621 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
37622 && initializer_zerop (gimple_call_arg (def_stmt,
37623 1)))
37624 op0 = pc_rtx;
37625 break;
37626 default:
37627 break;
37628 }
37629 }
37630 }
37631 }
37632
37633 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
37634 if (! pat)
37635 return const0_rtx;
37636 emit_insn (pat);
37637
37638 switch (fcode)
37639 {
37640 case IX86_BUILTIN_GATHER3DIV16SF:
37641 if (target == NULL_RTX)
37642 target = gen_reg_rtx (V8SFmode);
37643 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
37644 break;
37645 case IX86_BUILTIN_GATHER3DIV16SI:
37646 if (target == NULL_RTX)
37647 target = gen_reg_rtx (V8SImode);
37648 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
37649 break;
37650 case IX86_BUILTIN_GATHER3DIV8SF:
37651 case IX86_BUILTIN_GATHERDIV8SF:
37652 if (target == NULL_RTX)
37653 target = gen_reg_rtx (V4SFmode);
37654 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
37655 break;
37656 case IX86_BUILTIN_GATHER3DIV8SI:
37657 case IX86_BUILTIN_GATHERDIV8SI:
37658 if (target == NULL_RTX)
37659 target = gen_reg_rtx (V4SImode);
37660 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
37661 break;
37662 default:
37663 target = subtarget;
37664 break;
37665 }
37666 return target;
37667
37668 scatter_gen:
37669 arg0 = CALL_EXPR_ARG (exp, 0);
37670 arg1 = CALL_EXPR_ARG (exp, 1);
37671 arg2 = CALL_EXPR_ARG (exp, 2);
37672 arg3 = CALL_EXPR_ARG (exp, 3);
37673 arg4 = CALL_EXPR_ARG (exp, 4);
37674 op0 = expand_normal (arg0);
37675 op1 = expand_normal (arg1);
37676 op2 = expand_normal (arg2);
37677 op3 = expand_normal (arg3);
37678 op4 = expand_normal (arg4);
37679 mode1 = insn_data[icode].operand[1].mode;
37680 mode2 = insn_data[icode].operand[2].mode;
37681 mode3 = insn_data[icode].operand[3].mode;
37682 mode4 = insn_data[icode].operand[4].mode;
37683
37684 /* Scatter instruction stores operand op3 to memory with
37685 indices from op2 and scale from op4 under writemask op1.
37686 If index operand op2 has more elements then source operand
37687 op3 one need to use only its low half. And vice versa. */
37688 switch (fcode)
37689 {
37690 case IX86_BUILTIN_SCATTERALTSIV8DF:
37691 case IX86_BUILTIN_SCATTERALTSIV8DI:
37692 half = gen_reg_rtx (V8SImode);
37693 if (!nonimmediate_operand (op2, V16SImode))
37694 op2 = copy_to_mode_reg (V16SImode, op2);
37695 emit_insn (gen_vec_extract_lo_v16si (half, op2));
37696 op2 = half;
37697 break;
37698 case IX86_BUILTIN_SCATTERALTDIV16SF:
37699 case IX86_BUILTIN_SCATTERALTDIV16SI:
37700 half = gen_reg_rtx (mode3);
37701 if (mode3 == V8SFmode)
37702 gen = gen_vec_extract_lo_v16sf;
37703 else
37704 gen = gen_vec_extract_lo_v16si;
37705 if (!nonimmediate_operand (op3, GET_MODE (op3)))
37706 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
37707 emit_insn (gen (half, op3));
37708 op3 = half;
37709 break;
37710 default:
37711 break;
37712 }
37713
37714 /* Force memory operand only with base register here. But we
37715 don't want to do it on memory operand for other builtin
37716 functions. */
37717 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
37718
37719 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37720 op0 = copy_to_mode_reg (Pmode, op0);
37721
37722 op1 = fixup_modeless_constant (op1, mode1);
37723
37724 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
37725 {
37726 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37727 op1 = copy_to_mode_reg (mode1, op1);
37728 }
37729 else
37730 {
37731 op1 = copy_to_reg (op1);
37732 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
37733 }
37734
37735 if (!insn_data[icode].operand[2].predicate (op2, mode2))
37736 op2 = copy_to_mode_reg (mode2, op2);
37737
37738 if (!insn_data[icode].operand[3].predicate (op3, mode3))
37739 op3 = copy_to_mode_reg (mode3, op3);
37740
37741 if (!insn_data[icode].operand[4].predicate (op4, mode4))
37742 {
37743 error ("the last argument must be scale 1, 2, 4, 8");
37744 return const0_rtx;
37745 }
37746
37747 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
37748 if (! pat)
37749 return const0_rtx;
37750
37751 emit_insn (pat);
37752 return 0;
37753
37754 vec_prefetch_gen:
37755 arg0 = CALL_EXPR_ARG (exp, 0);
37756 arg1 = CALL_EXPR_ARG (exp, 1);
37757 arg2 = CALL_EXPR_ARG (exp, 2);
37758 arg3 = CALL_EXPR_ARG (exp, 3);
37759 arg4 = CALL_EXPR_ARG (exp, 4);
37760 op0 = expand_normal (arg0);
37761 op1 = expand_normal (arg1);
37762 op2 = expand_normal (arg2);
37763 op3 = expand_normal (arg3);
37764 op4 = expand_normal (arg4);
37765 mode0 = insn_data[icode].operand[0].mode;
37766 mode1 = insn_data[icode].operand[1].mode;
37767 mode3 = insn_data[icode].operand[3].mode;
37768 mode4 = insn_data[icode].operand[4].mode;
37769
37770 op0 = fixup_modeless_constant (op0, mode0);
37771
37772 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
37773 {
37774 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37775 op0 = copy_to_mode_reg (mode0, op0);
37776 }
37777 else
37778 {
37779 op0 = copy_to_reg (op0);
37780 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
37781 }
37782
37783 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37784 op1 = copy_to_mode_reg (mode1, op1);
37785
37786 /* Force memory operand only with base register here. But we
37787 don't want to do it on memory operand for other builtin
37788 functions. */
37789 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
37790
37791 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
37792 op2 = copy_to_mode_reg (Pmode, op2);
37793
37794 if (!insn_data[icode].operand[3].predicate (op3, mode3))
37795 {
37796 error ("the forth argument must be scale 1, 2, 4, 8");
37797 return const0_rtx;
37798 }
37799
37800 if (!insn_data[icode].operand[4].predicate (op4, mode4))
37801 {
37802 error ("incorrect hint operand");
37803 return const0_rtx;
37804 }
37805
37806 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
37807 if (! pat)
37808 return const0_rtx;
37809
37810 emit_insn (pat);
37811
37812 return 0;
37813
37814 case IX86_BUILTIN_XABORT:
37815 icode = CODE_FOR_xabort;
37816 arg0 = CALL_EXPR_ARG (exp, 0);
37817 op0 = expand_normal (arg0);
37818 mode0 = insn_data[icode].operand[0].mode;
37819 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37820 {
37821 error ("the xabort's argument must be an 8-bit immediate");
37822 return const0_rtx;
37823 }
37824 emit_insn (gen_xabort (op0));
37825 return 0;
37826
37827 default:
37828 break;
37829 }
37830
37831 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
37832 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
37833 {
37834 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
37835 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
37836 target);
37837 }
37838
37839 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
37840 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
37841 {
37842 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
37843 switch (fcode)
37844 {
37845 case IX86_BUILTIN_FABSQ:
37846 case IX86_BUILTIN_COPYSIGNQ:
37847 if (!TARGET_SSE)
37848 /* Emit a normal call if SSE isn't available. */
37849 return expand_call (exp, target, ignore);
37850 /* FALLTHRU */
37851 default:
37852 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
37853 }
37854 }
37855
37856 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
37857 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
37858 {
37859 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
37860 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
37861 }
37862
37863 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
37864 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
37865 {
37866 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
37867 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
37868 }
37869
37870 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
37871 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
37872 {
37873 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
37874 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
37875 }
37876
37877 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
37878 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
37879 {
37880 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
37881 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
37882 }
37883
37884 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
37885 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
37886 {
37887 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
37888 const struct builtin_description *d = bdesc_multi_arg + i;
37889 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
37890 (enum ix86_builtin_func_type)
37891 d->flag, d->comparison);
37892 }
37893
37894 gcc_unreachable ();
37895 }
37896
37897 /* This returns the target-specific builtin with code CODE if
37898 current_function_decl has visibility on this builtin, which is checked
37899 using isa flags. Returns NULL_TREE otherwise. */
37900
37901 static tree ix86_get_builtin (enum ix86_builtins code)
37902 {
37903 struct cl_target_option *opts;
37904 tree target_tree = NULL_TREE;
37905
37906 /* Determine the isa flags of current_function_decl. */
37907
37908 if (current_function_decl)
37909 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
37910
37911 if (target_tree == NULL)
37912 target_tree = target_option_default_node;
37913
37914 opts = TREE_TARGET_OPTION (target_tree);
37915
37916 if (ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
37917 return ix86_builtin_decl (code, true);
37918 else
37919 return NULL_TREE;
37920 }
37921
37922 /* Return function decl for target specific builtin
37923 for given MPX builtin passed i FCODE. */
37924 static tree
37925 ix86_builtin_mpx_function (unsigned fcode)
37926 {
37927 switch (fcode)
37928 {
37929 case BUILT_IN_CHKP_BNDMK:
37930 return ix86_builtins[IX86_BUILTIN_BNDMK];
37931
37932 case BUILT_IN_CHKP_BNDSTX:
37933 return ix86_builtins[IX86_BUILTIN_BNDSTX];
37934
37935 case BUILT_IN_CHKP_BNDLDX:
37936 return ix86_builtins[IX86_BUILTIN_BNDLDX];
37937
37938 case BUILT_IN_CHKP_BNDCL:
37939 return ix86_builtins[IX86_BUILTIN_BNDCL];
37940
37941 case BUILT_IN_CHKP_BNDCU:
37942 return ix86_builtins[IX86_BUILTIN_BNDCU];
37943
37944 case BUILT_IN_CHKP_BNDRET:
37945 return ix86_builtins[IX86_BUILTIN_BNDRET];
37946
37947 case BUILT_IN_CHKP_INTERSECT:
37948 return ix86_builtins[IX86_BUILTIN_BNDINT];
37949
37950 case BUILT_IN_CHKP_NARROW:
37951 return ix86_builtins[IX86_BUILTIN_BNDNARROW];
37952
37953 case BUILT_IN_CHKP_SIZEOF:
37954 return ix86_builtins[IX86_BUILTIN_SIZEOF];
37955
37956 case BUILT_IN_CHKP_EXTRACT_LOWER:
37957 return ix86_builtins[IX86_BUILTIN_BNDLOWER];
37958
37959 case BUILT_IN_CHKP_EXTRACT_UPPER:
37960 return ix86_builtins[IX86_BUILTIN_BNDUPPER];
37961
37962 default:
37963 return NULL_TREE;
37964 }
37965
37966 gcc_unreachable ();
37967 }
37968
37969 /* Helper function for ix86_load_bounds and ix86_store_bounds.
37970
37971 Return an address to be used to load/store bounds for pointer
37972 passed in SLOT.
37973
37974 SLOT_NO is an integer constant holding number of a target
37975 dependent special slot to be used in case SLOT is not a memory.
37976
37977 SPECIAL_BASE is a pointer to be used as a base of fake address
37978 to access special slots in Bounds Table. SPECIAL_BASE[-1],
37979 SPECIAL_BASE[-2] etc. will be used as fake pointer locations. */
37980
37981 static rtx
37982 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
37983 {
37984 rtx addr = NULL;
37985
37986 /* NULL slot means we pass bounds for pointer not passed to the
37987 function at all. Register slot means we pass pointer in a
37988 register. In both these cases bounds are passed via Bounds
37989 Table. Since we do not have actual pointer stored in memory,
37990 we have to use fake addresses to access Bounds Table. We
37991 start with (special_base - sizeof (void*)) and decrease this
37992 address by pointer size to get addresses for other slots. */
37993 if (!slot || REG_P (slot))
37994 {
37995 gcc_assert (CONST_INT_P (slot_no));
37996 addr = plus_constant (Pmode, special_base,
37997 -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
37998 }
37999 /* If pointer is passed in a memory then its address is used to
38000 access Bounds Table. */
38001 else if (MEM_P (slot))
38002 {
38003 addr = XEXP (slot, 0);
38004 if (!register_operand (addr, Pmode))
38005 addr = copy_addr_to_reg (addr);
38006 }
38007 else
38008 gcc_unreachable ();
38009
38010 return addr;
38011 }
38012
38013 /* Expand pass uses this hook to load bounds for function parameter
38014 PTR passed in SLOT in case its bounds are not passed in a register.
38015
38016 If SLOT is a memory, then bounds are loaded as for regular pointer
38017 loaded from memory. PTR may be NULL in case SLOT is a memory.
38018 In such case value of PTR (if required) may be loaded from SLOT.
38019
38020 If SLOT is NULL or a register then SLOT_NO is an integer constant
38021 holding number of the target dependent special slot which should be
38022 used to obtain bounds.
38023
38024 Return loaded bounds. */
38025
38026 static rtx
38027 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
38028 {
38029 rtx reg = gen_reg_rtx (BNDmode);
38030 rtx addr;
38031
38032 /* Get address to be used to access Bounds Table. Special slots start
38033 at the location of return address of the current function. */
38034 addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
38035
38036 /* Load pointer value from a memory if we don't have it. */
38037 if (!ptr)
38038 {
38039 gcc_assert (MEM_P (slot));
38040 ptr = copy_addr_to_reg (slot);
38041 }
38042
38043 if (!register_operand (ptr, Pmode))
38044 ptr = ix86_zero_extend_to_Pmode (ptr);
38045
38046 emit_insn (BNDmode == BND64mode
38047 ? gen_bnd64_ldx (reg, addr, ptr)
38048 : gen_bnd32_ldx (reg, addr, ptr));
38049
38050 return reg;
38051 }
38052
38053 /* Expand pass uses this hook to store BOUNDS for call argument PTR
38054 passed in SLOT in case BOUNDS are not passed in a register.
38055
38056 If SLOT is a memory, then BOUNDS are stored as for regular pointer
38057 stored in memory. PTR may be NULL in case SLOT is a memory.
38058 In such case value of PTR (if required) may be loaded from SLOT.
38059
38060 If SLOT is NULL or a register then SLOT_NO is an integer constant
38061 holding number of the target dependent special slot which should be
38062 used to store BOUNDS. */
38063
38064 static void
38065 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
38066 {
38067 rtx addr;
38068
38069 /* Get address to be used to access Bounds Table. Special slots start
38070 at the location of return address of a called function. */
38071 addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
38072
38073 /* Load pointer value from a memory if we don't have it. */
38074 if (!ptr)
38075 {
38076 gcc_assert (MEM_P (slot));
38077 ptr = copy_addr_to_reg (slot);
38078 }
38079
38080 if (!register_operand (ptr, Pmode))
38081 ptr = ix86_zero_extend_to_Pmode (ptr);
38082
38083 gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
38084 if (!register_operand (bounds, BNDmode))
38085 bounds = copy_to_mode_reg (BNDmode, bounds);
38086
38087 emit_insn (BNDmode == BND64mode
38088 ? gen_bnd64_stx (addr, ptr, bounds)
38089 : gen_bnd32_stx (addr, ptr, bounds));
38090 }
38091
38092 /* Load and return bounds returned by function in SLOT. */
38093
38094 static rtx
38095 ix86_load_returned_bounds (rtx slot)
38096 {
38097 rtx res;
38098
38099 gcc_assert (REG_P (slot));
38100 res = gen_reg_rtx (BNDmode);
38101 emit_move_insn (res, slot);
38102
38103 return res;
38104 }
38105
38106 /* Store BOUNDS returned by function into SLOT. */
38107
38108 static void
38109 ix86_store_returned_bounds (rtx slot, rtx bounds)
38110 {
38111 gcc_assert (REG_P (slot));
38112 emit_move_insn (slot, bounds);
38113 }
38114
38115 /* Returns a function decl for a vectorized version of the combined function
38116 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
38117 if it is not available. */
38118
38119 static tree
38120 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
38121 tree type_in)
38122 {
38123 machine_mode in_mode, out_mode;
38124 int in_n, out_n;
38125
38126 if (TREE_CODE (type_out) != VECTOR_TYPE
38127 || TREE_CODE (type_in) != VECTOR_TYPE)
38128 return NULL_TREE;
38129
38130 out_mode = TYPE_MODE (TREE_TYPE (type_out));
38131 out_n = TYPE_VECTOR_SUBPARTS (type_out);
38132 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38133 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38134
38135 switch (fn)
38136 {
38137 CASE_CFN_EXP2:
38138 if (out_mode == SFmode && in_mode == SFmode)
38139 {
38140 if (out_n == 16 && in_n == 16)
38141 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
38142 }
38143 break;
38144
38145 CASE_CFN_IFLOOR:
38146 CASE_CFN_LFLOOR:
38147 CASE_CFN_LLFLOOR:
38148 /* The round insn does not trap on denormals. */
38149 if (flag_trapping_math || !TARGET_ROUND)
38150 break;
38151
38152 if (out_mode == SImode && in_mode == DFmode)
38153 {
38154 if (out_n == 4 && in_n == 2)
38155 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
38156 else if (out_n == 8 && in_n == 4)
38157 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
38158 else if (out_n == 16 && in_n == 8)
38159 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
38160 }
38161 if (out_mode == SImode && in_mode == SFmode)
38162 {
38163 if (out_n == 4 && in_n == 4)
38164 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
38165 else if (out_n == 8 && in_n == 8)
38166 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
38167 else if (out_n == 16 && in_n == 16)
38168 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
38169 }
38170 break;
38171
38172 CASE_CFN_ICEIL:
38173 CASE_CFN_LCEIL:
38174 CASE_CFN_LLCEIL:
38175 /* The round insn does not trap on denormals. */
38176 if (flag_trapping_math || !TARGET_ROUND)
38177 break;
38178
38179 if (out_mode == SImode && in_mode == DFmode)
38180 {
38181 if (out_n == 4 && in_n == 2)
38182 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
38183 else if (out_n == 8 && in_n == 4)
38184 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
38185 else if (out_n == 16 && in_n == 8)
38186 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
38187 }
38188 if (out_mode == SImode && in_mode == SFmode)
38189 {
38190 if (out_n == 4 && in_n == 4)
38191 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
38192 else if (out_n == 8 && in_n == 8)
38193 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
38194 else if (out_n == 16 && in_n == 16)
38195 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
38196 }
38197 break;
38198
38199 CASE_CFN_IRINT:
38200 CASE_CFN_LRINT:
38201 CASE_CFN_LLRINT:
38202 if (out_mode == SImode && in_mode == DFmode)
38203 {
38204 if (out_n == 4 && in_n == 2)
38205 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
38206 else if (out_n == 8 && in_n == 4)
38207 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
38208 else if (out_n == 16 && in_n == 8)
38209 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
38210 }
38211 if (out_mode == SImode && in_mode == SFmode)
38212 {
38213 if (out_n == 4 && in_n == 4)
38214 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
38215 else if (out_n == 8 && in_n == 8)
38216 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
38217 else if (out_n == 16 && in_n == 16)
38218 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
38219 }
38220 break;
38221
38222 CASE_CFN_IROUND:
38223 CASE_CFN_LROUND:
38224 CASE_CFN_LLROUND:
38225 /* The round insn does not trap on denormals. */
38226 if (flag_trapping_math || !TARGET_ROUND)
38227 break;
38228
38229 if (out_mode == SImode && in_mode == DFmode)
38230 {
38231 if (out_n == 4 && in_n == 2)
38232 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
38233 else if (out_n == 8 && in_n == 4)
38234 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
38235 else if (out_n == 16 && in_n == 8)
38236 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
38237 }
38238 if (out_mode == SImode && in_mode == SFmode)
38239 {
38240 if (out_n == 4 && in_n == 4)
38241 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
38242 else if (out_n == 8 && in_n == 8)
38243 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
38244 else if (out_n == 16 && in_n == 16)
38245 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
38246 }
38247 break;
38248
38249 CASE_CFN_FLOOR:
38250 /* The round insn does not trap on denormals. */
38251 if (flag_trapping_math || !TARGET_ROUND)
38252 break;
38253
38254 if (out_mode == DFmode && in_mode == DFmode)
38255 {
38256 if (out_n == 2 && in_n == 2)
38257 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
38258 else if (out_n == 4 && in_n == 4)
38259 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
38260 else if (out_n == 8 && in_n == 8)
38261 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
38262 }
38263 if (out_mode == SFmode && in_mode == SFmode)
38264 {
38265 if (out_n == 4 && in_n == 4)
38266 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
38267 else if (out_n == 8 && in_n == 8)
38268 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
38269 else if (out_n == 16 && in_n == 16)
38270 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
38271 }
38272 break;
38273
38274 CASE_CFN_CEIL:
38275 /* The round insn does not trap on denormals. */
38276 if (flag_trapping_math || !TARGET_ROUND)
38277 break;
38278
38279 if (out_mode == DFmode && in_mode == DFmode)
38280 {
38281 if (out_n == 2 && in_n == 2)
38282 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
38283 else if (out_n == 4 && in_n == 4)
38284 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
38285 else if (out_n == 8 && in_n == 8)
38286 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
38287 }
38288 if (out_mode == SFmode && in_mode == SFmode)
38289 {
38290 if (out_n == 4 && in_n == 4)
38291 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
38292 else if (out_n == 8 && in_n == 8)
38293 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
38294 else if (out_n == 16 && in_n == 16)
38295 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
38296 }
38297 break;
38298
38299 CASE_CFN_TRUNC:
38300 /* The round insn does not trap on denormals. */
38301 if (flag_trapping_math || !TARGET_ROUND)
38302 break;
38303
38304 if (out_mode == DFmode && in_mode == DFmode)
38305 {
38306 if (out_n == 2 && in_n == 2)
38307 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
38308 else if (out_n == 4 && in_n == 4)
38309 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
38310 else if (out_n == 8 && in_n == 8)
38311 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
38312 }
38313 if (out_mode == SFmode && in_mode == SFmode)
38314 {
38315 if (out_n == 4 && in_n == 4)
38316 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
38317 else if (out_n == 8 && in_n == 8)
38318 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
38319 else if (out_n == 16 && in_n == 16)
38320 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
38321 }
38322 break;
38323
38324 CASE_CFN_RINT:
38325 /* The round insn does not trap on denormals. */
38326 if (flag_trapping_math || !TARGET_ROUND)
38327 break;
38328
38329 if (out_mode == DFmode && in_mode == DFmode)
38330 {
38331 if (out_n == 2 && in_n == 2)
38332 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
38333 else if (out_n == 4 && in_n == 4)
38334 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
38335 }
38336 if (out_mode == SFmode && in_mode == SFmode)
38337 {
38338 if (out_n == 4 && in_n == 4)
38339 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
38340 else if (out_n == 8 && in_n == 8)
38341 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
38342 }
38343 break;
38344
38345 CASE_CFN_FMA:
38346 if (out_mode == DFmode && in_mode == DFmode)
38347 {
38348 if (out_n == 2 && in_n == 2)
38349 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
38350 if (out_n == 4 && in_n == 4)
38351 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
38352 }
38353 if (out_mode == SFmode && in_mode == SFmode)
38354 {
38355 if (out_n == 4 && in_n == 4)
38356 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
38357 if (out_n == 8 && in_n == 8)
38358 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
38359 }
38360 break;
38361
38362 default:
38363 break;
38364 }
38365
38366 /* Dispatch to a handler for a vectorization library. */
38367 if (ix86_veclib_handler)
38368 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
38369
38370 return NULL_TREE;
38371 }
38372
38373 /* Handler for an SVML-style interface to
38374 a library with vectorized intrinsics. */
38375
38376 static tree
38377 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
38378 {
38379 char name[20];
38380 tree fntype, new_fndecl, args;
38381 unsigned arity;
38382 const char *bname;
38383 machine_mode el_mode, in_mode;
38384 int n, in_n;
38385
38386 /* The SVML is suitable for unsafe math only. */
38387 if (!flag_unsafe_math_optimizations)
38388 return NULL_TREE;
38389
38390 el_mode = TYPE_MODE (TREE_TYPE (type_out));
38391 n = TYPE_VECTOR_SUBPARTS (type_out);
38392 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38393 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38394 if (el_mode != in_mode
38395 || n != in_n)
38396 return NULL_TREE;
38397
38398 switch (fn)
38399 {
38400 CASE_CFN_EXP:
38401 CASE_CFN_LOG:
38402 CASE_CFN_LOG10:
38403 CASE_CFN_POW:
38404 CASE_CFN_TANH:
38405 CASE_CFN_TAN:
38406 CASE_CFN_ATAN:
38407 CASE_CFN_ATAN2:
38408 CASE_CFN_ATANH:
38409 CASE_CFN_CBRT:
38410 CASE_CFN_SINH:
38411 CASE_CFN_SIN:
38412 CASE_CFN_ASINH:
38413 CASE_CFN_ASIN:
38414 CASE_CFN_COSH:
38415 CASE_CFN_COS:
38416 CASE_CFN_ACOSH:
38417 CASE_CFN_ACOS:
38418 if ((el_mode != DFmode || n != 2)
38419 && (el_mode != SFmode || n != 4))
38420 return NULL_TREE;
38421 break;
38422
38423 default:
38424 return NULL_TREE;
38425 }
38426
38427 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
38428 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
38429
38430 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
38431 strcpy (name, "vmlsLn4");
38432 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
38433 strcpy (name, "vmldLn2");
38434 else if (n == 4)
38435 {
38436 sprintf (name, "vmls%s", bname+10);
38437 name[strlen (name)-1] = '4';
38438 }
38439 else
38440 sprintf (name, "vmld%s2", bname+10);
38441
38442 /* Convert to uppercase. */
38443 name[4] &= ~0x20;
38444
38445 arity = 0;
38446 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
38447 arity++;
38448
38449 if (arity == 1)
38450 fntype = build_function_type_list (type_out, type_in, NULL);
38451 else
38452 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
38453
38454 /* Build a function declaration for the vectorized function. */
38455 new_fndecl = build_decl (BUILTINS_LOCATION,
38456 FUNCTION_DECL, get_identifier (name), fntype);
38457 TREE_PUBLIC (new_fndecl) = 1;
38458 DECL_EXTERNAL (new_fndecl) = 1;
38459 DECL_IS_NOVOPS (new_fndecl) = 1;
38460 TREE_READONLY (new_fndecl) = 1;
38461
38462 return new_fndecl;
38463 }
38464
38465 /* Handler for an ACML-style interface to
38466 a library with vectorized intrinsics. */
38467
38468 static tree
38469 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
38470 {
38471 char name[20] = "__vr.._";
38472 tree fntype, new_fndecl, args;
38473 unsigned arity;
38474 const char *bname;
38475 machine_mode el_mode, in_mode;
38476 int n, in_n;
38477
38478 /* The ACML is 64bits only and suitable for unsafe math only as
38479 it does not correctly support parts of IEEE with the required
38480 precision such as denormals. */
38481 if (!TARGET_64BIT
38482 || !flag_unsafe_math_optimizations)
38483 return NULL_TREE;
38484
38485 el_mode = TYPE_MODE (TREE_TYPE (type_out));
38486 n = TYPE_VECTOR_SUBPARTS (type_out);
38487 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38488 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38489 if (el_mode != in_mode
38490 || n != in_n)
38491 return NULL_TREE;
38492
38493 switch (fn)
38494 {
38495 CASE_CFN_SIN:
38496 CASE_CFN_COS:
38497 CASE_CFN_EXP:
38498 CASE_CFN_LOG:
38499 CASE_CFN_LOG2:
38500 CASE_CFN_LOG10:
38501 if (el_mode == DFmode && n == 2)
38502 {
38503 name[4] = 'd';
38504 name[5] = '2';
38505 }
38506 else if (el_mode == SFmode && n == 4)
38507 {
38508 name[4] = 's';
38509 name[5] = '4';
38510 }
38511 else
38512 return NULL_TREE;
38513 break;
38514
38515 default:
38516 return NULL_TREE;
38517 }
38518
38519 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
38520 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
38521 sprintf (name + 7, "%s", bname+10);
38522
38523 arity = 0;
38524 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
38525 arity++;
38526
38527 if (arity == 1)
38528 fntype = build_function_type_list (type_out, type_in, NULL);
38529 else
38530 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
38531
38532 /* Build a function declaration for the vectorized function. */
38533 new_fndecl = build_decl (BUILTINS_LOCATION,
38534 FUNCTION_DECL, get_identifier (name), fntype);
38535 TREE_PUBLIC (new_fndecl) = 1;
38536 DECL_EXTERNAL (new_fndecl) = 1;
38537 DECL_IS_NOVOPS (new_fndecl) = 1;
38538 TREE_READONLY (new_fndecl) = 1;
38539
38540 return new_fndecl;
38541 }
38542
38543 /* Returns a decl of a function that implements gather load with
38544 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
38545 Return NULL_TREE if it is not available. */
38546
38547 static tree
38548 ix86_vectorize_builtin_gather (const_tree mem_vectype,
38549 const_tree index_type, int scale)
38550 {
38551 bool si;
38552 enum ix86_builtins code;
38553
38554 if (! TARGET_AVX2)
38555 return NULL_TREE;
38556
38557 if ((TREE_CODE (index_type) != INTEGER_TYPE
38558 && !POINTER_TYPE_P (index_type))
38559 || (TYPE_MODE (index_type) != SImode
38560 && TYPE_MODE (index_type) != DImode))
38561 return NULL_TREE;
38562
38563 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
38564 return NULL_TREE;
38565
38566 /* v*gather* insn sign extends index to pointer mode. */
38567 if (TYPE_PRECISION (index_type) < POINTER_SIZE
38568 && TYPE_UNSIGNED (index_type))
38569 return NULL_TREE;
38570
38571 if (scale <= 0
38572 || scale > 8
38573 || (scale & (scale - 1)) != 0)
38574 return NULL_TREE;
38575
38576 si = TYPE_MODE (index_type) == SImode;
38577 switch (TYPE_MODE (mem_vectype))
38578 {
38579 case V2DFmode:
38580 if (TARGET_AVX512VL)
38581 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
38582 else
38583 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
38584 break;
38585 case V4DFmode:
38586 if (TARGET_AVX512VL)
38587 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
38588 else
38589 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
38590 break;
38591 case V2DImode:
38592 if (TARGET_AVX512VL)
38593 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
38594 else
38595 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
38596 break;
38597 case V4DImode:
38598 if (TARGET_AVX512VL)
38599 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
38600 else
38601 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
38602 break;
38603 case V4SFmode:
38604 if (TARGET_AVX512VL)
38605 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
38606 else
38607 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
38608 break;
38609 case V8SFmode:
38610 if (TARGET_AVX512VL)
38611 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
38612 else
38613 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
38614 break;
38615 case V4SImode:
38616 if (TARGET_AVX512VL)
38617 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
38618 else
38619 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
38620 break;
38621 case V8SImode:
38622 if (TARGET_AVX512VL)
38623 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
38624 else
38625 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
38626 break;
38627 case V8DFmode:
38628 if (TARGET_AVX512F)
38629 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
38630 else
38631 return NULL_TREE;
38632 break;
38633 case V8DImode:
38634 if (TARGET_AVX512F)
38635 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
38636 else
38637 return NULL_TREE;
38638 break;
38639 case V16SFmode:
38640 if (TARGET_AVX512F)
38641 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
38642 else
38643 return NULL_TREE;
38644 break;
38645 case V16SImode:
38646 if (TARGET_AVX512F)
38647 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
38648 else
38649 return NULL_TREE;
38650 break;
38651 default:
38652 return NULL_TREE;
38653 }
38654
38655 return ix86_get_builtin (code);
38656 }
38657
38658 /* Returns a decl of a function that implements scatter store with
38659 register type VECTYPE and index type INDEX_TYPE and SCALE.
38660 Return NULL_TREE if it is not available. */
38661
38662 static tree
38663 ix86_vectorize_builtin_scatter (const_tree vectype,
38664 const_tree index_type, int scale)
38665 {
38666 bool si;
38667 enum ix86_builtins code;
38668
38669 if (!TARGET_AVX512F)
38670 return NULL_TREE;
38671
38672 if ((TREE_CODE (index_type) != INTEGER_TYPE
38673 && !POINTER_TYPE_P (index_type))
38674 || (TYPE_MODE (index_type) != SImode
38675 && TYPE_MODE (index_type) != DImode))
38676 return NULL_TREE;
38677
38678 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
38679 return NULL_TREE;
38680
38681 /* v*scatter* insn sign extends index to pointer mode. */
38682 if (TYPE_PRECISION (index_type) < POINTER_SIZE
38683 && TYPE_UNSIGNED (index_type))
38684 return NULL_TREE;
38685
38686 /* Scale can be 1, 2, 4 or 8. */
38687 if (scale <= 0
38688 || scale > 8
38689 || (scale & (scale - 1)) != 0)
38690 return NULL_TREE;
38691
38692 si = TYPE_MODE (index_type) == SImode;
38693 switch (TYPE_MODE (vectype))
38694 {
38695 case V8DFmode:
38696 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
38697 break;
38698 case V8DImode:
38699 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
38700 break;
38701 case V16SFmode:
38702 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
38703 break;
38704 case V16SImode:
38705 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
38706 break;
38707 default:
38708 return NULL_TREE;
38709 }
38710
38711 return ix86_builtins[code];
38712 }
38713
38714 /* Return true if it is safe to use the rsqrt optabs to optimize
38715 1.0/sqrt. */
38716
38717 static bool
38718 use_rsqrt_p ()
38719 {
38720 return (TARGET_SSE_MATH
38721 && flag_finite_math_only
38722 && !flag_trapping_math
38723 && flag_unsafe_math_optimizations);
38724 }
38725
38726 /* Returns a code for a target-specific builtin that implements
38727 reciprocal of the function, or NULL_TREE if not available. */
38728
38729 static tree
38730 ix86_builtin_reciprocal (tree fndecl)
38731 {
38732 switch (DECL_FUNCTION_CODE (fndecl))
38733 {
38734 /* Vectorized version of sqrt to rsqrt conversion. */
38735 case IX86_BUILTIN_SQRTPS_NR:
38736 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
38737
38738 case IX86_BUILTIN_SQRTPS_NR256:
38739 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
38740
38741 default:
38742 return NULL_TREE;
38743 }
38744 }
38745 \f
38746 /* Helper for avx_vpermilps256_operand et al. This is also used by
38747 the expansion functions to turn the parallel back into a mask.
38748 The return value is 0 for no match and the imm8+1 for a match. */
38749
38750 int
38751 avx_vpermilp_parallel (rtx par, machine_mode mode)
38752 {
38753 unsigned i, nelt = GET_MODE_NUNITS (mode);
38754 unsigned mask = 0;
38755 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
38756
38757 if (XVECLEN (par, 0) != (int) nelt)
38758 return 0;
38759
38760 /* Validate that all of the elements are constants, and not totally
38761 out of range. Copy the data into an integral array to make the
38762 subsequent checks easier. */
38763 for (i = 0; i < nelt; ++i)
38764 {
38765 rtx er = XVECEXP (par, 0, i);
38766 unsigned HOST_WIDE_INT ei;
38767
38768 if (!CONST_INT_P (er))
38769 return 0;
38770 ei = INTVAL (er);
38771 if (ei >= nelt)
38772 return 0;
38773 ipar[i] = ei;
38774 }
38775
38776 switch (mode)
38777 {
38778 case V8DFmode:
38779 /* In the 512-bit DFmode case, we can only move elements within
38780 a 128-bit lane. First fill the second part of the mask,
38781 then fallthru. */
38782 for (i = 4; i < 6; ++i)
38783 {
38784 if (ipar[i] < 4 || ipar[i] >= 6)
38785 return 0;
38786 mask |= (ipar[i] - 4) << i;
38787 }
38788 for (i = 6; i < 8; ++i)
38789 {
38790 if (ipar[i] < 6)
38791 return 0;
38792 mask |= (ipar[i] - 6) << i;
38793 }
38794 /* FALLTHRU */
38795
38796 case V4DFmode:
38797 /* In the 256-bit DFmode case, we can only move elements within
38798 a 128-bit lane. */
38799 for (i = 0; i < 2; ++i)
38800 {
38801 if (ipar[i] >= 2)
38802 return 0;
38803 mask |= ipar[i] << i;
38804 }
38805 for (i = 2; i < 4; ++i)
38806 {
38807 if (ipar[i] < 2)
38808 return 0;
38809 mask |= (ipar[i] - 2) << i;
38810 }
38811 break;
38812
38813 case V16SFmode:
38814 /* In 512 bit SFmode case, permutation in the upper 256 bits
38815 must mirror the permutation in the lower 256-bits. */
38816 for (i = 0; i < 8; ++i)
38817 if (ipar[i] + 8 != ipar[i + 8])
38818 return 0;
38819 /* FALLTHRU */
38820
38821 case V8SFmode:
38822 /* In 256 bit SFmode case, we have full freedom of
38823 movement within the low 128-bit lane, but the high 128-bit
38824 lane must mirror the exact same pattern. */
38825 for (i = 0; i < 4; ++i)
38826 if (ipar[i] + 4 != ipar[i + 4])
38827 return 0;
38828 nelt = 4;
38829 /* FALLTHRU */
38830
38831 case V2DFmode:
38832 case V4SFmode:
38833 /* In the 128-bit case, we've full freedom in the placement of
38834 the elements from the source operand. */
38835 for (i = 0; i < nelt; ++i)
38836 mask |= ipar[i] << (i * (nelt / 2));
38837 break;
38838
38839 default:
38840 gcc_unreachable ();
38841 }
38842
38843 /* Make sure success has a non-zero value by adding one. */
38844 return mask + 1;
38845 }
38846
38847 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
38848 the expansion functions to turn the parallel back into a mask.
38849 The return value is 0 for no match and the imm8+1 for a match. */
38850
38851 int
38852 avx_vperm2f128_parallel (rtx par, machine_mode mode)
38853 {
38854 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
38855 unsigned mask = 0;
38856 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
38857
38858 if (XVECLEN (par, 0) != (int) nelt)
38859 return 0;
38860
38861 /* Validate that all of the elements are constants, and not totally
38862 out of range. Copy the data into an integral array to make the
38863 subsequent checks easier. */
38864 for (i = 0; i < nelt; ++i)
38865 {
38866 rtx er = XVECEXP (par, 0, i);
38867 unsigned HOST_WIDE_INT ei;
38868
38869 if (!CONST_INT_P (er))
38870 return 0;
38871 ei = INTVAL (er);
38872 if (ei >= 2 * nelt)
38873 return 0;
38874 ipar[i] = ei;
38875 }
38876
38877 /* Validate that the halves of the permute are halves. */
38878 for (i = 0; i < nelt2 - 1; ++i)
38879 if (ipar[i] + 1 != ipar[i + 1])
38880 return 0;
38881 for (i = nelt2; i < nelt - 1; ++i)
38882 if (ipar[i] + 1 != ipar[i + 1])
38883 return 0;
38884
38885 /* Reconstruct the mask. */
38886 for (i = 0; i < 2; ++i)
38887 {
38888 unsigned e = ipar[i * nelt2];
38889 if (e % nelt2)
38890 return 0;
38891 e /= nelt2;
38892 mask |= e << (i * 4);
38893 }
38894
38895 /* Make sure success has a non-zero value by adding one. */
38896 return mask + 1;
38897 }
38898 \f
38899 /* Return a register priority for hard reg REGNO. */
38900 static int
38901 ix86_register_priority (int hard_regno)
38902 {
38903 /* ebp and r13 as the base always wants a displacement, r12 as the
38904 base always wants an index. So discourage their usage in an
38905 address. */
38906 if (hard_regno == R12_REG || hard_regno == R13_REG)
38907 return 0;
38908 if (hard_regno == BP_REG)
38909 return 1;
38910 /* New x86-64 int registers result in bigger code size. Discourage
38911 them. */
38912 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
38913 return 2;
38914 /* New x86-64 SSE registers result in bigger code size. Discourage
38915 them. */
38916 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
38917 return 2;
38918 /* Usage of AX register results in smaller code. Prefer it. */
38919 if (hard_regno == AX_REG)
38920 return 4;
38921 return 3;
38922 }
38923
38924 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
38925
38926 Put float CONST_DOUBLE in the constant pool instead of fp regs.
38927 QImode must go into class Q_REGS.
38928 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
38929 movdf to do mem-to-mem moves through integer regs. */
38930
38931 static reg_class_t
38932 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
38933 {
38934 machine_mode mode = GET_MODE (x);
38935
38936 /* We're only allowed to return a subclass of CLASS. Many of the
38937 following checks fail for NO_REGS, so eliminate that early. */
38938 if (regclass == NO_REGS)
38939 return NO_REGS;
38940
38941 /* All classes can load zeros. */
38942 if (x == CONST0_RTX (mode))
38943 return regclass;
38944
38945 /* Force constants into memory if we are loading a (nonzero) constant into
38946 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
38947 instructions to load from a constant. */
38948 if (CONSTANT_P (x)
38949 && (MAYBE_MMX_CLASS_P (regclass)
38950 || MAYBE_SSE_CLASS_P (regclass)
38951 || MAYBE_MASK_CLASS_P (regclass)))
38952 return NO_REGS;
38953
38954 /* Floating-point constants need more complex checks. */
38955 if (CONST_DOUBLE_P (x))
38956 {
38957 /* General regs can load everything. */
38958 if (INTEGER_CLASS_P (regclass))
38959 return regclass;
38960
38961 /* Floats can load 0 and 1 plus some others. Note that we eliminated
38962 zero above. We only want to wind up preferring 80387 registers if
38963 we plan on doing computation with them. */
38964 if (IS_STACK_MODE (mode)
38965 && standard_80387_constant_p (x) > 0)
38966 {
38967 /* Limit class to FP regs. */
38968 if (FLOAT_CLASS_P (regclass))
38969 return FLOAT_REGS;
38970 else if (regclass == FP_TOP_SSE_REGS)
38971 return FP_TOP_REG;
38972 else if (regclass == FP_SECOND_SSE_REGS)
38973 return FP_SECOND_REG;
38974 }
38975
38976 return NO_REGS;
38977 }
38978
38979 /* Prefer SSE regs only, if we can use them for math. */
38980 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
38981 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
38982
38983 /* Generally when we see PLUS here, it's the function invariant
38984 (plus soft-fp const_int). Which can only be computed into general
38985 regs. */
38986 if (GET_CODE (x) == PLUS)
38987 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
38988
38989 /* QImode constants are easy to load, but non-constant QImode data
38990 must go into Q_REGS. */
38991 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
38992 {
38993 if (Q_CLASS_P (regclass))
38994 return regclass;
38995 else if (reg_class_subset_p (Q_REGS, regclass))
38996 return Q_REGS;
38997 else
38998 return NO_REGS;
38999 }
39000
39001 return regclass;
39002 }
39003
39004 /* Discourage putting floating-point values in SSE registers unless
39005 SSE math is being used, and likewise for the 387 registers. */
39006 static reg_class_t
39007 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
39008 {
39009 machine_mode mode = GET_MODE (x);
39010
39011 /* Restrict the output reload class to the register bank that we are doing
39012 math on. If we would like not to return a subset of CLASS, reject this
39013 alternative: if reload cannot do this, it will still use its choice. */
39014 mode = GET_MODE (x);
39015 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39016 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
39017
39018 if (IS_STACK_MODE (mode))
39019 {
39020 if (regclass == FP_TOP_SSE_REGS)
39021 return FP_TOP_REG;
39022 else if (regclass == FP_SECOND_SSE_REGS)
39023 return FP_SECOND_REG;
39024 else
39025 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
39026 }
39027
39028 return regclass;
39029 }
39030
39031 static reg_class_t
39032 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
39033 machine_mode mode, secondary_reload_info *sri)
39034 {
39035 /* Double-word spills from general registers to non-offsettable memory
39036 references (zero-extended addresses) require special handling. */
39037 if (TARGET_64BIT
39038 && MEM_P (x)
39039 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
39040 && INTEGER_CLASS_P (rclass)
39041 && !offsettable_memref_p (x))
39042 {
39043 sri->icode = (in_p
39044 ? CODE_FOR_reload_noff_load
39045 : CODE_FOR_reload_noff_store);
39046 /* Add the cost of moving address to a temporary. */
39047 sri->extra_cost = 1;
39048
39049 return NO_REGS;
39050 }
39051
39052 /* QImode spills from non-QI registers require
39053 intermediate register on 32bit targets. */
39054 if (mode == QImode
39055 && (MAYBE_MASK_CLASS_P (rclass)
39056 || (!TARGET_64BIT && !in_p
39057 && INTEGER_CLASS_P (rclass)
39058 && MAYBE_NON_Q_CLASS_P (rclass))))
39059 {
39060 int regno;
39061
39062 if (REG_P (x))
39063 regno = REGNO (x);
39064 else
39065 regno = -1;
39066
39067 if (regno >= FIRST_PSEUDO_REGISTER || SUBREG_P (x))
39068 regno = true_regnum (x);
39069
39070 /* Return Q_REGS if the operand is in memory. */
39071 if (regno == -1)
39072 return Q_REGS;
39073 }
39074
39075 /* This condition handles corner case where an expression involving
39076 pointers gets vectorized. We're trying to use the address of a
39077 stack slot as a vector initializer.
39078
39079 (set (reg:V2DI 74 [ vect_cst_.2 ])
39080 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
39081
39082 Eventually frame gets turned into sp+offset like this:
39083
39084 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39085 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
39086 (const_int 392 [0x188]))))
39087
39088 That later gets turned into:
39089
39090 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39091 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
39092 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
39093
39094 We'll have the following reload recorded:
39095
39096 Reload 0: reload_in (DI) =
39097 (plus:DI (reg/f:DI 7 sp)
39098 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
39099 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39100 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
39101 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
39102 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39103 reload_reg_rtx: (reg:V2DI 22 xmm1)
39104
39105 Which isn't going to work since SSE instructions can't handle scalar
39106 additions. Returning GENERAL_REGS forces the addition into integer
39107 register and reload can handle subsequent reloads without problems. */
39108
39109 if (in_p && GET_CODE (x) == PLUS
39110 && SSE_CLASS_P (rclass)
39111 && SCALAR_INT_MODE_P (mode))
39112 return GENERAL_REGS;
39113
39114 return NO_REGS;
39115 }
39116
39117 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
39118
39119 static bool
39120 ix86_class_likely_spilled_p (reg_class_t rclass)
39121 {
39122 switch (rclass)
39123 {
39124 case AREG:
39125 case DREG:
39126 case CREG:
39127 case BREG:
39128 case AD_REGS:
39129 case SIREG:
39130 case DIREG:
39131 case SSE_FIRST_REG:
39132 case FP_TOP_REG:
39133 case FP_SECOND_REG:
39134 case BND_REGS:
39135 return true;
39136
39137 default:
39138 break;
39139 }
39140
39141 return false;
39142 }
39143
39144 /* If we are copying between general and FP registers, we need a memory
39145 location. The same is true for SSE and MMX registers.
39146
39147 To optimize register_move_cost performance, allow inline variant.
39148
39149 The macro can't work reliably when one of the CLASSES is class containing
39150 registers from multiple units (SSE, MMX, integer). We avoid this by never
39151 combining those units in single alternative in the machine description.
39152 Ensure that this constraint holds to avoid unexpected surprises.
39153
39154 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
39155 enforce these sanity checks. */
39156
39157 static inline bool
39158 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
39159 machine_mode mode, int strict)
39160 {
39161 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
39162 return false;
39163 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
39164 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
39165 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
39166 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
39167 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
39168 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
39169 {
39170 gcc_assert (!strict || lra_in_progress);
39171 return true;
39172 }
39173
39174 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
39175 return true;
39176
39177 /* Between mask and general, we have moves no larger than word size. */
39178 if ((MAYBE_MASK_CLASS_P (class1) != MAYBE_MASK_CLASS_P (class2))
39179 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
39180 return true;
39181
39182 /* ??? This is a lie. We do have moves between mmx/general, and for
39183 mmx/sse2. But by saying we need secondary memory we discourage the
39184 register allocator from using the mmx registers unless needed. */
39185 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
39186 return true;
39187
39188 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
39189 {
39190 /* SSE1 doesn't have any direct moves from other classes. */
39191 if (!TARGET_SSE2)
39192 return true;
39193
39194 /* If the target says that inter-unit moves are more expensive
39195 than moving through memory, then don't generate them. */
39196 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
39197 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
39198 return true;
39199
39200 /* Between SSE and general, we have moves no larger than word size. */
39201 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39202 return true;
39203 }
39204
39205 return false;
39206 }
39207
39208 bool
39209 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
39210 machine_mode mode, int strict)
39211 {
39212 return inline_secondary_memory_needed (class1, class2, mode, strict);
39213 }
39214
39215 /* Implement the TARGET_CLASS_MAX_NREGS hook.
39216
39217 On the 80386, this is the size of MODE in words,
39218 except in the FP regs, where a single reg is always enough. */
39219
39220 static unsigned char
39221 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
39222 {
39223 if (MAYBE_INTEGER_CLASS_P (rclass))
39224 {
39225 if (mode == XFmode)
39226 return (TARGET_64BIT ? 2 : 3);
39227 else if (mode == XCmode)
39228 return (TARGET_64BIT ? 4 : 6);
39229 else
39230 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
39231 }
39232 else
39233 {
39234 if (COMPLEX_MODE_P (mode))
39235 return 2;
39236 else
39237 return 1;
39238 }
39239 }
39240
39241 /* Return true if the registers in CLASS cannot represent the change from
39242 modes FROM to TO. */
39243
39244 bool
39245 ix86_cannot_change_mode_class (machine_mode from, machine_mode to,
39246 enum reg_class regclass)
39247 {
39248 if (from == to)
39249 return false;
39250
39251 /* x87 registers can't do subreg at all, as all values are reformatted
39252 to extended precision. */
39253 if (MAYBE_FLOAT_CLASS_P (regclass))
39254 return true;
39255
39256 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
39257 {
39258 /* Vector registers do not support QI or HImode loads. If we don't
39259 disallow a change to these modes, reload will assume it's ok to
39260 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
39261 the vec_dupv4hi pattern. */
39262 if (GET_MODE_SIZE (from) < 4)
39263 return true;
39264 }
39265
39266 return false;
39267 }
39268
39269 /* Return the cost of moving data of mode M between a
39270 register and memory. A value of 2 is the default; this cost is
39271 relative to those in `REGISTER_MOVE_COST'.
39272
39273 This function is used extensively by register_move_cost that is used to
39274 build tables at startup. Make it inline in this case.
39275 When IN is 2, return maximum of in and out move cost.
39276
39277 If moving between registers and memory is more expensive than
39278 between two registers, you should define this macro to express the
39279 relative cost.
39280
39281 Model also increased moving costs of QImode registers in non
39282 Q_REGS classes.
39283 */
39284 static inline int
39285 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
39286 int in)
39287 {
39288 int cost;
39289 if (FLOAT_CLASS_P (regclass))
39290 {
39291 int index;
39292 switch (mode)
39293 {
39294 case SFmode:
39295 index = 0;
39296 break;
39297 case DFmode:
39298 index = 1;
39299 break;
39300 case XFmode:
39301 index = 2;
39302 break;
39303 default:
39304 return 100;
39305 }
39306 if (in == 2)
39307 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
39308 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
39309 }
39310 if (SSE_CLASS_P (regclass))
39311 {
39312 int index;
39313 switch (GET_MODE_SIZE (mode))
39314 {
39315 case 4:
39316 index = 0;
39317 break;
39318 case 8:
39319 index = 1;
39320 break;
39321 case 16:
39322 index = 2;
39323 break;
39324 default:
39325 return 100;
39326 }
39327 if (in == 2)
39328 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
39329 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
39330 }
39331 if (MMX_CLASS_P (regclass))
39332 {
39333 int index;
39334 switch (GET_MODE_SIZE (mode))
39335 {
39336 case 4:
39337 index = 0;
39338 break;
39339 case 8:
39340 index = 1;
39341 break;
39342 default:
39343 return 100;
39344 }
39345 if (in)
39346 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
39347 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
39348 }
39349 switch (GET_MODE_SIZE (mode))
39350 {
39351 case 1:
39352 if (Q_CLASS_P (regclass) || TARGET_64BIT)
39353 {
39354 if (!in)
39355 return ix86_cost->int_store[0];
39356 if (TARGET_PARTIAL_REG_DEPENDENCY
39357 && optimize_function_for_speed_p (cfun))
39358 cost = ix86_cost->movzbl_load;
39359 else
39360 cost = ix86_cost->int_load[0];
39361 if (in == 2)
39362 return MAX (cost, ix86_cost->int_store[0]);
39363 return cost;
39364 }
39365 else
39366 {
39367 if (in == 2)
39368 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
39369 if (in)
39370 return ix86_cost->movzbl_load;
39371 else
39372 return ix86_cost->int_store[0] + 4;
39373 }
39374 break;
39375 case 2:
39376 if (in == 2)
39377 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
39378 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
39379 default:
39380 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
39381 if (mode == TFmode)
39382 mode = XFmode;
39383 if (in == 2)
39384 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
39385 else if (in)
39386 cost = ix86_cost->int_load[2];
39387 else
39388 cost = ix86_cost->int_store[2];
39389 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
39390 }
39391 }
39392
39393 static int
39394 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
39395 bool in)
39396 {
39397 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
39398 }
39399
39400
39401 /* Return the cost of moving data from a register in class CLASS1 to
39402 one in class CLASS2.
39403
39404 It is not required that the cost always equal 2 when FROM is the same as TO;
39405 on some machines it is expensive to move between registers if they are not
39406 general registers. */
39407
39408 static int
39409 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
39410 reg_class_t class2_i)
39411 {
39412 enum reg_class class1 = (enum reg_class) class1_i;
39413 enum reg_class class2 = (enum reg_class) class2_i;
39414
39415 /* In case we require secondary memory, compute cost of the store followed
39416 by load. In order to avoid bad register allocation choices, we need
39417 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
39418
39419 if (inline_secondary_memory_needed (class1, class2, mode, 0))
39420 {
39421 int cost = 1;
39422
39423 cost += inline_memory_move_cost (mode, class1, 2);
39424 cost += inline_memory_move_cost (mode, class2, 2);
39425
39426 /* In case of copying from general_purpose_register we may emit multiple
39427 stores followed by single load causing memory size mismatch stall.
39428 Count this as arbitrarily high cost of 20. */
39429 if (targetm.class_max_nregs (class1, mode)
39430 > targetm.class_max_nregs (class2, mode))
39431 cost += 20;
39432
39433 /* In the case of FP/MMX moves, the registers actually overlap, and we
39434 have to switch modes in order to treat them differently. */
39435 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
39436 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
39437 cost += 20;
39438
39439 return cost;
39440 }
39441
39442 /* Moves between SSE/MMX and integer unit are expensive. */
39443 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
39444 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
39445
39446 /* ??? By keeping returned value relatively high, we limit the number
39447 of moves between integer and MMX/SSE registers for all targets.
39448 Additionally, high value prevents problem with x86_modes_tieable_p(),
39449 where integer modes in MMX/SSE registers are not tieable
39450 because of missing QImode and HImode moves to, from or between
39451 MMX/SSE registers. */
39452 return MAX (8, ix86_cost->mmxsse_to_integer);
39453
39454 if (MAYBE_FLOAT_CLASS_P (class1))
39455 return ix86_cost->fp_move;
39456 if (MAYBE_SSE_CLASS_P (class1))
39457 return ix86_cost->sse_move;
39458 if (MAYBE_MMX_CLASS_P (class1))
39459 return ix86_cost->mmx_move;
39460 return 2;
39461 }
39462
39463 /* Return TRUE if hard register REGNO can hold a value of machine-mode
39464 MODE. */
39465
39466 bool
39467 ix86_hard_regno_mode_ok (int regno, machine_mode mode)
39468 {
39469 /* Flags and only flags can only hold CCmode values. */
39470 if (CC_REGNO_P (regno))
39471 return GET_MODE_CLASS (mode) == MODE_CC;
39472 if (GET_MODE_CLASS (mode) == MODE_CC
39473 || GET_MODE_CLASS (mode) == MODE_RANDOM
39474 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
39475 return false;
39476 if (STACK_REGNO_P (regno))
39477 return VALID_FP_MODE_P (mode);
39478 if (MASK_REGNO_P (regno))
39479 return (VALID_MASK_REG_MODE (mode)
39480 || (TARGET_AVX512BW
39481 && VALID_MASK_AVX512BW_MODE (mode)));
39482 if (BND_REGNO_P (regno))
39483 return VALID_BND_REG_MODE (mode);
39484 if (SSE_REGNO_P (regno))
39485 {
39486 /* We implement the move patterns for all vector modes into and
39487 out of SSE registers, even when no operation instructions
39488 are available. */
39489
39490 /* For AVX-512 we allow, regardless of regno:
39491 - XI mode
39492 - any of 512-bit wide vector mode
39493 - any scalar mode. */
39494 if (TARGET_AVX512F
39495 && (mode == XImode
39496 || VALID_AVX512F_REG_MODE (mode)
39497 || VALID_AVX512F_SCALAR_MODE (mode)))
39498 return true;
39499
39500 /* TODO check for QI/HI scalars. */
39501 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
39502 if (TARGET_AVX512VL
39503 && (mode == OImode
39504 || mode == TImode
39505 || VALID_AVX256_REG_MODE (mode)
39506 || VALID_AVX512VL_128_REG_MODE (mode)))
39507 return true;
39508
39509 /* xmm16-xmm31 are only available for AVX-512. */
39510 if (EXT_REX_SSE_REGNO_P (regno))
39511 return false;
39512
39513 /* OImode and AVX modes are available only when AVX is enabled. */
39514 return ((TARGET_AVX
39515 && VALID_AVX256_REG_OR_OI_MODE (mode))
39516 || VALID_SSE_REG_MODE (mode)
39517 || VALID_SSE2_REG_MODE (mode)
39518 || VALID_MMX_REG_MODE (mode)
39519 || VALID_MMX_REG_MODE_3DNOW (mode));
39520 }
39521 if (MMX_REGNO_P (regno))
39522 {
39523 /* We implement the move patterns for 3DNOW modes even in MMX mode,
39524 so if the register is available at all, then we can move data of
39525 the given mode into or out of it. */
39526 return (VALID_MMX_REG_MODE (mode)
39527 || VALID_MMX_REG_MODE_3DNOW (mode));
39528 }
39529
39530 if (mode == QImode)
39531 {
39532 /* Take care for QImode values - they can be in non-QI regs,
39533 but then they do cause partial register stalls. */
39534 if (ANY_QI_REGNO_P (regno))
39535 return true;
39536 if (!TARGET_PARTIAL_REG_STALL)
39537 return true;
39538 /* LRA checks if the hard register is OK for the given mode.
39539 QImode values can live in non-QI regs, so we allow all
39540 registers here. */
39541 if (lra_in_progress)
39542 return true;
39543 return !can_create_pseudo_p ();
39544 }
39545 /* We handle both integer and floats in the general purpose registers. */
39546 else if (VALID_INT_MODE_P (mode))
39547 return true;
39548 else if (VALID_FP_MODE_P (mode))
39549 return true;
39550 else if (VALID_DFP_MODE_P (mode))
39551 return true;
39552 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
39553 on to use that value in smaller contexts, this can easily force a
39554 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
39555 supporting DImode, allow it. */
39556 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
39557 return true;
39558
39559 return false;
39560 }
39561
39562 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
39563 tieable integer mode. */
39564
39565 static bool
39566 ix86_tieable_integer_mode_p (machine_mode mode)
39567 {
39568 switch (mode)
39569 {
39570 case HImode:
39571 case SImode:
39572 return true;
39573
39574 case QImode:
39575 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
39576
39577 case DImode:
39578 return TARGET_64BIT;
39579
39580 default:
39581 return false;
39582 }
39583 }
39584
39585 /* Return true if MODE1 is accessible in a register that can hold MODE2
39586 without copying. That is, all register classes that can hold MODE2
39587 can also hold MODE1. */
39588
39589 bool
39590 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
39591 {
39592 if (mode1 == mode2)
39593 return true;
39594
39595 if (ix86_tieable_integer_mode_p (mode1)
39596 && ix86_tieable_integer_mode_p (mode2))
39597 return true;
39598
39599 /* MODE2 being XFmode implies fp stack or general regs, which means we
39600 can tie any smaller floating point modes to it. Note that we do not
39601 tie this with TFmode. */
39602 if (mode2 == XFmode)
39603 return mode1 == SFmode || mode1 == DFmode;
39604
39605 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
39606 that we can tie it with SFmode. */
39607 if (mode2 == DFmode)
39608 return mode1 == SFmode;
39609
39610 /* If MODE2 is only appropriate for an SSE register, then tie with
39611 any other mode acceptable to SSE registers. */
39612 if (GET_MODE_SIZE (mode2) == 32
39613 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
39614 return (GET_MODE_SIZE (mode1) == 32
39615 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
39616 if (GET_MODE_SIZE (mode2) == 16
39617 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
39618 return (GET_MODE_SIZE (mode1) == 16
39619 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
39620
39621 /* If MODE2 is appropriate for an MMX register, then tie
39622 with any other mode acceptable to MMX registers. */
39623 if (GET_MODE_SIZE (mode2) == 8
39624 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
39625 return (GET_MODE_SIZE (mode1) == 8
39626 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
39627
39628 return false;
39629 }
39630
39631 /* Return the cost of moving between two registers of mode MODE. */
39632
39633 static int
39634 ix86_set_reg_reg_cost (machine_mode mode)
39635 {
39636 unsigned int units = UNITS_PER_WORD;
39637
39638 switch (GET_MODE_CLASS (mode))
39639 {
39640 default:
39641 break;
39642
39643 case MODE_CC:
39644 units = GET_MODE_SIZE (CCmode);
39645 break;
39646
39647 case MODE_FLOAT:
39648 if ((TARGET_SSE && mode == TFmode)
39649 || (TARGET_80387 && mode == XFmode)
39650 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
39651 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
39652 units = GET_MODE_SIZE (mode);
39653 break;
39654
39655 case MODE_COMPLEX_FLOAT:
39656 if ((TARGET_SSE && mode == TCmode)
39657 || (TARGET_80387 && mode == XCmode)
39658 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
39659 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
39660 units = GET_MODE_SIZE (mode);
39661 break;
39662
39663 case MODE_VECTOR_INT:
39664 case MODE_VECTOR_FLOAT:
39665 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
39666 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
39667 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
39668 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
39669 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
39670 units = GET_MODE_SIZE (mode);
39671 }
39672
39673 /* Return the cost of moving between two registers of mode MODE,
39674 assuming that the move will be in pieces of at most UNITS bytes. */
39675 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
39676 }
39677
39678 /* Compute a (partial) cost for rtx X. Return true if the complete
39679 cost has been computed, and false if subexpressions should be
39680 scanned. In either case, *TOTAL contains the cost result. */
39681
39682 static bool
39683 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
39684 int *total, bool speed)
39685 {
39686 rtx mask;
39687 enum rtx_code code = GET_CODE (x);
39688 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
39689 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
39690
39691 switch (code)
39692 {
39693 case SET:
39694 if (register_operand (SET_DEST (x), VOIDmode)
39695 && reg_or_0_operand (SET_SRC (x), VOIDmode))
39696 {
39697 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
39698 return true;
39699 }
39700 return false;
39701
39702 case CONST_INT:
39703 case CONST:
39704 case LABEL_REF:
39705 case SYMBOL_REF:
39706 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
39707 *total = 3;
39708 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
39709 *total = 2;
39710 else if (flag_pic && SYMBOLIC_CONST (x)
39711 && !(TARGET_64BIT
39712 && (GET_CODE (x) == LABEL_REF
39713 || (GET_CODE (x) == SYMBOL_REF
39714 && SYMBOL_REF_LOCAL_P (x))))
39715 /* Use 0 cost for CONST to improve its propagation. */
39716 && (TARGET_64BIT || GET_CODE (x) != CONST))
39717 *total = 1;
39718 else
39719 *total = 0;
39720 return true;
39721
39722 case CONST_DOUBLE:
39723 if (IS_STACK_MODE (mode))
39724 switch (standard_80387_constant_p (x))
39725 {
39726 case -1:
39727 case 0:
39728 break;
39729 case 1: /* 0.0 */
39730 *total = 1;
39731 return true;
39732 default: /* Other constants */
39733 *total = 2;
39734 return true;
39735 }
39736 /* FALLTHRU */
39737
39738 case CONST_VECTOR:
39739 switch (standard_sse_constant_p (x, mode))
39740 {
39741 case 0:
39742 break;
39743 case 1: /* 0: xor eliminates false dependency */
39744 *total = 0;
39745 return true;
39746 default: /* -1: cmp contains false dependency */
39747 *total = 1;
39748 return true;
39749 }
39750 /* FALLTHRU */
39751
39752 case CONST_WIDE_INT:
39753 /* Fall back to (MEM (SYMBOL_REF)), since that's where
39754 it'll probably end up. Add a penalty for size. */
39755 *total = (COSTS_N_INSNS (1)
39756 + (!TARGET_64BIT && flag_pic)
39757 + (GET_MODE_SIZE (mode) <= 4
39758 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
39759 return true;
39760
39761 case ZERO_EXTEND:
39762 /* The zero extensions is often completely free on x86_64, so make
39763 it as cheap as possible. */
39764 if (TARGET_64BIT && mode == DImode
39765 && GET_MODE (XEXP (x, 0)) == SImode)
39766 *total = 1;
39767 else if (TARGET_ZERO_EXTEND_WITH_AND)
39768 *total = cost->add;
39769 else
39770 *total = cost->movzx;
39771 return false;
39772
39773 case SIGN_EXTEND:
39774 *total = cost->movsx;
39775 return false;
39776
39777 case ASHIFT:
39778 if (SCALAR_INT_MODE_P (mode)
39779 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
39780 && CONST_INT_P (XEXP (x, 1)))
39781 {
39782 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
39783 if (value == 1)
39784 {
39785 *total = cost->add;
39786 return false;
39787 }
39788 if ((value == 2 || value == 3)
39789 && cost->lea <= cost->shift_const)
39790 {
39791 *total = cost->lea;
39792 return false;
39793 }
39794 }
39795 /* FALLTHRU */
39796
39797 case ROTATE:
39798 case ASHIFTRT:
39799 case LSHIFTRT:
39800 case ROTATERT:
39801 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
39802 {
39803 /* ??? Should be SSE vector operation cost. */
39804 /* At least for published AMD latencies, this really is the same
39805 as the latency for a simple fpu operation like fabs. */
39806 /* V*QImode is emulated with 1-11 insns. */
39807 if (mode == V16QImode || mode == V32QImode)
39808 {
39809 int count = 11;
39810 if (TARGET_XOP && mode == V16QImode)
39811 {
39812 /* For XOP we use vpshab, which requires a broadcast of the
39813 value to the variable shift insn. For constants this
39814 means a V16Q const in mem; even when we can perform the
39815 shift with one insn set the cost to prefer paddb. */
39816 if (CONSTANT_P (XEXP (x, 1)))
39817 {
39818 *total = (cost->fabs
39819 + rtx_cost (XEXP (x, 0), mode, code, 0, speed)
39820 + (speed ? 2 : COSTS_N_BYTES (16)));
39821 return true;
39822 }
39823 count = 3;
39824 }
39825 else if (TARGET_SSSE3)
39826 count = 7;
39827 *total = cost->fabs * count;
39828 }
39829 else
39830 *total = cost->fabs;
39831 }
39832 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39833 {
39834 if (CONST_INT_P (XEXP (x, 1)))
39835 {
39836 if (INTVAL (XEXP (x, 1)) > 32)
39837 *total = cost->shift_const + COSTS_N_INSNS (2);
39838 else
39839 *total = cost->shift_const * 2;
39840 }
39841 else
39842 {
39843 if (GET_CODE (XEXP (x, 1)) == AND)
39844 *total = cost->shift_var * 2;
39845 else
39846 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
39847 }
39848 }
39849 else
39850 {
39851 if (CONST_INT_P (XEXP (x, 1)))
39852 *total = cost->shift_const;
39853 else if (SUBREG_P (XEXP (x, 1))
39854 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
39855 {
39856 /* Return the cost after shift-and truncation. */
39857 *total = cost->shift_var;
39858 return true;
39859 }
39860 else
39861 *total = cost->shift_var;
39862 }
39863 return false;
39864
39865 case FMA:
39866 {
39867 rtx sub;
39868
39869 gcc_assert (FLOAT_MODE_P (mode));
39870 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
39871
39872 /* ??? SSE scalar/vector cost should be used here. */
39873 /* ??? Bald assumption that fma has the same cost as fmul. */
39874 *total = cost->fmul;
39875 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
39876
39877 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
39878 sub = XEXP (x, 0);
39879 if (GET_CODE (sub) == NEG)
39880 sub = XEXP (sub, 0);
39881 *total += rtx_cost (sub, mode, FMA, 0, speed);
39882
39883 sub = XEXP (x, 2);
39884 if (GET_CODE (sub) == NEG)
39885 sub = XEXP (sub, 0);
39886 *total += rtx_cost (sub, mode, FMA, 2, speed);
39887 return true;
39888 }
39889
39890 case MULT:
39891 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39892 {
39893 /* ??? SSE scalar cost should be used here. */
39894 *total = cost->fmul;
39895 return false;
39896 }
39897 else if (X87_FLOAT_MODE_P (mode))
39898 {
39899 *total = cost->fmul;
39900 return false;
39901 }
39902 else if (FLOAT_MODE_P (mode))
39903 {
39904 /* ??? SSE vector cost should be used here. */
39905 *total = cost->fmul;
39906 return false;
39907 }
39908 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
39909 {
39910 /* V*QImode is emulated with 7-13 insns. */
39911 if (mode == V16QImode || mode == V32QImode)
39912 {
39913 int extra = 11;
39914 if (TARGET_XOP && mode == V16QImode)
39915 extra = 5;
39916 else if (TARGET_SSSE3)
39917 extra = 6;
39918 *total = cost->fmul * 2 + cost->fabs * extra;
39919 }
39920 /* V*DImode is emulated with 5-8 insns. */
39921 else if (mode == V2DImode || mode == V4DImode)
39922 {
39923 if (TARGET_XOP && mode == V2DImode)
39924 *total = cost->fmul * 2 + cost->fabs * 3;
39925 else
39926 *total = cost->fmul * 3 + cost->fabs * 5;
39927 }
39928 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
39929 insns, including two PMULUDQ. */
39930 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
39931 *total = cost->fmul * 2 + cost->fabs * 5;
39932 else
39933 *total = cost->fmul;
39934 return false;
39935 }
39936 else
39937 {
39938 rtx op0 = XEXP (x, 0);
39939 rtx op1 = XEXP (x, 1);
39940 int nbits;
39941 if (CONST_INT_P (XEXP (x, 1)))
39942 {
39943 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
39944 for (nbits = 0; value != 0; value &= value - 1)
39945 nbits++;
39946 }
39947 else
39948 /* This is arbitrary. */
39949 nbits = 7;
39950
39951 /* Compute costs correctly for widening multiplication. */
39952 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
39953 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
39954 == GET_MODE_SIZE (mode))
39955 {
39956 int is_mulwiden = 0;
39957 machine_mode inner_mode = GET_MODE (op0);
39958
39959 if (GET_CODE (op0) == GET_CODE (op1))
39960 is_mulwiden = 1, op1 = XEXP (op1, 0);
39961 else if (CONST_INT_P (op1))
39962 {
39963 if (GET_CODE (op0) == SIGN_EXTEND)
39964 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
39965 == INTVAL (op1);
39966 else
39967 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
39968 }
39969
39970 if (is_mulwiden)
39971 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
39972 }
39973
39974 *total = (cost->mult_init[MODE_INDEX (mode)]
39975 + nbits * cost->mult_bit
39976 + rtx_cost (op0, mode, outer_code, opno, speed)
39977 + rtx_cost (op1, mode, outer_code, opno, speed));
39978
39979 return true;
39980 }
39981
39982 case DIV:
39983 case UDIV:
39984 case MOD:
39985 case UMOD:
39986 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39987 /* ??? SSE cost should be used here. */
39988 *total = cost->fdiv;
39989 else if (X87_FLOAT_MODE_P (mode))
39990 *total = cost->fdiv;
39991 else if (FLOAT_MODE_P (mode))
39992 /* ??? SSE vector cost should be used here. */
39993 *total = cost->fdiv;
39994 else
39995 *total = cost->divide[MODE_INDEX (mode)];
39996 return false;
39997
39998 case PLUS:
39999 if (GET_MODE_CLASS (mode) == MODE_INT
40000 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
40001 {
40002 if (GET_CODE (XEXP (x, 0)) == PLUS
40003 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
40004 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
40005 && CONSTANT_P (XEXP (x, 1)))
40006 {
40007 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
40008 if (val == 2 || val == 4 || val == 8)
40009 {
40010 *total = cost->lea;
40011 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
40012 outer_code, opno, speed);
40013 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
40014 outer_code, opno, speed);
40015 *total += rtx_cost (XEXP (x, 1), mode,
40016 outer_code, opno, speed);
40017 return true;
40018 }
40019 }
40020 else if (GET_CODE (XEXP (x, 0)) == MULT
40021 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
40022 {
40023 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
40024 if (val == 2 || val == 4 || val == 8)
40025 {
40026 *total = cost->lea;
40027 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
40028 outer_code, opno, speed);
40029 *total += rtx_cost (XEXP (x, 1), mode,
40030 outer_code, opno, speed);
40031 return true;
40032 }
40033 }
40034 else if (GET_CODE (XEXP (x, 0)) == PLUS)
40035 {
40036 *total = cost->lea;
40037 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
40038 outer_code, opno, speed);
40039 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
40040 outer_code, opno, speed);
40041 *total += rtx_cost (XEXP (x, 1), mode,
40042 outer_code, opno, speed);
40043 return true;
40044 }
40045 }
40046 /* FALLTHRU */
40047
40048 case MINUS:
40049 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40050 {
40051 /* ??? SSE cost should be used here. */
40052 *total = cost->fadd;
40053 return false;
40054 }
40055 else if (X87_FLOAT_MODE_P (mode))
40056 {
40057 *total = cost->fadd;
40058 return false;
40059 }
40060 else if (FLOAT_MODE_P (mode))
40061 {
40062 /* ??? SSE vector cost should be used here. */
40063 *total = cost->fadd;
40064 return false;
40065 }
40066 /* FALLTHRU */
40067
40068 case AND:
40069 case IOR:
40070 case XOR:
40071 if (GET_MODE_CLASS (mode) == MODE_INT
40072 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40073 {
40074 *total = (cost->add * 2
40075 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
40076 << (GET_MODE (XEXP (x, 0)) != DImode))
40077 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
40078 << (GET_MODE (XEXP (x, 1)) != DImode)));
40079 return true;
40080 }
40081 /* FALLTHRU */
40082
40083 case NEG:
40084 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40085 {
40086 /* ??? SSE cost should be used here. */
40087 *total = cost->fchs;
40088 return false;
40089 }
40090 else if (X87_FLOAT_MODE_P (mode))
40091 {
40092 *total = cost->fchs;
40093 return false;
40094 }
40095 else if (FLOAT_MODE_P (mode))
40096 {
40097 /* ??? SSE vector cost should be used here. */
40098 *total = cost->fchs;
40099 return false;
40100 }
40101 /* FALLTHRU */
40102
40103 case NOT:
40104 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40105 {
40106 /* ??? Should be SSE vector operation cost. */
40107 /* At least for published AMD latencies, this really is the same
40108 as the latency for a simple fpu operation like fabs. */
40109 *total = cost->fabs;
40110 }
40111 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40112 *total = cost->add * 2;
40113 else
40114 *total = cost->add;
40115 return false;
40116
40117 case COMPARE:
40118 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
40119 && XEXP (XEXP (x, 0), 1) == const1_rtx
40120 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
40121 && XEXP (x, 1) == const0_rtx)
40122 {
40123 /* This kind of construct is implemented using test[bwl].
40124 Treat it as if we had an AND. */
40125 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
40126 *total = (cost->add
40127 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
40128 opno, speed)
40129 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
40130 return true;
40131 }
40132
40133 /* The embedded comparison operand is completely free. */
40134 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
40135 && XEXP (x, 1) == const0_rtx)
40136 *total = 0;
40137
40138 return false;
40139
40140 case FLOAT_EXTEND:
40141 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
40142 *total = 0;
40143 return false;
40144
40145 case ABS:
40146 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40147 /* ??? SSE cost should be used here. */
40148 *total = cost->fabs;
40149 else if (X87_FLOAT_MODE_P (mode))
40150 *total = cost->fabs;
40151 else if (FLOAT_MODE_P (mode))
40152 /* ??? SSE vector cost should be used here. */
40153 *total = cost->fabs;
40154 return false;
40155
40156 case SQRT:
40157 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40158 /* ??? SSE cost should be used here. */
40159 *total = cost->fsqrt;
40160 else if (X87_FLOAT_MODE_P (mode))
40161 *total = cost->fsqrt;
40162 else if (FLOAT_MODE_P (mode))
40163 /* ??? SSE vector cost should be used here. */
40164 *total = cost->fsqrt;
40165 return false;
40166
40167 case UNSPEC:
40168 if (XINT (x, 1) == UNSPEC_TP)
40169 *total = 0;
40170 return false;
40171
40172 case VEC_SELECT:
40173 case VEC_CONCAT:
40174 case VEC_DUPLICATE:
40175 /* ??? Assume all of these vector manipulation patterns are
40176 recognizable. In which case they all pretty much have the
40177 same cost. */
40178 *total = cost->fabs;
40179 return true;
40180 case VEC_MERGE:
40181 mask = XEXP (x, 2);
40182 /* This is masked instruction, assume the same cost,
40183 as nonmasked variant. */
40184 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
40185 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
40186 else
40187 *total = cost->fabs;
40188 return true;
40189
40190 default:
40191 return false;
40192 }
40193 }
40194
40195 #if TARGET_MACHO
40196
40197 static int current_machopic_label_num;
40198
40199 /* Given a symbol name and its associated stub, write out the
40200 definition of the stub. */
40201
40202 void
40203 machopic_output_stub (FILE *file, const char *symb, const char *stub)
40204 {
40205 unsigned int length;
40206 char *binder_name, *symbol_name, lazy_ptr_name[32];
40207 int label = ++current_machopic_label_num;
40208
40209 /* For 64-bit we shouldn't get here. */
40210 gcc_assert (!TARGET_64BIT);
40211
40212 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
40213 symb = targetm.strip_name_encoding (symb);
40214
40215 length = strlen (stub);
40216 binder_name = XALLOCAVEC (char, length + 32);
40217 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
40218
40219 length = strlen (symb);
40220 symbol_name = XALLOCAVEC (char, length + 32);
40221 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
40222
40223 sprintf (lazy_ptr_name, "L%d$lz", label);
40224
40225 if (MACHOPIC_ATT_STUB)
40226 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
40227 else if (MACHOPIC_PURE)
40228 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
40229 else
40230 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
40231
40232 fprintf (file, "%s:\n", stub);
40233 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
40234
40235 if (MACHOPIC_ATT_STUB)
40236 {
40237 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
40238 }
40239 else if (MACHOPIC_PURE)
40240 {
40241 /* PIC stub. */
40242 /* 25-byte PIC stub using "CALL get_pc_thunk". */
40243 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
40244 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
40245 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
40246 label, lazy_ptr_name, label);
40247 fprintf (file, "\tjmp\t*%%ecx\n");
40248 }
40249 else
40250 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
40251
40252 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
40253 it needs no stub-binding-helper. */
40254 if (MACHOPIC_ATT_STUB)
40255 return;
40256
40257 fprintf (file, "%s:\n", binder_name);
40258
40259 if (MACHOPIC_PURE)
40260 {
40261 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
40262 fprintf (file, "\tpushl\t%%ecx\n");
40263 }
40264 else
40265 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
40266
40267 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
40268
40269 /* N.B. Keep the correspondence of these
40270 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
40271 old-pic/new-pic/non-pic stubs; altering this will break
40272 compatibility with existing dylibs. */
40273 if (MACHOPIC_PURE)
40274 {
40275 /* 25-byte PIC stub using "CALL get_pc_thunk". */
40276 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
40277 }
40278 else
40279 /* 16-byte -mdynamic-no-pic stub. */
40280 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
40281
40282 fprintf (file, "%s:\n", lazy_ptr_name);
40283 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
40284 fprintf (file, ASM_LONG "%s\n", binder_name);
40285 }
40286 #endif /* TARGET_MACHO */
40287
40288 /* Order the registers for register allocator. */
40289
40290 void
40291 x86_order_regs_for_local_alloc (void)
40292 {
40293 int pos = 0;
40294 int i;
40295
40296 /* First allocate the local general purpose registers. */
40297 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
40298 if (GENERAL_REGNO_P (i) && call_used_regs[i])
40299 reg_alloc_order [pos++] = i;
40300
40301 /* Global general purpose registers. */
40302 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
40303 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
40304 reg_alloc_order [pos++] = i;
40305
40306 /* x87 registers come first in case we are doing FP math
40307 using them. */
40308 if (!TARGET_SSE_MATH)
40309 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
40310 reg_alloc_order [pos++] = i;
40311
40312 /* SSE registers. */
40313 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
40314 reg_alloc_order [pos++] = i;
40315 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
40316 reg_alloc_order [pos++] = i;
40317
40318 /* Extended REX SSE registers. */
40319 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
40320 reg_alloc_order [pos++] = i;
40321
40322 /* Mask register. */
40323 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
40324 reg_alloc_order [pos++] = i;
40325
40326 /* MPX bound registers. */
40327 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
40328 reg_alloc_order [pos++] = i;
40329
40330 /* x87 registers. */
40331 if (TARGET_SSE_MATH)
40332 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
40333 reg_alloc_order [pos++] = i;
40334
40335 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
40336 reg_alloc_order [pos++] = i;
40337
40338 /* Initialize the rest of array as we do not allocate some registers
40339 at all. */
40340 while (pos < FIRST_PSEUDO_REGISTER)
40341 reg_alloc_order [pos++] = 0;
40342 }
40343
40344 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
40345 in struct attribute_spec handler. */
40346 static tree
40347 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
40348 tree args,
40349 int,
40350 bool *no_add_attrs)
40351 {
40352 if (TREE_CODE (*node) != FUNCTION_TYPE
40353 && TREE_CODE (*node) != METHOD_TYPE
40354 && TREE_CODE (*node) != FIELD_DECL
40355 && TREE_CODE (*node) != TYPE_DECL)
40356 {
40357 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40358 name);
40359 *no_add_attrs = true;
40360 return NULL_TREE;
40361 }
40362 if (TARGET_64BIT)
40363 {
40364 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
40365 name);
40366 *no_add_attrs = true;
40367 return NULL_TREE;
40368 }
40369 if (is_attribute_p ("callee_pop_aggregate_return", name))
40370 {
40371 tree cst;
40372
40373 cst = TREE_VALUE (args);
40374 if (TREE_CODE (cst) != INTEGER_CST)
40375 {
40376 warning (OPT_Wattributes,
40377 "%qE attribute requires an integer constant argument",
40378 name);
40379 *no_add_attrs = true;
40380 }
40381 else if (compare_tree_int (cst, 0) != 0
40382 && compare_tree_int (cst, 1) != 0)
40383 {
40384 warning (OPT_Wattributes,
40385 "argument to %qE attribute is neither zero, nor one",
40386 name);
40387 *no_add_attrs = true;
40388 }
40389
40390 return NULL_TREE;
40391 }
40392
40393 return NULL_TREE;
40394 }
40395
40396 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
40397 struct attribute_spec.handler. */
40398 static tree
40399 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
40400 bool *no_add_attrs)
40401 {
40402 if (TREE_CODE (*node) != FUNCTION_TYPE
40403 && TREE_CODE (*node) != METHOD_TYPE
40404 && TREE_CODE (*node) != FIELD_DECL
40405 && TREE_CODE (*node) != TYPE_DECL)
40406 {
40407 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40408 name);
40409 *no_add_attrs = true;
40410 return NULL_TREE;
40411 }
40412
40413 /* Can combine regparm with all attributes but fastcall. */
40414 if (is_attribute_p ("ms_abi", name))
40415 {
40416 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
40417 {
40418 error ("ms_abi and sysv_abi attributes are not compatible");
40419 }
40420
40421 return NULL_TREE;
40422 }
40423 else if (is_attribute_p ("sysv_abi", name))
40424 {
40425 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
40426 {
40427 error ("ms_abi and sysv_abi attributes are not compatible");
40428 }
40429
40430 return NULL_TREE;
40431 }
40432
40433 return NULL_TREE;
40434 }
40435
40436 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
40437 struct attribute_spec.handler. */
40438 static tree
40439 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
40440 bool *no_add_attrs)
40441 {
40442 tree *type = NULL;
40443 if (DECL_P (*node))
40444 {
40445 if (TREE_CODE (*node) == TYPE_DECL)
40446 type = &TREE_TYPE (*node);
40447 }
40448 else
40449 type = node;
40450
40451 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
40452 {
40453 warning (OPT_Wattributes, "%qE attribute ignored",
40454 name);
40455 *no_add_attrs = true;
40456 }
40457
40458 else if ((is_attribute_p ("ms_struct", name)
40459 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
40460 || ((is_attribute_p ("gcc_struct", name)
40461 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
40462 {
40463 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
40464 name);
40465 *no_add_attrs = true;
40466 }
40467
40468 return NULL_TREE;
40469 }
40470
40471 static tree
40472 ix86_handle_fndecl_attribute (tree *node, tree name, tree, int,
40473 bool *no_add_attrs)
40474 {
40475 if (TREE_CODE (*node) != FUNCTION_DECL)
40476 {
40477 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40478 name);
40479 *no_add_attrs = true;
40480 }
40481 return NULL_TREE;
40482 }
40483
40484 static tree
40485 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
40486 int, bool *)
40487 {
40488 return NULL_TREE;
40489 }
40490
40491 static tree
40492 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
40493 {
40494 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
40495 but the function type contains args and return type data. */
40496 tree func_type = *node;
40497 tree return_type = TREE_TYPE (func_type);
40498
40499 int nargs = 0;
40500 tree current_arg_type = TYPE_ARG_TYPES (func_type);
40501 while (current_arg_type
40502 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
40503 {
40504 if (nargs == 0)
40505 {
40506 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
40507 error ("interrupt service routine should have a pointer "
40508 "as the first argument");
40509 }
40510 else if (nargs == 1)
40511 {
40512 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
40513 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
40514 error ("interrupt service routine should have unsigned %s"
40515 "int as the second argument",
40516 TARGET_64BIT
40517 ? (TARGET_X32 ? "long long " : "long ")
40518 : "");
40519 }
40520 nargs++;
40521 current_arg_type = TREE_CHAIN (current_arg_type);
40522 }
40523 if (!nargs || nargs > 2)
40524 error ("interrupt service routine can only have a pointer argument "
40525 "and an optional integer argument");
40526 if (! VOID_TYPE_P (return_type))
40527 error ("interrupt service routine can't have non-void return value");
40528
40529 return NULL_TREE;
40530 }
40531
40532 static bool
40533 ix86_ms_bitfield_layout_p (const_tree record_type)
40534 {
40535 return ((TARGET_MS_BITFIELD_LAYOUT
40536 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
40537 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
40538 }
40539
40540 /* Returns an expression indicating where the this parameter is
40541 located on entry to the FUNCTION. */
40542
40543 static rtx
40544 x86_this_parameter (tree function)
40545 {
40546 tree type = TREE_TYPE (function);
40547 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
40548 int nregs;
40549
40550 if (TARGET_64BIT)
40551 {
40552 const int *parm_regs;
40553
40554 if (ix86_function_type_abi (type) == MS_ABI)
40555 parm_regs = x86_64_ms_abi_int_parameter_registers;
40556 else
40557 parm_regs = x86_64_int_parameter_registers;
40558 return gen_rtx_REG (Pmode, parm_regs[aggr]);
40559 }
40560
40561 nregs = ix86_function_regparm (type, function);
40562
40563 if (nregs > 0 && !stdarg_p (type))
40564 {
40565 int regno;
40566 unsigned int ccvt = ix86_get_callcvt (type);
40567
40568 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
40569 regno = aggr ? DX_REG : CX_REG;
40570 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
40571 {
40572 regno = CX_REG;
40573 if (aggr)
40574 return gen_rtx_MEM (SImode,
40575 plus_constant (Pmode, stack_pointer_rtx, 4));
40576 }
40577 else
40578 {
40579 regno = AX_REG;
40580 if (aggr)
40581 {
40582 regno = DX_REG;
40583 if (nregs == 1)
40584 return gen_rtx_MEM (SImode,
40585 plus_constant (Pmode,
40586 stack_pointer_rtx, 4));
40587 }
40588 }
40589 return gen_rtx_REG (SImode, regno);
40590 }
40591
40592 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
40593 aggr ? 8 : 4));
40594 }
40595
40596 /* Determine whether x86_output_mi_thunk can succeed. */
40597
40598 static bool
40599 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
40600 const_tree function)
40601 {
40602 /* 64-bit can handle anything. */
40603 if (TARGET_64BIT)
40604 return true;
40605
40606 /* For 32-bit, everything's fine if we have one free register. */
40607 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
40608 return true;
40609
40610 /* Need a free register for vcall_offset. */
40611 if (vcall_offset)
40612 return false;
40613
40614 /* Need a free register for GOT references. */
40615 if (flag_pic && !targetm.binds_local_p (function))
40616 return false;
40617
40618 /* Otherwise ok. */
40619 return true;
40620 }
40621
40622 /* Output the assembler code for a thunk function. THUNK_DECL is the
40623 declaration for the thunk function itself, FUNCTION is the decl for
40624 the target function. DELTA is an immediate constant offset to be
40625 added to THIS. If VCALL_OFFSET is nonzero, the word at
40626 *(*this + vcall_offset) should be added to THIS. */
40627
40628 static void
40629 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
40630 HOST_WIDE_INT vcall_offset, tree function)
40631 {
40632 rtx this_param = x86_this_parameter (function);
40633 rtx this_reg, tmp, fnaddr;
40634 unsigned int tmp_regno;
40635 rtx_insn *insn;
40636
40637 if (TARGET_64BIT)
40638 tmp_regno = R10_REG;
40639 else
40640 {
40641 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
40642 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
40643 tmp_regno = AX_REG;
40644 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
40645 tmp_regno = DX_REG;
40646 else
40647 tmp_regno = CX_REG;
40648 }
40649
40650 emit_note (NOTE_INSN_PROLOGUE_END);
40651
40652 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
40653 pull it in now and let DELTA benefit. */
40654 if (REG_P (this_param))
40655 this_reg = this_param;
40656 else if (vcall_offset)
40657 {
40658 /* Put the this parameter into %eax. */
40659 this_reg = gen_rtx_REG (Pmode, AX_REG);
40660 emit_move_insn (this_reg, this_param);
40661 }
40662 else
40663 this_reg = NULL_RTX;
40664
40665 /* Adjust the this parameter by a fixed constant. */
40666 if (delta)
40667 {
40668 rtx delta_rtx = GEN_INT (delta);
40669 rtx delta_dst = this_reg ? this_reg : this_param;
40670
40671 if (TARGET_64BIT)
40672 {
40673 if (!x86_64_general_operand (delta_rtx, Pmode))
40674 {
40675 tmp = gen_rtx_REG (Pmode, tmp_regno);
40676 emit_move_insn (tmp, delta_rtx);
40677 delta_rtx = tmp;
40678 }
40679 }
40680
40681 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
40682 }
40683
40684 /* Adjust the this parameter by a value stored in the vtable. */
40685 if (vcall_offset)
40686 {
40687 rtx vcall_addr, vcall_mem, this_mem;
40688
40689 tmp = gen_rtx_REG (Pmode, tmp_regno);
40690
40691 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
40692 if (Pmode != ptr_mode)
40693 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
40694 emit_move_insn (tmp, this_mem);
40695
40696 /* Adjust the this parameter. */
40697 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
40698 if (TARGET_64BIT
40699 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
40700 {
40701 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
40702 emit_move_insn (tmp2, GEN_INT (vcall_offset));
40703 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
40704 }
40705
40706 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
40707 if (Pmode != ptr_mode)
40708 emit_insn (gen_addsi_1_zext (this_reg,
40709 gen_rtx_REG (ptr_mode,
40710 REGNO (this_reg)),
40711 vcall_mem));
40712 else
40713 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
40714 }
40715
40716 /* If necessary, drop THIS back to its stack slot. */
40717 if (this_reg && this_reg != this_param)
40718 emit_move_insn (this_param, this_reg);
40719
40720 fnaddr = XEXP (DECL_RTL (function), 0);
40721 if (TARGET_64BIT)
40722 {
40723 if (!flag_pic || targetm.binds_local_p (function)
40724 || TARGET_PECOFF)
40725 ;
40726 else
40727 {
40728 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
40729 tmp = gen_rtx_CONST (Pmode, tmp);
40730 fnaddr = gen_const_mem (Pmode, tmp);
40731 }
40732 }
40733 else
40734 {
40735 if (!flag_pic || targetm.binds_local_p (function))
40736 ;
40737 #if TARGET_MACHO
40738 else if (TARGET_MACHO)
40739 {
40740 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
40741 fnaddr = XEXP (fnaddr, 0);
40742 }
40743 #endif /* TARGET_MACHO */
40744 else
40745 {
40746 tmp = gen_rtx_REG (Pmode, CX_REG);
40747 output_set_got (tmp, NULL_RTX);
40748
40749 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
40750 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
40751 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
40752 fnaddr = gen_const_mem (Pmode, fnaddr);
40753 }
40754 }
40755
40756 /* Our sibling call patterns do not allow memories, because we have no
40757 predicate that can distinguish between frame and non-frame memory.
40758 For our purposes here, we can get away with (ab)using a jump pattern,
40759 because we're going to do no optimization. */
40760 if (MEM_P (fnaddr))
40761 {
40762 if (sibcall_insn_operand (fnaddr, word_mode))
40763 {
40764 fnaddr = XEXP (DECL_RTL (function), 0);
40765 tmp = gen_rtx_MEM (QImode, fnaddr);
40766 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
40767 tmp = emit_call_insn (tmp);
40768 SIBLING_CALL_P (tmp) = 1;
40769 }
40770 else
40771 emit_jump_insn (gen_indirect_jump (fnaddr));
40772 }
40773 else
40774 {
40775 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
40776 {
40777 // CM_LARGE_PIC always uses pseudo PIC register which is
40778 // uninitialized. Since FUNCTION is local and calling it
40779 // doesn't go through PLT, we use scratch register %r11 as
40780 // PIC register and initialize it here.
40781 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
40782 ix86_init_large_pic_reg (tmp_regno);
40783 fnaddr = legitimize_pic_address (fnaddr,
40784 gen_rtx_REG (Pmode, tmp_regno));
40785 }
40786
40787 if (!sibcall_insn_operand (fnaddr, word_mode))
40788 {
40789 tmp = gen_rtx_REG (word_mode, tmp_regno);
40790 if (GET_MODE (fnaddr) != word_mode)
40791 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
40792 emit_move_insn (tmp, fnaddr);
40793 fnaddr = tmp;
40794 }
40795
40796 tmp = gen_rtx_MEM (QImode, fnaddr);
40797 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
40798 tmp = emit_call_insn (tmp);
40799 SIBLING_CALL_P (tmp) = 1;
40800 }
40801 emit_barrier ();
40802
40803 /* Emit just enough of rest_of_compilation to get the insns emitted.
40804 Note that use_thunk calls assemble_start_function et al. */
40805 insn = get_insns ();
40806 shorten_branches (insn);
40807 final_start_function (insn, file, 1);
40808 final (insn, file, 1);
40809 final_end_function ();
40810 }
40811
40812 static void
40813 x86_file_start (void)
40814 {
40815 default_file_start ();
40816 if (TARGET_16BIT)
40817 fputs ("\t.code16gcc\n", asm_out_file);
40818 #if TARGET_MACHO
40819 darwin_file_start ();
40820 #endif
40821 if (X86_FILE_START_VERSION_DIRECTIVE)
40822 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
40823 if (X86_FILE_START_FLTUSED)
40824 fputs ("\t.global\t__fltused\n", asm_out_file);
40825 if (ix86_asm_dialect == ASM_INTEL)
40826 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
40827 }
40828
40829 int
40830 x86_field_alignment (tree field, int computed)
40831 {
40832 machine_mode mode;
40833 tree type = TREE_TYPE (field);
40834
40835 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
40836 return computed;
40837 if (TARGET_IAMCU)
40838 return iamcu_alignment (type, computed);
40839 mode = TYPE_MODE (strip_array_types (type));
40840 if (mode == DFmode || mode == DCmode
40841 || GET_MODE_CLASS (mode) == MODE_INT
40842 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
40843 return MIN (32, computed);
40844 return computed;
40845 }
40846
40847 /* Print call to TARGET to FILE. */
40848
40849 static void
40850 x86_print_call_or_nop (FILE *file, const char *target)
40851 {
40852 if (flag_nop_mcount)
40853 fprintf (file, "1:\tnopl 0x00(%%eax,%%eax,1)\n"); /* 5 byte nop. */
40854 else
40855 fprintf (file, "1:\tcall\t%s\n", target);
40856 }
40857
40858 /* Output assembler code to FILE to increment profiler label # LABELNO
40859 for profiling a function entry. */
40860 void
40861 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
40862 {
40863 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
40864 : MCOUNT_NAME);
40865 if (TARGET_64BIT)
40866 {
40867 #ifndef NO_PROFILE_COUNTERS
40868 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
40869 #endif
40870
40871 if (!TARGET_PECOFF && flag_pic)
40872 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
40873 else
40874 x86_print_call_or_nop (file, mcount_name);
40875 }
40876 else if (flag_pic)
40877 {
40878 #ifndef NO_PROFILE_COUNTERS
40879 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
40880 LPREFIX, labelno);
40881 #endif
40882 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
40883 }
40884 else
40885 {
40886 #ifndef NO_PROFILE_COUNTERS
40887 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
40888 LPREFIX, labelno);
40889 #endif
40890 x86_print_call_or_nop (file, mcount_name);
40891 }
40892
40893 if (flag_record_mcount)
40894 {
40895 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
40896 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
40897 fprintf (file, "\t.previous\n");
40898 }
40899 }
40900
40901 /* We don't have exact information about the insn sizes, but we may assume
40902 quite safely that we are informed about all 1 byte insns and memory
40903 address sizes. This is enough to eliminate unnecessary padding in
40904 99% of cases. */
40905
40906 static int
40907 min_insn_size (rtx_insn *insn)
40908 {
40909 int l = 0, len;
40910
40911 if (!INSN_P (insn) || !active_insn_p (insn))
40912 return 0;
40913
40914 /* Discard alignments we've emit and jump instructions. */
40915 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
40916 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
40917 return 0;
40918
40919 /* Important case - calls are always 5 bytes.
40920 It is common to have many calls in the row. */
40921 if (CALL_P (insn)
40922 && symbolic_reference_mentioned_p (PATTERN (insn))
40923 && !SIBLING_CALL_P (insn))
40924 return 5;
40925 len = get_attr_length (insn);
40926 if (len <= 1)
40927 return 1;
40928
40929 /* For normal instructions we rely on get_attr_length being exact,
40930 with a few exceptions. */
40931 if (!JUMP_P (insn))
40932 {
40933 enum attr_type type = get_attr_type (insn);
40934
40935 switch (type)
40936 {
40937 case TYPE_MULTI:
40938 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
40939 || asm_noperands (PATTERN (insn)) >= 0)
40940 return 0;
40941 break;
40942 case TYPE_OTHER:
40943 case TYPE_FCMP:
40944 break;
40945 default:
40946 /* Otherwise trust get_attr_length. */
40947 return len;
40948 }
40949
40950 l = get_attr_length_address (insn);
40951 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
40952 l = 4;
40953 }
40954 if (l)
40955 return 1+l;
40956 else
40957 return 2;
40958 }
40959
40960 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
40961
40962 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
40963 window. */
40964
40965 static void
40966 ix86_avoid_jump_mispredicts (void)
40967 {
40968 rtx_insn *insn, *start = get_insns ();
40969 int nbytes = 0, njumps = 0;
40970 bool isjump = false;
40971
40972 /* Look for all minimal intervals of instructions containing 4 jumps.
40973 The intervals are bounded by START and INSN. NBYTES is the total
40974 size of instructions in the interval including INSN and not including
40975 START. When the NBYTES is smaller than 16 bytes, it is possible
40976 that the end of START and INSN ends up in the same 16byte page.
40977
40978 The smallest offset in the page INSN can start is the case where START
40979 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
40980 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
40981
40982 Don't consider asm goto as jump, while it can contain a jump, it doesn't
40983 have to, control transfer to label(s) can be performed through other
40984 means, and also we estimate minimum length of all asm stmts as 0. */
40985 for (insn = start; insn; insn = NEXT_INSN (insn))
40986 {
40987 int min_size;
40988
40989 if (LABEL_P (insn))
40990 {
40991 int align = label_to_alignment (insn);
40992 int max_skip = label_to_max_skip (insn);
40993
40994 if (max_skip > 15)
40995 max_skip = 15;
40996 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
40997 already in the current 16 byte page, because otherwise
40998 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
40999 bytes to reach 16 byte boundary. */
41000 if (align <= 0
41001 || (align <= 3 && max_skip != (1 << align) - 1))
41002 max_skip = 0;
41003 if (dump_file)
41004 fprintf (dump_file, "Label %i with max_skip %i\n",
41005 INSN_UID (insn), max_skip);
41006 if (max_skip)
41007 {
41008 while (nbytes + max_skip >= 16)
41009 {
41010 start = NEXT_INSN (start);
41011 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
41012 || CALL_P (start))
41013 njumps--, isjump = true;
41014 else
41015 isjump = false;
41016 nbytes -= min_insn_size (start);
41017 }
41018 }
41019 continue;
41020 }
41021
41022 min_size = min_insn_size (insn);
41023 nbytes += min_size;
41024 if (dump_file)
41025 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
41026 INSN_UID (insn), min_size);
41027 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
41028 || CALL_P (insn))
41029 njumps++;
41030 else
41031 continue;
41032
41033 while (njumps > 3)
41034 {
41035 start = NEXT_INSN (start);
41036 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
41037 || CALL_P (start))
41038 njumps--, isjump = true;
41039 else
41040 isjump = false;
41041 nbytes -= min_insn_size (start);
41042 }
41043 gcc_assert (njumps >= 0);
41044 if (dump_file)
41045 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
41046 INSN_UID (start), INSN_UID (insn), nbytes);
41047
41048 if (njumps == 3 && isjump && nbytes < 16)
41049 {
41050 int padsize = 15 - nbytes + min_insn_size (insn);
41051
41052 if (dump_file)
41053 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
41054 INSN_UID (insn), padsize);
41055 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
41056 }
41057 }
41058 }
41059 #endif
41060
41061 /* AMD Athlon works faster
41062 when RET is not destination of conditional jump or directly preceded
41063 by other jump instruction. We avoid the penalty by inserting NOP just
41064 before the RET instructions in such cases. */
41065 static void
41066 ix86_pad_returns (void)
41067 {
41068 edge e;
41069 edge_iterator ei;
41070
41071 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41072 {
41073 basic_block bb = e->src;
41074 rtx_insn *ret = BB_END (bb);
41075 rtx_insn *prev;
41076 bool replace = false;
41077
41078 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
41079 || optimize_bb_for_size_p (bb))
41080 continue;
41081 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
41082 if (active_insn_p (prev) || LABEL_P (prev))
41083 break;
41084 if (prev && LABEL_P (prev))
41085 {
41086 edge e;
41087 edge_iterator ei;
41088
41089 FOR_EACH_EDGE (e, ei, bb->preds)
41090 if (EDGE_FREQUENCY (e) && e->src->index >= 0
41091 && !(e->flags & EDGE_FALLTHRU))
41092 {
41093 replace = true;
41094 break;
41095 }
41096 }
41097 if (!replace)
41098 {
41099 prev = prev_active_insn (ret);
41100 if (prev
41101 && ((JUMP_P (prev) && any_condjump_p (prev))
41102 || CALL_P (prev)))
41103 replace = true;
41104 /* Empty functions get branch mispredict even when
41105 the jump destination is not visible to us. */
41106 if (!prev && !optimize_function_for_size_p (cfun))
41107 replace = true;
41108 }
41109 if (replace)
41110 {
41111 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
41112 delete_insn (ret);
41113 }
41114 }
41115 }
41116
41117 /* Count the minimum number of instructions in BB. Return 4 if the
41118 number of instructions >= 4. */
41119
41120 static int
41121 ix86_count_insn_bb (basic_block bb)
41122 {
41123 rtx_insn *insn;
41124 int insn_count = 0;
41125
41126 /* Count number of instructions in this block. Return 4 if the number
41127 of instructions >= 4. */
41128 FOR_BB_INSNS (bb, insn)
41129 {
41130 /* Only happen in exit blocks. */
41131 if (JUMP_P (insn)
41132 && ANY_RETURN_P (PATTERN (insn)))
41133 break;
41134
41135 if (NONDEBUG_INSN_P (insn)
41136 && GET_CODE (PATTERN (insn)) != USE
41137 && GET_CODE (PATTERN (insn)) != CLOBBER)
41138 {
41139 insn_count++;
41140 if (insn_count >= 4)
41141 return insn_count;
41142 }
41143 }
41144
41145 return insn_count;
41146 }
41147
41148
41149 /* Count the minimum number of instructions in code path in BB.
41150 Return 4 if the number of instructions >= 4. */
41151
41152 static int
41153 ix86_count_insn (basic_block bb)
41154 {
41155 edge e;
41156 edge_iterator ei;
41157 int min_prev_count;
41158
41159 /* Only bother counting instructions along paths with no
41160 more than 2 basic blocks between entry and exit. Given
41161 that BB has an edge to exit, determine if a predecessor
41162 of BB has an edge from entry. If so, compute the number
41163 of instructions in the predecessor block. If there
41164 happen to be multiple such blocks, compute the minimum. */
41165 min_prev_count = 4;
41166 FOR_EACH_EDGE (e, ei, bb->preds)
41167 {
41168 edge prev_e;
41169 edge_iterator prev_ei;
41170
41171 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
41172 {
41173 min_prev_count = 0;
41174 break;
41175 }
41176 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
41177 {
41178 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
41179 {
41180 int count = ix86_count_insn_bb (e->src);
41181 if (count < min_prev_count)
41182 min_prev_count = count;
41183 break;
41184 }
41185 }
41186 }
41187
41188 if (min_prev_count < 4)
41189 min_prev_count += ix86_count_insn_bb (bb);
41190
41191 return min_prev_count;
41192 }
41193
41194 /* Pad short function to 4 instructions. */
41195
41196 static void
41197 ix86_pad_short_function (void)
41198 {
41199 edge e;
41200 edge_iterator ei;
41201
41202 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41203 {
41204 rtx_insn *ret = BB_END (e->src);
41205 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
41206 {
41207 int insn_count = ix86_count_insn (e->src);
41208
41209 /* Pad short function. */
41210 if (insn_count < 4)
41211 {
41212 rtx_insn *insn = ret;
41213
41214 /* Find epilogue. */
41215 while (insn
41216 && (!NOTE_P (insn)
41217 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
41218 insn = PREV_INSN (insn);
41219
41220 if (!insn)
41221 insn = ret;
41222
41223 /* Two NOPs count as one instruction. */
41224 insn_count = 2 * (4 - insn_count);
41225 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
41226 }
41227 }
41228 }
41229 }
41230
41231 /* Fix up a Windows system unwinder issue. If an EH region falls through into
41232 the epilogue, the Windows system unwinder will apply epilogue logic and
41233 produce incorrect offsets. This can be avoided by adding a nop between
41234 the last insn that can throw and the first insn of the epilogue. */
41235
41236 static void
41237 ix86_seh_fixup_eh_fallthru (void)
41238 {
41239 edge e;
41240 edge_iterator ei;
41241
41242 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41243 {
41244 rtx_insn *insn, *next;
41245
41246 /* Find the beginning of the epilogue. */
41247 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
41248 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
41249 break;
41250 if (insn == NULL)
41251 continue;
41252
41253 /* We only care about preceding insns that can throw. */
41254 insn = prev_active_insn (insn);
41255 if (insn == NULL || !can_throw_internal (insn))
41256 continue;
41257
41258 /* Do not separate calls from their debug information. */
41259 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
41260 if (NOTE_P (next)
41261 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
41262 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
41263 insn = next;
41264 else
41265 break;
41266
41267 emit_insn_after (gen_nops (const1_rtx), insn);
41268 }
41269 }
41270
41271 /* Given a register number BASE, the lowest of a group of registers, update
41272 regsets IN and OUT with the registers that should be avoided in input
41273 and output operands respectively when trying to avoid generating a modr/m
41274 byte for -fmitigate-rop. */
41275
41276 static void
41277 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
41278 {
41279 SET_HARD_REG_BIT (out, base);
41280 SET_HARD_REG_BIT (out, base + 1);
41281 SET_HARD_REG_BIT (in, base + 2);
41282 SET_HARD_REG_BIT (in, base + 3);
41283 }
41284
41285 /* Called if -fmitigate_rop is in effect. Try to rewrite instructions so
41286 that certain encodings of modr/m bytes do not occur. */
41287 static void
41288 ix86_mitigate_rop (void)
41289 {
41290 HARD_REG_SET input_risky;
41291 HARD_REG_SET output_risky;
41292 HARD_REG_SET inout_risky;
41293
41294 CLEAR_HARD_REG_SET (output_risky);
41295 CLEAR_HARD_REG_SET (input_risky);
41296 SET_HARD_REG_BIT (output_risky, AX_REG);
41297 SET_HARD_REG_BIT (output_risky, CX_REG);
41298 SET_HARD_REG_BIT (input_risky, BX_REG);
41299 SET_HARD_REG_BIT (input_risky, DX_REG);
41300 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
41301 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
41302 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
41303 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
41304 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
41305 set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
41306 COPY_HARD_REG_SET (inout_risky, input_risky);
41307 IOR_HARD_REG_SET (inout_risky, output_risky);
41308
41309 df_note_add_problem ();
41310 /* Fix up what stack-regs did. */
41311 df_insn_rescan_all ();
41312 df_analyze ();
41313
41314 regrename_init (true);
41315 regrename_analyze (NULL);
41316
41317 auto_vec<du_head_p> cands;
41318
41319 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
41320 {
41321 if (!NONDEBUG_INSN_P (insn))
41322 continue;
41323
41324 if (GET_CODE (PATTERN (insn)) == USE
41325 || GET_CODE (PATTERN (insn)) == CLOBBER)
41326 continue;
41327
41328 extract_insn (insn);
41329
41330 int opno0, opno1;
41331 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
41332 recog_data.n_operands, &opno0,
41333 &opno1);
41334
41335 if (!ix86_rop_should_change_byte_p (modrm))
41336 continue;
41337
41338 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
41339
41340 /* This happens when regrename has to fail a block. */
41341 if (!info->op_info)
41342 continue;
41343
41344 if (info->op_info[opno0].n_chains != 0)
41345 {
41346 gcc_assert (info->op_info[opno0].n_chains == 1);
41347 du_head_p op0c;
41348 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
41349 if (op0c->target_data_1 + op0c->target_data_2 == 0
41350 && !op0c->cannot_rename)
41351 cands.safe_push (op0c);
41352
41353 op0c->target_data_1++;
41354 }
41355 if (info->op_info[opno1].n_chains != 0)
41356 {
41357 gcc_assert (info->op_info[opno1].n_chains == 1);
41358 du_head_p op1c;
41359 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
41360 if (op1c->target_data_1 + op1c->target_data_2 == 0
41361 && !op1c->cannot_rename)
41362 cands.safe_push (op1c);
41363
41364 op1c->target_data_2++;
41365 }
41366 }
41367
41368 int i;
41369 du_head_p head;
41370 FOR_EACH_VEC_ELT (cands, i, head)
41371 {
41372 int old_reg, best_reg;
41373 HARD_REG_SET unavailable;
41374
41375 CLEAR_HARD_REG_SET (unavailable);
41376 if (head->target_data_1)
41377 IOR_HARD_REG_SET (unavailable, output_risky);
41378 if (head->target_data_2)
41379 IOR_HARD_REG_SET (unavailable, input_risky);
41380
41381 int n_uses;
41382 reg_class superclass = regrename_find_superclass (head, &n_uses,
41383 &unavailable);
41384 old_reg = head->regno;
41385 best_reg = find_rename_reg (head, superclass, &unavailable,
41386 old_reg, false);
41387 bool ok = regrename_do_replace (head, best_reg);
41388 gcc_assert (ok);
41389 if (dump_file)
41390 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
41391 reg_names[best_reg], reg_class_names[superclass]);
41392
41393 }
41394
41395 regrename_finish ();
41396
41397 df_analyze ();
41398
41399 basic_block bb;
41400 regset_head live;
41401
41402 INIT_REG_SET (&live);
41403
41404 FOR_EACH_BB_FN (bb, cfun)
41405 {
41406 rtx_insn *insn;
41407
41408 COPY_REG_SET (&live, DF_LR_OUT (bb));
41409 df_simulate_initialize_backwards (bb, &live);
41410
41411 FOR_BB_INSNS_REVERSE (bb, insn)
41412 {
41413 if (!NONDEBUG_INSN_P (insn))
41414 continue;
41415
41416 df_simulate_one_insn_backwards (bb, insn, &live);
41417
41418 if (GET_CODE (PATTERN (insn)) == USE
41419 || GET_CODE (PATTERN (insn)) == CLOBBER)
41420 continue;
41421
41422 extract_insn (insn);
41423 constrain_operands_cached (insn, reload_completed);
41424 int opno0, opno1;
41425 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
41426 recog_data.n_operands, &opno0,
41427 &opno1);
41428 if (modrm < 0
41429 || !ix86_rop_should_change_byte_p (modrm)
41430 || opno0 == opno1)
41431 continue;
41432
41433 rtx oldreg = recog_data.operand[opno1];
41434 preprocess_constraints (insn);
41435 const operand_alternative *alt = which_op_alt ();
41436
41437 int i;
41438 for (i = 0; i < recog_data.n_operands; i++)
41439 if (i != opno1
41440 && alt[i].earlyclobber
41441 && reg_overlap_mentioned_p (recog_data.operand[i],
41442 oldreg))
41443 break;
41444
41445 if (i < recog_data.n_operands)
41446 continue;
41447
41448 if (dump_file)
41449 fprintf (dump_file,
41450 "attempting to fix modrm byte in insn %d:"
41451 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
41452 reg_class_names[alt[opno1].cl]);
41453
41454 HARD_REG_SET unavailable;
41455 REG_SET_TO_HARD_REG_SET (unavailable, &live);
41456 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
41457 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
41458 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
41459 IOR_HARD_REG_SET (unavailable, output_risky);
41460 IOR_COMPL_HARD_REG_SET (unavailable,
41461 reg_class_contents[alt[opno1].cl]);
41462
41463 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41464 if (!TEST_HARD_REG_BIT (unavailable, i))
41465 break;
41466 if (i == FIRST_PSEUDO_REGISTER)
41467 {
41468 if (dump_file)
41469 fprintf (dump_file, ", none available\n");
41470 continue;
41471 }
41472 if (dump_file)
41473 fprintf (dump_file, " -> %d\n", i);
41474 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
41475 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
41476 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
41477 }
41478 }
41479 }
41480
41481 /* Implement machine specific optimizations. We implement padding of returns
41482 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
41483 static void
41484 ix86_reorg (void)
41485 {
41486 /* We are freeing block_for_insn in the toplev to keep compatibility
41487 with old MDEP_REORGS that are not CFG based. Recompute it now. */
41488 compute_bb_for_insn ();
41489
41490 if (flag_mitigate_rop)
41491 ix86_mitigate_rop ();
41492
41493 if (TARGET_SEH && current_function_has_exception_handlers ())
41494 ix86_seh_fixup_eh_fallthru ();
41495
41496 if (optimize && optimize_function_for_speed_p (cfun))
41497 {
41498 if (TARGET_PAD_SHORT_FUNCTION)
41499 ix86_pad_short_function ();
41500 else if (TARGET_PAD_RETURNS)
41501 ix86_pad_returns ();
41502 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
41503 if (TARGET_FOUR_JUMP_LIMIT)
41504 ix86_avoid_jump_mispredicts ();
41505 #endif
41506 }
41507 }
41508
41509 /* Return nonzero when QImode register that must be represented via REX prefix
41510 is used. */
41511 bool
41512 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
41513 {
41514 int i;
41515 extract_insn_cached (insn);
41516 for (i = 0; i < recog_data.n_operands; i++)
41517 if (GENERAL_REG_P (recog_data.operand[i])
41518 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
41519 return true;
41520 return false;
41521 }
41522
41523 /* Return true when INSN mentions register that must be encoded using REX
41524 prefix. */
41525 bool
41526 x86_extended_reg_mentioned_p (rtx insn)
41527 {
41528 subrtx_iterator::array_type array;
41529 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
41530 {
41531 const_rtx x = *iter;
41532 if (REG_P (x)
41533 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
41534 return true;
41535 }
41536 return false;
41537 }
41538
41539 /* If profitable, negate (without causing overflow) integer constant
41540 of mode MODE at location LOC. Return true in this case. */
41541 bool
41542 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
41543 {
41544 HOST_WIDE_INT val;
41545
41546 if (!CONST_INT_P (*loc))
41547 return false;
41548
41549 switch (mode)
41550 {
41551 case DImode:
41552 /* DImode x86_64 constants must fit in 32 bits. */
41553 gcc_assert (x86_64_immediate_operand (*loc, mode));
41554
41555 mode = SImode;
41556 break;
41557
41558 case SImode:
41559 case HImode:
41560 case QImode:
41561 break;
41562
41563 default:
41564 gcc_unreachable ();
41565 }
41566
41567 /* Avoid overflows. */
41568 if (mode_signbit_p (mode, *loc))
41569 return false;
41570
41571 val = INTVAL (*loc);
41572
41573 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
41574 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
41575 if ((val < 0 && val != -128)
41576 || val == 128)
41577 {
41578 *loc = GEN_INT (-val);
41579 return true;
41580 }
41581
41582 return false;
41583 }
41584
41585 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
41586 optabs would emit if we didn't have TFmode patterns. */
41587
41588 void
41589 x86_emit_floatuns (rtx operands[2])
41590 {
41591 rtx_code_label *neglab, *donelab;
41592 rtx i0, i1, f0, in, out;
41593 machine_mode mode, inmode;
41594
41595 inmode = GET_MODE (operands[1]);
41596 gcc_assert (inmode == SImode || inmode == DImode);
41597
41598 out = operands[0];
41599 in = force_reg (inmode, operands[1]);
41600 mode = GET_MODE (out);
41601 neglab = gen_label_rtx ();
41602 donelab = gen_label_rtx ();
41603 f0 = gen_reg_rtx (mode);
41604
41605 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
41606
41607 expand_float (out, in, 0);
41608
41609 emit_jump_insn (gen_jump (donelab));
41610 emit_barrier ();
41611
41612 emit_label (neglab);
41613
41614 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
41615 1, OPTAB_DIRECT);
41616 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
41617 1, OPTAB_DIRECT);
41618 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
41619
41620 expand_float (f0, i0, 0);
41621
41622 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
41623
41624 emit_label (donelab);
41625 }
41626 \f
41627 static bool canonicalize_perm (struct expand_vec_perm_d *d);
41628 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
41629 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
41630 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
41631
41632 /* Get a vector mode of the same size as the original but with elements
41633 twice as wide. This is only guaranteed to apply to integral vectors. */
41634
41635 static inline machine_mode
41636 get_mode_wider_vector (machine_mode o)
41637 {
41638 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
41639 machine_mode n = GET_MODE_WIDER_MODE (o);
41640 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
41641 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
41642 return n;
41643 }
41644
41645 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
41646 fill target with val via vec_duplicate. */
41647
41648 static bool
41649 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
41650 {
41651 bool ok;
41652 rtx_insn *insn;
41653 rtx dup;
41654
41655 /* First attempt to recognize VAL as-is. */
41656 dup = gen_rtx_VEC_DUPLICATE (mode, val);
41657 insn = emit_insn (gen_rtx_SET (target, dup));
41658 if (recog_memoized (insn) < 0)
41659 {
41660 rtx_insn *seq;
41661 /* If that fails, force VAL into a register. */
41662
41663 start_sequence ();
41664 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
41665 seq = get_insns ();
41666 end_sequence ();
41667 if (seq)
41668 emit_insn_before (seq, insn);
41669
41670 ok = recog_memoized (insn) >= 0;
41671 gcc_assert (ok);
41672 }
41673 return true;
41674 }
41675
41676 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41677 with all elements equal to VAR. Return true if successful. */
41678
41679 static bool
41680 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
41681 rtx target, rtx val)
41682 {
41683 bool ok;
41684
41685 switch (mode)
41686 {
41687 case V2SImode:
41688 case V2SFmode:
41689 if (!mmx_ok)
41690 return false;
41691 /* FALLTHRU */
41692
41693 case V4DFmode:
41694 case V4DImode:
41695 case V8SFmode:
41696 case V8SImode:
41697 case V2DFmode:
41698 case V2DImode:
41699 case V4SFmode:
41700 case V4SImode:
41701 case V16SImode:
41702 case V8DImode:
41703 case V16SFmode:
41704 case V8DFmode:
41705 return ix86_vector_duplicate_value (mode, target, val);
41706
41707 case V4HImode:
41708 if (!mmx_ok)
41709 return false;
41710 if (TARGET_SSE || TARGET_3DNOW_A)
41711 {
41712 rtx x;
41713
41714 val = gen_lowpart (SImode, val);
41715 x = gen_rtx_TRUNCATE (HImode, val);
41716 x = gen_rtx_VEC_DUPLICATE (mode, x);
41717 emit_insn (gen_rtx_SET (target, x));
41718 return true;
41719 }
41720 goto widen;
41721
41722 case V8QImode:
41723 if (!mmx_ok)
41724 return false;
41725 goto widen;
41726
41727 case V8HImode:
41728 if (TARGET_AVX2)
41729 return ix86_vector_duplicate_value (mode, target, val);
41730
41731 if (TARGET_SSE2)
41732 {
41733 struct expand_vec_perm_d dperm;
41734 rtx tmp1, tmp2;
41735
41736 permute:
41737 memset (&dperm, 0, sizeof (dperm));
41738 dperm.target = target;
41739 dperm.vmode = mode;
41740 dperm.nelt = GET_MODE_NUNITS (mode);
41741 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
41742 dperm.one_operand_p = true;
41743
41744 /* Extend to SImode using a paradoxical SUBREG. */
41745 tmp1 = gen_reg_rtx (SImode);
41746 emit_move_insn (tmp1, gen_lowpart (SImode, val));
41747
41748 /* Insert the SImode value as low element of a V4SImode vector. */
41749 tmp2 = gen_reg_rtx (V4SImode);
41750 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
41751 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
41752
41753 ok = (expand_vec_perm_1 (&dperm)
41754 || expand_vec_perm_broadcast_1 (&dperm));
41755 gcc_assert (ok);
41756 return ok;
41757 }
41758 goto widen;
41759
41760 case V16QImode:
41761 if (TARGET_AVX2)
41762 return ix86_vector_duplicate_value (mode, target, val);
41763
41764 if (TARGET_SSE2)
41765 goto permute;
41766 goto widen;
41767
41768 widen:
41769 /* Replicate the value once into the next wider mode and recurse. */
41770 {
41771 machine_mode smode, wsmode, wvmode;
41772 rtx x;
41773
41774 smode = GET_MODE_INNER (mode);
41775 wvmode = get_mode_wider_vector (mode);
41776 wsmode = GET_MODE_INNER (wvmode);
41777
41778 val = convert_modes (wsmode, smode, val, true);
41779 x = expand_simple_binop (wsmode, ASHIFT, val,
41780 GEN_INT (GET_MODE_BITSIZE (smode)),
41781 NULL_RTX, 1, OPTAB_LIB_WIDEN);
41782 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
41783
41784 x = gen_reg_rtx (wvmode);
41785 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
41786 gcc_assert (ok);
41787 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
41788 return ok;
41789 }
41790
41791 case V16HImode:
41792 case V32QImode:
41793 if (TARGET_AVX2)
41794 return ix86_vector_duplicate_value (mode, target, val);
41795 else
41796 {
41797 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
41798 rtx x = gen_reg_rtx (hvmode);
41799
41800 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
41801 gcc_assert (ok);
41802
41803 x = gen_rtx_VEC_CONCAT (mode, x, x);
41804 emit_insn (gen_rtx_SET (target, x));
41805 }
41806 return true;
41807
41808 case V64QImode:
41809 case V32HImode:
41810 if (TARGET_AVX512BW)
41811 return ix86_vector_duplicate_value (mode, target, val);
41812 else
41813 {
41814 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
41815 rtx x = gen_reg_rtx (hvmode);
41816
41817 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
41818 gcc_assert (ok);
41819
41820 x = gen_rtx_VEC_CONCAT (mode, x, x);
41821 emit_insn (gen_rtx_SET (target, x));
41822 }
41823 return true;
41824
41825 default:
41826 return false;
41827 }
41828 }
41829
41830 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41831 whose ONE_VAR element is VAR, and other elements are zero. Return true
41832 if successful. */
41833
41834 static bool
41835 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
41836 rtx target, rtx var, int one_var)
41837 {
41838 machine_mode vsimode;
41839 rtx new_target;
41840 rtx x, tmp;
41841 bool use_vector_set = false;
41842
41843 switch (mode)
41844 {
41845 case V2DImode:
41846 /* For SSE4.1, we normally use vector set. But if the second
41847 element is zero and inter-unit moves are OK, we use movq
41848 instead. */
41849 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
41850 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
41851 && one_var == 0));
41852 break;
41853 case V16QImode:
41854 case V4SImode:
41855 case V4SFmode:
41856 use_vector_set = TARGET_SSE4_1;
41857 break;
41858 case V8HImode:
41859 use_vector_set = TARGET_SSE2;
41860 break;
41861 case V4HImode:
41862 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
41863 break;
41864 case V32QImode:
41865 case V16HImode:
41866 case V8SImode:
41867 case V8SFmode:
41868 case V4DFmode:
41869 use_vector_set = TARGET_AVX;
41870 break;
41871 case V4DImode:
41872 /* Use ix86_expand_vector_set in 64bit mode only. */
41873 use_vector_set = TARGET_AVX && TARGET_64BIT;
41874 break;
41875 default:
41876 break;
41877 }
41878
41879 if (use_vector_set)
41880 {
41881 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
41882 var = force_reg (GET_MODE_INNER (mode), var);
41883 ix86_expand_vector_set (mmx_ok, target, var, one_var);
41884 return true;
41885 }
41886
41887 switch (mode)
41888 {
41889 case V2SFmode:
41890 case V2SImode:
41891 if (!mmx_ok)
41892 return false;
41893 /* FALLTHRU */
41894
41895 case V2DFmode:
41896 case V2DImode:
41897 if (one_var != 0)
41898 return false;
41899 var = force_reg (GET_MODE_INNER (mode), var);
41900 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
41901 emit_insn (gen_rtx_SET (target, x));
41902 return true;
41903
41904 case V4SFmode:
41905 case V4SImode:
41906 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
41907 new_target = gen_reg_rtx (mode);
41908 else
41909 new_target = target;
41910 var = force_reg (GET_MODE_INNER (mode), var);
41911 x = gen_rtx_VEC_DUPLICATE (mode, var);
41912 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
41913 emit_insn (gen_rtx_SET (new_target, x));
41914 if (one_var != 0)
41915 {
41916 /* We need to shuffle the value to the correct position, so
41917 create a new pseudo to store the intermediate result. */
41918
41919 /* With SSE2, we can use the integer shuffle insns. */
41920 if (mode != V4SFmode && TARGET_SSE2)
41921 {
41922 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
41923 const1_rtx,
41924 GEN_INT (one_var == 1 ? 0 : 1),
41925 GEN_INT (one_var == 2 ? 0 : 1),
41926 GEN_INT (one_var == 3 ? 0 : 1)));
41927 if (target != new_target)
41928 emit_move_insn (target, new_target);
41929 return true;
41930 }
41931
41932 /* Otherwise convert the intermediate result to V4SFmode and
41933 use the SSE1 shuffle instructions. */
41934 if (mode != V4SFmode)
41935 {
41936 tmp = gen_reg_rtx (V4SFmode);
41937 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
41938 }
41939 else
41940 tmp = new_target;
41941
41942 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
41943 const1_rtx,
41944 GEN_INT (one_var == 1 ? 0 : 1),
41945 GEN_INT (one_var == 2 ? 0+4 : 1+4),
41946 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
41947
41948 if (mode != V4SFmode)
41949 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
41950 else if (tmp != target)
41951 emit_move_insn (target, tmp);
41952 }
41953 else if (target != new_target)
41954 emit_move_insn (target, new_target);
41955 return true;
41956
41957 case V8HImode:
41958 case V16QImode:
41959 vsimode = V4SImode;
41960 goto widen;
41961 case V4HImode:
41962 case V8QImode:
41963 if (!mmx_ok)
41964 return false;
41965 vsimode = V2SImode;
41966 goto widen;
41967 widen:
41968 if (one_var != 0)
41969 return false;
41970
41971 /* Zero extend the variable element to SImode and recurse. */
41972 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
41973
41974 x = gen_reg_rtx (vsimode);
41975 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
41976 var, one_var))
41977 gcc_unreachable ();
41978
41979 emit_move_insn (target, gen_lowpart (mode, x));
41980 return true;
41981
41982 default:
41983 return false;
41984 }
41985 }
41986
41987 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41988 consisting of the values in VALS. It is known that all elements
41989 except ONE_VAR are constants. Return true if successful. */
41990
41991 static bool
41992 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
41993 rtx target, rtx vals, int one_var)
41994 {
41995 rtx var = XVECEXP (vals, 0, one_var);
41996 machine_mode wmode;
41997 rtx const_vec, x;
41998
41999 const_vec = copy_rtx (vals);
42000 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
42001 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
42002
42003 switch (mode)
42004 {
42005 case V2DFmode:
42006 case V2DImode:
42007 case V2SFmode:
42008 case V2SImode:
42009 /* For the two element vectors, it's just as easy to use
42010 the general case. */
42011 return false;
42012
42013 case V4DImode:
42014 /* Use ix86_expand_vector_set in 64bit mode only. */
42015 if (!TARGET_64BIT)
42016 return false;
42017 /* FALLTHRU */
42018 case V4DFmode:
42019 case V8SFmode:
42020 case V8SImode:
42021 case V16HImode:
42022 case V32QImode:
42023 case V4SFmode:
42024 case V4SImode:
42025 case V8HImode:
42026 case V4HImode:
42027 break;
42028
42029 case V16QImode:
42030 if (TARGET_SSE4_1)
42031 break;
42032 wmode = V8HImode;
42033 goto widen;
42034 case V8QImode:
42035 wmode = V4HImode;
42036 goto widen;
42037 widen:
42038 /* There's no way to set one QImode entry easily. Combine
42039 the variable value with its adjacent constant value, and
42040 promote to an HImode set. */
42041 x = XVECEXP (vals, 0, one_var ^ 1);
42042 if (one_var & 1)
42043 {
42044 var = convert_modes (HImode, QImode, var, true);
42045 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
42046 NULL_RTX, 1, OPTAB_LIB_WIDEN);
42047 x = GEN_INT (INTVAL (x) & 0xff);
42048 }
42049 else
42050 {
42051 var = convert_modes (HImode, QImode, var, true);
42052 x = gen_int_mode (INTVAL (x) << 8, HImode);
42053 }
42054 if (x != const0_rtx)
42055 var = expand_simple_binop (HImode, IOR, var, x, var,
42056 1, OPTAB_LIB_WIDEN);
42057
42058 x = gen_reg_rtx (wmode);
42059 emit_move_insn (x, gen_lowpart (wmode, const_vec));
42060 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
42061
42062 emit_move_insn (target, gen_lowpart (mode, x));
42063 return true;
42064
42065 default:
42066 return false;
42067 }
42068
42069 emit_move_insn (target, const_vec);
42070 ix86_expand_vector_set (mmx_ok, target, var, one_var);
42071 return true;
42072 }
42073
42074 /* A subroutine of ix86_expand_vector_init_general. Use vector
42075 concatenate to handle the most general case: all values variable,
42076 and none identical. */
42077
42078 static void
42079 ix86_expand_vector_init_concat (machine_mode mode,
42080 rtx target, rtx *ops, int n)
42081 {
42082 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
42083 rtx first[16], second[8], third[4];
42084 rtvec v;
42085 int i, j;
42086
42087 switch (n)
42088 {
42089 case 2:
42090 switch (mode)
42091 {
42092 case V16SImode:
42093 cmode = V8SImode;
42094 break;
42095 case V16SFmode:
42096 cmode = V8SFmode;
42097 break;
42098 case V8DImode:
42099 cmode = V4DImode;
42100 break;
42101 case V8DFmode:
42102 cmode = V4DFmode;
42103 break;
42104 case V8SImode:
42105 cmode = V4SImode;
42106 break;
42107 case V8SFmode:
42108 cmode = V4SFmode;
42109 break;
42110 case V4DImode:
42111 cmode = V2DImode;
42112 break;
42113 case V4DFmode:
42114 cmode = V2DFmode;
42115 break;
42116 case V4SImode:
42117 cmode = V2SImode;
42118 break;
42119 case V4SFmode:
42120 cmode = V2SFmode;
42121 break;
42122 case V2DImode:
42123 cmode = DImode;
42124 break;
42125 case V2SImode:
42126 cmode = SImode;
42127 break;
42128 case V2DFmode:
42129 cmode = DFmode;
42130 break;
42131 case V2SFmode:
42132 cmode = SFmode;
42133 break;
42134 default:
42135 gcc_unreachable ();
42136 }
42137
42138 if (!register_operand (ops[1], cmode))
42139 ops[1] = force_reg (cmode, ops[1]);
42140 if (!register_operand (ops[0], cmode))
42141 ops[0] = force_reg (cmode, ops[0]);
42142 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
42143 ops[1])));
42144 break;
42145
42146 case 4:
42147 switch (mode)
42148 {
42149 case V4DImode:
42150 cmode = V2DImode;
42151 break;
42152 case V4DFmode:
42153 cmode = V2DFmode;
42154 break;
42155 case V4SImode:
42156 cmode = V2SImode;
42157 break;
42158 case V4SFmode:
42159 cmode = V2SFmode;
42160 break;
42161 default:
42162 gcc_unreachable ();
42163 }
42164 goto half;
42165
42166 case 8:
42167 switch (mode)
42168 {
42169 case V8DImode:
42170 cmode = V2DImode;
42171 hmode = V4DImode;
42172 break;
42173 case V8DFmode:
42174 cmode = V2DFmode;
42175 hmode = V4DFmode;
42176 break;
42177 case V8SImode:
42178 cmode = V2SImode;
42179 hmode = V4SImode;
42180 break;
42181 case V8SFmode:
42182 cmode = V2SFmode;
42183 hmode = V4SFmode;
42184 break;
42185 default:
42186 gcc_unreachable ();
42187 }
42188 goto half;
42189
42190 case 16:
42191 switch (mode)
42192 {
42193 case V16SImode:
42194 cmode = V2SImode;
42195 hmode = V4SImode;
42196 gmode = V8SImode;
42197 break;
42198 case V16SFmode:
42199 cmode = V2SFmode;
42200 hmode = V4SFmode;
42201 gmode = V8SFmode;
42202 break;
42203 default:
42204 gcc_unreachable ();
42205 }
42206 goto half;
42207
42208 half:
42209 /* FIXME: We process inputs backward to help RA. PR 36222. */
42210 i = n - 1;
42211 j = (n >> 1) - 1;
42212 for (; i > 0; i -= 2, j--)
42213 {
42214 first[j] = gen_reg_rtx (cmode);
42215 v = gen_rtvec (2, ops[i - 1], ops[i]);
42216 ix86_expand_vector_init (false, first[j],
42217 gen_rtx_PARALLEL (cmode, v));
42218 }
42219
42220 n >>= 1;
42221 if (n > 4)
42222 {
42223 gcc_assert (hmode != VOIDmode);
42224 gcc_assert (gmode != VOIDmode);
42225 for (i = j = 0; i < n; i += 2, j++)
42226 {
42227 second[j] = gen_reg_rtx (hmode);
42228 ix86_expand_vector_init_concat (hmode, second [j],
42229 &first [i], 2);
42230 }
42231 n >>= 1;
42232 for (i = j = 0; i < n; i += 2, j++)
42233 {
42234 third[j] = gen_reg_rtx (gmode);
42235 ix86_expand_vector_init_concat (gmode, third[j],
42236 &second[i], 2);
42237 }
42238 n >>= 1;
42239 ix86_expand_vector_init_concat (mode, target, third, n);
42240 }
42241 else if (n > 2)
42242 {
42243 gcc_assert (hmode != VOIDmode);
42244 for (i = j = 0; i < n; i += 2, j++)
42245 {
42246 second[j] = gen_reg_rtx (hmode);
42247 ix86_expand_vector_init_concat (hmode, second [j],
42248 &first [i], 2);
42249 }
42250 n >>= 1;
42251 ix86_expand_vector_init_concat (mode, target, second, n);
42252 }
42253 else
42254 ix86_expand_vector_init_concat (mode, target, first, n);
42255 break;
42256
42257 default:
42258 gcc_unreachable ();
42259 }
42260 }
42261
42262 /* A subroutine of ix86_expand_vector_init_general. Use vector
42263 interleave to handle the most general case: all values variable,
42264 and none identical. */
42265
42266 static void
42267 ix86_expand_vector_init_interleave (machine_mode mode,
42268 rtx target, rtx *ops, int n)
42269 {
42270 machine_mode first_imode, second_imode, third_imode, inner_mode;
42271 int i, j;
42272 rtx op0, op1;
42273 rtx (*gen_load_even) (rtx, rtx, rtx);
42274 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
42275 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
42276
42277 switch (mode)
42278 {
42279 case V8HImode:
42280 gen_load_even = gen_vec_setv8hi;
42281 gen_interleave_first_low = gen_vec_interleave_lowv4si;
42282 gen_interleave_second_low = gen_vec_interleave_lowv2di;
42283 inner_mode = HImode;
42284 first_imode = V4SImode;
42285 second_imode = V2DImode;
42286 third_imode = VOIDmode;
42287 break;
42288 case V16QImode:
42289 gen_load_even = gen_vec_setv16qi;
42290 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
42291 gen_interleave_second_low = gen_vec_interleave_lowv4si;
42292 inner_mode = QImode;
42293 first_imode = V8HImode;
42294 second_imode = V4SImode;
42295 third_imode = V2DImode;
42296 break;
42297 default:
42298 gcc_unreachable ();
42299 }
42300
42301 for (i = 0; i < n; i++)
42302 {
42303 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
42304 op0 = gen_reg_rtx (SImode);
42305 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
42306
42307 /* Insert the SImode value as low element of V4SImode vector. */
42308 op1 = gen_reg_rtx (V4SImode);
42309 op0 = gen_rtx_VEC_MERGE (V4SImode,
42310 gen_rtx_VEC_DUPLICATE (V4SImode,
42311 op0),
42312 CONST0_RTX (V4SImode),
42313 const1_rtx);
42314 emit_insn (gen_rtx_SET (op1, op0));
42315
42316 /* Cast the V4SImode vector back to a vector in orignal mode. */
42317 op0 = gen_reg_rtx (mode);
42318 emit_move_insn (op0, gen_lowpart (mode, op1));
42319
42320 /* Load even elements into the second position. */
42321 emit_insn (gen_load_even (op0,
42322 force_reg (inner_mode,
42323 ops [i + i + 1]),
42324 const1_rtx));
42325
42326 /* Cast vector to FIRST_IMODE vector. */
42327 ops[i] = gen_reg_rtx (first_imode);
42328 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
42329 }
42330
42331 /* Interleave low FIRST_IMODE vectors. */
42332 for (i = j = 0; i < n; i += 2, j++)
42333 {
42334 op0 = gen_reg_rtx (first_imode);
42335 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
42336
42337 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
42338 ops[j] = gen_reg_rtx (second_imode);
42339 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
42340 }
42341
42342 /* Interleave low SECOND_IMODE vectors. */
42343 switch (second_imode)
42344 {
42345 case V4SImode:
42346 for (i = j = 0; i < n / 2; i += 2, j++)
42347 {
42348 op0 = gen_reg_rtx (second_imode);
42349 emit_insn (gen_interleave_second_low (op0, ops[i],
42350 ops[i + 1]));
42351
42352 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
42353 vector. */
42354 ops[j] = gen_reg_rtx (third_imode);
42355 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
42356 }
42357 second_imode = V2DImode;
42358 gen_interleave_second_low = gen_vec_interleave_lowv2di;
42359 /* FALLTHRU */
42360
42361 case V2DImode:
42362 op0 = gen_reg_rtx (second_imode);
42363 emit_insn (gen_interleave_second_low (op0, ops[0],
42364 ops[1]));
42365
42366 /* Cast the SECOND_IMODE vector back to a vector on original
42367 mode. */
42368 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
42369 break;
42370
42371 default:
42372 gcc_unreachable ();
42373 }
42374 }
42375
42376 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
42377 all values variable, and none identical. */
42378
42379 static void
42380 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
42381 rtx target, rtx vals)
42382 {
42383 rtx ops[64], op0, op1, op2, op3, op4, op5;
42384 machine_mode half_mode = VOIDmode;
42385 machine_mode quarter_mode = VOIDmode;
42386 int n, i;
42387
42388 switch (mode)
42389 {
42390 case V2SFmode:
42391 case V2SImode:
42392 if (!mmx_ok && !TARGET_SSE)
42393 break;
42394 /* FALLTHRU */
42395
42396 case V16SImode:
42397 case V16SFmode:
42398 case V8DFmode:
42399 case V8DImode:
42400 case V8SFmode:
42401 case V8SImode:
42402 case V4DFmode:
42403 case V4DImode:
42404 case V4SFmode:
42405 case V4SImode:
42406 case V2DFmode:
42407 case V2DImode:
42408 n = GET_MODE_NUNITS (mode);
42409 for (i = 0; i < n; i++)
42410 ops[i] = XVECEXP (vals, 0, i);
42411 ix86_expand_vector_init_concat (mode, target, ops, n);
42412 return;
42413
42414 case V32QImode:
42415 half_mode = V16QImode;
42416 goto half;
42417
42418 case V16HImode:
42419 half_mode = V8HImode;
42420 goto half;
42421
42422 half:
42423 n = GET_MODE_NUNITS (mode);
42424 for (i = 0; i < n; i++)
42425 ops[i] = XVECEXP (vals, 0, i);
42426 op0 = gen_reg_rtx (half_mode);
42427 op1 = gen_reg_rtx (half_mode);
42428 ix86_expand_vector_init_interleave (half_mode, op0, ops,
42429 n >> 2);
42430 ix86_expand_vector_init_interleave (half_mode, op1,
42431 &ops [n >> 1], n >> 2);
42432 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
42433 return;
42434
42435 case V64QImode:
42436 quarter_mode = V16QImode;
42437 half_mode = V32QImode;
42438 goto quarter;
42439
42440 case V32HImode:
42441 quarter_mode = V8HImode;
42442 half_mode = V16HImode;
42443 goto quarter;
42444
42445 quarter:
42446 n = GET_MODE_NUNITS (mode);
42447 for (i = 0; i < n; i++)
42448 ops[i] = XVECEXP (vals, 0, i);
42449 op0 = gen_reg_rtx (quarter_mode);
42450 op1 = gen_reg_rtx (quarter_mode);
42451 op2 = gen_reg_rtx (quarter_mode);
42452 op3 = gen_reg_rtx (quarter_mode);
42453 op4 = gen_reg_rtx (half_mode);
42454 op5 = gen_reg_rtx (half_mode);
42455 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
42456 n >> 3);
42457 ix86_expand_vector_init_interleave (quarter_mode, op1,
42458 &ops [n >> 2], n >> 3);
42459 ix86_expand_vector_init_interleave (quarter_mode, op2,
42460 &ops [n >> 1], n >> 3);
42461 ix86_expand_vector_init_interleave (quarter_mode, op3,
42462 &ops [(n >> 1) | (n >> 2)], n >> 3);
42463 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
42464 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
42465 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
42466 return;
42467
42468 case V16QImode:
42469 if (!TARGET_SSE4_1)
42470 break;
42471 /* FALLTHRU */
42472
42473 case V8HImode:
42474 if (!TARGET_SSE2)
42475 break;
42476
42477 /* Don't use ix86_expand_vector_init_interleave if we can't
42478 move from GPR to SSE register directly. */
42479 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
42480 break;
42481
42482 n = GET_MODE_NUNITS (mode);
42483 for (i = 0; i < n; i++)
42484 ops[i] = XVECEXP (vals, 0, i);
42485 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
42486 return;
42487
42488 case V4HImode:
42489 case V8QImode:
42490 break;
42491
42492 default:
42493 gcc_unreachable ();
42494 }
42495
42496 {
42497 int i, j, n_elts, n_words, n_elt_per_word;
42498 machine_mode inner_mode;
42499 rtx words[4], shift;
42500
42501 inner_mode = GET_MODE_INNER (mode);
42502 n_elts = GET_MODE_NUNITS (mode);
42503 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
42504 n_elt_per_word = n_elts / n_words;
42505 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
42506
42507 for (i = 0; i < n_words; ++i)
42508 {
42509 rtx word = NULL_RTX;
42510
42511 for (j = 0; j < n_elt_per_word; ++j)
42512 {
42513 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
42514 elt = convert_modes (word_mode, inner_mode, elt, true);
42515
42516 if (j == 0)
42517 word = elt;
42518 else
42519 {
42520 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
42521 word, 1, OPTAB_LIB_WIDEN);
42522 word = expand_simple_binop (word_mode, IOR, word, elt,
42523 word, 1, OPTAB_LIB_WIDEN);
42524 }
42525 }
42526
42527 words[i] = word;
42528 }
42529
42530 if (n_words == 1)
42531 emit_move_insn (target, gen_lowpart (mode, words[0]));
42532 else if (n_words == 2)
42533 {
42534 rtx tmp = gen_reg_rtx (mode);
42535 emit_clobber (tmp);
42536 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
42537 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
42538 emit_move_insn (target, tmp);
42539 }
42540 else if (n_words == 4)
42541 {
42542 rtx tmp = gen_reg_rtx (V4SImode);
42543 gcc_assert (word_mode == SImode);
42544 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
42545 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
42546 emit_move_insn (target, gen_lowpart (mode, tmp));
42547 }
42548 else
42549 gcc_unreachable ();
42550 }
42551 }
42552
42553 /* Initialize vector TARGET via VALS. Suppress the use of MMX
42554 instructions unless MMX_OK is true. */
42555
42556 void
42557 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
42558 {
42559 machine_mode mode = GET_MODE (target);
42560 machine_mode inner_mode = GET_MODE_INNER (mode);
42561 int n_elts = GET_MODE_NUNITS (mode);
42562 int n_var = 0, one_var = -1;
42563 bool all_same = true, all_const_zero = true;
42564 int i;
42565 rtx x;
42566
42567 for (i = 0; i < n_elts; ++i)
42568 {
42569 x = XVECEXP (vals, 0, i);
42570 if (!(CONST_SCALAR_INT_P (x)
42571 || CONST_DOUBLE_P (x)
42572 || CONST_FIXED_P (x)))
42573 n_var++, one_var = i;
42574 else if (x != CONST0_RTX (inner_mode))
42575 all_const_zero = false;
42576 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
42577 all_same = false;
42578 }
42579
42580 /* Constants are best loaded from the constant pool. */
42581 if (n_var == 0)
42582 {
42583 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
42584 return;
42585 }
42586
42587 /* If all values are identical, broadcast the value. */
42588 if (all_same
42589 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
42590 XVECEXP (vals, 0, 0)))
42591 return;
42592
42593 /* Values where only one field is non-constant are best loaded from
42594 the pool and overwritten via move later. */
42595 if (n_var == 1)
42596 {
42597 if (all_const_zero
42598 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
42599 XVECEXP (vals, 0, one_var),
42600 one_var))
42601 return;
42602
42603 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
42604 return;
42605 }
42606
42607 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
42608 }
42609
42610 void
42611 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
42612 {
42613 machine_mode mode = GET_MODE (target);
42614 machine_mode inner_mode = GET_MODE_INNER (mode);
42615 machine_mode half_mode;
42616 bool use_vec_merge = false;
42617 rtx tmp;
42618 static rtx (*gen_extract[6][2]) (rtx, rtx)
42619 = {
42620 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
42621 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
42622 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
42623 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
42624 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
42625 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
42626 };
42627 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
42628 = {
42629 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
42630 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
42631 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
42632 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
42633 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
42634 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
42635 };
42636 int i, j, n;
42637 machine_mode mmode = VOIDmode;
42638 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
42639
42640 switch (mode)
42641 {
42642 case V2SFmode:
42643 case V2SImode:
42644 if (mmx_ok)
42645 {
42646 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
42647 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
42648 if (elt == 0)
42649 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
42650 else
42651 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
42652 emit_insn (gen_rtx_SET (target, tmp));
42653 return;
42654 }
42655 break;
42656
42657 case V2DImode:
42658 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
42659 if (use_vec_merge)
42660 break;
42661
42662 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
42663 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
42664 if (elt == 0)
42665 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
42666 else
42667 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
42668 emit_insn (gen_rtx_SET (target, tmp));
42669 return;
42670
42671 case V2DFmode:
42672 {
42673 rtx op0, op1;
42674
42675 /* For the two element vectors, we implement a VEC_CONCAT with
42676 the extraction of the other element. */
42677
42678 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
42679 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
42680
42681 if (elt == 0)
42682 op0 = val, op1 = tmp;
42683 else
42684 op0 = tmp, op1 = val;
42685
42686 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
42687 emit_insn (gen_rtx_SET (target, tmp));
42688 }
42689 return;
42690
42691 case V4SFmode:
42692 use_vec_merge = TARGET_SSE4_1;
42693 if (use_vec_merge)
42694 break;
42695
42696 switch (elt)
42697 {
42698 case 0:
42699 use_vec_merge = true;
42700 break;
42701
42702 case 1:
42703 /* tmp = target = A B C D */
42704 tmp = copy_to_reg (target);
42705 /* target = A A B B */
42706 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
42707 /* target = X A B B */
42708 ix86_expand_vector_set (false, target, val, 0);
42709 /* target = A X C D */
42710 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42711 const1_rtx, const0_rtx,
42712 GEN_INT (2+4), GEN_INT (3+4)));
42713 return;
42714
42715 case 2:
42716 /* tmp = target = A B C D */
42717 tmp = copy_to_reg (target);
42718 /* tmp = X B C D */
42719 ix86_expand_vector_set (false, tmp, val, 0);
42720 /* target = A B X D */
42721 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42722 const0_rtx, const1_rtx,
42723 GEN_INT (0+4), GEN_INT (3+4)));
42724 return;
42725
42726 case 3:
42727 /* tmp = target = A B C D */
42728 tmp = copy_to_reg (target);
42729 /* tmp = X B C D */
42730 ix86_expand_vector_set (false, tmp, val, 0);
42731 /* target = A B X D */
42732 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42733 const0_rtx, const1_rtx,
42734 GEN_INT (2+4), GEN_INT (0+4)));
42735 return;
42736
42737 default:
42738 gcc_unreachable ();
42739 }
42740 break;
42741
42742 case V4SImode:
42743 use_vec_merge = TARGET_SSE4_1;
42744 if (use_vec_merge)
42745 break;
42746
42747 /* Element 0 handled by vec_merge below. */
42748 if (elt == 0)
42749 {
42750 use_vec_merge = true;
42751 break;
42752 }
42753
42754 if (TARGET_SSE2)
42755 {
42756 /* With SSE2, use integer shuffles to swap element 0 and ELT,
42757 store into element 0, then shuffle them back. */
42758
42759 rtx order[4];
42760
42761 order[0] = GEN_INT (elt);
42762 order[1] = const1_rtx;
42763 order[2] = const2_rtx;
42764 order[3] = GEN_INT (3);
42765 order[elt] = const0_rtx;
42766
42767 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
42768 order[1], order[2], order[3]));
42769
42770 ix86_expand_vector_set (false, target, val, 0);
42771
42772 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
42773 order[1], order[2], order[3]));
42774 }
42775 else
42776 {
42777 /* For SSE1, we have to reuse the V4SF code. */
42778 rtx t = gen_reg_rtx (V4SFmode);
42779 emit_move_insn (t, gen_lowpart (V4SFmode, target));
42780 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
42781 emit_move_insn (target, gen_lowpart (mode, t));
42782 }
42783 return;
42784
42785 case V8HImode:
42786 use_vec_merge = TARGET_SSE2;
42787 break;
42788 case V4HImode:
42789 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
42790 break;
42791
42792 case V16QImode:
42793 use_vec_merge = TARGET_SSE4_1;
42794 break;
42795
42796 case V8QImode:
42797 break;
42798
42799 case V32QImode:
42800 half_mode = V16QImode;
42801 j = 0;
42802 n = 16;
42803 goto half;
42804
42805 case V16HImode:
42806 half_mode = V8HImode;
42807 j = 1;
42808 n = 8;
42809 goto half;
42810
42811 case V8SImode:
42812 half_mode = V4SImode;
42813 j = 2;
42814 n = 4;
42815 goto half;
42816
42817 case V4DImode:
42818 half_mode = V2DImode;
42819 j = 3;
42820 n = 2;
42821 goto half;
42822
42823 case V8SFmode:
42824 half_mode = V4SFmode;
42825 j = 4;
42826 n = 4;
42827 goto half;
42828
42829 case V4DFmode:
42830 half_mode = V2DFmode;
42831 j = 5;
42832 n = 2;
42833 goto half;
42834
42835 half:
42836 /* Compute offset. */
42837 i = elt / n;
42838 elt %= n;
42839
42840 gcc_assert (i <= 1);
42841
42842 /* Extract the half. */
42843 tmp = gen_reg_rtx (half_mode);
42844 emit_insn (gen_extract[j][i] (tmp, target));
42845
42846 /* Put val in tmp at elt. */
42847 ix86_expand_vector_set (false, tmp, val, elt);
42848
42849 /* Put it back. */
42850 emit_insn (gen_insert[j][i] (target, target, tmp));
42851 return;
42852
42853 case V8DFmode:
42854 if (TARGET_AVX512F)
42855 {
42856 mmode = QImode;
42857 gen_blendm = gen_avx512f_blendmv8df;
42858 }
42859 break;
42860
42861 case V8DImode:
42862 if (TARGET_AVX512F)
42863 {
42864 mmode = QImode;
42865 gen_blendm = gen_avx512f_blendmv8di;
42866 }
42867 break;
42868
42869 case V16SFmode:
42870 if (TARGET_AVX512F)
42871 {
42872 mmode = HImode;
42873 gen_blendm = gen_avx512f_blendmv16sf;
42874 }
42875 break;
42876
42877 case V16SImode:
42878 if (TARGET_AVX512F)
42879 {
42880 mmode = HImode;
42881 gen_blendm = gen_avx512f_blendmv16si;
42882 }
42883 break;
42884
42885 case V32HImode:
42886 if (TARGET_AVX512F && TARGET_AVX512BW)
42887 {
42888 mmode = SImode;
42889 gen_blendm = gen_avx512bw_blendmv32hi;
42890 }
42891 break;
42892
42893 case V64QImode:
42894 if (TARGET_AVX512F && TARGET_AVX512BW)
42895 {
42896 mmode = DImode;
42897 gen_blendm = gen_avx512bw_blendmv64qi;
42898 }
42899 break;
42900
42901 default:
42902 break;
42903 }
42904
42905 if (mmode != VOIDmode)
42906 {
42907 tmp = gen_reg_rtx (mode);
42908 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
42909 /* The avx512*_blendm<mode> expanders have different operand order
42910 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
42911 elements where the mask is set and second input operand otherwise,
42912 in {sse,avx}*_*blend* the first input operand is used for elements
42913 where the mask is clear and second input operand otherwise. */
42914 emit_insn (gen_blendm (target, target, tmp,
42915 force_reg (mmode,
42916 gen_int_mode (1 << elt, mmode))));
42917 }
42918 else if (use_vec_merge)
42919 {
42920 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
42921 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
42922 emit_insn (gen_rtx_SET (target, tmp));
42923 }
42924 else
42925 {
42926 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
42927
42928 emit_move_insn (mem, target);
42929
42930 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
42931 emit_move_insn (tmp, val);
42932
42933 emit_move_insn (target, mem);
42934 }
42935 }
42936
42937 void
42938 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
42939 {
42940 machine_mode mode = GET_MODE (vec);
42941 machine_mode inner_mode = GET_MODE_INNER (mode);
42942 bool use_vec_extr = false;
42943 rtx tmp;
42944
42945 switch (mode)
42946 {
42947 case V2SImode:
42948 case V2SFmode:
42949 if (!mmx_ok)
42950 break;
42951 /* FALLTHRU */
42952
42953 case V2DFmode:
42954 case V2DImode:
42955 use_vec_extr = true;
42956 break;
42957
42958 case V4SFmode:
42959 use_vec_extr = TARGET_SSE4_1;
42960 if (use_vec_extr)
42961 break;
42962
42963 switch (elt)
42964 {
42965 case 0:
42966 tmp = vec;
42967 break;
42968
42969 case 1:
42970 case 3:
42971 tmp = gen_reg_rtx (mode);
42972 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
42973 GEN_INT (elt), GEN_INT (elt),
42974 GEN_INT (elt+4), GEN_INT (elt+4)));
42975 break;
42976
42977 case 2:
42978 tmp = gen_reg_rtx (mode);
42979 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
42980 break;
42981
42982 default:
42983 gcc_unreachable ();
42984 }
42985 vec = tmp;
42986 use_vec_extr = true;
42987 elt = 0;
42988 break;
42989
42990 case V4SImode:
42991 use_vec_extr = TARGET_SSE4_1;
42992 if (use_vec_extr)
42993 break;
42994
42995 if (TARGET_SSE2)
42996 {
42997 switch (elt)
42998 {
42999 case 0:
43000 tmp = vec;
43001 break;
43002
43003 case 1:
43004 case 3:
43005 tmp = gen_reg_rtx (mode);
43006 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
43007 GEN_INT (elt), GEN_INT (elt),
43008 GEN_INT (elt), GEN_INT (elt)));
43009 break;
43010
43011 case 2:
43012 tmp = gen_reg_rtx (mode);
43013 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
43014 break;
43015
43016 default:
43017 gcc_unreachable ();
43018 }
43019 vec = tmp;
43020 use_vec_extr = true;
43021 elt = 0;
43022 }
43023 else
43024 {
43025 /* For SSE1, we have to reuse the V4SF code. */
43026 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
43027 gen_lowpart (V4SFmode, vec), elt);
43028 return;
43029 }
43030 break;
43031
43032 case V8HImode:
43033 use_vec_extr = TARGET_SSE2;
43034 break;
43035 case V4HImode:
43036 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
43037 break;
43038
43039 case V16QImode:
43040 use_vec_extr = TARGET_SSE4_1;
43041 break;
43042
43043 case V8SFmode:
43044 if (TARGET_AVX)
43045 {
43046 tmp = gen_reg_rtx (V4SFmode);
43047 if (elt < 4)
43048 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
43049 else
43050 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
43051 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43052 return;
43053 }
43054 break;
43055
43056 case V4DFmode:
43057 if (TARGET_AVX)
43058 {
43059 tmp = gen_reg_rtx (V2DFmode);
43060 if (elt < 2)
43061 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
43062 else
43063 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
43064 ix86_expand_vector_extract (false, target, tmp, elt & 1);
43065 return;
43066 }
43067 break;
43068
43069 case V32QImode:
43070 if (TARGET_AVX)
43071 {
43072 tmp = gen_reg_rtx (V16QImode);
43073 if (elt < 16)
43074 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
43075 else
43076 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
43077 ix86_expand_vector_extract (false, target, tmp, elt & 15);
43078 return;
43079 }
43080 break;
43081
43082 case V16HImode:
43083 if (TARGET_AVX)
43084 {
43085 tmp = gen_reg_rtx (V8HImode);
43086 if (elt < 8)
43087 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
43088 else
43089 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
43090 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43091 return;
43092 }
43093 break;
43094
43095 case V8SImode:
43096 if (TARGET_AVX)
43097 {
43098 tmp = gen_reg_rtx (V4SImode);
43099 if (elt < 4)
43100 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
43101 else
43102 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
43103 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43104 return;
43105 }
43106 break;
43107
43108 case V4DImode:
43109 if (TARGET_AVX)
43110 {
43111 tmp = gen_reg_rtx (V2DImode);
43112 if (elt < 2)
43113 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
43114 else
43115 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
43116 ix86_expand_vector_extract (false, target, tmp, elt & 1);
43117 return;
43118 }
43119 break;
43120
43121 case V32HImode:
43122 if (TARGET_AVX512BW)
43123 {
43124 tmp = gen_reg_rtx (V16HImode);
43125 if (elt < 16)
43126 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
43127 else
43128 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
43129 ix86_expand_vector_extract (false, target, tmp, elt & 15);
43130 return;
43131 }
43132 break;
43133
43134 case V64QImode:
43135 if (TARGET_AVX512BW)
43136 {
43137 tmp = gen_reg_rtx (V32QImode);
43138 if (elt < 32)
43139 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
43140 else
43141 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
43142 ix86_expand_vector_extract (false, target, tmp, elt & 31);
43143 return;
43144 }
43145 break;
43146
43147 case V16SFmode:
43148 tmp = gen_reg_rtx (V8SFmode);
43149 if (elt < 8)
43150 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
43151 else
43152 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
43153 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43154 return;
43155
43156 case V8DFmode:
43157 tmp = gen_reg_rtx (V4DFmode);
43158 if (elt < 4)
43159 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
43160 else
43161 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
43162 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43163 return;
43164
43165 case V16SImode:
43166 tmp = gen_reg_rtx (V8SImode);
43167 if (elt < 8)
43168 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
43169 else
43170 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
43171 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43172 return;
43173
43174 case V8DImode:
43175 tmp = gen_reg_rtx (V4DImode);
43176 if (elt < 4)
43177 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
43178 else
43179 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
43180 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43181 return;
43182
43183 case V8QImode:
43184 /* ??? Could extract the appropriate HImode element and shift. */
43185 default:
43186 break;
43187 }
43188
43189 if (use_vec_extr)
43190 {
43191 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
43192 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
43193
43194 /* Let the rtl optimizers know about the zero extension performed. */
43195 if (inner_mode == QImode || inner_mode == HImode)
43196 {
43197 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
43198 target = gen_lowpart (SImode, target);
43199 }
43200
43201 emit_insn (gen_rtx_SET (target, tmp));
43202 }
43203 else
43204 {
43205 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
43206
43207 emit_move_insn (mem, vec);
43208
43209 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
43210 emit_move_insn (target, tmp);
43211 }
43212 }
43213
43214 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
43215 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
43216 The upper bits of DEST are undefined, though they shouldn't cause
43217 exceptions (some bits from src or all zeros are ok). */
43218
43219 static void
43220 emit_reduc_half (rtx dest, rtx src, int i)
43221 {
43222 rtx tem, d = dest;
43223 switch (GET_MODE (src))
43224 {
43225 case V4SFmode:
43226 if (i == 128)
43227 tem = gen_sse_movhlps (dest, src, src);
43228 else
43229 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
43230 GEN_INT (1 + 4), GEN_INT (1 + 4));
43231 break;
43232 case V2DFmode:
43233 tem = gen_vec_interleave_highv2df (dest, src, src);
43234 break;
43235 case V16QImode:
43236 case V8HImode:
43237 case V4SImode:
43238 case V2DImode:
43239 d = gen_reg_rtx (V1TImode);
43240 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
43241 GEN_INT (i / 2));
43242 break;
43243 case V8SFmode:
43244 if (i == 256)
43245 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
43246 else
43247 tem = gen_avx_shufps256 (dest, src, src,
43248 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
43249 break;
43250 case V4DFmode:
43251 if (i == 256)
43252 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
43253 else
43254 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
43255 break;
43256 case V32QImode:
43257 case V16HImode:
43258 case V8SImode:
43259 case V4DImode:
43260 if (i == 256)
43261 {
43262 if (GET_MODE (dest) != V4DImode)
43263 d = gen_reg_rtx (V4DImode);
43264 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
43265 gen_lowpart (V4DImode, src),
43266 const1_rtx);
43267 }
43268 else
43269 {
43270 d = gen_reg_rtx (V2TImode);
43271 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
43272 GEN_INT (i / 2));
43273 }
43274 break;
43275 case V64QImode:
43276 case V32HImode:
43277 case V16SImode:
43278 case V16SFmode:
43279 case V8DImode:
43280 case V8DFmode:
43281 if (i > 128)
43282 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
43283 gen_lowpart (V16SImode, src),
43284 gen_lowpart (V16SImode, src),
43285 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
43286 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
43287 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
43288 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
43289 GEN_INT (0xC), GEN_INT (0xD),
43290 GEN_INT (0xE), GEN_INT (0xF),
43291 GEN_INT (0x10), GEN_INT (0x11),
43292 GEN_INT (0x12), GEN_INT (0x13),
43293 GEN_INT (0x14), GEN_INT (0x15),
43294 GEN_INT (0x16), GEN_INT (0x17));
43295 else
43296 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
43297 gen_lowpart (V16SImode, src),
43298 GEN_INT (i == 128 ? 0x2 : 0x1),
43299 GEN_INT (0x3),
43300 GEN_INT (0x3),
43301 GEN_INT (0x3),
43302 GEN_INT (i == 128 ? 0x6 : 0x5),
43303 GEN_INT (0x7),
43304 GEN_INT (0x7),
43305 GEN_INT (0x7),
43306 GEN_INT (i == 128 ? 0xA : 0x9),
43307 GEN_INT (0xB),
43308 GEN_INT (0xB),
43309 GEN_INT (0xB),
43310 GEN_INT (i == 128 ? 0xE : 0xD),
43311 GEN_INT (0xF),
43312 GEN_INT (0xF),
43313 GEN_INT (0xF));
43314 break;
43315 default:
43316 gcc_unreachable ();
43317 }
43318 emit_insn (tem);
43319 if (d != dest)
43320 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
43321 }
43322
43323 /* Expand a vector reduction. FN is the binary pattern to reduce;
43324 DEST is the destination; IN is the input vector. */
43325
43326 void
43327 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
43328 {
43329 rtx half, dst, vec = in;
43330 machine_mode mode = GET_MODE (in);
43331 int i;
43332
43333 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
43334 if (TARGET_SSE4_1
43335 && mode == V8HImode
43336 && fn == gen_uminv8hi3)
43337 {
43338 emit_insn (gen_sse4_1_phminposuw (dest, in));
43339 return;
43340 }
43341
43342 for (i = GET_MODE_BITSIZE (mode);
43343 i > GET_MODE_UNIT_BITSIZE (mode);
43344 i >>= 1)
43345 {
43346 half = gen_reg_rtx (mode);
43347 emit_reduc_half (half, vec, i);
43348 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
43349 dst = dest;
43350 else
43351 dst = gen_reg_rtx (mode);
43352 emit_insn (fn (dst, half, vec));
43353 vec = dst;
43354 }
43355 }
43356 \f
43357 /* Target hook for scalar_mode_supported_p. */
43358 static bool
43359 ix86_scalar_mode_supported_p (machine_mode mode)
43360 {
43361 if (DECIMAL_FLOAT_MODE_P (mode))
43362 return default_decimal_float_supported_p ();
43363 else if (mode == TFmode)
43364 return true;
43365 else
43366 return default_scalar_mode_supported_p (mode);
43367 }
43368
43369 /* Implements target hook vector_mode_supported_p. */
43370 static bool
43371 ix86_vector_mode_supported_p (machine_mode mode)
43372 {
43373 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
43374 return true;
43375 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
43376 return true;
43377 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
43378 return true;
43379 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
43380 return true;
43381 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
43382 return true;
43383 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
43384 return true;
43385 return false;
43386 }
43387
43388 /* Target hook for c_mode_for_suffix. */
43389 static machine_mode
43390 ix86_c_mode_for_suffix (char suffix)
43391 {
43392 if (suffix == 'q')
43393 return TFmode;
43394 if (suffix == 'w')
43395 return XFmode;
43396
43397 return VOIDmode;
43398 }
43399
43400 /* Worker function for TARGET_MD_ASM_ADJUST.
43401
43402 We implement asm flag outputs, and maintain source compatibility
43403 with the old cc0-based compiler. */
43404
43405 static rtx_insn *
43406 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
43407 vec<const char *> &constraints,
43408 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
43409 {
43410 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
43411 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
43412
43413 bool saw_asm_flag = false;
43414
43415 start_sequence ();
43416 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
43417 {
43418 const char *con = constraints[i];
43419 if (strncmp (con, "=@cc", 4) != 0)
43420 continue;
43421 con += 4;
43422 if (strchr (con, ',') != NULL)
43423 {
43424 error ("alternatives not allowed in asm flag output");
43425 continue;
43426 }
43427
43428 bool invert = false;
43429 if (con[0] == 'n')
43430 invert = true, con++;
43431
43432 machine_mode mode = CCmode;
43433 rtx_code code = UNKNOWN;
43434
43435 switch (con[0])
43436 {
43437 case 'a':
43438 if (con[1] == 0)
43439 mode = CCAmode, code = EQ;
43440 else if (con[1] == 'e' && con[2] == 0)
43441 mode = CCCmode, code = NE;
43442 break;
43443 case 'b':
43444 if (con[1] == 0)
43445 mode = CCCmode, code = EQ;
43446 else if (con[1] == 'e' && con[2] == 0)
43447 mode = CCAmode, code = NE;
43448 break;
43449 case 'c':
43450 if (con[1] == 0)
43451 mode = CCCmode, code = EQ;
43452 break;
43453 case 'e':
43454 if (con[1] == 0)
43455 mode = CCZmode, code = EQ;
43456 break;
43457 case 'g':
43458 if (con[1] == 0)
43459 mode = CCGCmode, code = GT;
43460 else if (con[1] == 'e' && con[2] == 0)
43461 mode = CCGCmode, code = GE;
43462 break;
43463 case 'l':
43464 if (con[1] == 0)
43465 mode = CCGCmode, code = LT;
43466 else if (con[1] == 'e' && con[2] == 0)
43467 mode = CCGCmode, code = LE;
43468 break;
43469 case 'o':
43470 if (con[1] == 0)
43471 mode = CCOmode, code = EQ;
43472 break;
43473 case 'p':
43474 if (con[1] == 0)
43475 mode = CCPmode, code = EQ;
43476 break;
43477 case 's':
43478 if (con[1] == 0)
43479 mode = CCSmode, code = EQ;
43480 break;
43481 case 'z':
43482 if (con[1] == 0)
43483 mode = CCZmode, code = EQ;
43484 break;
43485 }
43486 if (code == UNKNOWN)
43487 {
43488 error ("unknown asm flag output %qs", constraints[i]);
43489 continue;
43490 }
43491 if (invert)
43492 code = reverse_condition (code);
43493
43494 rtx dest = outputs[i];
43495 if (!saw_asm_flag)
43496 {
43497 /* This is the first asm flag output. Here we put the flags
43498 register in as the real output and adjust the condition to
43499 allow it. */
43500 constraints[i] = "=Bf";
43501 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
43502 saw_asm_flag = true;
43503 }
43504 else
43505 {
43506 /* We don't need the flags register as output twice. */
43507 constraints[i] = "=X";
43508 outputs[i] = gen_rtx_SCRATCH (SImode);
43509 }
43510
43511 rtx x = gen_rtx_REG (mode, FLAGS_REG);
43512 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
43513
43514 machine_mode dest_mode = GET_MODE (dest);
43515 if (!SCALAR_INT_MODE_P (dest_mode))
43516 {
43517 error ("invalid type for asm flag output");
43518 continue;
43519 }
43520
43521 if (dest_mode == DImode && !TARGET_64BIT)
43522 dest_mode = SImode;
43523
43524 if (dest_mode != QImode)
43525 {
43526 rtx destqi = gen_reg_rtx (QImode);
43527 emit_insn (gen_rtx_SET (destqi, x));
43528
43529 if (TARGET_ZERO_EXTEND_WITH_AND
43530 && optimize_function_for_speed_p (cfun))
43531 {
43532 x = force_reg (dest_mode, const0_rtx);
43533
43534 emit_insn (gen_movstrictqi
43535 (gen_lowpart (QImode, x), destqi));
43536 }
43537 else
43538 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
43539 }
43540
43541 if (dest_mode != GET_MODE (dest))
43542 {
43543 rtx tmp = gen_reg_rtx (SImode);
43544
43545 emit_insn (gen_rtx_SET (tmp, x));
43546 emit_insn (gen_zero_extendsidi2 (dest, tmp));
43547 }
43548 else
43549 emit_insn (gen_rtx_SET (dest, x));
43550 }
43551 rtx_insn *seq = get_insns ();
43552 end_sequence ();
43553
43554 if (saw_asm_flag)
43555 return seq;
43556 else
43557 {
43558 /* If we had no asm flag outputs, clobber the flags. */
43559 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
43560 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
43561 return NULL;
43562 }
43563 }
43564
43565 /* Implements target vector targetm.asm.encode_section_info. */
43566
43567 static void ATTRIBUTE_UNUSED
43568 ix86_encode_section_info (tree decl, rtx rtl, int first)
43569 {
43570 default_encode_section_info (decl, rtl, first);
43571
43572 if (ix86_in_large_data_p (decl))
43573 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
43574 }
43575
43576 /* Worker function for REVERSE_CONDITION. */
43577
43578 enum rtx_code
43579 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
43580 {
43581 return (mode != CCFPmode && mode != CCFPUmode
43582 ? reverse_condition (code)
43583 : reverse_condition_maybe_unordered (code));
43584 }
43585
43586 /* Output code to perform an x87 FP register move, from OPERANDS[1]
43587 to OPERANDS[0]. */
43588
43589 const char *
43590 output_387_reg_move (rtx insn, rtx *operands)
43591 {
43592 if (REG_P (operands[0]))
43593 {
43594 if (REG_P (operands[1])
43595 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
43596 {
43597 if (REGNO (operands[0]) == FIRST_STACK_REG)
43598 return output_387_ffreep (operands, 0);
43599 return "fstp\t%y0";
43600 }
43601 if (STACK_TOP_P (operands[0]))
43602 return "fld%Z1\t%y1";
43603 return "fst\t%y0";
43604 }
43605 else if (MEM_P (operands[0]))
43606 {
43607 gcc_assert (REG_P (operands[1]));
43608 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
43609 return "fstp%Z0\t%y0";
43610 else
43611 {
43612 /* There is no non-popping store to memory for XFmode.
43613 So if we need one, follow the store with a load. */
43614 if (GET_MODE (operands[0]) == XFmode)
43615 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
43616 else
43617 return "fst%Z0\t%y0";
43618 }
43619 }
43620 else
43621 gcc_unreachable();
43622 }
43623
43624 /* Output code to perform a conditional jump to LABEL, if C2 flag in
43625 FP status register is set. */
43626
43627 void
43628 ix86_emit_fp_unordered_jump (rtx label)
43629 {
43630 rtx reg = gen_reg_rtx (HImode);
43631 rtx temp;
43632
43633 emit_insn (gen_x86_fnstsw_1 (reg));
43634
43635 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
43636 {
43637 emit_insn (gen_x86_sahf_1 (reg));
43638
43639 temp = gen_rtx_REG (CCmode, FLAGS_REG);
43640 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
43641 }
43642 else
43643 {
43644 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
43645
43646 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
43647 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
43648 }
43649
43650 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
43651 gen_rtx_LABEL_REF (VOIDmode, label),
43652 pc_rtx);
43653 temp = gen_rtx_SET (pc_rtx, temp);
43654
43655 emit_jump_insn (temp);
43656 predict_jump (REG_BR_PROB_BASE * 10 / 100);
43657 }
43658
43659 /* Output code to perform a log1p XFmode calculation. */
43660
43661 void ix86_emit_i387_log1p (rtx op0, rtx op1)
43662 {
43663 rtx_code_label *label1 = gen_label_rtx ();
43664 rtx_code_label *label2 = gen_label_rtx ();
43665
43666 rtx tmp = gen_reg_rtx (XFmode);
43667 rtx tmp2 = gen_reg_rtx (XFmode);
43668 rtx test;
43669
43670 emit_insn (gen_absxf2 (tmp, op1));
43671 test = gen_rtx_GE (VOIDmode, tmp,
43672 const_double_from_real_value (
43673 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
43674 XFmode));
43675 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
43676
43677 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
43678 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
43679 emit_jump (label2);
43680
43681 emit_label (label1);
43682 emit_move_insn (tmp, CONST1_RTX (XFmode));
43683 emit_insn (gen_addxf3 (tmp, op1, tmp));
43684 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
43685 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
43686
43687 emit_label (label2);
43688 }
43689
43690 /* Emit code for round calculation. */
43691 void ix86_emit_i387_round (rtx op0, rtx op1)
43692 {
43693 machine_mode inmode = GET_MODE (op1);
43694 machine_mode outmode = GET_MODE (op0);
43695 rtx e1, e2, res, tmp, tmp1, half;
43696 rtx scratch = gen_reg_rtx (HImode);
43697 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
43698 rtx_code_label *jump_label = gen_label_rtx ();
43699 rtx insn;
43700 rtx (*gen_abs) (rtx, rtx);
43701 rtx (*gen_neg) (rtx, rtx);
43702
43703 switch (inmode)
43704 {
43705 case SFmode:
43706 gen_abs = gen_abssf2;
43707 break;
43708 case DFmode:
43709 gen_abs = gen_absdf2;
43710 break;
43711 case XFmode:
43712 gen_abs = gen_absxf2;
43713 break;
43714 default:
43715 gcc_unreachable ();
43716 }
43717
43718 switch (outmode)
43719 {
43720 case SFmode:
43721 gen_neg = gen_negsf2;
43722 break;
43723 case DFmode:
43724 gen_neg = gen_negdf2;
43725 break;
43726 case XFmode:
43727 gen_neg = gen_negxf2;
43728 break;
43729 case HImode:
43730 gen_neg = gen_neghi2;
43731 break;
43732 case SImode:
43733 gen_neg = gen_negsi2;
43734 break;
43735 case DImode:
43736 gen_neg = gen_negdi2;
43737 break;
43738 default:
43739 gcc_unreachable ();
43740 }
43741
43742 e1 = gen_reg_rtx (inmode);
43743 e2 = gen_reg_rtx (inmode);
43744 res = gen_reg_rtx (outmode);
43745
43746 half = const_double_from_real_value (dconsthalf, inmode);
43747
43748 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
43749
43750 /* scratch = fxam(op1) */
43751 emit_insn (gen_rtx_SET (scratch,
43752 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
43753 UNSPEC_FXAM)));
43754 /* e1 = fabs(op1) */
43755 emit_insn (gen_abs (e1, op1));
43756
43757 /* e2 = e1 + 0.5 */
43758 half = force_reg (inmode, half);
43759 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
43760
43761 /* res = floor(e2) */
43762 if (inmode != XFmode)
43763 {
43764 tmp1 = gen_reg_rtx (XFmode);
43765
43766 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
43767 }
43768 else
43769 tmp1 = e2;
43770
43771 switch (outmode)
43772 {
43773 case SFmode:
43774 case DFmode:
43775 {
43776 rtx tmp0 = gen_reg_rtx (XFmode);
43777
43778 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
43779
43780 emit_insn (gen_rtx_SET (res,
43781 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
43782 UNSPEC_TRUNC_NOOP)));
43783 }
43784 break;
43785 case XFmode:
43786 emit_insn (gen_frndintxf2_floor (res, tmp1));
43787 break;
43788 case HImode:
43789 emit_insn (gen_lfloorxfhi2 (res, tmp1));
43790 break;
43791 case SImode:
43792 emit_insn (gen_lfloorxfsi2 (res, tmp1));
43793 break;
43794 case DImode:
43795 emit_insn (gen_lfloorxfdi2 (res, tmp1));
43796 break;
43797 default:
43798 gcc_unreachable ();
43799 }
43800
43801 /* flags = signbit(a) */
43802 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
43803
43804 /* if (flags) then res = -res */
43805 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
43806 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
43807 gen_rtx_LABEL_REF (VOIDmode, jump_label),
43808 pc_rtx);
43809 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
43810 predict_jump (REG_BR_PROB_BASE * 50 / 100);
43811 JUMP_LABEL (insn) = jump_label;
43812
43813 emit_insn (gen_neg (res, res));
43814
43815 emit_label (jump_label);
43816 LABEL_NUSES (jump_label) = 1;
43817
43818 emit_move_insn (op0, res);
43819 }
43820
43821 /* Output code to perform a Newton-Rhapson approximation of a single precision
43822 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
43823
43824 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
43825 {
43826 rtx x0, x1, e0, e1;
43827
43828 x0 = gen_reg_rtx (mode);
43829 e0 = gen_reg_rtx (mode);
43830 e1 = gen_reg_rtx (mode);
43831 x1 = gen_reg_rtx (mode);
43832
43833 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
43834
43835 b = force_reg (mode, b);
43836
43837 /* x0 = rcp(b) estimate */
43838 if (mode == V16SFmode || mode == V8DFmode)
43839 {
43840 if (TARGET_AVX512ER)
43841 {
43842 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43843 UNSPEC_RCP28)));
43844 /* res = a * x0 */
43845 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
43846 return;
43847 }
43848 else
43849 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43850 UNSPEC_RCP14)));
43851 }
43852 else
43853 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43854 UNSPEC_RCP)));
43855
43856 /* e0 = x0 * b */
43857 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
43858
43859 /* e0 = x0 * e0 */
43860 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
43861
43862 /* e1 = x0 + x0 */
43863 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
43864
43865 /* x1 = e1 - e0 */
43866 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
43867
43868 /* res = a * x1 */
43869 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
43870 }
43871
43872 /* Output code to perform a Newton-Rhapson approximation of a
43873 single precision floating point [reciprocal] square root. */
43874
43875 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
43876 {
43877 rtx x0, e0, e1, e2, e3, mthree, mhalf;
43878 REAL_VALUE_TYPE r;
43879 int unspec;
43880
43881 x0 = gen_reg_rtx (mode);
43882 e0 = gen_reg_rtx (mode);
43883 e1 = gen_reg_rtx (mode);
43884 e2 = gen_reg_rtx (mode);
43885 e3 = gen_reg_rtx (mode);
43886
43887 if (TARGET_AVX512ER && mode == V16SFmode)
43888 {
43889 if (recip)
43890 /* res = rsqrt28(a) estimate */
43891 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43892 UNSPEC_RSQRT28)));
43893 else
43894 {
43895 /* x0 = rsqrt28(a) estimate */
43896 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43897 UNSPEC_RSQRT28)));
43898 /* res = rcp28(x0) estimate */
43899 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
43900 UNSPEC_RCP28)));
43901 }
43902 return;
43903 }
43904
43905 real_from_integer (&r, VOIDmode, -3, SIGNED);
43906 mthree = const_double_from_real_value (r, SFmode);
43907
43908 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
43909 mhalf = const_double_from_real_value (r, SFmode);
43910 unspec = UNSPEC_RSQRT;
43911
43912 if (VECTOR_MODE_P (mode))
43913 {
43914 mthree = ix86_build_const_vector (mode, true, mthree);
43915 mhalf = ix86_build_const_vector (mode, true, mhalf);
43916 /* There is no 512-bit rsqrt. There is however rsqrt14. */
43917 if (GET_MODE_SIZE (mode) == 64)
43918 unspec = UNSPEC_RSQRT14;
43919 }
43920
43921 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
43922 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
43923
43924 a = force_reg (mode, a);
43925
43926 /* x0 = rsqrt(a) estimate */
43927 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43928 unspec)));
43929
43930 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
43931 if (!recip)
43932 {
43933 rtx zero = force_reg (mode, CONST0_RTX(mode));
43934 rtx mask;
43935
43936 /* Handle masked compare. */
43937 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
43938 {
43939 mask = gen_reg_rtx (HImode);
43940 /* Imm value 0x4 corresponds to not-equal comparison. */
43941 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
43942 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
43943 }
43944 else
43945 {
43946 mask = gen_reg_rtx (mode);
43947 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
43948 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
43949 }
43950 }
43951
43952 /* e0 = x0 * a */
43953 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
43954 /* e1 = e0 * x0 */
43955 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
43956
43957 /* e2 = e1 - 3. */
43958 mthree = force_reg (mode, mthree);
43959 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
43960
43961 mhalf = force_reg (mode, mhalf);
43962 if (recip)
43963 /* e3 = -.5 * x0 */
43964 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
43965 else
43966 /* e3 = -.5 * e0 */
43967 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
43968 /* ret = e2 * e3 */
43969 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
43970 }
43971
43972 #ifdef TARGET_SOLARIS
43973 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
43974
43975 static void
43976 i386_solaris_elf_named_section (const char *name, unsigned int flags,
43977 tree decl)
43978 {
43979 /* With Binutils 2.15, the "@unwind" marker must be specified on
43980 every occurrence of the ".eh_frame" section, not just the first
43981 one. */
43982 if (TARGET_64BIT
43983 && strcmp (name, ".eh_frame") == 0)
43984 {
43985 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
43986 flags & SECTION_WRITE ? "aw" : "a");
43987 return;
43988 }
43989
43990 #ifndef USE_GAS
43991 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
43992 {
43993 solaris_elf_asm_comdat_section (name, flags, decl);
43994 return;
43995 }
43996 #endif
43997
43998 default_elf_asm_named_section (name, flags, decl);
43999 }
44000 #endif /* TARGET_SOLARIS */
44001
44002 /* Return the mangling of TYPE if it is an extended fundamental type. */
44003
44004 static const char *
44005 ix86_mangle_type (const_tree type)
44006 {
44007 type = TYPE_MAIN_VARIANT (type);
44008
44009 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
44010 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
44011 return NULL;
44012
44013 switch (TYPE_MODE (type))
44014 {
44015 case TFmode:
44016 /* __float128 is "g". */
44017 return "g";
44018 case XFmode:
44019 /* "long double" or __float80 is "e". */
44020 return "e";
44021 default:
44022 return NULL;
44023 }
44024 }
44025
44026 #ifdef TARGET_THREAD_SSP_OFFSET
44027 /* If using TLS guards, don't waste time creating and expanding
44028 __stack_chk_guard decl and MEM as we are going to ignore it. */
44029 static tree
44030 ix86_stack_protect_guard (void)
44031 {
44032 if (TARGET_SSP_TLS_GUARD)
44033 return NULL_TREE;
44034 return default_stack_protect_guard ();
44035 }
44036 #endif
44037
44038 /* For 32-bit code we can save PIC register setup by using
44039 __stack_chk_fail_local hidden function instead of calling
44040 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
44041 register, so it is better to call __stack_chk_fail directly. */
44042
44043 static tree ATTRIBUTE_UNUSED
44044 ix86_stack_protect_fail (void)
44045 {
44046 return TARGET_64BIT
44047 ? default_external_stack_protect_fail ()
44048 : default_hidden_stack_protect_fail ();
44049 }
44050
44051 /* Select a format to encode pointers in exception handling data. CODE
44052 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
44053 true if the symbol may be affected by dynamic relocations.
44054
44055 ??? All x86 object file formats are capable of representing this.
44056 After all, the relocation needed is the same as for the call insn.
44057 Whether or not a particular assembler allows us to enter such, I
44058 guess we'll have to see. */
44059 int
44060 asm_preferred_eh_data_format (int code, int global)
44061 {
44062 if (flag_pic)
44063 {
44064 int type = DW_EH_PE_sdata8;
44065 if (!TARGET_64BIT
44066 || ix86_cmodel == CM_SMALL_PIC
44067 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
44068 type = DW_EH_PE_sdata4;
44069 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
44070 }
44071 if (ix86_cmodel == CM_SMALL
44072 || (ix86_cmodel == CM_MEDIUM && code))
44073 return DW_EH_PE_udata4;
44074 return DW_EH_PE_absptr;
44075 }
44076 \f
44077 /* Expand copysign from SIGN to the positive value ABS_VALUE
44078 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
44079 the sign-bit. */
44080 static void
44081 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
44082 {
44083 machine_mode mode = GET_MODE (sign);
44084 rtx sgn = gen_reg_rtx (mode);
44085 if (mask == NULL_RTX)
44086 {
44087 machine_mode vmode;
44088
44089 if (mode == SFmode)
44090 vmode = V4SFmode;
44091 else if (mode == DFmode)
44092 vmode = V2DFmode;
44093 else
44094 vmode = mode;
44095
44096 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
44097 if (!VECTOR_MODE_P (mode))
44098 {
44099 /* We need to generate a scalar mode mask in this case. */
44100 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
44101 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
44102 mask = gen_reg_rtx (mode);
44103 emit_insn (gen_rtx_SET (mask, tmp));
44104 }
44105 }
44106 else
44107 mask = gen_rtx_NOT (mode, mask);
44108 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
44109 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
44110 }
44111
44112 /* Expand fabs (OP0) and return a new rtx that holds the result. The
44113 mask for masking out the sign-bit is stored in *SMASK, if that is
44114 non-null. */
44115 static rtx
44116 ix86_expand_sse_fabs (rtx op0, rtx *smask)
44117 {
44118 machine_mode vmode, mode = GET_MODE (op0);
44119 rtx xa, mask;
44120
44121 xa = gen_reg_rtx (mode);
44122 if (mode == SFmode)
44123 vmode = V4SFmode;
44124 else if (mode == DFmode)
44125 vmode = V2DFmode;
44126 else
44127 vmode = mode;
44128 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
44129 if (!VECTOR_MODE_P (mode))
44130 {
44131 /* We need to generate a scalar mode mask in this case. */
44132 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
44133 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
44134 mask = gen_reg_rtx (mode);
44135 emit_insn (gen_rtx_SET (mask, tmp));
44136 }
44137 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
44138
44139 if (smask)
44140 *smask = mask;
44141
44142 return xa;
44143 }
44144
44145 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
44146 swapping the operands if SWAP_OPERANDS is true. The expanded
44147 code is a forward jump to a newly created label in case the
44148 comparison is true. The generated label rtx is returned. */
44149 static rtx_code_label *
44150 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
44151 bool swap_operands)
44152 {
44153 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
44154 rtx_code_label *label;
44155 rtx tmp;
44156
44157 if (swap_operands)
44158 std::swap (op0, op1);
44159
44160 label = gen_label_rtx ();
44161 tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG);
44162 emit_insn (gen_rtx_SET (tmp, gen_rtx_COMPARE (fpcmp_mode, op0, op1)));
44163 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
44164 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
44165 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
44166 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
44167 JUMP_LABEL (tmp) = label;
44168
44169 return label;
44170 }
44171
44172 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
44173 using comparison code CODE. Operands are swapped for the comparison if
44174 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
44175 static rtx
44176 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
44177 bool swap_operands)
44178 {
44179 rtx (*insn)(rtx, rtx, rtx, rtx);
44180 machine_mode mode = GET_MODE (op0);
44181 rtx mask = gen_reg_rtx (mode);
44182
44183 if (swap_operands)
44184 std::swap (op0, op1);
44185
44186 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
44187
44188 emit_insn (insn (mask, op0, op1,
44189 gen_rtx_fmt_ee (code, mode, op0, op1)));
44190 return mask;
44191 }
44192
44193 /* Generate and return a rtx of mode MODE for 2**n where n is the number
44194 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
44195 static rtx
44196 ix86_gen_TWO52 (machine_mode mode)
44197 {
44198 REAL_VALUE_TYPE TWO52r;
44199 rtx TWO52;
44200
44201 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
44202 TWO52 = const_double_from_real_value (TWO52r, mode);
44203 TWO52 = force_reg (mode, TWO52);
44204
44205 return TWO52;
44206 }
44207
44208 /* Expand SSE sequence for computing lround from OP1 storing
44209 into OP0. */
44210 void
44211 ix86_expand_lround (rtx op0, rtx op1)
44212 {
44213 /* C code for the stuff we're doing below:
44214 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
44215 return (long)tmp;
44216 */
44217 machine_mode mode = GET_MODE (op1);
44218 const struct real_format *fmt;
44219 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44220 rtx adj;
44221
44222 /* load nextafter (0.5, 0.0) */
44223 fmt = REAL_MODE_FORMAT (mode);
44224 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44225 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44226
44227 /* adj = copysign (0.5, op1) */
44228 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
44229 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
44230
44231 /* adj = op1 + adj */
44232 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
44233
44234 /* op0 = (imode)adj */
44235 expand_fix (op0, adj, 0);
44236 }
44237
44238 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
44239 into OPERAND0. */
44240 void
44241 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
44242 {
44243 /* C code for the stuff we're doing below (for do_floor):
44244 xi = (long)op1;
44245 xi -= (double)xi > op1 ? 1 : 0;
44246 return xi;
44247 */
44248 machine_mode fmode = GET_MODE (op1);
44249 machine_mode imode = GET_MODE (op0);
44250 rtx ireg, freg, tmp;
44251 rtx_code_label *label;
44252
44253 /* reg = (long)op1 */
44254 ireg = gen_reg_rtx (imode);
44255 expand_fix (ireg, op1, 0);
44256
44257 /* freg = (double)reg */
44258 freg = gen_reg_rtx (fmode);
44259 expand_float (freg, ireg, 0);
44260
44261 /* ireg = (freg > op1) ? ireg - 1 : ireg */
44262 label = ix86_expand_sse_compare_and_jump (UNLE,
44263 freg, op1, !do_floor);
44264 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
44265 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
44266 emit_move_insn (ireg, tmp);
44267
44268 emit_label (label);
44269 LABEL_NUSES (label) = 1;
44270
44271 emit_move_insn (op0, ireg);
44272 }
44273
44274 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
44275 result in OPERAND0. */
44276 void
44277 ix86_expand_rint (rtx operand0, rtx operand1)
44278 {
44279 /* C code for the stuff we're doing below:
44280 xa = fabs (operand1);
44281 if (!isless (xa, 2**52))
44282 return operand1;
44283 xa = xa + 2**52 - 2**52;
44284 return copysign (xa, operand1);
44285 */
44286 machine_mode mode = GET_MODE (operand0);
44287 rtx res, xa, TWO52, mask;
44288 rtx_code_label *label;
44289
44290 res = gen_reg_rtx (mode);
44291 emit_move_insn (res, operand1);
44292
44293 /* xa = abs (operand1) */
44294 xa = ix86_expand_sse_fabs (res, &mask);
44295
44296 /* if (!isless (xa, TWO52)) goto label; */
44297 TWO52 = ix86_gen_TWO52 (mode);
44298 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44299
44300 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44301 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
44302
44303 ix86_sse_copysign_to_positive (res, xa, res, mask);
44304
44305 emit_label (label);
44306 LABEL_NUSES (label) = 1;
44307
44308 emit_move_insn (operand0, res);
44309 }
44310
44311 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
44312 into OPERAND0. */
44313 void
44314 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
44315 {
44316 /* C code for the stuff we expand below.
44317 double xa = fabs (x), x2;
44318 if (!isless (xa, TWO52))
44319 return x;
44320 xa = xa + TWO52 - TWO52;
44321 x2 = copysign (xa, x);
44322 Compensate. Floor:
44323 if (x2 > x)
44324 x2 -= 1;
44325 Compensate. Ceil:
44326 if (x2 < x)
44327 x2 -= -1;
44328 return x2;
44329 */
44330 machine_mode mode = GET_MODE (operand0);
44331 rtx xa, TWO52, tmp, one, res, mask;
44332 rtx_code_label *label;
44333
44334 TWO52 = ix86_gen_TWO52 (mode);
44335
44336 /* Temporary for holding the result, initialized to the input
44337 operand to ease control flow. */
44338 res = gen_reg_rtx (mode);
44339 emit_move_insn (res, operand1);
44340
44341 /* xa = abs (operand1) */
44342 xa = ix86_expand_sse_fabs (res, &mask);
44343
44344 /* if (!isless (xa, TWO52)) goto label; */
44345 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44346
44347 /* xa = xa + TWO52 - TWO52; */
44348 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44349 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
44350
44351 /* xa = copysign (xa, operand1) */
44352 ix86_sse_copysign_to_positive (xa, xa, res, mask);
44353
44354 /* generate 1.0 or -1.0 */
44355 one = force_reg (mode,
44356 const_double_from_real_value (do_floor
44357 ? dconst1 : dconstm1, mode));
44358
44359 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
44360 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
44361 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44362 /* We always need to subtract here to preserve signed zero. */
44363 tmp = expand_simple_binop (mode, MINUS,
44364 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44365 emit_move_insn (res, tmp);
44366
44367 emit_label (label);
44368 LABEL_NUSES (label) = 1;
44369
44370 emit_move_insn (operand0, res);
44371 }
44372
44373 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
44374 into OPERAND0. */
44375 void
44376 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
44377 {
44378 /* C code for the stuff we expand below.
44379 double xa = fabs (x), x2;
44380 if (!isless (xa, TWO52))
44381 return x;
44382 x2 = (double)(long)x;
44383 Compensate. Floor:
44384 if (x2 > x)
44385 x2 -= 1;
44386 Compensate. Ceil:
44387 if (x2 < x)
44388 x2 += 1;
44389 if (HONOR_SIGNED_ZEROS (mode))
44390 return copysign (x2, x);
44391 return x2;
44392 */
44393 machine_mode mode = GET_MODE (operand0);
44394 rtx xa, xi, TWO52, tmp, one, res, mask;
44395 rtx_code_label *label;
44396
44397 TWO52 = ix86_gen_TWO52 (mode);
44398
44399 /* Temporary for holding the result, initialized to the input
44400 operand to ease control flow. */
44401 res = gen_reg_rtx (mode);
44402 emit_move_insn (res, operand1);
44403
44404 /* xa = abs (operand1) */
44405 xa = ix86_expand_sse_fabs (res, &mask);
44406
44407 /* if (!isless (xa, TWO52)) goto label; */
44408 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44409
44410 /* xa = (double)(long)x */
44411 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44412 expand_fix (xi, res, 0);
44413 expand_float (xa, xi, 0);
44414
44415 /* generate 1.0 */
44416 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
44417
44418 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
44419 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
44420 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44421 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
44422 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44423 emit_move_insn (res, tmp);
44424
44425 if (HONOR_SIGNED_ZEROS (mode))
44426 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
44427
44428 emit_label (label);
44429 LABEL_NUSES (label) = 1;
44430
44431 emit_move_insn (operand0, res);
44432 }
44433
44434 /* Expand SSE sequence for computing round from OPERAND1 storing
44435 into OPERAND0. Sequence that works without relying on DImode truncation
44436 via cvttsd2siq that is only available on 64bit targets. */
44437 void
44438 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
44439 {
44440 /* C code for the stuff we expand below.
44441 double xa = fabs (x), xa2, x2;
44442 if (!isless (xa, TWO52))
44443 return x;
44444 Using the absolute value and copying back sign makes
44445 -0.0 -> -0.0 correct.
44446 xa2 = xa + TWO52 - TWO52;
44447 Compensate.
44448 dxa = xa2 - xa;
44449 if (dxa <= -0.5)
44450 xa2 += 1;
44451 else if (dxa > 0.5)
44452 xa2 -= 1;
44453 x2 = copysign (xa2, x);
44454 return x2;
44455 */
44456 machine_mode mode = GET_MODE (operand0);
44457 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
44458 rtx_code_label *label;
44459
44460 TWO52 = ix86_gen_TWO52 (mode);
44461
44462 /* Temporary for holding the result, initialized to the input
44463 operand to ease control flow. */
44464 res = gen_reg_rtx (mode);
44465 emit_move_insn (res, operand1);
44466
44467 /* xa = abs (operand1) */
44468 xa = ix86_expand_sse_fabs (res, &mask);
44469
44470 /* if (!isless (xa, TWO52)) goto label; */
44471 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44472
44473 /* xa2 = xa + TWO52 - TWO52; */
44474 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44475 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
44476
44477 /* dxa = xa2 - xa; */
44478 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
44479
44480 /* generate 0.5, 1.0 and -0.5 */
44481 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
44482 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
44483 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
44484 0, OPTAB_DIRECT);
44485
44486 /* Compensate. */
44487 tmp = gen_reg_rtx (mode);
44488 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
44489 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
44490 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44491 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44492 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
44493 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
44494 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44495 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44496
44497 /* res = copysign (xa2, operand1) */
44498 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
44499
44500 emit_label (label);
44501 LABEL_NUSES (label) = 1;
44502
44503 emit_move_insn (operand0, res);
44504 }
44505
44506 /* Expand SSE sequence for computing trunc from OPERAND1 storing
44507 into OPERAND0. */
44508 void
44509 ix86_expand_trunc (rtx operand0, rtx operand1)
44510 {
44511 /* C code for SSE variant we expand below.
44512 double xa = fabs (x), x2;
44513 if (!isless (xa, TWO52))
44514 return x;
44515 x2 = (double)(long)x;
44516 if (HONOR_SIGNED_ZEROS (mode))
44517 return copysign (x2, x);
44518 return x2;
44519 */
44520 machine_mode mode = GET_MODE (operand0);
44521 rtx xa, xi, TWO52, res, mask;
44522 rtx_code_label *label;
44523
44524 TWO52 = ix86_gen_TWO52 (mode);
44525
44526 /* Temporary for holding the result, initialized to the input
44527 operand to ease control flow. */
44528 res = gen_reg_rtx (mode);
44529 emit_move_insn (res, operand1);
44530
44531 /* xa = abs (operand1) */
44532 xa = ix86_expand_sse_fabs (res, &mask);
44533
44534 /* if (!isless (xa, TWO52)) goto label; */
44535 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44536
44537 /* x = (double)(long)x */
44538 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44539 expand_fix (xi, res, 0);
44540 expand_float (res, xi, 0);
44541
44542 if (HONOR_SIGNED_ZEROS (mode))
44543 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
44544
44545 emit_label (label);
44546 LABEL_NUSES (label) = 1;
44547
44548 emit_move_insn (operand0, res);
44549 }
44550
44551 /* Expand SSE sequence for computing trunc from OPERAND1 storing
44552 into OPERAND0. */
44553 void
44554 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
44555 {
44556 machine_mode mode = GET_MODE (operand0);
44557 rtx xa, mask, TWO52, one, res, smask, tmp;
44558 rtx_code_label *label;
44559
44560 /* C code for SSE variant we expand below.
44561 double xa = fabs (x), x2;
44562 if (!isless (xa, TWO52))
44563 return x;
44564 xa2 = xa + TWO52 - TWO52;
44565 Compensate:
44566 if (xa2 > xa)
44567 xa2 -= 1.0;
44568 x2 = copysign (xa2, x);
44569 return x2;
44570 */
44571
44572 TWO52 = ix86_gen_TWO52 (mode);
44573
44574 /* Temporary for holding the result, initialized to the input
44575 operand to ease control flow. */
44576 res = gen_reg_rtx (mode);
44577 emit_move_insn (res, operand1);
44578
44579 /* xa = abs (operand1) */
44580 xa = ix86_expand_sse_fabs (res, &smask);
44581
44582 /* if (!isless (xa, TWO52)) goto label; */
44583 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44584
44585 /* res = xa + TWO52 - TWO52; */
44586 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44587 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
44588 emit_move_insn (res, tmp);
44589
44590 /* generate 1.0 */
44591 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
44592
44593 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
44594 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
44595 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
44596 tmp = expand_simple_binop (mode, MINUS,
44597 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
44598 emit_move_insn (res, tmp);
44599
44600 /* res = copysign (res, operand1) */
44601 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
44602
44603 emit_label (label);
44604 LABEL_NUSES (label) = 1;
44605
44606 emit_move_insn (operand0, res);
44607 }
44608
44609 /* Expand SSE sequence for computing round from OPERAND1 storing
44610 into OPERAND0. */
44611 void
44612 ix86_expand_round (rtx operand0, rtx operand1)
44613 {
44614 /* C code for the stuff we're doing below:
44615 double xa = fabs (x);
44616 if (!isless (xa, TWO52))
44617 return x;
44618 xa = (double)(long)(xa + nextafter (0.5, 0.0));
44619 return copysign (xa, x);
44620 */
44621 machine_mode mode = GET_MODE (operand0);
44622 rtx res, TWO52, xa, xi, half, mask;
44623 rtx_code_label *label;
44624 const struct real_format *fmt;
44625 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44626
44627 /* Temporary for holding the result, initialized to the input
44628 operand to ease control flow. */
44629 res = gen_reg_rtx (mode);
44630 emit_move_insn (res, operand1);
44631
44632 TWO52 = ix86_gen_TWO52 (mode);
44633 xa = ix86_expand_sse_fabs (res, &mask);
44634 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44635
44636 /* load nextafter (0.5, 0.0) */
44637 fmt = REAL_MODE_FORMAT (mode);
44638 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44639 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44640
44641 /* xa = xa + 0.5 */
44642 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
44643 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
44644
44645 /* xa = (double)(int64_t)xa */
44646 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44647 expand_fix (xi, xa, 0);
44648 expand_float (xa, xi, 0);
44649
44650 /* res = copysign (xa, operand1) */
44651 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
44652
44653 emit_label (label);
44654 LABEL_NUSES (label) = 1;
44655
44656 emit_move_insn (operand0, res);
44657 }
44658
44659 /* Expand SSE sequence for computing round
44660 from OP1 storing into OP0 using sse4 round insn. */
44661 void
44662 ix86_expand_round_sse4 (rtx op0, rtx op1)
44663 {
44664 machine_mode mode = GET_MODE (op0);
44665 rtx e1, e2, res, half;
44666 const struct real_format *fmt;
44667 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44668 rtx (*gen_copysign) (rtx, rtx, rtx);
44669 rtx (*gen_round) (rtx, rtx, rtx);
44670
44671 switch (mode)
44672 {
44673 case SFmode:
44674 gen_copysign = gen_copysignsf3;
44675 gen_round = gen_sse4_1_roundsf2;
44676 break;
44677 case DFmode:
44678 gen_copysign = gen_copysigndf3;
44679 gen_round = gen_sse4_1_rounddf2;
44680 break;
44681 default:
44682 gcc_unreachable ();
44683 }
44684
44685 /* round (a) = trunc (a + copysign (0.5, a)) */
44686
44687 /* load nextafter (0.5, 0.0) */
44688 fmt = REAL_MODE_FORMAT (mode);
44689 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44690 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44691 half = const_double_from_real_value (pred_half, mode);
44692
44693 /* e1 = copysign (0.5, op1) */
44694 e1 = gen_reg_rtx (mode);
44695 emit_insn (gen_copysign (e1, half, op1));
44696
44697 /* e2 = op1 + e1 */
44698 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
44699
44700 /* res = trunc (e2) */
44701 res = gen_reg_rtx (mode);
44702 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
44703
44704 emit_move_insn (op0, res);
44705 }
44706 \f
44707
44708 /* Table of valid machine attributes. */
44709 static const struct attribute_spec ix86_attribute_table[] =
44710 {
44711 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
44712 affects_type_identity } */
44713 /* Stdcall attribute says callee is responsible for popping arguments
44714 if they are not variable. */
44715 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44716 true },
44717 /* Fastcall attribute says callee is responsible for popping arguments
44718 if they are not variable. */
44719 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44720 true },
44721 /* Thiscall attribute says callee is responsible for popping arguments
44722 if they are not variable. */
44723 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44724 true },
44725 /* Cdecl attribute says the callee is a normal C declaration */
44726 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44727 true },
44728 /* Regparm attribute specifies how many integer arguments are to be
44729 passed in registers. */
44730 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
44731 true },
44732 /* Sseregparm attribute says we are using x86_64 calling conventions
44733 for FP arguments. */
44734 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44735 true },
44736 /* The transactional memory builtins are implicitly regparm or fastcall
44737 depending on the ABI. Override the generic do-nothing attribute that
44738 these builtins were declared with. */
44739 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
44740 true },
44741 /* force_align_arg_pointer says this function realigns the stack at entry. */
44742 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
44743 false, true, true, ix86_handle_force_align_arg_pointer_attribute, false },
44744 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
44745 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
44746 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
44747 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
44748 false },
44749 #endif
44750 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
44751 false },
44752 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
44753 false },
44754 #ifdef SUBTARGET_ATTRIBUTE_TABLE
44755 SUBTARGET_ATTRIBUTE_TABLE,
44756 #endif
44757 /* ms_abi and sysv_abi calling convention function attributes. */
44758 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
44759 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
44760 { "ms_abi va_list", 0, 0, false, false, false, NULL, false },
44761 { "sysv_abi va_list", 0, 0, false, false, false, NULL, false },
44762 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
44763 false },
44764 { "callee_pop_aggregate_return", 1, 1, false, true, true,
44765 ix86_handle_callee_pop_aggregate_return, true },
44766 { "interrupt", 0, 0, false, true, true,
44767 ix86_handle_interrupt_attribute, false },
44768 { "no_caller_saved_registers", 0, 0, false, true, true,
44769 ix86_handle_no_caller_saved_registers_attribute, false },
44770
44771 /* End element. */
44772 { NULL, 0, 0, false, false, false, NULL, false }
44773 };
44774
44775 /* Implement targetm.vectorize.builtin_vectorization_cost. */
44776 static int
44777 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
44778 tree vectype, int)
44779 {
44780 switch (type_of_cost)
44781 {
44782 case scalar_stmt:
44783 return ix86_cost->scalar_stmt_cost;
44784
44785 case scalar_load:
44786 return ix86_cost->scalar_load_cost;
44787
44788 case scalar_store:
44789 return ix86_cost->scalar_store_cost;
44790
44791 case vector_stmt:
44792 return ix86_cost->vec_stmt_cost;
44793
44794 case vector_load:
44795 return ix86_cost->vec_align_load_cost;
44796
44797 case vector_store:
44798 return ix86_cost->vec_store_cost;
44799
44800 case vec_to_scalar:
44801 return ix86_cost->vec_to_scalar_cost;
44802
44803 case scalar_to_vec:
44804 return ix86_cost->scalar_to_vec_cost;
44805
44806 case unaligned_load:
44807 case unaligned_store:
44808 return ix86_cost->vec_unalign_load_cost;
44809
44810 case cond_branch_taken:
44811 return ix86_cost->cond_taken_branch_cost;
44812
44813 case cond_branch_not_taken:
44814 return ix86_cost->cond_not_taken_branch_cost;
44815
44816 case vec_perm:
44817 case vec_promote_demote:
44818 return ix86_cost->vec_stmt_cost;
44819
44820 case vec_construct:
44821 return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
44822
44823 default:
44824 gcc_unreachable ();
44825 }
44826 }
44827
44828 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
44829 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
44830 insn every time. */
44831
44832 static GTY(()) rtx_insn *vselect_insn;
44833
44834 /* Initialize vselect_insn. */
44835
44836 static void
44837 init_vselect_insn (void)
44838 {
44839 unsigned i;
44840 rtx x;
44841
44842 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
44843 for (i = 0; i < MAX_VECT_LEN; ++i)
44844 XVECEXP (x, 0, i) = const0_rtx;
44845 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
44846 const0_rtx), x);
44847 x = gen_rtx_SET (const0_rtx, x);
44848 start_sequence ();
44849 vselect_insn = emit_insn (x);
44850 end_sequence ();
44851 }
44852
44853 /* Construct (set target (vec_select op0 (parallel perm))) and
44854 return true if that's a valid instruction in the active ISA. */
44855
44856 static bool
44857 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
44858 unsigned nelt, bool testing_p)
44859 {
44860 unsigned int i;
44861 rtx x, save_vconcat;
44862 int icode;
44863
44864 if (vselect_insn == NULL_RTX)
44865 init_vselect_insn ();
44866
44867 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
44868 PUT_NUM_ELEM (XVEC (x, 0), nelt);
44869 for (i = 0; i < nelt; ++i)
44870 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
44871 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
44872 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
44873 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
44874 SET_DEST (PATTERN (vselect_insn)) = target;
44875 icode = recog_memoized (vselect_insn);
44876
44877 if (icode >= 0 && !testing_p)
44878 emit_insn (copy_rtx (PATTERN (vselect_insn)));
44879
44880 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
44881 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
44882 INSN_CODE (vselect_insn) = -1;
44883
44884 return icode >= 0;
44885 }
44886
44887 /* Similar, but generate a vec_concat from op0 and op1 as well. */
44888
44889 static bool
44890 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
44891 const unsigned char *perm, unsigned nelt,
44892 bool testing_p)
44893 {
44894 machine_mode v2mode;
44895 rtx x;
44896 bool ok;
44897
44898 if (vselect_insn == NULL_RTX)
44899 init_vselect_insn ();
44900
44901 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
44902 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
44903 PUT_MODE (x, v2mode);
44904 XEXP (x, 0) = op0;
44905 XEXP (x, 1) = op1;
44906 ok = expand_vselect (target, x, perm, nelt, testing_p);
44907 XEXP (x, 0) = const0_rtx;
44908 XEXP (x, 1) = const0_rtx;
44909 return ok;
44910 }
44911
44912 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
44913 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
44914
44915 static bool
44916 expand_vec_perm_blend (struct expand_vec_perm_d *d)
44917 {
44918 machine_mode mmode, vmode = d->vmode;
44919 unsigned i, mask, nelt = d->nelt;
44920 rtx target, op0, op1, maskop, x;
44921 rtx rperm[32], vperm;
44922
44923 if (d->one_operand_p)
44924 return false;
44925 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
44926 && (TARGET_AVX512BW
44927 || GET_MODE_UNIT_SIZE (vmode) >= 4))
44928 ;
44929 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
44930 ;
44931 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
44932 ;
44933 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
44934 ;
44935 else
44936 return false;
44937
44938 /* This is a blend, not a permute. Elements must stay in their
44939 respective lanes. */
44940 for (i = 0; i < nelt; ++i)
44941 {
44942 unsigned e = d->perm[i];
44943 if (!(e == i || e == i + nelt))
44944 return false;
44945 }
44946
44947 if (d->testing_p)
44948 return true;
44949
44950 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
44951 decision should be extracted elsewhere, so that we only try that
44952 sequence once all budget==3 options have been tried. */
44953 target = d->target;
44954 op0 = d->op0;
44955 op1 = d->op1;
44956 mask = 0;
44957
44958 switch (vmode)
44959 {
44960 case V8DFmode:
44961 case V16SFmode:
44962 case V4DFmode:
44963 case V8SFmode:
44964 case V2DFmode:
44965 case V4SFmode:
44966 case V8HImode:
44967 case V8SImode:
44968 case V32HImode:
44969 case V64QImode:
44970 case V16SImode:
44971 case V8DImode:
44972 for (i = 0; i < nelt; ++i)
44973 mask |= (d->perm[i] >= nelt) << i;
44974 break;
44975
44976 case V2DImode:
44977 for (i = 0; i < 2; ++i)
44978 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
44979 vmode = V8HImode;
44980 goto do_subreg;
44981
44982 case V4SImode:
44983 for (i = 0; i < 4; ++i)
44984 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
44985 vmode = V8HImode;
44986 goto do_subreg;
44987
44988 case V16QImode:
44989 /* See if bytes move in pairs so we can use pblendw with
44990 an immediate argument, rather than pblendvb with a vector
44991 argument. */
44992 for (i = 0; i < 16; i += 2)
44993 if (d->perm[i] + 1 != d->perm[i + 1])
44994 {
44995 use_pblendvb:
44996 for (i = 0; i < nelt; ++i)
44997 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
44998
44999 finish_pblendvb:
45000 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
45001 vperm = force_reg (vmode, vperm);
45002
45003 if (GET_MODE_SIZE (vmode) == 16)
45004 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
45005 else
45006 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
45007 if (target != d->target)
45008 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45009 return true;
45010 }
45011
45012 for (i = 0; i < 8; ++i)
45013 mask |= (d->perm[i * 2] >= 16) << i;
45014 vmode = V8HImode;
45015 /* FALLTHRU */
45016
45017 do_subreg:
45018 target = gen_reg_rtx (vmode);
45019 op0 = gen_lowpart (vmode, op0);
45020 op1 = gen_lowpart (vmode, op1);
45021 break;
45022
45023 case V32QImode:
45024 /* See if bytes move in pairs. If not, vpblendvb must be used. */
45025 for (i = 0; i < 32; i += 2)
45026 if (d->perm[i] + 1 != d->perm[i + 1])
45027 goto use_pblendvb;
45028 /* See if bytes move in quadruplets. If yes, vpblendd
45029 with immediate can be used. */
45030 for (i = 0; i < 32; i += 4)
45031 if (d->perm[i] + 2 != d->perm[i + 2])
45032 break;
45033 if (i < 32)
45034 {
45035 /* See if bytes move the same in both lanes. If yes,
45036 vpblendw with immediate can be used. */
45037 for (i = 0; i < 16; i += 2)
45038 if (d->perm[i] + 16 != d->perm[i + 16])
45039 goto use_pblendvb;
45040
45041 /* Use vpblendw. */
45042 for (i = 0; i < 16; ++i)
45043 mask |= (d->perm[i * 2] >= 32) << i;
45044 vmode = V16HImode;
45045 goto do_subreg;
45046 }
45047
45048 /* Use vpblendd. */
45049 for (i = 0; i < 8; ++i)
45050 mask |= (d->perm[i * 4] >= 32) << i;
45051 vmode = V8SImode;
45052 goto do_subreg;
45053
45054 case V16HImode:
45055 /* See if words move in pairs. If yes, vpblendd can be used. */
45056 for (i = 0; i < 16; i += 2)
45057 if (d->perm[i] + 1 != d->perm[i + 1])
45058 break;
45059 if (i < 16)
45060 {
45061 /* See if words move the same in both lanes. If not,
45062 vpblendvb must be used. */
45063 for (i = 0; i < 8; i++)
45064 if (d->perm[i] + 8 != d->perm[i + 8])
45065 {
45066 /* Use vpblendvb. */
45067 for (i = 0; i < 32; ++i)
45068 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
45069
45070 vmode = V32QImode;
45071 nelt = 32;
45072 target = gen_reg_rtx (vmode);
45073 op0 = gen_lowpart (vmode, op0);
45074 op1 = gen_lowpart (vmode, op1);
45075 goto finish_pblendvb;
45076 }
45077
45078 /* Use vpblendw. */
45079 for (i = 0; i < 16; ++i)
45080 mask |= (d->perm[i] >= 16) << i;
45081 break;
45082 }
45083
45084 /* Use vpblendd. */
45085 for (i = 0; i < 8; ++i)
45086 mask |= (d->perm[i * 2] >= 16) << i;
45087 vmode = V8SImode;
45088 goto do_subreg;
45089
45090 case V4DImode:
45091 /* Use vpblendd. */
45092 for (i = 0; i < 4; ++i)
45093 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
45094 vmode = V8SImode;
45095 goto do_subreg;
45096
45097 default:
45098 gcc_unreachable ();
45099 }
45100
45101 switch (vmode)
45102 {
45103 case V8DFmode:
45104 case V8DImode:
45105 mmode = QImode;
45106 break;
45107 case V16SFmode:
45108 case V16SImode:
45109 mmode = HImode;
45110 break;
45111 case V32HImode:
45112 mmode = SImode;
45113 break;
45114 case V64QImode:
45115 mmode = DImode;
45116 break;
45117 default:
45118 mmode = VOIDmode;
45119 }
45120
45121 if (mmode != VOIDmode)
45122 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
45123 else
45124 maskop = GEN_INT (mask);
45125
45126 /* This matches five different patterns with the different modes. */
45127 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
45128 x = gen_rtx_SET (target, x);
45129 emit_insn (x);
45130 if (target != d->target)
45131 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45132
45133 return true;
45134 }
45135
45136 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45137 in terms of the variable form of vpermilps.
45138
45139 Note that we will have already failed the immediate input vpermilps,
45140 which requires that the high and low part shuffle be identical; the
45141 variable form doesn't require that. */
45142
45143 static bool
45144 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
45145 {
45146 rtx rperm[8], vperm;
45147 unsigned i;
45148
45149 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
45150 return false;
45151
45152 /* We can only permute within the 128-bit lane. */
45153 for (i = 0; i < 8; ++i)
45154 {
45155 unsigned e = d->perm[i];
45156 if (i < 4 ? e >= 4 : e < 4)
45157 return false;
45158 }
45159
45160 if (d->testing_p)
45161 return true;
45162
45163 for (i = 0; i < 8; ++i)
45164 {
45165 unsigned e = d->perm[i];
45166
45167 /* Within each 128-bit lane, the elements of op0 are numbered
45168 from 0 and the elements of op1 are numbered from 4. */
45169 if (e >= 8 + 4)
45170 e -= 8;
45171 else if (e >= 4)
45172 e -= 4;
45173
45174 rperm[i] = GEN_INT (e);
45175 }
45176
45177 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
45178 vperm = force_reg (V8SImode, vperm);
45179 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
45180
45181 return true;
45182 }
45183
45184 /* Return true if permutation D can be performed as VMODE permutation
45185 instead. */
45186
45187 static bool
45188 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
45189 {
45190 unsigned int i, j, chunk;
45191
45192 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
45193 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
45194 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
45195 return false;
45196
45197 if (GET_MODE_NUNITS (vmode) >= d->nelt)
45198 return true;
45199
45200 chunk = d->nelt / GET_MODE_NUNITS (vmode);
45201 for (i = 0; i < d->nelt; i += chunk)
45202 if (d->perm[i] & (chunk - 1))
45203 return false;
45204 else
45205 for (j = 1; j < chunk; ++j)
45206 if (d->perm[i] + j != d->perm[i + j])
45207 return false;
45208
45209 return true;
45210 }
45211
45212 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45213 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
45214
45215 static bool
45216 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
45217 {
45218 unsigned i, nelt, eltsz, mask;
45219 unsigned char perm[64];
45220 machine_mode vmode = V16QImode;
45221 rtx rperm[64], vperm, target, op0, op1;
45222
45223 nelt = d->nelt;
45224
45225 if (!d->one_operand_p)
45226 {
45227 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
45228 {
45229 if (TARGET_AVX2
45230 && valid_perm_using_mode_p (V2TImode, d))
45231 {
45232 if (d->testing_p)
45233 return true;
45234
45235 /* Use vperm2i128 insn. The pattern uses
45236 V4DImode instead of V2TImode. */
45237 target = d->target;
45238 if (d->vmode != V4DImode)
45239 target = gen_reg_rtx (V4DImode);
45240 op0 = gen_lowpart (V4DImode, d->op0);
45241 op1 = gen_lowpart (V4DImode, d->op1);
45242 rperm[0]
45243 = GEN_INT ((d->perm[0] / (nelt / 2))
45244 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
45245 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
45246 if (target != d->target)
45247 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45248 return true;
45249 }
45250 return false;
45251 }
45252 }
45253 else
45254 {
45255 if (GET_MODE_SIZE (d->vmode) == 16)
45256 {
45257 if (!TARGET_SSSE3)
45258 return false;
45259 }
45260 else if (GET_MODE_SIZE (d->vmode) == 32)
45261 {
45262 if (!TARGET_AVX2)
45263 return false;
45264
45265 /* V4DImode should be already handled through
45266 expand_vselect by vpermq instruction. */
45267 gcc_assert (d->vmode != V4DImode);
45268
45269 vmode = V32QImode;
45270 if (d->vmode == V8SImode
45271 || d->vmode == V16HImode
45272 || d->vmode == V32QImode)
45273 {
45274 /* First see if vpermq can be used for
45275 V8SImode/V16HImode/V32QImode. */
45276 if (valid_perm_using_mode_p (V4DImode, d))
45277 {
45278 for (i = 0; i < 4; i++)
45279 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
45280 if (d->testing_p)
45281 return true;
45282 target = gen_reg_rtx (V4DImode);
45283 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
45284 perm, 4, false))
45285 {
45286 emit_move_insn (d->target,
45287 gen_lowpart (d->vmode, target));
45288 return true;
45289 }
45290 return false;
45291 }
45292
45293 /* Next see if vpermd can be used. */
45294 if (valid_perm_using_mode_p (V8SImode, d))
45295 vmode = V8SImode;
45296 }
45297 /* Or if vpermps can be used. */
45298 else if (d->vmode == V8SFmode)
45299 vmode = V8SImode;
45300
45301 if (vmode == V32QImode)
45302 {
45303 /* vpshufb only works intra lanes, it is not
45304 possible to shuffle bytes in between the lanes. */
45305 for (i = 0; i < nelt; ++i)
45306 if ((d->perm[i] ^ i) & (nelt / 2))
45307 return false;
45308 }
45309 }
45310 else if (GET_MODE_SIZE (d->vmode) == 64)
45311 {
45312 if (!TARGET_AVX512BW)
45313 return false;
45314
45315 /* If vpermq didn't work, vpshufb won't work either. */
45316 if (d->vmode == V8DFmode || d->vmode == V8DImode)
45317 return false;
45318
45319 vmode = V64QImode;
45320 if (d->vmode == V16SImode
45321 || d->vmode == V32HImode
45322 || d->vmode == V64QImode)
45323 {
45324 /* First see if vpermq can be used for
45325 V16SImode/V32HImode/V64QImode. */
45326 if (valid_perm_using_mode_p (V8DImode, d))
45327 {
45328 for (i = 0; i < 8; i++)
45329 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
45330 if (d->testing_p)
45331 return true;
45332 target = gen_reg_rtx (V8DImode);
45333 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
45334 perm, 8, false))
45335 {
45336 emit_move_insn (d->target,
45337 gen_lowpart (d->vmode, target));
45338 return true;
45339 }
45340 return false;
45341 }
45342
45343 /* Next see if vpermd can be used. */
45344 if (valid_perm_using_mode_p (V16SImode, d))
45345 vmode = V16SImode;
45346 }
45347 /* Or if vpermps can be used. */
45348 else if (d->vmode == V16SFmode)
45349 vmode = V16SImode;
45350 if (vmode == V64QImode)
45351 {
45352 /* vpshufb only works intra lanes, it is not
45353 possible to shuffle bytes in between the lanes. */
45354 for (i = 0; i < nelt; ++i)
45355 if ((d->perm[i] ^ i) & (nelt / 4))
45356 return false;
45357 }
45358 }
45359 else
45360 return false;
45361 }
45362
45363 if (d->testing_p)
45364 return true;
45365
45366 if (vmode == V8SImode)
45367 for (i = 0; i < 8; ++i)
45368 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
45369 else if (vmode == V16SImode)
45370 for (i = 0; i < 16; ++i)
45371 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
45372 else
45373 {
45374 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
45375 if (!d->one_operand_p)
45376 mask = 2 * nelt - 1;
45377 else if (vmode == V16QImode)
45378 mask = nelt - 1;
45379 else if (vmode == V64QImode)
45380 mask = nelt / 4 - 1;
45381 else
45382 mask = nelt / 2 - 1;
45383
45384 for (i = 0; i < nelt; ++i)
45385 {
45386 unsigned j, e = d->perm[i] & mask;
45387 for (j = 0; j < eltsz; ++j)
45388 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
45389 }
45390 }
45391
45392 vperm = gen_rtx_CONST_VECTOR (vmode,
45393 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
45394 vperm = force_reg (vmode, vperm);
45395
45396 target = d->target;
45397 if (d->vmode != vmode)
45398 target = gen_reg_rtx (vmode);
45399 op0 = gen_lowpart (vmode, d->op0);
45400 if (d->one_operand_p)
45401 {
45402 if (vmode == V16QImode)
45403 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
45404 else if (vmode == V32QImode)
45405 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
45406 else if (vmode == V64QImode)
45407 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
45408 else if (vmode == V8SFmode)
45409 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
45410 else if (vmode == V8SImode)
45411 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
45412 else if (vmode == V16SFmode)
45413 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
45414 else if (vmode == V16SImode)
45415 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
45416 else
45417 gcc_unreachable ();
45418 }
45419 else
45420 {
45421 op1 = gen_lowpart (vmode, d->op1);
45422 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
45423 }
45424 if (target != d->target)
45425 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45426
45427 return true;
45428 }
45429
45430 /* For V*[QHS]Imode permutations, check if the same permutation
45431 can't be performed in a 2x, 4x or 8x wider inner mode. */
45432
45433 static bool
45434 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
45435 struct expand_vec_perm_d *nd)
45436 {
45437 int i;
45438 enum machine_mode mode = VOIDmode;
45439
45440 switch (d->vmode)
45441 {
45442 case V16QImode: mode = V8HImode; break;
45443 case V32QImode: mode = V16HImode; break;
45444 case V64QImode: mode = V32HImode; break;
45445 case V8HImode: mode = V4SImode; break;
45446 case V16HImode: mode = V8SImode; break;
45447 case V32HImode: mode = V16SImode; break;
45448 case V4SImode: mode = V2DImode; break;
45449 case V8SImode: mode = V4DImode; break;
45450 case V16SImode: mode = V8DImode; break;
45451 default: return false;
45452 }
45453 for (i = 0; i < d->nelt; i += 2)
45454 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
45455 return false;
45456 nd->vmode = mode;
45457 nd->nelt = d->nelt / 2;
45458 for (i = 0; i < nd->nelt; i++)
45459 nd->perm[i] = d->perm[2 * i] / 2;
45460 if (GET_MODE_INNER (mode) != DImode)
45461 canonicalize_vector_int_perm (nd, nd);
45462 if (nd != d)
45463 {
45464 nd->one_operand_p = d->one_operand_p;
45465 nd->testing_p = d->testing_p;
45466 if (d->op0 == d->op1)
45467 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
45468 else
45469 {
45470 nd->op0 = gen_lowpart (nd->vmode, d->op0);
45471 nd->op1 = gen_lowpart (nd->vmode, d->op1);
45472 }
45473 if (d->testing_p)
45474 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
45475 else
45476 nd->target = gen_reg_rtx (nd->vmode);
45477 }
45478 return true;
45479 }
45480
45481 /* Try to expand one-operand permutation with constant mask. */
45482
45483 static bool
45484 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
45485 {
45486 machine_mode mode = GET_MODE (d->op0);
45487 machine_mode maskmode = mode;
45488 rtx (*gen) (rtx, rtx, rtx) = NULL;
45489 rtx target, op0, mask;
45490 rtx vec[64];
45491
45492 if (!rtx_equal_p (d->op0, d->op1))
45493 return false;
45494
45495 if (!TARGET_AVX512F)
45496 return false;
45497
45498 switch (mode)
45499 {
45500 case V16SImode:
45501 gen = gen_avx512f_permvarv16si;
45502 break;
45503 case V16SFmode:
45504 gen = gen_avx512f_permvarv16sf;
45505 maskmode = V16SImode;
45506 break;
45507 case V8DImode:
45508 gen = gen_avx512f_permvarv8di;
45509 break;
45510 case V8DFmode:
45511 gen = gen_avx512f_permvarv8df;
45512 maskmode = V8DImode;
45513 break;
45514 default:
45515 return false;
45516 }
45517
45518 target = d->target;
45519 op0 = d->op0;
45520 for (int i = 0; i < d->nelt; ++i)
45521 vec[i] = GEN_INT (d->perm[i]);
45522 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
45523 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
45524 return true;
45525 }
45526
45527 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
45528 in a single instruction. */
45529
45530 static bool
45531 expand_vec_perm_1 (struct expand_vec_perm_d *d)
45532 {
45533 unsigned i, nelt = d->nelt;
45534 struct expand_vec_perm_d nd;
45535
45536 /* Check plain VEC_SELECT first, because AVX has instructions that could
45537 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
45538 input where SEL+CONCAT may not. */
45539 if (d->one_operand_p)
45540 {
45541 int mask = nelt - 1;
45542 bool identity_perm = true;
45543 bool broadcast_perm = true;
45544
45545 for (i = 0; i < nelt; i++)
45546 {
45547 nd.perm[i] = d->perm[i] & mask;
45548 if (nd.perm[i] != i)
45549 identity_perm = false;
45550 if (nd.perm[i])
45551 broadcast_perm = false;
45552 }
45553
45554 if (identity_perm)
45555 {
45556 if (!d->testing_p)
45557 emit_move_insn (d->target, d->op0);
45558 return true;
45559 }
45560 else if (broadcast_perm && TARGET_AVX2)
45561 {
45562 /* Use vpbroadcast{b,w,d}. */
45563 rtx (*gen) (rtx, rtx) = NULL;
45564 switch (d->vmode)
45565 {
45566 case V64QImode:
45567 if (TARGET_AVX512BW)
45568 gen = gen_avx512bw_vec_dupv64qi_1;
45569 break;
45570 case V32QImode:
45571 gen = gen_avx2_pbroadcastv32qi_1;
45572 break;
45573 case V32HImode:
45574 if (TARGET_AVX512BW)
45575 gen = gen_avx512bw_vec_dupv32hi_1;
45576 break;
45577 case V16HImode:
45578 gen = gen_avx2_pbroadcastv16hi_1;
45579 break;
45580 case V16SImode:
45581 if (TARGET_AVX512F)
45582 gen = gen_avx512f_vec_dupv16si_1;
45583 break;
45584 case V8SImode:
45585 gen = gen_avx2_pbroadcastv8si_1;
45586 break;
45587 case V16QImode:
45588 gen = gen_avx2_pbroadcastv16qi;
45589 break;
45590 case V8HImode:
45591 gen = gen_avx2_pbroadcastv8hi;
45592 break;
45593 case V16SFmode:
45594 if (TARGET_AVX512F)
45595 gen = gen_avx512f_vec_dupv16sf_1;
45596 break;
45597 case V8SFmode:
45598 gen = gen_avx2_vec_dupv8sf_1;
45599 break;
45600 case V8DFmode:
45601 if (TARGET_AVX512F)
45602 gen = gen_avx512f_vec_dupv8df_1;
45603 break;
45604 case V8DImode:
45605 if (TARGET_AVX512F)
45606 gen = gen_avx512f_vec_dupv8di_1;
45607 break;
45608 /* For other modes prefer other shuffles this function creates. */
45609 default: break;
45610 }
45611 if (gen != NULL)
45612 {
45613 if (!d->testing_p)
45614 emit_insn (gen (d->target, d->op0));
45615 return true;
45616 }
45617 }
45618
45619 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
45620 return true;
45621
45622 /* There are plenty of patterns in sse.md that are written for
45623 SEL+CONCAT and are not replicated for a single op. Perhaps
45624 that should be changed, to avoid the nastiness here. */
45625
45626 /* Recognize interleave style patterns, which means incrementing
45627 every other permutation operand. */
45628 for (i = 0; i < nelt; i += 2)
45629 {
45630 nd.perm[i] = d->perm[i] & mask;
45631 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
45632 }
45633 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
45634 d->testing_p))
45635 return true;
45636
45637 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
45638 if (nelt >= 4)
45639 {
45640 for (i = 0; i < nelt; i += 4)
45641 {
45642 nd.perm[i + 0] = d->perm[i + 0] & mask;
45643 nd.perm[i + 1] = d->perm[i + 1] & mask;
45644 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
45645 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
45646 }
45647
45648 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
45649 d->testing_p))
45650 return true;
45651 }
45652 }
45653
45654 /* Finally, try the fully general two operand permute. */
45655 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
45656 d->testing_p))
45657 return true;
45658
45659 /* Recognize interleave style patterns with reversed operands. */
45660 if (!d->one_operand_p)
45661 {
45662 for (i = 0; i < nelt; ++i)
45663 {
45664 unsigned e = d->perm[i];
45665 if (e >= nelt)
45666 e -= nelt;
45667 else
45668 e += nelt;
45669 nd.perm[i] = e;
45670 }
45671
45672 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
45673 d->testing_p))
45674 return true;
45675 }
45676
45677 /* Try the SSE4.1 blend variable merge instructions. */
45678 if (expand_vec_perm_blend (d))
45679 return true;
45680
45681 /* Try one of the AVX vpermil variable permutations. */
45682 if (expand_vec_perm_vpermil (d))
45683 return true;
45684
45685 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
45686 vpshufb, vpermd, vpermps or vpermq variable permutation. */
45687 if (expand_vec_perm_pshufb (d))
45688 return true;
45689
45690 /* Try the AVX2 vpalignr instruction. */
45691 if (expand_vec_perm_palignr (d, true))
45692 return true;
45693
45694 /* Try the AVX512F vperm{s,d} instructions. */
45695 if (ix86_expand_vec_one_operand_perm_avx512 (d))
45696 return true;
45697
45698 /* Try the AVX512F vpermi2 instructions. */
45699 if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
45700 return true;
45701
45702 /* See if we can get the same permutation in different vector integer
45703 mode. */
45704 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
45705 {
45706 if (!d->testing_p)
45707 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
45708 return true;
45709 }
45710 return false;
45711 }
45712
45713 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45714 in terms of a pair of pshuflw + pshufhw instructions. */
45715
45716 static bool
45717 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
45718 {
45719 unsigned char perm2[MAX_VECT_LEN];
45720 unsigned i;
45721 bool ok;
45722
45723 if (d->vmode != V8HImode || !d->one_operand_p)
45724 return false;
45725
45726 /* The two permutations only operate in 64-bit lanes. */
45727 for (i = 0; i < 4; ++i)
45728 if (d->perm[i] >= 4)
45729 return false;
45730 for (i = 4; i < 8; ++i)
45731 if (d->perm[i] < 4)
45732 return false;
45733
45734 if (d->testing_p)
45735 return true;
45736
45737 /* Emit the pshuflw. */
45738 memcpy (perm2, d->perm, 4);
45739 for (i = 4; i < 8; ++i)
45740 perm2[i] = i;
45741 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
45742 gcc_assert (ok);
45743
45744 /* Emit the pshufhw. */
45745 memcpy (perm2 + 4, d->perm + 4, 4);
45746 for (i = 0; i < 4; ++i)
45747 perm2[i] = i;
45748 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
45749 gcc_assert (ok);
45750
45751 return true;
45752 }
45753
45754 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
45755 the permutation using the SSSE3 palignr instruction. This succeeds
45756 when all of the elements in PERM fit within one vector and we merely
45757 need to shift them down so that a single vector permutation has a
45758 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
45759 the vpalignr instruction itself can perform the requested permutation. */
45760
45761 static bool
45762 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
45763 {
45764 unsigned i, nelt = d->nelt;
45765 unsigned min, max, minswap, maxswap;
45766 bool in_order, ok, swap = false;
45767 rtx shift, target;
45768 struct expand_vec_perm_d dcopy;
45769
45770 /* Even with AVX, palignr only operates on 128-bit vectors,
45771 in AVX2 palignr operates on both 128-bit lanes. */
45772 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
45773 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
45774 return false;
45775
45776 min = 2 * nelt;
45777 max = 0;
45778 minswap = 2 * nelt;
45779 maxswap = 0;
45780 for (i = 0; i < nelt; ++i)
45781 {
45782 unsigned e = d->perm[i];
45783 unsigned eswap = d->perm[i] ^ nelt;
45784 if (GET_MODE_SIZE (d->vmode) == 32)
45785 {
45786 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
45787 eswap = e ^ (nelt / 2);
45788 }
45789 if (e < min)
45790 min = e;
45791 if (e > max)
45792 max = e;
45793 if (eswap < minswap)
45794 minswap = eswap;
45795 if (eswap > maxswap)
45796 maxswap = eswap;
45797 }
45798 if (min == 0
45799 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
45800 {
45801 if (d->one_operand_p
45802 || minswap == 0
45803 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
45804 ? nelt / 2 : nelt))
45805 return false;
45806 swap = true;
45807 min = minswap;
45808 max = maxswap;
45809 }
45810
45811 /* Given that we have SSSE3, we know we'll be able to implement the
45812 single operand permutation after the palignr with pshufb for
45813 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
45814 first. */
45815 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
45816 return true;
45817
45818 dcopy = *d;
45819 if (swap)
45820 {
45821 dcopy.op0 = d->op1;
45822 dcopy.op1 = d->op0;
45823 for (i = 0; i < nelt; ++i)
45824 dcopy.perm[i] ^= nelt;
45825 }
45826
45827 in_order = true;
45828 for (i = 0; i < nelt; ++i)
45829 {
45830 unsigned e = dcopy.perm[i];
45831 if (GET_MODE_SIZE (d->vmode) == 32
45832 && e >= nelt
45833 && (e & (nelt / 2 - 1)) < min)
45834 e = e - min - (nelt / 2);
45835 else
45836 e = e - min;
45837 if (e != i)
45838 in_order = false;
45839 dcopy.perm[i] = e;
45840 }
45841 dcopy.one_operand_p = true;
45842
45843 if (single_insn_only_p && !in_order)
45844 return false;
45845
45846 /* For AVX2, test whether we can permute the result in one instruction. */
45847 if (d->testing_p)
45848 {
45849 if (in_order)
45850 return true;
45851 dcopy.op1 = dcopy.op0;
45852 return expand_vec_perm_1 (&dcopy);
45853 }
45854
45855 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
45856 if (GET_MODE_SIZE (d->vmode) == 16)
45857 {
45858 target = gen_reg_rtx (TImode);
45859 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
45860 gen_lowpart (TImode, dcopy.op0), shift));
45861 }
45862 else
45863 {
45864 target = gen_reg_rtx (V2TImode);
45865 emit_insn (gen_avx2_palignrv2ti (target,
45866 gen_lowpart (V2TImode, dcopy.op1),
45867 gen_lowpart (V2TImode, dcopy.op0),
45868 shift));
45869 }
45870
45871 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
45872
45873 /* Test for the degenerate case where the alignment by itself
45874 produces the desired permutation. */
45875 if (in_order)
45876 {
45877 emit_move_insn (d->target, dcopy.op0);
45878 return true;
45879 }
45880
45881 ok = expand_vec_perm_1 (&dcopy);
45882 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
45883
45884 return ok;
45885 }
45886
45887 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
45888 the permutation using the SSE4_1 pblendv instruction. Potentially
45889 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
45890
45891 static bool
45892 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
45893 {
45894 unsigned i, which, nelt = d->nelt;
45895 struct expand_vec_perm_d dcopy, dcopy1;
45896 machine_mode vmode = d->vmode;
45897 bool ok;
45898
45899 /* Use the same checks as in expand_vec_perm_blend. */
45900 if (d->one_operand_p)
45901 return false;
45902 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
45903 ;
45904 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
45905 ;
45906 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
45907 ;
45908 else
45909 return false;
45910
45911 /* Figure out where permutation elements stay not in their
45912 respective lanes. */
45913 for (i = 0, which = 0; i < nelt; ++i)
45914 {
45915 unsigned e = d->perm[i];
45916 if (e != i)
45917 which |= (e < nelt ? 1 : 2);
45918 }
45919 /* We can pblend the part where elements stay not in their
45920 respective lanes only when these elements are all in one
45921 half of a permutation.
45922 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
45923 lanes, but both 8 and 9 >= 8
45924 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
45925 respective lanes and 8 >= 8, but 2 not. */
45926 if (which != 1 && which != 2)
45927 return false;
45928 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
45929 return true;
45930
45931 /* First we apply one operand permutation to the part where
45932 elements stay not in their respective lanes. */
45933 dcopy = *d;
45934 if (which == 2)
45935 dcopy.op0 = dcopy.op1 = d->op1;
45936 else
45937 dcopy.op0 = dcopy.op1 = d->op0;
45938 if (!d->testing_p)
45939 dcopy.target = gen_reg_rtx (vmode);
45940 dcopy.one_operand_p = true;
45941
45942 for (i = 0; i < nelt; ++i)
45943 dcopy.perm[i] = d->perm[i] & (nelt - 1);
45944
45945 ok = expand_vec_perm_1 (&dcopy);
45946 if (GET_MODE_SIZE (vmode) != 16 && !ok)
45947 return false;
45948 else
45949 gcc_assert (ok);
45950 if (d->testing_p)
45951 return true;
45952
45953 /* Next we put permuted elements into their positions. */
45954 dcopy1 = *d;
45955 if (which == 2)
45956 dcopy1.op1 = dcopy.target;
45957 else
45958 dcopy1.op0 = dcopy.target;
45959
45960 for (i = 0; i < nelt; ++i)
45961 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
45962
45963 ok = expand_vec_perm_blend (&dcopy1);
45964 gcc_assert (ok);
45965
45966 return true;
45967 }
45968
45969 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
45970
45971 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
45972 a two vector permutation into a single vector permutation by using
45973 an interleave operation to merge the vectors. */
45974
45975 static bool
45976 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
45977 {
45978 struct expand_vec_perm_d dremap, dfinal;
45979 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
45980 unsigned HOST_WIDE_INT contents;
45981 unsigned char remap[2 * MAX_VECT_LEN];
45982 rtx_insn *seq;
45983 bool ok, same_halves = false;
45984
45985 if (GET_MODE_SIZE (d->vmode) == 16)
45986 {
45987 if (d->one_operand_p)
45988 return false;
45989 }
45990 else if (GET_MODE_SIZE (d->vmode) == 32)
45991 {
45992 if (!TARGET_AVX)
45993 return false;
45994 /* For 32-byte modes allow even d->one_operand_p.
45995 The lack of cross-lane shuffling in some instructions
45996 might prevent a single insn shuffle. */
45997 dfinal = *d;
45998 dfinal.testing_p = true;
45999 /* If expand_vec_perm_interleave3 can expand this into
46000 a 3 insn sequence, give up and let it be expanded as
46001 3 insn sequence. While that is one insn longer,
46002 it doesn't need a memory operand and in the common
46003 case that both interleave low and high permutations
46004 with the same operands are adjacent needs 4 insns
46005 for both after CSE. */
46006 if (expand_vec_perm_interleave3 (&dfinal))
46007 return false;
46008 }
46009 else
46010 return false;
46011
46012 /* Examine from whence the elements come. */
46013 contents = 0;
46014 for (i = 0; i < nelt; ++i)
46015 contents |= HOST_WIDE_INT_1U << d->perm[i];
46016
46017 memset (remap, 0xff, sizeof (remap));
46018 dremap = *d;
46019
46020 if (GET_MODE_SIZE (d->vmode) == 16)
46021 {
46022 unsigned HOST_WIDE_INT h1, h2, h3, h4;
46023
46024 /* Split the two input vectors into 4 halves. */
46025 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
46026 h2 = h1 << nelt2;
46027 h3 = h2 << nelt2;
46028 h4 = h3 << nelt2;
46029
46030 /* If the elements from the low halves use interleave low, and similarly
46031 for interleave high. If the elements are from mis-matched halves, we
46032 can use shufps for V4SF/V4SI or do a DImode shuffle. */
46033 if ((contents & (h1 | h3)) == contents)
46034 {
46035 /* punpckl* */
46036 for (i = 0; i < nelt2; ++i)
46037 {
46038 remap[i] = i * 2;
46039 remap[i + nelt] = i * 2 + 1;
46040 dremap.perm[i * 2] = i;
46041 dremap.perm[i * 2 + 1] = i + nelt;
46042 }
46043 if (!TARGET_SSE2 && d->vmode == V4SImode)
46044 dremap.vmode = V4SFmode;
46045 }
46046 else if ((contents & (h2 | h4)) == contents)
46047 {
46048 /* punpckh* */
46049 for (i = 0; i < nelt2; ++i)
46050 {
46051 remap[i + nelt2] = i * 2;
46052 remap[i + nelt + nelt2] = i * 2 + 1;
46053 dremap.perm[i * 2] = i + nelt2;
46054 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
46055 }
46056 if (!TARGET_SSE2 && d->vmode == V4SImode)
46057 dremap.vmode = V4SFmode;
46058 }
46059 else if ((contents & (h1 | h4)) == contents)
46060 {
46061 /* shufps */
46062 for (i = 0; i < nelt2; ++i)
46063 {
46064 remap[i] = i;
46065 remap[i + nelt + nelt2] = i + nelt2;
46066 dremap.perm[i] = i;
46067 dremap.perm[i + nelt2] = i + nelt + nelt2;
46068 }
46069 if (nelt != 4)
46070 {
46071 /* shufpd */
46072 dremap.vmode = V2DImode;
46073 dremap.nelt = 2;
46074 dremap.perm[0] = 0;
46075 dremap.perm[1] = 3;
46076 }
46077 }
46078 else if ((contents & (h2 | h3)) == contents)
46079 {
46080 /* shufps */
46081 for (i = 0; i < nelt2; ++i)
46082 {
46083 remap[i + nelt2] = i;
46084 remap[i + nelt] = i + nelt2;
46085 dremap.perm[i] = i + nelt2;
46086 dremap.perm[i + nelt2] = i + nelt;
46087 }
46088 if (nelt != 4)
46089 {
46090 /* shufpd */
46091 dremap.vmode = V2DImode;
46092 dremap.nelt = 2;
46093 dremap.perm[0] = 1;
46094 dremap.perm[1] = 2;
46095 }
46096 }
46097 else
46098 return false;
46099 }
46100 else
46101 {
46102 unsigned int nelt4 = nelt / 4, nzcnt = 0;
46103 unsigned HOST_WIDE_INT q[8];
46104 unsigned int nonzero_halves[4];
46105
46106 /* Split the two input vectors into 8 quarters. */
46107 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
46108 for (i = 1; i < 8; ++i)
46109 q[i] = q[0] << (nelt4 * i);
46110 for (i = 0; i < 4; ++i)
46111 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
46112 {
46113 nonzero_halves[nzcnt] = i;
46114 ++nzcnt;
46115 }
46116
46117 if (nzcnt == 1)
46118 {
46119 gcc_assert (d->one_operand_p);
46120 nonzero_halves[1] = nonzero_halves[0];
46121 same_halves = true;
46122 }
46123 else if (d->one_operand_p)
46124 {
46125 gcc_assert (nonzero_halves[0] == 0);
46126 gcc_assert (nonzero_halves[1] == 1);
46127 }
46128
46129 if (nzcnt <= 2)
46130 {
46131 if (d->perm[0] / nelt2 == nonzero_halves[1])
46132 {
46133 /* Attempt to increase the likelihood that dfinal
46134 shuffle will be intra-lane. */
46135 std::swap (nonzero_halves[0], nonzero_halves[1]);
46136 }
46137
46138 /* vperm2f128 or vperm2i128. */
46139 for (i = 0; i < nelt2; ++i)
46140 {
46141 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
46142 remap[i + nonzero_halves[0] * nelt2] = i;
46143 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
46144 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
46145 }
46146
46147 if (d->vmode != V8SFmode
46148 && d->vmode != V4DFmode
46149 && d->vmode != V8SImode)
46150 {
46151 dremap.vmode = V8SImode;
46152 dremap.nelt = 8;
46153 for (i = 0; i < 4; ++i)
46154 {
46155 dremap.perm[i] = i + nonzero_halves[0] * 4;
46156 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
46157 }
46158 }
46159 }
46160 else if (d->one_operand_p)
46161 return false;
46162 else if (TARGET_AVX2
46163 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
46164 {
46165 /* vpunpckl* */
46166 for (i = 0; i < nelt4; ++i)
46167 {
46168 remap[i] = i * 2;
46169 remap[i + nelt] = i * 2 + 1;
46170 remap[i + nelt2] = i * 2 + nelt2;
46171 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
46172 dremap.perm[i * 2] = i;
46173 dremap.perm[i * 2 + 1] = i + nelt;
46174 dremap.perm[i * 2 + nelt2] = i + nelt2;
46175 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
46176 }
46177 }
46178 else if (TARGET_AVX2
46179 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
46180 {
46181 /* vpunpckh* */
46182 for (i = 0; i < nelt4; ++i)
46183 {
46184 remap[i + nelt4] = i * 2;
46185 remap[i + nelt + nelt4] = i * 2 + 1;
46186 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
46187 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
46188 dremap.perm[i * 2] = i + nelt4;
46189 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
46190 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
46191 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
46192 }
46193 }
46194 else
46195 return false;
46196 }
46197
46198 /* Use the remapping array set up above to move the elements from their
46199 swizzled locations into their final destinations. */
46200 dfinal = *d;
46201 for (i = 0; i < nelt; ++i)
46202 {
46203 unsigned e = remap[d->perm[i]];
46204 gcc_assert (e < nelt);
46205 /* If same_halves is true, both halves of the remapped vector are the
46206 same. Avoid cross-lane accesses if possible. */
46207 if (same_halves && i >= nelt2)
46208 {
46209 gcc_assert (e < nelt2);
46210 dfinal.perm[i] = e + nelt2;
46211 }
46212 else
46213 dfinal.perm[i] = e;
46214 }
46215 if (!d->testing_p)
46216 {
46217 dremap.target = gen_reg_rtx (dremap.vmode);
46218 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
46219 }
46220 dfinal.op1 = dfinal.op0;
46221 dfinal.one_operand_p = true;
46222
46223 /* Test if the final remap can be done with a single insn. For V4SFmode or
46224 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
46225 start_sequence ();
46226 ok = expand_vec_perm_1 (&dfinal);
46227 seq = get_insns ();
46228 end_sequence ();
46229
46230 if (!ok)
46231 return false;
46232
46233 if (d->testing_p)
46234 return true;
46235
46236 if (dremap.vmode != dfinal.vmode)
46237 {
46238 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
46239 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
46240 }
46241
46242 ok = expand_vec_perm_1 (&dremap);
46243 gcc_assert (ok);
46244
46245 emit_insn (seq);
46246 return true;
46247 }
46248
46249 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46250 a single vector cross-lane permutation into vpermq followed
46251 by any of the single insn permutations. */
46252
46253 static bool
46254 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
46255 {
46256 struct expand_vec_perm_d dremap, dfinal;
46257 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
46258 unsigned contents[2];
46259 bool ok;
46260
46261 if (!(TARGET_AVX2
46262 && (d->vmode == V32QImode || d->vmode == V16HImode)
46263 && d->one_operand_p))
46264 return false;
46265
46266 contents[0] = 0;
46267 contents[1] = 0;
46268 for (i = 0; i < nelt2; ++i)
46269 {
46270 contents[0] |= 1u << (d->perm[i] / nelt4);
46271 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
46272 }
46273
46274 for (i = 0; i < 2; ++i)
46275 {
46276 unsigned int cnt = 0;
46277 for (j = 0; j < 4; ++j)
46278 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
46279 return false;
46280 }
46281
46282 if (d->testing_p)
46283 return true;
46284
46285 dremap = *d;
46286 dremap.vmode = V4DImode;
46287 dremap.nelt = 4;
46288 dremap.target = gen_reg_rtx (V4DImode);
46289 dremap.op0 = gen_lowpart (V4DImode, d->op0);
46290 dremap.op1 = dremap.op0;
46291 dremap.one_operand_p = true;
46292 for (i = 0; i < 2; ++i)
46293 {
46294 unsigned int cnt = 0;
46295 for (j = 0; j < 4; ++j)
46296 if ((contents[i] & (1u << j)) != 0)
46297 dremap.perm[2 * i + cnt++] = j;
46298 for (; cnt < 2; ++cnt)
46299 dremap.perm[2 * i + cnt] = 0;
46300 }
46301
46302 dfinal = *d;
46303 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
46304 dfinal.op1 = dfinal.op0;
46305 dfinal.one_operand_p = true;
46306 for (i = 0, j = 0; i < nelt; ++i)
46307 {
46308 if (i == nelt2)
46309 j = 2;
46310 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
46311 if ((d->perm[i] / nelt4) == dremap.perm[j])
46312 ;
46313 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
46314 dfinal.perm[i] |= nelt4;
46315 else
46316 gcc_unreachable ();
46317 }
46318
46319 ok = expand_vec_perm_1 (&dremap);
46320 gcc_assert (ok);
46321
46322 ok = expand_vec_perm_1 (&dfinal);
46323 gcc_assert (ok);
46324
46325 return true;
46326 }
46327
46328 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
46329 a vector permutation using two instructions, vperm2f128 resp.
46330 vperm2i128 followed by any single in-lane permutation. */
46331
46332 static bool
46333 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
46334 {
46335 struct expand_vec_perm_d dfirst, dsecond;
46336 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
46337 bool ok;
46338
46339 if (!TARGET_AVX
46340 || GET_MODE_SIZE (d->vmode) != 32
46341 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
46342 return false;
46343
46344 dsecond = *d;
46345 dsecond.one_operand_p = false;
46346 dsecond.testing_p = true;
46347
46348 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
46349 immediate. For perm < 16 the second permutation uses
46350 d->op0 as first operand, for perm >= 16 it uses d->op1
46351 as first operand. The second operand is the result of
46352 vperm2[fi]128. */
46353 for (perm = 0; perm < 32; perm++)
46354 {
46355 /* Ignore permutations which do not move anything cross-lane. */
46356 if (perm < 16)
46357 {
46358 /* The second shuffle for e.g. V4DFmode has
46359 0123 and ABCD operands.
46360 Ignore AB23, as 23 is already in the second lane
46361 of the first operand. */
46362 if ((perm & 0xc) == (1 << 2)) continue;
46363 /* And 01CD, as 01 is in the first lane of the first
46364 operand. */
46365 if ((perm & 3) == 0) continue;
46366 /* And 4567, as then the vperm2[fi]128 doesn't change
46367 anything on the original 4567 second operand. */
46368 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
46369 }
46370 else
46371 {
46372 /* The second shuffle for e.g. V4DFmode has
46373 4567 and ABCD operands.
46374 Ignore AB67, as 67 is already in the second lane
46375 of the first operand. */
46376 if ((perm & 0xc) == (3 << 2)) continue;
46377 /* And 45CD, as 45 is in the first lane of the first
46378 operand. */
46379 if ((perm & 3) == 2) continue;
46380 /* And 0123, as then the vperm2[fi]128 doesn't change
46381 anything on the original 0123 first operand. */
46382 if ((perm & 0xf) == (1 << 2)) continue;
46383 }
46384
46385 for (i = 0; i < nelt; i++)
46386 {
46387 j = d->perm[i] / nelt2;
46388 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
46389 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
46390 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
46391 dsecond.perm[i] = d->perm[i] & (nelt - 1);
46392 else
46393 break;
46394 }
46395
46396 if (i == nelt)
46397 {
46398 start_sequence ();
46399 ok = expand_vec_perm_1 (&dsecond);
46400 end_sequence ();
46401 }
46402 else
46403 ok = false;
46404
46405 if (ok)
46406 {
46407 if (d->testing_p)
46408 return true;
46409
46410 /* Found a usable second shuffle. dfirst will be
46411 vperm2f128 on d->op0 and d->op1. */
46412 dsecond.testing_p = false;
46413 dfirst = *d;
46414 dfirst.target = gen_reg_rtx (d->vmode);
46415 for (i = 0; i < nelt; i++)
46416 dfirst.perm[i] = (i & (nelt2 - 1))
46417 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
46418
46419 canonicalize_perm (&dfirst);
46420 ok = expand_vec_perm_1 (&dfirst);
46421 gcc_assert (ok);
46422
46423 /* And dsecond is some single insn shuffle, taking
46424 d->op0 and result of vperm2f128 (if perm < 16) or
46425 d->op1 and result of vperm2f128 (otherwise). */
46426 if (perm >= 16)
46427 dsecond.op0 = dsecond.op1;
46428 dsecond.op1 = dfirst.target;
46429
46430 ok = expand_vec_perm_1 (&dsecond);
46431 gcc_assert (ok);
46432
46433 return true;
46434 }
46435
46436 /* For one operand, the only useful vperm2f128 permutation is 0x01
46437 aka lanes swap. */
46438 if (d->one_operand_p)
46439 return false;
46440 }
46441
46442 return false;
46443 }
46444
46445 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46446 a two vector permutation using 2 intra-lane interleave insns
46447 and cross-lane shuffle for 32-byte vectors. */
46448
46449 static bool
46450 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
46451 {
46452 unsigned i, nelt;
46453 rtx (*gen) (rtx, rtx, rtx);
46454
46455 if (d->one_operand_p)
46456 return false;
46457 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
46458 ;
46459 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
46460 ;
46461 else
46462 return false;
46463
46464 nelt = d->nelt;
46465 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
46466 return false;
46467 for (i = 0; i < nelt; i += 2)
46468 if (d->perm[i] != d->perm[0] + i / 2
46469 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
46470 return false;
46471
46472 if (d->testing_p)
46473 return true;
46474
46475 switch (d->vmode)
46476 {
46477 case V32QImode:
46478 if (d->perm[0])
46479 gen = gen_vec_interleave_highv32qi;
46480 else
46481 gen = gen_vec_interleave_lowv32qi;
46482 break;
46483 case V16HImode:
46484 if (d->perm[0])
46485 gen = gen_vec_interleave_highv16hi;
46486 else
46487 gen = gen_vec_interleave_lowv16hi;
46488 break;
46489 case V8SImode:
46490 if (d->perm[0])
46491 gen = gen_vec_interleave_highv8si;
46492 else
46493 gen = gen_vec_interleave_lowv8si;
46494 break;
46495 case V4DImode:
46496 if (d->perm[0])
46497 gen = gen_vec_interleave_highv4di;
46498 else
46499 gen = gen_vec_interleave_lowv4di;
46500 break;
46501 case V8SFmode:
46502 if (d->perm[0])
46503 gen = gen_vec_interleave_highv8sf;
46504 else
46505 gen = gen_vec_interleave_lowv8sf;
46506 break;
46507 case V4DFmode:
46508 if (d->perm[0])
46509 gen = gen_vec_interleave_highv4df;
46510 else
46511 gen = gen_vec_interleave_lowv4df;
46512 break;
46513 default:
46514 gcc_unreachable ();
46515 }
46516
46517 emit_insn (gen (d->target, d->op0, d->op1));
46518 return true;
46519 }
46520
46521 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
46522 a single vector permutation using a single intra-lane vector
46523 permutation, vperm2f128 swapping the lanes and vblend* insn blending
46524 the non-swapped and swapped vectors together. */
46525
46526 static bool
46527 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
46528 {
46529 struct expand_vec_perm_d dfirst, dsecond;
46530 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
46531 rtx_insn *seq;
46532 bool ok;
46533 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
46534
46535 if (!TARGET_AVX
46536 || TARGET_AVX2
46537 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
46538 || !d->one_operand_p)
46539 return false;
46540
46541 dfirst = *d;
46542 for (i = 0; i < nelt; i++)
46543 dfirst.perm[i] = 0xff;
46544 for (i = 0, msk = 0; i < nelt; i++)
46545 {
46546 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
46547 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
46548 return false;
46549 dfirst.perm[j] = d->perm[i];
46550 if (j != i)
46551 msk |= (1 << i);
46552 }
46553 for (i = 0; i < nelt; i++)
46554 if (dfirst.perm[i] == 0xff)
46555 dfirst.perm[i] = i;
46556
46557 if (!d->testing_p)
46558 dfirst.target = gen_reg_rtx (dfirst.vmode);
46559
46560 start_sequence ();
46561 ok = expand_vec_perm_1 (&dfirst);
46562 seq = get_insns ();
46563 end_sequence ();
46564
46565 if (!ok)
46566 return false;
46567
46568 if (d->testing_p)
46569 return true;
46570
46571 emit_insn (seq);
46572
46573 dsecond = *d;
46574 dsecond.op0 = dfirst.target;
46575 dsecond.op1 = dfirst.target;
46576 dsecond.one_operand_p = true;
46577 dsecond.target = gen_reg_rtx (dsecond.vmode);
46578 for (i = 0; i < nelt; i++)
46579 dsecond.perm[i] = i ^ nelt2;
46580
46581 ok = expand_vec_perm_1 (&dsecond);
46582 gcc_assert (ok);
46583
46584 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
46585 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
46586 return true;
46587 }
46588
46589 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
46590 permutation using two vperm2f128, followed by a vshufpd insn blending
46591 the two vectors together. */
46592
46593 static bool
46594 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
46595 {
46596 struct expand_vec_perm_d dfirst, dsecond, dthird;
46597 bool ok;
46598
46599 if (!TARGET_AVX || (d->vmode != V4DFmode))
46600 return false;
46601
46602 if (d->testing_p)
46603 return true;
46604
46605 dfirst = *d;
46606 dsecond = *d;
46607 dthird = *d;
46608
46609 dfirst.perm[0] = (d->perm[0] & ~1);
46610 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
46611 dfirst.perm[2] = (d->perm[2] & ~1);
46612 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
46613 dsecond.perm[0] = (d->perm[1] & ~1);
46614 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
46615 dsecond.perm[2] = (d->perm[3] & ~1);
46616 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
46617 dthird.perm[0] = (d->perm[0] % 2);
46618 dthird.perm[1] = (d->perm[1] % 2) + 4;
46619 dthird.perm[2] = (d->perm[2] % 2) + 2;
46620 dthird.perm[3] = (d->perm[3] % 2) + 6;
46621
46622 dfirst.target = gen_reg_rtx (dfirst.vmode);
46623 dsecond.target = gen_reg_rtx (dsecond.vmode);
46624 dthird.op0 = dfirst.target;
46625 dthird.op1 = dsecond.target;
46626 dthird.one_operand_p = false;
46627
46628 canonicalize_perm (&dfirst);
46629 canonicalize_perm (&dsecond);
46630
46631 ok = expand_vec_perm_1 (&dfirst)
46632 && expand_vec_perm_1 (&dsecond)
46633 && expand_vec_perm_1 (&dthird);
46634
46635 gcc_assert (ok);
46636
46637 return true;
46638 }
46639
46640 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
46641 permutation with two pshufb insns and an ior. We should have already
46642 failed all two instruction sequences. */
46643
46644 static bool
46645 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
46646 {
46647 rtx rperm[2][16], vperm, l, h, op, m128;
46648 unsigned int i, nelt, eltsz;
46649
46650 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
46651 return false;
46652 gcc_assert (!d->one_operand_p);
46653
46654 if (d->testing_p)
46655 return true;
46656
46657 nelt = d->nelt;
46658 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46659
46660 /* Generate two permutation masks. If the required element is within
46661 the given vector it is shuffled into the proper lane. If the required
46662 element is in the other vector, force a zero into the lane by setting
46663 bit 7 in the permutation mask. */
46664 m128 = GEN_INT (-128);
46665 for (i = 0; i < nelt; ++i)
46666 {
46667 unsigned j, e = d->perm[i];
46668 unsigned which = (e >= nelt);
46669 if (e >= nelt)
46670 e -= nelt;
46671
46672 for (j = 0; j < eltsz; ++j)
46673 {
46674 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
46675 rperm[1-which][i*eltsz + j] = m128;
46676 }
46677 }
46678
46679 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
46680 vperm = force_reg (V16QImode, vperm);
46681
46682 l = gen_reg_rtx (V16QImode);
46683 op = gen_lowpart (V16QImode, d->op0);
46684 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
46685
46686 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
46687 vperm = force_reg (V16QImode, vperm);
46688
46689 h = gen_reg_rtx (V16QImode);
46690 op = gen_lowpart (V16QImode, d->op1);
46691 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
46692
46693 op = d->target;
46694 if (d->vmode != V16QImode)
46695 op = gen_reg_rtx (V16QImode);
46696 emit_insn (gen_iorv16qi3 (op, l, h));
46697 if (op != d->target)
46698 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46699
46700 return true;
46701 }
46702
46703 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
46704 with two vpshufb insns, vpermq and vpor. We should have already failed
46705 all two or three instruction sequences. */
46706
46707 static bool
46708 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
46709 {
46710 rtx rperm[2][32], vperm, l, h, hp, op, m128;
46711 unsigned int i, nelt, eltsz;
46712
46713 if (!TARGET_AVX2
46714 || !d->one_operand_p
46715 || (d->vmode != V32QImode && d->vmode != V16HImode))
46716 return false;
46717
46718 if (d->testing_p)
46719 return true;
46720
46721 nelt = d->nelt;
46722 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46723
46724 /* Generate two permutation masks. If the required element is within
46725 the same lane, it is shuffled in. If the required element from the
46726 other lane, force a zero by setting bit 7 in the permutation mask.
46727 In the other mask the mask has non-negative elements if element
46728 is requested from the other lane, but also moved to the other lane,
46729 so that the result of vpshufb can have the two V2TImode halves
46730 swapped. */
46731 m128 = GEN_INT (-128);
46732 for (i = 0; i < nelt; ++i)
46733 {
46734 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
46735 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
46736
46737 for (j = 0; j < eltsz; ++j)
46738 {
46739 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
46740 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
46741 }
46742 }
46743
46744 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
46745 vperm = force_reg (V32QImode, vperm);
46746
46747 h = gen_reg_rtx (V32QImode);
46748 op = gen_lowpart (V32QImode, d->op0);
46749 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
46750
46751 /* Swap the 128-byte lanes of h into hp. */
46752 hp = gen_reg_rtx (V4DImode);
46753 op = gen_lowpart (V4DImode, h);
46754 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
46755 const1_rtx));
46756
46757 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
46758 vperm = force_reg (V32QImode, vperm);
46759
46760 l = gen_reg_rtx (V32QImode);
46761 op = gen_lowpart (V32QImode, d->op0);
46762 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
46763
46764 op = d->target;
46765 if (d->vmode != V32QImode)
46766 op = gen_reg_rtx (V32QImode);
46767 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
46768 if (op != d->target)
46769 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46770
46771 return true;
46772 }
46773
46774 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
46775 and extract-odd permutations of two V32QImode and V16QImode operand
46776 with two vpshufb insns, vpor and vpermq. We should have already
46777 failed all two or three instruction sequences. */
46778
46779 static bool
46780 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
46781 {
46782 rtx rperm[2][32], vperm, l, h, ior, op, m128;
46783 unsigned int i, nelt, eltsz;
46784
46785 if (!TARGET_AVX2
46786 || d->one_operand_p
46787 || (d->vmode != V32QImode && d->vmode != V16HImode))
46788 return false;
46789
46790 for (i = 0; i < d->nelt; ++i)
46791 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
46792 return false;
46793
46794 if (d->testing_p)
46795 return true;
46796
46797 nelt = d->nelt;
46798 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46799
46800 /* Generate two permutation masks. In the first permutation mask
46801 the first quarter will contain indexes for the first half
46802 of the op0, the second quarter will contain bit 7 set, third quarter
46803 will contain indexes for the second half of the op0 and the
46804 last quarter bit 7 set. In the second permutation mask
46805 the first quarter will contain bit 7 set, the second quarter
46806 indexes for the first half of the op1, the third quarter bit 7 set
46807 and last quarter indexes for the second half of the op1.
46808 I.e. the first mask e.g. for V32QImode extract even will be:
46809 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
46810 (all values masked with 0xf except for -128) and second mask
46811 for extract even will be
46812 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
46813 m128 = GEN_INT (-128);
46814 for (i = 0; i < nelt; ++i)
46815 {
46816 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
46817 unsigned which = d->perm[i] >= nelt;
46818 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
46819
46820 for (j = 0; j < eltsz; ++j)
46821 {
46822 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
46823 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
46824 }
46825 }
46826
46827 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
46828 vperm = force_reg (V32QImode, vperm);
46829
46830 l = gen_reg_rtx (V32QImode);
46831 op = gen_lowpart (V32QImode, d->op0);
46832 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
46833
46834 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
46835 vperm = force_reg (V32QImode, vperm);
46836
46837 h = gen_reg_rtx (V32QImode);
46838 op = gen_lowpart (V32QImode, d->op1);
46839 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
46840
46841 ior = gen_reg_rtx (V32QImode);
46842 emit_insn (gen_iorv32qi3 (ior, l, h));
46843
46844 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
46845 op = gen_reg_rtx (V4DImode);
46846 ior = gen_lowpart (V4DImode, ior);
46847 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
46848 const1_rtx, GEN_INT (3)));
46849 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46850
46851 return true;
46852 }
46853
46854 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
46855 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
46856 with two "and" and "pack" or two "shift" and "pack" insns. We should
46857 have already failed all two instruction sequences. */
46858
46859 static bool
46860 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
46861 {
46862 rtx op, dop0, dop1, t, rperm[16];
46863 unsigned i, odd, c, s, nelt = d->nelt;
46864 bool end_perm = false;
46865 machine_mode half_mode;
46866 rtx (*gen_and) (rtx, rtx, rtx);
46867 rtx (*gen_pack) (rtx, rtx, rtx);
46868 rtx (*gen_shift) (rtx, rtx, rtx);
46869
46870 if (d->one_operand_p)
46871 return false;
46872
46873 switch (d->vmode)
46874 {
46875 case V8HImode:
46876 /* Required for "pack". */
46877 if (!TARGET_SSE4_1)
46878 return false;
46879 c = 0xffff;
46880 s = 16;
46881 half_mode = V4SImode;
46882 gen_and = gen_andv4si3;
46883 gen_pack = gen_sse4_1_packusdw;
46884 gen_shift = gen_lshrv4si3;
46885 break;
46886 case V16QImode:
46887 /* No check as all instructions are SSE2. */
46888 c = 0xff;
46889 s = 8;
46890 half_mode = V8HImode;
46891 gen_and = gen_andv8hi3;
46892 gen_pack = gen_sse2_packuswb;
46893 gen_shift = gen_lshrv8hi3;
46894 break;
46895 case V16HImode:
46896 if (!TARGET_AVX2)
46897 return false;
46898 c = 0xffff;
46899 s = 16;
46900 half_mode = V8SImode;
46901 gen_and = gen_andv8si3;
46902 gen_pack = gen_avx2_packusdw;
46903 gen_shift = gen_lshrv8si3;
46904 end_perm = true;
46905 break;
46906 case V32QImode:
46907 if (!TARGET_AVX2)
46908 return false;
46909 c = 0xff;
46910 s = 8;
46911 half_mode = V16HImode;
46912 gen_and = gen_andv16hi3;
46913 gen_pack = gen_avx2_packuswb;
46914 gen_shift = gen_lshrv16hi3;
46915 end_perm = true;
46916 break;
46917 default:
46918 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
46919 general shuffles. */
46920 return false;
46921 }
46922
46923 /* Check that permutation is even or odd. */
46924 odd = d->perm[0];
46925 if (odd > 1)
46926 return false;
46927
46928 for (i = 1; i < nelt; ++i)
46929 if (d->perm[i] != 2 * i + odd)
46930 return false;
46931
46932 if (d->testing_p)
46933 return true;
46934
46935 dop0 = gen_reg_rtx (half_mode);
46936 dop1 = gen_reg_rtx (half_mode);
46937 if (odd == 0)
46938 {
46939 for (i = 0; i < nelt / 2; i++)
46940 rperm[i] = GEN_INT (c);
46941 t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
46942 t = force_reg (half_mode, t);
46943 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
46944 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
46945 }
46946 else
46947 {
46948 emit_insn (gen_shift (dop0,
46949 gen_lowpart (half_mode, d->op0),
46950 GEN_INT (s)));
46951 emit_insn (gen_shift (dop1,
46952 gen_lowpart (half_mode, d->op1),
46953 GEN_INT (s)));
46954 }
46955 /* In AVX2 for 256 bit case we need to permute pack result. */
46956 if (TARGET_AVX2 && end_perm)
46957 {
46958 op = gen_reg_rtx (d->vmode);
46959 t = gen_reg_rtx (V4DImode);
46960 emit_insn (gen_pack (op, dop0, dop1));
46961 emit_insn (gen_avx2_permv4di_1 (t,
46962 gen_lowpart (V4DImode, op),
46963 const0_rtx,
46964 const2_rtx,
46965 const1_rtx,
46966 GEN_INT (3)));
46967 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
46968 }
46969 else
46970 emit_insn (gen_pack (d->target, dop0, dop1));
46971
46972 return true;
46973 }
46974
46975 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
46976 and extract-odd permutations of two V64QI operands
46977 with two "shifts", two "truncs" and one "concat" insns for "odd"
46978 and two "truncs" and one concat insn for "even."
46979 Have already failed all two instruction sequences. */
46980
46981 static bool
46982 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
46983 {
46984 rtx t1, t2, t3, t4;
46985 unsigned i, odd, nelt = d->nelt;
46986
46987 if (!TARGET_AVX512BW
46988 || d->one_operand_p
46989 || d->vmode != V64QImode)
46990 return false;
46991
46992 /* Check that permutation is even or odd. */
46993 odd = d->perm[0];
46994 if (odd > 1)
46995 return false;
46996
46997 for (i = 1; i < nelt; ++i)
46998 if (d->perm[i] != 2 * i + odd)
46999 return false;
47000
47001 if (d->testing_p)
47002 return true;
47003
47004
47005 if (odd)
47006 {
47007 t1 = gen_reg_rtx (V32HImode);
47008 t2 = gen_reg_rtx (V32HImode);
47009 emit_insn (gen_lshrv32hi3 (t1,
47010 gen_lowpart (V32HImode, d->op0),
47011 GEN_INT (8)));
47012 emit_insn (gen_lshrv32hi3 (t2,
47013 gen_lowpart (V32HImode, d->op1),
47014 GEN_INT (8)));
47015 }
47016 else
47017 {
47018 t1 = gen_lowpart (V32HImode, d->op0);
47019 t2 = gen_lowpart (V32HImode, d->op1);
47020 }
47021
47022 t3 = gen_reg_rtx (V32QImode);
47023 t4 = gen_reg_rtx (V32QImode);
47024 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
47025 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
47026 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
47027
47028 return true;
47029 }
47030
47031 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
47032 and extract-odd permutations. */
47033
47034 static bool
47035 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
47036 {
47037 rtx t1, t2, t3, t4, t5;
47038
47039 switch (d->vmode)
47040 {
47041 case V4DFmode:
47042 if (d->testing_p)
47043 break;
47044 t1 = gen_reg_rtx (V4DFmode);
47045 t2 = gen_reg_rtx (V4DFmode);
47046
47047 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
47048 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
47049 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
47050
47051 /* Now an unpck[lh]pd will produce the result required. */
47052 if (odd)
47053 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
47054 else
47055 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
47056 emit_insn (t3);
47057 break;
47058
47059 case V8SFmode:
47060 {
47061 int mask = odd ? 0xdd : 0x88;
47062
47063 if (d->testing_p)
47064 break;
47065 t1 = gen_reg_rtx (V8SFmode);
47066 t2 = gen_reg_rtx (V8SFmode);
47067 t3 = gen_reg_rtx (V8SFmode);
47068
47069 /* Shuffle within the 128-bit lanes to produce:
47070 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
47071 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
47072 GEN_INT (mask)));
47073
47074 /* Shuffle the lanes around to produce:
47075 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
47076 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
47077 GEN_INT (0x3)));
47078
47079 /* Shuffle within the 128-bit lanes to produce:
47080 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
47081 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
47082
47083 /* Shuffle within the 128-bit lanes to produce:
47084 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
47085 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
47086
47087 /* Shuffle the lanes around to produce:
47088 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
47089 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
47090 GEN_INT (0x20)));
47091 }
47092 break;
47093
47094 case V2DFmode:
47095 case V4SFmode:
47096 case V2DImode:
47097 case V4SImode:
47098 /* These are always directly implementable by expand_vec_perm_1. */
47099 gcc_unreachable ();
47100
47101 case V8HImode:
47102 if (TARGET_SSE4_1)
47103 return expand_vec_perm_even_odd_pack (d);
47104 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
47105 return expand_vec_perm_pshufb2 (d);
47106 else
47107 {
47108 if (d->testing_p)
47109 break;
47110 /* We need 2*log2(N)-1 operations to achieve odd/even
47111 with interleave. */
47112 t1 = gen_reg_rtx (V8HImode);
47113 t2 = gen_reg_rtx (V8HImode);
47114 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
47115 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
47116 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
47117 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
47118 if (odd)
47119 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
47120 else
47121 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
47122 emit_insn (t3);
47123 }
47124 break;
47125
47126 case V16QImode:
47127 return expand_vec_perm_even_odd_pack (d);
47128
47129 case V16HImode:
47130 case V32QImode:
47131 return expand_vec_perm_even_odd_pack (d);
47132
47133 case V64QImode:
47134 return expand_vec_perm_even_odd_trunc (d);
47135
47136 case V4DImode:
47137 if (!TARGET_AVX2)
47138 {
47139 struct expand_vec_perm_d d_copy = *d;
47140 d_copy.vmode = V4DFmode;
47141 if (d->testing_p)
47142 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
47143 else
47144 d_copy.target = gen_reg_rtx (V4DFmode);
47145 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
47146 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
47147 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
47148 {
47149 if (!d->testing_p)
47150 emit_move_insn (d->target,
47151 gen_lowpart (V4DImode, d_copy.target));
47152 return true;
47153 }
47154 return false;
47155 }
47156
47157 if (d->testing_p)
47158 break;
47159
47160 t1 = gen_reg_rtx (V4DImode);
47161 t2 = gen_reg_rtx (V4DImode);
47162
47163 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
47164 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
47165 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
47166
47167 /* Now an vpunpck[lh]qdq will produce the result required. */
47168 if (odd)
47169 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
47170 else
47171 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
47172 emit_insn (t3);
47173 break;
47174
47175 case V8SImode:
47176 if (!TARGET_AVX2)
47177 {
47178 struct expand_vec_perm_d d_copy = *d;
47179 d_copy.vmode = V8SFmode;
47180 if (d->testing_p)
47181 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
47182 else
47183 d_copy.target = gen_reg_rtx (V8SFmode);
47184 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
47185 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
47186 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
47187 {
47188 if (!d->testing_p)
47189 emit_move_insn (d->target,
47190 gen_lowpart (V8SImode, d_copy.target));
47191 return true;
47192 }
47193 return false;
47194 }
47195
47196 if (d->testing_p)
47197 break;
47198
47199 t1 = gen_reg_rtx (V8SImode);
47200 t2 = gen_reg_rtx (V8SImode);
47201 t3 = gen_reg_rtx (V4DImode);
47202 t4 = gen_reg_rtx (V4DImode);
47203 t5 = gen_reg_rtx (V4DImode);
47204
47205 /* Shuffle the lanes around into
47206 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
47207 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
47208 gen_lowpart (V4DImode, d->op1),
47209 GEN_INT (0x20)));
47210 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
47211 gen_lowpart (V4DImode, d->op1),
47212 GEN_INT (0x31)));
47213
47214 /* Swap the 2nd and 3rd position in each lane into
47215 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
47216 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
47217 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
47218 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
47219 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
47220
47221 /* Now an vpunpck[lh]qdq will produce
47222 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
47223 if (odd)
47224 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
47225 gen_lowpart (V4DImode, t2));
47226 else
47227 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
47228 gen_lowpart (V4DImode, t2));
47229 emit_insn (t3);
47230 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
47231 break;
47232
47233 default:
47234 gcc_unreachable ();
47235 }
47236
47237 return true;
47238 }
47239
47240 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
47241 extract-even and extract-odd permutations. */
47242
47243 static bool
47244 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
47245 {
47246 unsigned i, odd, nelt = d->nelt;
47247
47248 odd = d->perm[0];
47249 if (odd != 0 && odd != 1)
47250 return false;
47251
47252 for (i = 1; i < nelt; ++i)
47253 if (d->perm[i] != 2 * i + odd)
47254 return false;
47255
47256 return expand_vec_perm_even_odd_1 (d, odd);
47257 }
47258
47259 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
47260 permutations. We assume that expand_vec_perm_1 has already failed. */
47261
47262 static bool
47263 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
47264 {
47265 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
47266 machine_mode vmode = d->vmode;
47267 unsigned char perm2[4];
47268 rtx op0 = d->op0, dest;
47269 bool ok;
47270
47271 switch (vmode)
47272 {
47273 case V4DFmode:
47274 case V8SFmode:
47275 /* These are special-cased in sse.md so that we can optionally
47276 use the vbroadcast instruction. They expand to two insns
47277 if the input happens to be in a register. */
47278 gcc_unreachable ();
47279
47280 case V2DFmode:
47281 case V2DImode:
47282 case V4SFmode:
47283 case V4SImode:
47284 /* These are always implementable using standard shuffle patterns. */
47285 gcc_unreachable ();
47286
47287 case V8HImode:
47288 case V16QImode:
47289 /* These can be implemented via interleave. We save one insn by
47290 stopping once we have promoted to V4SImode and then use pshufd. */
47291 if (d->testing_p)
47292 return true;
47293 do
47294 {
47295 rtx dest;
47296 rtx (*gen) (rtx, rtx, rtx)
47297 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
47298 : gen_vec_interleave_lowv8hi;
47299
47300 if (elt >= nelt2)
47301 {
47302 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
47303 : gen_vec_interleave_highv8hi;
47304 elt -= nelt2;
47305 }
47306 nelt2 /= 2;
47307
47308 dest = gen_reg_rtx (vmode);
47309 emit_insn (gen (dest, op0, op0));
47310 vmode = get_mode_wider_vector (vmode);
47311 op0 = gen_lowpart (vmode, dest);
47312 }
47313 while (vmode != V4SImode);
47314
47315 memset (perm2, elt, 4);
47316 dest = gen_reg_rtx (V4SImode);
47317 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
47318 gcc_assert (ok);
47319 if (!d->testing_p)
47320 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
47321 return true;
47322
47323 case V64QImode:
47324 case V32QImode:
47325 case V16HImode:
47326 case V8SImode:
47327 case V4DImode:
47328 /* For AVX2 broadcasts of the first element vpbroadcast* or
47329 vpermq should be used by expand_vec_perm_1. */
47330 gcc_assert (!TARGET_AVX2 || d->perm[0]);
47331 return false;
47332
47333 default:
47334 gcc_unreachable ();
47335 }
47336 }
47337
47338 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
47339 broadcast permutations. */
47340
47341 static bool
47342 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
47343 {
47344 unsigned i, elt, nelt = d->nelt;
47345
47346 if (!d->one_operand_p)
47347 return false;
47348
47349 elt = d->perm[0];
47350 for (i = 1; i < nelt; ++i)
47351 if (d->perm[i] != elt)
47352 return false;
47353
47354 return expand_vec_perm_broadcast_1 (d);
47355 }
47356
47357 /* Implement arbitrary permutations of two V64QImode operands
47358 will 2 vpermi2w, 2 vpshufb and one vpor instruction. */
47359 static bool
47360 expand_vec_perm_vpermi2_vpshub2 (struct expand_vec_perm_d *d)
47361 {
47362 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
47363 return false;
47364
47365 if (d->testing_p)
47366 return true;
47367
47368 struct expand_vec_perm_d ds[2];
47369 rtx rperm[128], vperm, target0, target1;
47370 unsigned int i, nelt;
47371 machine_mode vmode;
47372
47373 nelt = d->nelt;
47374 vmode = V64QImode;
47375
47376 for (i = 0; i < 2; i++)
47377 {
47378 ds[i] = *d;
47379 ds[i].vmode = V32HImode;
47380 ds[i].nelt = 32;
47381 ds[i].target = gen_reg_rtx (V32HImode);
47382 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
47383 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
47384 }
47385
47386 /* Prepare permutations such that the first one takes care of
47387 putting the even bytes into the right positions or one higher
47388 positions (ds[0]) and the second one takes care of
47389 putting the odd bytes into the right positions or one below
47390 (ds[1]). */
47391
47392 for (i = 0; i < nelt; i++)
47393 {
47394 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
47395 if (i & 1)
47396 {
47397 rperm[i] = constm1_rtx;
47398 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
47399 }
47400 else
47401 {
47402 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
47403 rperm[i + 64] = constm1_rtx;
47404 }
47405 }
47406
47407 bool ok = expand_vec_perm_1 (&ds[0]);
47408 gcc_assert (ok);
47409 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
47410
47411 ok = expand_vec_perm_1 (&ds[1]);
47412 gcc_assert (ok);
47413 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
47414
47415 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
47416 vperm = force_reg (vmode, vperm);
47417 target0 = gen_reg_rtx (V64QImode);
47418 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
47419
47420 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
47421 vperm = force_reg (vmode, vperm);
47422 target1 = gen_reg_rtx (V64QImode);
47423 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
47424
47425 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
47426 return true;
47427 }
47428
47429 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
47430 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
47431 all the shorter instruction sequences. */
47432
47433 static bool
47434 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
47435 {
47436 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
47437 unsigned int i, nelt, eltsz;
47438 bool used[4];
47439
47440 if (!TARGET_AVX2
47441 || d->one_operand_p
47442 || (d->vmode != V32QImode && d->vmode != V16HImode))
47443 return false;
47444
47445 if (d->testing_p)
47446 return true;
47447
47448 nelt = d->nelt;
47449 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47450
47451 /* Generate 4 permutation masks. If the required element is within
47452 the same lane, it is shuffled in. If the required element from the
47453 other lane, force a zero by setting bit 7 in the permutation mask.
47454 In the other mask the mask has non-negative elements if element
47455 is requested from the other lane, but also moved to the other lane,
47456 so that the result of vpshufb can have the two V2TImode halves
47457 swapped. */
47458 m128 = GEN_INT (-128);
47459 for (i = 0; i < 32; ++i)
47460 {
47461 rperm[0][i] = m128;
47462 rperm[1][i] = m128;
47463 rperm[2][i] = m128;
47464 rperm[3][i] = m128;
47465 }
47466 used[0] = false;
47467 used[1] = false;
47468 used[2] = false;
47469 used[3] = false;
47470 for (i = 0; i < nelt; ++i)
47471 {
47472 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
47473 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
47474 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
47475
47476 for (j = 0; j < eltsz; ++j)
47477 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
47478 used[which] = true;
47479 }
47480
47481 for (i = 0; i < 2; ++i)
47482 {
47483 if (!used[2 * i + 1])
47484 {
47485 h[i] = NULL_RTX;
47486 continue;
47487 }
47488 vperm = gen_rtx_CONST_VECTOR (V32QImode,
47489 gen_rtvec_v (32, rperm[2 * i + 1]));
47490 vperm = force_reg (V32QImode, vperm);
47491 h[i] = gen_reg_rtx (V32QImode);
47492 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
47493 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
47494 }
47495
47496 /* Swap the 128-byte lanes of h[X]. */
47497 for (i = 0; i < 2; ++i)
47498 {
47499 if (h[i] == NULL_RTX)
47500 continue;
47501 op = gen_reg_rtx (V4DImode);
47502 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
47503 const2_rtx, GEN_INT (3), const0_rtx,
47504 const1_rtx));
47505 h[i] = gen_lowpart (V32QImode, op);
47506 }
47507
47508 for (i = 0; i < 2; ++i)
47509 {
47510 if (!used[2 * i])
47511 {
47512 l[i] = NULL_RTX;
47513 continue;
47514 }
47515 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
47516 vperm = force_reg (V32QImode, vperm);
47517 l[i] = gen_reg_rtx (V32QImode);
47518 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
47519 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
47520 }
47521
47522 for (i = 0; i < 2; ++i)
47523 {
47524 if (h[i] && l[i])
47525 {
47526 op = gen_reg_rtx (V32QImode);
47527 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
47528 l[i] = op;
47529 }
47530 else if (h[i])
47531 l[i] = h[i];
47532 }
47533
47534 gcc_assert (l[0] && l[1]);
47535 op = d->target;
47536 if (d->vmode != V32QImode)
47537 op = gen_reg_rtx (V32QImode);
47538 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
47539 if (op != d->target)
47540 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47541 return true;
47542 }
47543
47544 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
47545 With all of the interface bits taken care of, perform the expansion
47546 in D and return true on success. */
47547
47548 static bool
47549 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
47550 {
47551 /* Try a single instruction expansion. */
47552 if (expand_vec_perm_1 (d))
47553 return true;
47554
47555 /* Try sequences of two instructions. */
47556
47557 if (expand_vec_perm_pshuflw_pshufhw (d))
47558 return true;
47559
47560 if (expand_vec_perm_palignr (d, false))
47561 return true;
47562
47563 if (expand_vec_perm_interleave2 (d))
47564 return true;
47565
47566 if (expand_vec_perm_broadcast (d))
47567 return true;
47568
47569 if (expand_vec_perm_vpermq_perm_1 (d))
47570 return true;
47571
47572 if (expand_vec_perm_vperm2f128 (d))
47573 return true;
47574
47575 if (expand_vec_perm_pblendv (d))
47576 return true;
47577
47578 /* Try sequences of three instructions. */
47579
47580 if (expand_vec_perm_even_odd_pack (d))
47581 return true;
47582
47583 if (expand_vec_perm_2vperm2f128_vshuf (d))
47584 return true;
47585
47586 if (expand_vec_perm_pshufb2 (d))
47587 return true;
47588
47589 if (expand_vec_perm_interleave3 (d))
47590 return true;
47591
47592 if (expand_vec_perm_vperm2f128_vblend (d))
47593 return true;
47594
47595 /* Try sequences of four instructions. */
47596
47597 if (expand_vec_perm_even_odd_trunc (d))
47598 return true;
47599 if (expand_vec_perm_vpshufb2_vpermq (d))
47600 return true;
47601
47602 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
47603 return true;
47604
47605 if (expand_vec_perm_vpermi2_vpshub2 (d))
47606 return true;
47607
47608 /* ??? Look for narrow permutations whose element orderings would
47609 allow the promotion to a wider mode. */
47610
47611 /* ??? Look for sequences of interleave or a wider permute that place
47612 the data into the correct lanes for a half-vector shuffle like
47613 pshuf[lh]w or vpermilps. */
47614
47615 /* ??? Look for sequences of interleave that produce the desired results.
47616 The combinatorics of punpck[lh] get pretty ugly... */
47617
47618 if (expand_vec_perm_even_odd (d))
47619 return true;
47620
47621 /* Even longer sequences. */
47622 if (expand_vec_perm_vpshufb4_vpermq2 (d))
47623 return true;
47624
47625 /* See if we can get the same permutation in different vector integer
47626 mode. */
47627 struct expand_vec_perm_d nd;
47628 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
47629 {
47630 if (!d->testing_p)
47631 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
47632 return true;
47633 }
47634
47635 return false;
47636 }
47637
47638 /* If a permutation only uses one operand, make it clear. Returns true
47639 if the permutation references both operands. */
47640
47641 static bool
47642 canonicalize_perm (struct expand_vec_perm_d *d)
47643 {
47644 int i, which, nelt = d->nelt;
47645
47646 for (i = which = 0; i < nelt; ++i)
47647 which |= (d->perm[i] < nelt ? 1 : 2);
47648
47649 d->one_operand_p = true;
47650 switch (which)
47651 {
47652 default:
47653 gcc_unreachable();
47654
47655 case 3:
47656 if (!rtx_equal_p (d->op0, d->op1))
47657 {
47658 d->one_operand_p = false;
47659 break;
47660 }
47661 /* The elements of PERM do not suggest that only the first operand
47662 is used, but both operands are identical. Allow easier matching
47663 of the permutation by folding the permutation into the single
47664 input vector. */
47665 /* FALLTHRU */
47666
47667 case 2:
47668 for (i = 0; i < nelt; ++i)
47669 d->perm[i] &= nelt - 1;
47670 d->op0 = d->op1;
47671 break;
47672
47673 case 1:
47674 d->op1 = d->op0;
47675 break;
47676 }
47677
47678 return (which == 3);
47679 }
47680
47681 bool
47682 ix86_expand_vec_perm_const (rtx operands[4])
47683 {
47684 struct expand_vec_perm_d d;
47685 unsigned char perm[MAX_VECT_LEN];
47686 int i, nelt;
47687 bool two_args;
47688 rtx sel;
47689
47690 d.target = operands[0];
47691 d.op0 = operands[1];
47692 d.op1 = operands[2];
47693 sel = operands[3];
47694
47695 d.vmode = GET_MODE (d.target);
47696 gcc_assert (VECTOR_MODE_P (d.vmode));
47697 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47698 d.testing_p = false;
47699
47700 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
47701 gcc_assert (XVECLEN (sel, 0) == nelt);
47702 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
47703
47704 for (i = 0; i < nelt; ++i)
47705 {
47706 rtx e = XVECEXP (sel, 0, i);
47707 int ei = INTVAL (e) & (2 * nelt - 1);
47708 d.perm[i] = ei;
47709 perm[i] = ei;
47710 }
47711
47712 two_args = canonicalize_perm (&d);
47713
47714 if (ix86_expand_vec_perm_const_1 (&d))
47715 return true;
47716
47717 /* If the selector says both arguments are needed, but the operands are the
47718 same, the above tried to expand with one_operand_p and flattened selector.
47719 If that didn't work, retry without one_operand_p; we succeeded with that
47720 during testing. */
47721 if (two_args && d.one_operand_p)
47722 {
47723 d.one_operand_p = false;
47724 memcpy (d.perm, perm, sizeof (perm));
47725 return ix86_expand_vec_perm_const_1 (&d);
47726 }
47727
47728 return false;
47729 }
47730
47731 /* Implement targetm.vectorize.vec_perm_const_ok. */
47732
47733 static bool
47734 ix86_vectorize_vec_perm_const_ok (machine_mode vmode,
47735 const unsigned char *sel)
47736 {
47737 struct expand_vec_perm_d d;
47738 unsigned int i, nelt, which;
47739 bool ret;
47740
47741 d.vmode = vmode;
47742 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47743 d.testing_p = true;
47744
47745 /* Given sufficient ISA support we can just return true here
47746 for selected vector modes. */
47747 switch (d.vmode)
47748 {
47749 case V16SFmode:
47750 case V16SImode:
47751 case V8DImode:
47752 case V8DFmode:
47753 if (TARGET_AVX512F)
47754 /* All implementable with a single vpermi2 insn. */
47755 return true;
47756 break;
47757 case V32HImode:
47758 if (TARGET_AVX512BW)
47759 /* All implementable with a single vpermi2 insn. */
47760 return true;
47761 break;
47762 case V64QImode:
47763 if (TARGET_AVX512BW)
47764 /* Implementable with 2 vpermi2, 2 vpshufb and 1 or insn. */
47765 return true;
47766 break;
47767 case V8SImode:
47768 case V8SFmode:
47769 case V4DFmode:
47770 case V4DImode:
47771 if (TARGET_AVX512VL)
47772 /* All implementable with a single vpermi2 insn. */
47773 return true;
47774 break;
47775 case V16HImode:
47776 if (TARGET_AVX2)
47777 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
47778 return true;
47779 break;
47780 case V32QImode:
47781 if (TARGET_AVX2)
47782 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
47783 return true;
47784 break;
47785 case V4SImode:
47786 case V4SFmode:
47787 case V8HImode:
47788 case V16QImode:
47789 /* All implementable with a single vpperm insn. */
47790 if (TARGET_XOP)
47791 return true;
47792 /* All implementable with 2 pshufb + 1 ior. */
47793 if (TARGET_SSSE3)
47794 return true;
47795 break;
47796 case V2DImode:
47797 case V2DFmode:
47798 /* All implementable with shufpd or unpck[lh]pd. */
47799 return true;
47800 default:
47801 return false;
47802 }
47803
47804 /* Extract the values from the vector CST into the permutation
47805 array in D. */
47806 memcpy (d.perm, sel, nelt);
47807 for (i = which = 0; i < nelt; ++i)
47808 {
47809 unsigned char e = d.perm[i];
47810 gcc_assert (e < 2 * nelt);
47811 which |= (e < nelt ? 1 : 2);
47812 }
47813
47814 /* For all elements from second vector, fold the elements to first. */
47815 if (which == 2)
47816 for (i = 0; i < nelt; ++i)
47817 d.perm[i] -= nelt;
47818
47819 /* Check whether the mask can be applied to the vector type. */
47820 d.one_operand_p = (which != 3);
47821
47822 /* Implementable with shufps or pshufd. */
47823 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
47824 return true;
47825
47826 /* Otherwise we have to go through the motions and see if we can
47827 figure out how to generate the requested permutation. */
47828 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
47829 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
47830 if (!d.one_operand_p)
47831 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
47832
47833 start_sequence ();
47834 ret = ix86_expand_vec_perm_const_1 (&d);
47835 end_sequence ();
47836
47837 return ret;
47838 }
47839
47840 void
47841 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
47842 {
47843 struct expand_vec_perm_d d;
47844 unsigned i, nelt;
47845
47846 d.target = targ;
47847 d.op0 = op0;
47848 d.op1 = op1;
47849 d.vmode = GET_MODE (targ);
47850 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47851 d.one_operand_p = false;
47852 d.testing_p = false;
47853
47854 for (i = 0; i < nelt; ++i)
47855 d.perm[i] = i * 2 + odd;
47856
47857 /* We'll either be able to implement the permutation directly... */
47858 if (expand_vec_perm_1 (&d))
47859 return;
47860
47861 /* ... or we use the special-case patterns. */
47862 expand_vec_perm_even_odd_1 (&d, odd);
47863 }
47864
47865 static void
47866 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
47867 {
47868 struct expand_vec_perm_d d;
47869 unsigned i, nelt, base;
47870 bool ok;
47871
47872 d.target = targ;
47873 d.op0 = op0;
47874 d.op1 = op1;
47875 d.vmode = GET_MODE (targ);
47876 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47877 d.one_operand_p = false;
47878 d.testing_p = false;
47879
47880 base = high_p ? nelt / 2 : 0;
47881 for (i = 0; i < nelt / 2; ++i)
47882 {
47883 d.perm[i * 2] = i + base;
47884 d.perm[i * 2 + 1] = i + base + nelt;
47885 }
47886
47887 /* Note that for AVX this isn't one instruction. */
47888 ok = ix86_expand_vec_perm_const_1 (&d);
47889 gcc_assert (ok);
47890 }
47891
47892
47893 /* Expand a vector operation CODE for a V*QImode in terms of the
47894 same operation on V*HImode. */
47895
47896 void
47897 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
47898 {
47899 machine_mode qimode = GET_MODE (dest);
47900 machine_mode himode;
47901 rtx (*gen_il) (rtx, rtx, rtx);
47902 rtx (*gen_ih) (rtx, rtx, rtx);
47903 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
47904 struct expand_vec_perm_d d;
47905 bool ok, full_interleave;
47906 bool uns_p = false;
47907 int i;
47908
47909 switch (qimode)
47910 {
47911 case V16QImode:
47912 himode = V8HImode;
47913 gen_il = gen_vec_interleave_lowv16qi;
47914 gen_ih = gen_vec_interleave_highv16qi;
47915 break;
47916 case V32QImode:
47917 himode = V16HImode;
47918 gen_il = gen_avx2_interleave_lowv32qi;
47919 gen_ih = gen_avx2_interleave_highv32qi;
47920 break;
47921 case V64QImode:
47922 himode = V32HImode;
47923 gen_il = gen_avx512bw_interleave_lowv64qi;
47924 gen_ih = gen_avx512bw_interleave_highv64qi;
47925 break;
47926 default:
47927 gcc_unreachable ();
47928 }
47929
47930 op2_l = op2_h = op2;
47931 switch (code)
47932 {
47933 case MULT:
47934 /* Unpack data such that we've got a source byte in each low byte of
47935 each word. We don't care what goes into the high byte of each word.
47936 Rather than trying to get zero in there, most convenient is to let
47937 it be a copy of the low byte. */
47938 op2_l = gen_reg_rtx (qimode);
47939 op2_h = gen_reg_rtx (qimode);
47940 emit_insn (gen_il (op2_l, op2, op2));
47941 emit_insn (gen_ih (op2_h, op2, op2));
47942 /* FALLTHRU */
47943
47944 op1_l = gen_reg_rtx (qimode);
47945 op1_h = gen_reg_rtx (qimode);
47946 emit_insn (gen_il (op1_l, op1, op1));
47947 emit_insn (gen_ih (op1_h, op1, op1));
47948 full_interleave = qimode == V16QImode;
47949 break;
47950
47951 case ASHIFT:
47952 case LSHIFTRT:
47953 uns_p = true;
47954 /* FALLTHRU */
47955 case ASHIFTRT:
47956 op1_l = gen_reg_rtx (himode);
47957 op1_h = gen_reg_rtx (himode);
47958 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
47959 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
47960 full_interleave = true;
47961 break;
47962 default:
47963 gcc_unreachable ();
47964 }
47965
47966 /* Perform the operation. */
47967 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
47968 1, OPTAB_DIRECT);
47969 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
47970 1, OPTAB_DIRECT);
47971 gcc_assert (res_l && res_h);
47972
47973 /* Merge the data back into the right place. */
47974 d.target = dest;
47975 d.op0 = gen_lowpart (qimode, res_l);
47976 d.op1 = gen_lowpart (qimode, res_h);
47977 d.vmode = qimode;
47978 d.nelt = GET_MODE_NUNITS (qimode);
47979 d.one_operand_p = false;
47980 d.testing_p = false;
47981
47982 if (full_interleave)
47983 {
47984 /* For SSE2, we used an full interleave, so the desired
47985 results are in the even elements. */
47986 for (i = 0; i < d.nelt; ++i)
47987 d.perm[i] = i * 2;
47988 }
47989 else
47990 {
47991 /* For AVX, the interleave used above was not cross-lane. So the
47992 extraction is evens but with the second and third quarter swapped.
47993 Happily, that is even one insn shorter than even extraction.
47994 For AVX512BW we have 4 lanes. We extract evens from within a lane,
47995 always first from the first and then from the second source operand,
47996 the index bits above the low 4 bits remains the same.
47997 Thus, for d.nelt == 32 we want permutation
47998 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
47999 and for d.nelt == 64 we want permutation
48000 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
48001 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
48002 for (i = 0; i < d.nelt; ++i)
48003 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
48004 }
48005
48006 ok = ix86_expand_vec_perm_const_1 (&d);
48007 gcc_assert (ok);
48008
48009 set_unique_reg_note (get_last_insn (), REG_EQUAL,
48010 gen_rtx_fmt_ee (code, qimode, op1, op2));
48011 }
48012
48013 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
48014 if op is CONST_VECTOR with all odd elements equal to their
48015 preceding element. */
48016
48017 static bool
48018 const_vector_equal_evenodd_p (rtx op)
48019 {
48020 machine_mode mode = GET_MODE (op);
48021 int i, nunits = GET_MODE_NUNITS (mode);
48022 if (GET_CODE (op) != CONST_VECTOR
48023 || nunits != CONST_VECTOR_NUNITS (op))
48024 return false;
48025 for (i = 0; i < nunits; i += 2)
48026 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
48027 return false;
48028 return true;
48029 }
48030
48031 void
48032 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
48033 bool uns_p, bool odd_p)
48034 {
48035 machine_mode mode = GET_MODE (op1);
48036 machine_mode wmode = GET_MODE (dest);
48037 rtx x;
48038 rtx orig_op1 = op1, orig_op2 = op2;
48039
48040 if (!nonimmediate_operand (op1, mode))
48041 op1 = force_reg (mode, op1);
48042 if (!nonimmediate_operand (op2, mode))
48043 op2 = force_reg (mode, op2);
48044
48045 /* We only play even/odd games with vectors of SImode. */
48046 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
48047
48048 /* If we're looking for the odd results, shift those members down to
48049 the even slots. For some cpus this is faster than a PSHUFD. */
48050 if (odd_p)
48051 {
48052 /* For XOP use vpmacsdqh, but only for smult, as it is only
48053 signed. */
48054 if (TARGET_XOP && mode == V4SImode && !uns_p)
48055 {
48056 x = force_reg (wmode, CONST0_RTX (wmode));
48057 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
48058 return;
48059 }
48060
48061 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
48062 if (!const_vector_equal_evenodd_p (orig_op1))
48063 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
48064 x, NULL, 1, OPTAB_DIRECT);
48065 if (!const_vector_equal_evenodd_p (orig_op2))
48066 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
48067 x, NULL, 1, OPTAB_DIRECT);
48068 op1 = gen_lowpart (mode, op1);
48069 op2 = gen_lowpart (mode, op2);
48070 }
48071
48072 if (mode == V16SImode)
48073 {
48074 if (uns_p)
48075 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
48076 else
48077 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
48078 }
48079 else if (mode == V8SImode)
48080 {
48081 if (uns_p)
48082 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
48083 else
48084 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
48085 }
48086 else if (uns_p)
48087 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
48088 else if (TARGET_SSE4_1)
48089 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
48090 else
48091 {
48092 rtx s1, s2, t0, t1, t2;
48093
48094 /* The easiest way to implement this without PMULDQ is to go through
48095 the motions as if we are performing a full 64-bit multiply. With
48096 the exception that we need to do less shuffling of the elements. */
48097
48098 /* Compute the sign-extension, aka highparts, of the two operands. */
48099 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
48100 op1, pc_rtx, pc_rtx);
48101 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
48102 op2, pc_rtx, pc_rtx);
48103
48104 /* Multiply LO(A) * HI(B), and vice-versa. */
48105 t1 = gen_reg_rtx (wmode);
48106 t2 = gen_reg_rtx (wmode);
48107 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
48108 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
48109
48110 /* Multiply LO(A) * LO(B). */
48111 t0 = gen_reg_rtx (wmode);
48112 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
48113
48114 /* Combine and shift the highparts into place. */
48115 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
48116 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
48117 1, OPTAB_DIRECT);
48118
48119 /* Combine high and low parts. */
48120 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
48121 return;
48122 }
48123 emit_insn (x);
48124 }
48125
48126 void
48127 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
48128 bool uns_p, bool high_p)
48129 {
48130 machine_mode wmode = GET_MODE (dest);
48131 machine_mode mode = GET_MODE (op1);
48132 rtx t1, t2, t3, t4, mask;
48133
48134 switch (mode)
48135 {
48136 case V4SImode:
48137 t1 = gen_reg_rtx (mode);
48138 t2 = gen_reg_rtx (mode);
48139 if (TARGET_XOP && !uns_p)
48140 {
48141 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
48142 shuffle the elements once so that all elements are in the right
48143 place for immediate use: { A C B D }. */
48144 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
48145 const1_rtx, GEN_INT (3)));
48146 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
48147 const1_rtx, GEN_INT (3)));
48148 }
48149 else
48150 {
48151 /* Put the elements into place for the multiply. */
48152 ix86_expand_vec_interleave (t1, op1, op1, high_p);
48153 ix86_expand_vec_interleave (t2, op2, op2, high_p);
48154 high_p = false;
48155 }
48156 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
48157 break;
48158
48159 case V8SImode:
48160 /* Shuffle the elements between the lanes. After this we
48161 have { A B E F | C D G H } for each operand. */
48162 t1 = gen_reg_rtx (V4DImode);
48163 t2 = gen_reg_rtx (V4DImode);
48164 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
48165 const0_rtx, const2_rtx,
48166 const1_rtx, GEN_INT (3)));
48167 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
48168 const0_rtx, const2_rtx,
48169 const1_rtx, GEN_INT (3)));
48170
48171 /* Shuffle the elements within the lanes. After this we
48172 have { A A B B | C C D D } or { E E F F | G G H H }. */
48173 t3 = gen_reg_rtx (V8SImode);
48174 t4 = gen_reg_rtx (V8SImode);
48175 mask = GEN_INT (high_p
48176 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
48177 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
48178 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
48179 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
48180
48181 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
48182 break;
48183
48184 case V8HImode:
48185 case V16HImode:
48186 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
48187 uns_p, OPTAB_DIRECT);
48188 t2 = expand_binop (mode,
48189 uns_p ? umul_highpart_optab : smul_highpart_optab,
48190 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
48191 gcc_assert (t1 && t2);
48192
48193 t3 = gen_reg_rtx (mode);
48194 ix86_expand_vec_interleave (t3, t1, t2, high_p);
48195 emit_move_insn (dest, gen_lowpart (wmode, t3));
48196 break;
48197
48198 case V16QImode:
48199 case V32QImode:
48200 case V32HImode:
48201 case V16SImode:
48202 case V64QImode:
48203 t1 = gen_reg_rtx (wmode);
48204 t2 = gen_reg_rtx (wmode);
48205 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
48206 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
48207
48208 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
48209 break;
48210
48211 default:
48212 gcc_unreachable ();
48213 }
48214 }
48215
48216 void
48217 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
48218 {
48219 rtx res_1, res_2, res_3, res_4;
48220
48221 res_1 = gen_reg_rtx (V4SImode);
48222 res_2 = gen_reg_rtx (V4SImode);
48223 res_3 = gen_reg_rtx (V2DImode);
48224 res_4 = gen_reg_rtx (V2DImode);
48225 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
48226 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
48227
48228 /* Move the results in element 2 down to element 1; we don't care
48229 what goes in elements 2 and 3. Then we can merge the parts
48230 back together with an interleave.
48231
48232 Note that two other sequences were tried:
48233 (1) Use interleaves at the start instead of psrldq, which allows
48234 us to use a single shufps to merge things back at the end.
48235 (2) Use shufps here to combine the two vectors, then pshufd to
48236 put the elements in the correct order.
48237 In both cases the cost of the reformatting stall was too high
48238 and the overall sequence slower. */
48239
48240 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
48241 const0_rtx, const2_rtx,
48242 const0_rtx, const0_rtx));
48243 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
48244 const0_rtx, const2_rtx,
48245 const0_rtx, const0_rtx));
48246 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
48247
48248 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
48249 }
48250
48251 void
48252 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
48253 {
48254 machine_mode mode = GET_MODE (op0);
48255 rtx t1, t2, t3, t4, t5, t6;
48256
48257 if (TARGET_AVX512DQ && mode == V8DImode)
48258 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
48259 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
48260 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
48261 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
48262 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
48263 else if (TARGET_XOP && mode == V2DImode)
48264 {
48265 /* op1: A,B,C,D, op2: E,F,G,H */
48266 op1 = gen_lowpart (V4SImode, op1);
48267 op2 = gen_lowpart (V4SImode, op2);
48268
48269 t1 = gen_reg_rtx (V4SImode);
48270 t2 = gen_reg_rtx (V4SImode);
48271 t3 = gen_reg_rtx (V2DImode);
48272 t4 = gen_reg_rtx (V2DImode);
48273
48274 /* t1: B,A,D,C */
48275 emit_insn (gen_sse2_pshufd_1 (t1, op1,
48276 GEN_INT (1),
48277 GEN_INT (0),
48278 GEN_INT (3),
48279 GEN_INT (2)));
48280
48281 /* t2: (B*E),(A*F),(D*G),(C*H) */
48282 emit_insn (gen_mulv4si3 (t2, t1, op2));
48283
48284 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
48285 emit_insn (gen_xop_phadddq (t3, t2));
48286
48287 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
48288 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
48289
48290 /* Multiply lower parts and add all */
48291 t5 = gen_reg_rtx (V2DImode);
48292 emit_insn (gen_vec_widen_umult_even_v4si (t5,
48293 gen_lowpart (V4SImode, op1),
48294 gen_lowpart (V4SImode, op2)));
48295 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
48296
48297 }
48298 else
48299 {
48300 machine_mode nmode;
48301 rtx (*umul) (rtx, rtx, rtx);
48302
48303 if (mode == V2DImode)
48304 {
48305 umul = gen_vec_widen_umult_even_v4si;
48306 nmode = V4SImode;
48307 }
48308 else if (mode == V4DImode)
48309 {
48310 umul = gen_vec_widen_umult_even_v8si;
48311 nmode = V8SImode;
48312 }
48313 else if (mode == V8DImode)
48314 {
48315 umul = gen_vec_widen_umult_even_v16si;
48316 nmode = V16SImode;
48317 }
48318 else
48319 gcc_unreachable ();
48320
48321
48322 /* Multiply low parts. */
48323 t1 = gen_reg_rtx (mode);
48324 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
48325
48326 /* Shift input vectors right 32 bits so we can multiply high parts. */
48327 t6 = GEN_INT (32);
48328 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
48329 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
48330
48331 /* Multiply high parts by low parts. */
48332 t4 = gen_reg_rtx (mode);
48333 t5 = gen_reg_rtx (mode);
48334 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
48335 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
48336
48337 /* Combine and shift the highparts back. */
48338 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
48339 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
48340
48341 /* Combine high and low parts. */
48342 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
48343 }
48344
48345 set_unique_reg_note (get_last_insn (), REG_EQUAL,
48346 gen_rtx_MULT (mode, op1, op2));
48347 }
48348
48349 /* Return 1 if control tansfer instruction INSN
48350 should be encoded with bnd prefix.
48351 If insn is NULL then return 1 when control
48352 transfer instructions should be prefixed with
48353 bnd by default for current function. */
48354
48355 bool
48356 ix86_bnd_prefixed_insn_p (rtx insn)
48357 {
48358 /* For call insns check special flag. */
48359 if (insn && CALL_P (insn))
48360 {
48361 rtx call = get_call_rtx_from (insn);
48362 if (call)
48363 return CALL_EXPR_WITH_BOUNDS_P (call);
48364 }
48365
48366 /* All other insns are prefixed only if function is instrumented. */
48367 return chkp_function_instrumented_p (current_function_decl);
48368 }
48369
48370 /* Calculate integer abs() using only SSE2 instructions. */
48371
48372 void
48373 ix86_expand_sse2_abs (rtx target, rtx input)
48374 {
48375 machine_mode mode = GET_MODE (target);
48376 rtx tmp0, tmp1, x;
48377
48378 switch (mode)
48379 {
48380 /* For 32-bit signed integer X, the best way to calculate the absolute
48381 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
48382 case V4SImode:
48383 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
48384 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
48385 NULL, 0, OPTAB_DIRECT);
48386 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
48387 NULL, 0, OPTAB_DIRECT);
48388 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
48389 target, 0, OPTAB_DIRECT);
48390 break;
48391
48392 /* For 16-bit signed integer X, the best way to calculate the absolute
48393 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
48394 case V8HImode:
48395 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
48396
48397 x = expand_simple_binop (mode, SMAX, tmp0, input,
48398 target, 0, OPTAB_DIRECT);
48399 break;
48400
48401 /* For 8-bit signed integer X, the best way to calculate the absolute
48402 value of X is min ((unsigned char) X, (unsigned char) (-X)),
48403 as SSE2 provides the PMINUB insn. */
48404 case V16QImode:
48405 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
48406
48407 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
48408 target, 0, OPTAB_DIRECT);
48409 break;
48410
48411 default:
48412 gcc_unreachable ();
48413 }
48414
48415 if (x != target)
48416 emit_move_insn (target, x);
48417 }
48418
48419 /* Expand an extract from a vector register through pextr insn.
48420 Return true if successful. */
48421
48422 bool
48423 ix86_expand_pextr (rtx *operands)
48424 {
48425 rtx dst = operands[0];
48426 rtx src = operands[1];
48427
48428 unsigned int size = INTVAL (operands[2]);
48429 unsigned int pos = INTVAL (operands[3]);
48430
48431 if (SUBREG_P (dst))
48432 {
48433 /* Reject non-lowpart subregs. */
48434 if (SUBREG_BYTE (dst) > 0)
48435 return false;
48436 dst = SUBREG_REG (dst);
48437 }
48438
48439 if (SUBREG_P (src))
48440 {
48441 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
48442 src = SUBREG_REG (src);
48443 }
48444
48445 switch (GET_MODE (src))
48446 {
48447 case V16QImode:
48448 case V8HImode:
48449 case V4SImode:
48450 case V2DImode:
48451 case V1TImode:
48452 case TImode:
48453 {
48454 machine_mode srcmode, dstmode;
48455 rtx d, pat;
48456
48457 dstmode = mode_for_size (size, MODE_INT, 0);
48458
48459 switch (dstmode)
48460 {
48461 case QImode:
48462 if (!TARGET_SSE4_1)
48463 return false;
48464 srcmode = V16QImode;
48465 break;
48466
48467 case HImode:
48468 if (!TARGET_SSE2)
48469 return false;
48470 srcmode = V8HImode;
48471 break;
48472
48473 case SImode:
48474 if (!TARGET_SSE4_1)
48475 return false;
48476 srcmode = V4SImode;
48477 break;
48478
48479 case DImode:
48480 gcc_assert (TARGET_64BIT);
48481 if (!TARGET_SSE4_1)
48482 return false;
48483 srcmode = V2DImode;
48484 break;
48485
48486 default:
48487 return false;
48488 }
48489
48490 /* Reject extractions from misaligned positions. */
48491 if (pos & (size-1))
48492 return false;
48493
48494 if (GET_MODE (dst) == dstmode)
48495 d = dst;
48496 else
48497 d = gen_reg_rtx (dstmode);
48498
48499 /* Construct insn pattern. */
48500 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
48501 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
48502
48503 /* Let the rtl optimizers know about the zero extension performed. */
48504 if (dstmode == QImode || dstmode == HImode)
48505 {
48506 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
48507 d = gen_lowpart (SImode, d);
48508 }
48509
48510 emit_insn (gen_rtx_SET (d, pat));
48511
48512 if (d != dst)
48513 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
48514 return true;
48515 }
48516
48517 default:
48518 return false;
48519 }
48520 }
48521
48522 /* Expand an insert into a vector register through pinsr insn.
48523 Return true if successful. */
48524
48525 bool
48526 ix86_expand_pinsr (rtx *operands)
48527 {
48528 rtx dst = operands[0];
48529 rtx src = operands[3];
48530
48531 unsigned int size = INTVAL (operands[1]);
48532 unsigned int pos = INTVAL (operands[2]);
48533
48534 if (SUBREG_P (dst))
48535 {
48536 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
48537 dst = SUBREG_REG (dst);
48538 }
48539
48540 switch (GET_MODE (dst))
48541 {
48542 case V16QImode:
48543 case V8HImode:
48544 case V4SImode:
48545 case V2DImode:
48546 case V1TImode:
48547 case TImode:
48548 {
48549 machine_mode srcmode, dstmode;
48550 rtx (*pinsr)(rtx, rtx, rtx, rtx);
48551 rtx d;
48552
48553 srcmode = mode_for_size (size, MODE_INT, 0);
48554
48555 switch (srcmode)
48556 {
48557 case QImode:
48558 if (!TARGET_SSE4_1)
48559 return false;
48560 dstmode = V16QImode;
48561 pinsr = gen_sse4_1_pinsrb;
48562 break;
48563
48564 case HImode:
48565 if (!TARGET_SSE2)
48566 return false;
48567 dstmode = V8HImode;
48568 pinsr = gen_sse2_pinsrw;
48569 break;
48570
48571 case SImode:
48572 if (!TARGET_SSE4_1)
48573 return false;
48574 dstmode = V4SImode;
48575 pinsr = gen_sse4_1_pinsrd;
48576 break;
48577
48578 case DImode:
48579 gcc_assert (TARGET_64BIT);
48580 if (!TARGET_SSE4_1)
48581 return false;
48582 dstmode = V2DImode;
48583 pinsr = gen_sse4_1_pinsrq;
48584 break;
48585
48586 default:
48587 return false;
48588 }
48589
48590 /* Reject insertions to misaligned positions. */
48591 if (pos & (size-1))
48592 return false;
48593
48594 if (SUBREG_P (src))
48595 {
48596 unsigned int srcpos = SUBREG_BYTE (src);
48597
48598 if (srcpos > 0)
48599 {
48600 rtx extr_ops[4];
48601
48602 extr_ops[0] = gen_reg_rtx (srcmode);
48603 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
48604 extr_ops[2] = GEN_INT (size);
48605 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
48606
48607 if (!ix86_expand_pextr (extr_ops))
48608 return false;
48609
48610 src = extr_ops[0];
48611 }
48612 else
48613 src = gen_lowpart (srcmode, SUBREG_REG (src));
48614 }
48615
48616 if (GET_MODE (dst) == dstmode)
48617 d = dst;
48618 else
48619 d = gen_reg_rtx (dstmode);
48620
48621 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
48622 gen_lowpart (srcmode, src),
48623 GEN_INT (1 << (pos / size))));
48624 if (d != dst)
48625 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
48626 return true;
48627 }
48628
48629 default:
48630 return false;
48631 }
48632 }
48633 \f
48634 /* This function returns the calling abi specific va_list type node.
48635 It returns the FNDECL specific va_list type. */
48636
48637 static tree
48638 ix86_fn_abi_va_list (tree fndecl)
48639 {
48640 if (!TARGET_64BIT)
48641 return va_list_type_node;
48642 gcc_assert (fndecl != NULL_TREE);
48643
48644 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
48645 return ms_va_list_type_node;
48646 else
48647 return sysv_va_list_type_node;
48648 }
48649
48650 /* Returns the canonical va_list type specified by TYPE. If there
48651 is no valid TYPE provided, it return NULL_TREE. */
48652
48653 static tree
48654 ix86_canonical_va_list_type (tree type)
48655 {
48656 if (TARGET_64BIT)
48657 {
48658 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
48659 return ms_va_list_type_node;
48660
48661 if ((TREE_CODE (type) == ARRAY_TYPE
48662 && integer_zerop (array_type_nelts (type)))
48663 || POINTER_TYPE_P (type))
48664 {
48665 tree elem_type = TREE_TYPE (type);
48666 if (TREE_CODE (elem_type) == RECORD_TYPE
48667 && lookup_attribute ("sysv_abi va_list",
48668 TYPE_ATTRIBUTES (elem_type)))
48669 return sysv_va_list_type_node;
48670 }
48671
48672 return NULL_TREE;
48673 }
48674
48675 return std_canonical_va_list_type (type);
48676 }
48677
48678 /* Iterate through the target-specific builtin types for va_list.
48679 IDX denotes the iterator, *PTREE is set to the result type of
48680 the va_list builtin, and *PNAME to its internal type.
48681 Returns zero if there is no element for this index, otherwise
48682 IDX should be increased upon the next call.
48683 Note, do not iterate a base builtin's name like __builtin_va_list.
48684 Used from c_common_nodes_and_builtins. */
48685
48686 static int
48687 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
48688 {
48689 if (TARGET_64BIT)
48690 {
48691 switch (idx)
48692 {
48693 default:
48694 break;
48695
48696 case 0:
48697 *ptree = ms_va_list_type_node;
48698 *pname = "__builtin_ms_va_list";
48699 return 1;
48700
48701 case 1:
48702 *ptree = sysv_va_list_type_node;
48703 *pname = "__builtin_sysv_va_list";
48704 return 1;
48705 }
48706 }
48707
48708 return 0;
48709 }
48710
48711 #undef TARGET_SCHED_DISPATCH
48712 #define TARGET_SCHED_DISPATCH has_dispatch
48713 #undef TARGET_SCHED_DISPATCH_DO
48714 #define TARGET_SCHED_DISPATCH_DO do_dispatch
48715 #undef TARGET_SCHED_REASSOCIATION_WIDTH
48716 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
48717 #undef TARGET_SCHED_REORDER
48718 #define TARGET_SCHED_REORDER ix86_sched_reorder
48719 #undef TARGET_SCHED_ADJUST_PRIORITY
48720 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
48721 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
48722 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
48723 ix86_dependencies_evaluation_hook
48724
48725 /* The size of the dispatch window is the total number of bytes of
48726 object code allowed in a window. */
48727 #define DISPATCH_WINDOW_SIZE 16
48728
48729 /* Number of dispatch windows considered for scheduling. */
48730 #define MAX_DISPATCH_WINDOWS 3
48731
48732 /* Maximum number of instructions in a window. */
48733 #define MAX_INSN 4
48734
48735 /* Maximum number of immediate operands in a window. */
48736 #define MAX_IMM 4
48737
48738 /* Maximum number of immediate bits allowed in a window. */
48739 #define MAX_IMM_SIZE 128
48740
48741 /* Maximum number of 32 bit immediates allowed in a window. */
48742 #define MAX_IMM_32 4
48743
48744 /* Maximum number of 64 bit immediates allowed in a window. */
48745 #define MAX_IMM_64 2
48746
48747 /* Maximum total of loads or prefetches allowed in a window. */
48748 #define MAX_LOAD 2
48749
48750 /* Maximum total of stores allowed in a window. */
48751 #define MAX_STORE 1
48752
48753 #undef BIG
48754 #define BIG 100
48755
48756
48757 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
48758 enum dispatch_group {
48759 disp_no_group = 0,
48760 disp_load,
48761 disp_store,
48762 disp_load_store,
48763 disp_prefetch,
48764 disp_imm,
48765 disp_imm_32,
48766 disp_imm_64,
48767 disp_branch,
48768 disp_cmp,
48769 disp_jcc,
48770 disp_last
48771 };
48772
48773 /* Number of allowable groups in a dispatch window. It is an array
48774 indexed by dispatch_group enum. 100 is used as a big number,
48775 because the number of these kind of operations does not have any
48776 effect in dispatch window, but we need them for other reasons in
48777 the table. */
48778 static unsigned int num_allowable_groups[disp_last] = {
48779 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
48780 };
48781
48782 char group_name[disp_last + 1][16] = {
48783 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
48784 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
48785 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
48786 };
48787
48788 /* Instruction path. */
48789 enum insn_path {
48790 no_path = 0,
48791 path_single, /* Single micro op. */
48792 path_double, /* Double micro op. */
48793 path_multi, /* Instructions with more than 2 micro op.. */
48794 last_path
48795 };
48796
48797 /* sched_insn_info defines a window to the instructions scheduled in
48798 the basic block. It contains a pointer to the insn_info table and
48799 the instruction scheduled.
48800
48801 Windows are allocated for each basic block and are linked
48802 together. */
48803 typedef struct sched_insn_info_s {
48804 rtx insn;
48805 enum dispatch_group group;
48806 enum insn_path path;
48807 int byte_len;
48808 int imm_bytes;
48809 } sched_insn_info;
48810
48811 /* Linked list of dispatch windows. This is a two way list of
48812 dispatch windows of a basic block. It contains information about
48813 the number of uops in the window and the total number of
48814 instructions and of bytes in the object code for this dispatch
48815 window. */
48816 typedef struct dispatch_windows_s {
48817 int num_insn; /* Number of insn in the window. */
48818 int num_uops; /* Number of uops in the window. */
48819 int window_size; /* Number of bytes in the window. */
48820 int window_num; /* Window number between 0 or 1. */
48821 int num_imm; /* Number of immediates in an insn. */
48822 int num_imm_32; /* Number of 32 bit immediates in an insn. */
48823 int num_imm_64; /* Number of 64 bit immediates in an insn. */
48824 int imm_size; /* Total immediates in the window. */
48825 int num_loads; /* Total memory loads in the window. */
48826 int num_stores; /* Total memory stores in the window. */
48827 int violation; /* Violation exists in window. */
48828 sched_insn_info *window; /* Pointer to the window. */
48829 struct dispatch_windows_s *next;
48830 struct dispatch_windows_s *prev;
48831 } dispatch_windows;
48832
48833 /* Immediate valuse used in an insn. */
48834 typedef struct imm_info_s
48835 {
48836 int imm;
48837 int imm32;
48838 int imm64;
48839 } imm_info;
48840
48841 static dispatch_windows *dispatch_window_list;
48842 static dispatch_windows *dispatch_window_list1;
48843
48844 /* Get dispatch group of insn. */
48845
48846 static enum dispatch_group
48847 get_mem_group (rtx_insn *insn)
48848 {
48849 enum attr_memory memory;
48850
48851 if (INSN_CODE (insn) < 0)
48852 return disp_no_group;
48853 memory = get_attr_memory (insn);
48854 if (memory == MEMORY_STORE)
48855 return disp_store;
48856
48857 if (memory == MEMORY_LOAD)
48858 return disp_load;
48859
48860 if (memory == MEMORY_BOTH)
48861 return disp_load_store;
48862
48863 return disp_no_group;
48864 }
48865
48866 /* Return true if insn is a compare instruction. */
48867
48868 static bool
48869 is_cmp (rtx_insn *insn)
48870 {
48871 enum attr_type type;
48872
48873 type = get_attr_type (insn);
48874 return (type == TYPE_TEST
48875 || type == TYPE_ICMP
48876 || type == TYPE_FCMP
48877 || GET_CODE (PATTERN (insn)) == COMPARE);
48878 }
48879
48880 /* Return true if a dispatch violation encountered. */
48881
48882 static bool
48883 dispatch_violation (void)
48884 {
48885 if (dispatch_window_list->next)
48886 return dispatch_window_list->next->violation;
48887 return dispatch_window_list->violation;
48888 }
48889
48890 /* Return true if insn is a branch instruction. */
48891
48892 static bool
48893 is_branch (rtx_insn *insn)
48894 {
48895 return (CALL_P (insn) || JUMP_P (insn));
48896 }
48897
48898 /* Return true if insn is a prefetch instruction. */
48899
48900 static bool
48901 is_prefetch (rtx_insn *insn)
48902 {
48903 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
48904 }
48905
48906 /* This function initializes a dispatch window and the list container holding a
48907 pointer to the window. */
48908
48909 static void
48910 init_window (int window_num)
48911 {
48912 int i;
48913 dispatch_windows *new_list;
48914
48915 if (window_num == 0)
48916 new_list = dispatch_window_list;
48917 else
48918 new_list = dispatch_window_list1;
48919
48920 new_list->num_insn = 0;
48921 new_list->num_uops = 0;
48922 new_list->window_size = 0;
48923 new_list->next = NULL;
48924 new_list->prev = NULL;
48925 new_list->window_num = window_num;
48926 new_list->num_imm = 0;
48927 new_list->num_imm_32 = 0;
48928 new_list->num_imm_64 = 0;
48929 new_list->imm_size = 0;
48930 new_list->num_loads = 0;
48931 new_list->num_stores = 0;
48932 new_list->violation = false;
48933
48934 for (i = 0; i < MAX_INSN; i++)
48935 {
48936 new_list->window[i].insn = NULL;
48937 new_list->window[i].group = disp_no_group;
48938 new_list->window[i].path = no_path;
48939 new_list->window[i].byte_len = 0;
48940 new_list->window[i].imm_bytes = 0;
48941 }
48942 return;
48943 }
48944
48945 /* This function allocates and initializes a dispatch window and the
48946 list container holding a pointer to the window. */
48947
48948 static dispatch_windows *
48949 allocate_window (void)
48950 {
48951 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
48952 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
48953
48954 return new_list;
48955 }
48956
48957 /* This routine initializes the dispatch scheduling information. It
48958 initiates building dispatch scheduler tables and constructs the
48959 first dispatch window. */
48960
48961 static void
48962 init_dispatch_sched (void)
48963 {
48964 /* Allocate a dispatch list and a window. */
48965 dispatch_window_list = allocate_window ();
48966 dispatch_window_list1 = allocate_window ();
48967 init_window (0);
48968 init_window (1);
48969 }
48970
48971 /* This function returns true if a branch is detected. End of a basic block
48972 does not have to be a branch, but here we assume only branches end a
48973 window. */
48974
48975 static bool
48976 is_end_basic_block (enum dispatch_group group)
48977 {
48978 return group == disp_branch;
48979 }
48980
48981 /* This function is called when the end of a window processing is reached. */
48982
48983 static void
48984 process_end_window (void)
48985 {
48986 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
48987 if (dispatch_window_list->next)
48988 {
48989 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
48990 gcc_assert (dispatch_window_list->window_size
48991 + dispatch_window_list1->window_size <= 48);
48992 init_window (1);
48993 }
48994 init_window (0);
48995 }
48996
48997 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
48998 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
48999 for 48 bytes of instructions. Note that these windows are not dispatch
49000 windows that their sizes are DISPATCH_WINDOW_SIZE. */
49001
49002 static dispatch_windows *
49003 allocate_next_window (int window_num)
49004 {
49005 if (window_num == 0)
49006 {
49007 if (dispatch_window_list->next)
49008 init_window (1);
49009 init_window (0);
49010 return dispatch_window_list;
49011 }
49012
49013 dispatch_window_list->next = dispatch_window_list1;
49014 dispatch_window_list1->prev = dispatch_window_list;
49015
49016 return dispatch_window_list1;
49017 }
49018
49019 /* Compute number of immediate operands of an instruction. */
49020
49021 static void
49022 find_constant (rtx in_rtx, imm_info *imm_values)
49023 {
49024 if (INSN_P (in_rtx))
49025 in_rtx = PATTERN (in_rtx);
49026 subrtx_iterator::array_type array;
49027 FOR_EACH_SUBRTX (iter, array, in_rtx, ALL)
49028 if (const_rtx x = *iter)
49029 switch (GET_CODE (x))
49030 {
49031 case CONST:
49032 case SYMBOL_REF:
49033 case CONST_INT:
49034 (imm_values->imm)++;
49035 if (x86_64_immediate_operand (CONST_CAST_RTX (x), SImode))
49036 (imm_values->imm32)++;
49037 else
49038 (imm_values->imm64)++;
49039 break;
49040
49041 case CONST_DOUBLE:
49042 case CONST_WIDE_INT:
49043 (imm_values->imm)++;
49044 (imm_values->imm64)++;
49045 break;
49046
49047 case CODE_LABEL:
49048 if (LABEL_KIND (x) == LABEL_NORMAL)
49049 {
49050 (imm_values->imm)++;
49051 (imm_values->imm32)++;
49052 }
49053 break;
49054
49055 default:
49056 break;
49057 }
49058 }
49059
49060 /* Return total size of immediate operands of an instruction along with number
49061 of corresponding immediate-operands. It initializes its parameters to zero
49062 befor calling FIND_CONSTANT.
49063 INSN is the input instruction. IMM is the total of immediates.
49064 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
49065 bit immediates. */
49066
49067 static int
49068 get_num_immediates (rtx_insn *insn, int *imm, int *imm32, int *imm64)
49069 {
49070 imm_info imm_values = {0, 0, 0};
49071
49072 find_constant (insn, &imm_values);
49073 *imm = imm_values.imm;
49074 *imm32 = imm_values.imm32;
49075 *imm64 = imm_values.imm64;
49076 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
49077 }
49078
49079 /* This function indicates if an operand of an instruction is an
49080 immediate. */
49081
49082 static bool
49083 has_immediate (rtx_insn *insn)
49084 {
49085 int num_imm_operand;
49086 int num_imm32_operand;
49087 int num_imm64_operand;
49088
49089 if (insn)
49090 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
49091 &num_imm64_operand);
49092 return false;
49093 }
49094
49095 /* Return single or double path for instructions. */
49096
49097 static enum insn_path
49098 get_insn_path (rtx_insn *insn)
49099 {
49100 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
49101
49102 if ((int)path == 0)
49103 return path_single;
49104
49105 if ((int)path == 1)
49106 return path_double;
49107
49108 return path_multi;
49109 }
49110
49111 /* Return insn dispatch group. */
49112
49113 static enum dispatch_group
49114 get_insn_group (rtx_insn *insn)
49115 {
49116 enum dispatch_group group = get_mem_group (insn);
49117 if (group)
49118 return group;
49119
49120 if (is_branch (insn))
49121 return disp_branch;
49122
49123 if (is_cmp (insn))
49124 return disp_cmp;
49125
49126 if (has_immediate (insn))
49127 return disp_imm;
49128
49129 if (is_prefetch (insn))
49130 return disp_prefetch;
49131
49132 return disp_no_group;
49133 }
49134
49135 /* Count number of GROUP restricted instructions in a dispatch
49136 window WINDOW_LIST. */
49137
49138 static int
49139 count_num_restricted (rtx_insn *insn, dispatch_windows *window_list)
49140 {
49141 enum dispatch_group group = get_insn_group (insn);
49142 int imm_size;
49143 int num_imm_operand;
49144 int num_imm32_operand;
49145 int num_imm64_operand;
49146
49147 if (group == disp_no_group)
49148 return 0;
49149
49150 if (group == disp_imm)
49151 {
49152 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
49153 &num_imm64_operand);
49154 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
49155 || num_imm_operand + window_list->num_imm > MAX_IMM
49156 || (num_imm32_operand > 0
49157 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
49158 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
49159 || (num_imm64_operand > 0
49160 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
49161 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
49162 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
49163 && num_imm64_operand > 0
49164 && ((window_list->num_imm_64 > 0
49165 && window_list->num_insn >= 2)
49166 || window_list->num_insn >= 3)))
49167 return BIG;
49168
49169 return 1;
49170 }
49171
49172 if ((group == disp_load_store
49173 && (window_list->num_loads >= MAX_LOAD
49174 || window_list->num_stores >= MAX_STORE))
49175 || ((group == disp_load
49176 || group == disp_prefetch)
49177 && window_list->num_loads >= MAX_LOAD)
49178 || (group == disp_store
49179 && window_list->num_stores >= MAX_STORE))
49180 return BIG;
49181
49182 return 1;
49183 }
49184
49185 /* This function returns true if insn satisfies dispatch rules on the
49186 last window scheduled. */
49187
49188 static bool
49189 fits_dispatch_window (rtx_insn *insn)
49190 {
49191 dispatch_windows *window_list = dispatch_window_list;
49192 dispatch_windows *window_list_next = dispatch_window_list->next;
49193 unsigned int num_restrict;
49194 enum dispatch_group group = get_insn_group (insn);
49195 enum insn_path path = get_insn_path (insn);
49196 int sum;
49197
49198 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
49199 instructions should be given the lowest priority in the
49200 scheduling process in Haifa scheduler to make sure they will be
49201 scheduled in the same dispatch window as the reference to them. */
49202 if (group == disp_jcc || group == disp_cmp)
49203 return false;
49204
49205 /* Check nonrestricted. */
49206 if (group == disp_no_group || group == disp_branch)
49207 return true;
49208
49209 /* Get last dispatch window. */
49210 if (window_list_next)
49211 window_list = window_list_next;
49212
49213 if (window_list->window_num == 1)
49214 {
49215 sum = window_list->prev->window_size + window_list->window_size;
49216
49217 if (sum == 32
49218 || (min_insn_size (insn) + sum) >= 48)
49219 /* Window 1 is full. Go for next window. */
49220 return true;
49221 }
49222
49223 num_restrict = count_num_restricted (insn, window_list);
49224
49225 if (num_restrict > num_allowable_groups[group])
49226 return false;
49227
49228 /* See if it fits in the first window. */
49229 if (window_list->window_num == 0)
49230 {
49231 /* The first widow should have only single and double path
49232 uops. */
49233 if (path == path_double
49234 && (window_list->num_uops + 2) > MAX_INSN)
49235 return false;
49236 else if (path != path_single)
49237 return false;
49238 }
49239 return true;
49240 }
49241
49242 /* Add an instruction INSN with NUM_UOPS micro-operations to the
49243 dispatch window WINDOW_LIST. */
49244
49245 static void
49246 add_insn_window (rtx_insn *insn, dispatch_windows *window_list, int num_uops)
49247 {
49248 int byte_len = min_insn_size (insn);
49249 int num_insn = window_list->num_insn;
49250 int imm_size;
49251 sched_insn_info *window = window_list->window;
49252 enum dispatch_group group = get_insn_group (insn);
49253 enum insn_path path = get_insn_path (insn);
49254 int num_imm_operand;
49255 int num_imm32_operand;
49256 int num_imm64_operand;
49257
49258 if (!window_list->violation && group != disp_cmp
49259 && !fits_dispatch_window (insn))
49260 window_list->violation = true;
49261
49262 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
49263 &num_imm64_operand);
49264
49265 /* Initialize window with new instruction. */
49266 window[num_insn].insn = insn;
49267 window[num_insn].byte_len = byte_len;
49268 window[num_insn].group = group;
49269 window[num_insn].path = path;
49270 window[num_insn].imm_bytes = imm_size;
49271
49272 window_list->window_size += byte_len;
49273 window_list->num_insn = num_insn + 1;
49274 window_list->num_uops = window_list->num_uops + num_uops;
49275 window_list->imm_size += imm_size;
49276 window_list->num_imm += num_imm_operand;
49277 window_list->num_imm_32 += num_imm32_operand;
49278 window_list->num_imm_64 += num_imm64_operand;
49279
49280 if (group == disp_store)
49281 window_list->num_stores += 1;
49282 else if (group == disp_load
49283 || group == disp_prefetch)
49284 window_list->num_loads += 1;
49285 else if (group == disp_load_store)
49286 {
49287 window_list->num_stores += 1;
49288 window_list->num_loads += 1;
49289 }
49290 }
49291
49292 /* Adds a scheduled instruction, INSN, to the current dispatch window.
49293 If the total bytes of instructions or the number of instructions in
49294 the window exceed allowable, it allocates a new window. */
49295
49296 static void
49297 add_to_dispatch_window (rtx_insn *insn)
49298 {
49299 int byte_len;
49300 dispatch_windows *window_list;
49301 dispatch_windows *next_list;
49302 dispatch_windows *window0_list;
49303 enum insn_path path;
49304 enum dispatch_group insn_group;
49305 bool insn_fits;
49306 int num_insn;
49307 int num_uops;
49308 int window_num;
49309 int insn_num_uops;
49310 int sum;
49311
49312 if (INSN_CODE (insn) < 0)
49313 return;
49314
49315 byte_len = min_insn_size (insn);
49316 window_list = dispatch_window_list;
49317 next_list = window_list->next;
49318 path = get_insn_path (insn);
49319 insn_group = get_insn_group (insn);
49320
49321 /* Get the last dispatch window. */
49322 if (next_list)
49323 window_list = dispatch_window_list->next;
49324
49325 if (path == path_single)
49326 insn_num_uops = 1;
49327 else if (path == path_double)
49328 insn_num_uops = 2;
49329 else
49330 insn_num_uops = (int) path;
49331
49332 /* If current window is full, get a new window.
49333 Window number zero is full, if MAX_INSN uops are scheduled in it.
49334 Window number one is full, if window zero's bytes plus window
49335 one's bytes is 32, or if the bytes of the new instruction added
49336 to the total makes it greater than 48, or it has already MAX_INSN
49337 instructions in it. */
49338 num_insn = window_list->num_insn;
49339 num_uops = window_list->num_uops;
49340 window_num = window_list->window_num;
49341 insn_fits = fits_dispatch_window (insn);
49342
49343 if (num_insn >= MAX_INSN
49344 || num_uops + insn_num_uops > MAX_INSN
49345 || !(insn_fits))
49346 {
49347 window_num = ~window_num & 1;
49348 window_list = allocate_next_window (window_num);
49349 }
49350
49351 if (window_num == 0)
49352 {
49353 add_insn_window (insn, window_list, insn_num_uops);
49354 if (window_list->num_insn >= MAX_INSN
49355 && insn_group == disp_branch)
49356 {
49357 process_end_window ();
49358 return;
49359 }
49360 }
49361 else if (window_num == 1)
49362 {
49363 window0_list = window_list->prev;
49364 sum = window0_list->window_size + window_list->window_size;
49365 if (sum == 32
49366 || (byte_len + sum) >= 48)
49367 {
49368 process_end_window ();
49369 window_list = dispatch_window_list;
49370 }
49371
49372 add_insn_window (insn, window_list, insn_num_uops);
49373 }
49374 else
49375 gcc_unreachable ();
49376
49377 if (is_end_basic_block (insn_group))
49378 {
49379 /* End of basic block is reached do end-basic-block process. */
49380 process_end_window ();
49381 return;
49382 }
49383 }
49384
49385 /* Print the dispatch window, WINDOW_NUM, to FILE. */
49386
49387 DEBUG_FUNCTION static void
49388 debug_dispatch_window_file (FILE *file, int window_num)
49389 {
49390 dispatch_windows *list;
49391 int i;
49392
49393 if (window_num == 0)
49394 list = dispatch_window_list;
49395 else
49396 list = dispatch_window_list1;
49397
49398 fprintf (file, "Window #%d:\n", list->window_num);
49399 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
49400 list->num_insn, list->num_uops, list->window_size);
49401 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
49402 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
49403
49404 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
49405 list->num_stores);
49406 fprintf (file, " insn info:\n");
49407
49408 for (i = 0; i < MAX_INSN; i++)
49409 {
49410 if (!list->window[i].insn)
49411 break;
49412 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
49413 i, group_name[list->window[i].group],
49414 i, (void *)list->window[i].insn,
49415 i, list->window[i].path,
49416 i, list->window[i].byte_len,
49417 i, list->window[i].imm_bytes);
49418 }
49419 }
49420
49421 /* Print to stdout a dispatch window. */
49422
49423 DEBUG_FUNCTION void
49424 debug_dispatch_window (int window_num)
49425 {
49426 debug_dispatch_window_file (stdout, window_num);
49427 }
49428
49429 /* Print INSN dispatch information to FILE. */
49430
49431 DEBUG_FUNCTION static void
49432 debug_insn_dispatch_info_file (FILE *file, rtx_insn *insn)
49433 {
49434 int byte_len;
49435 enum insn_path path;
49436 enum dispatch_group group;
49437 int imm_size;
49438 int num_imm_operand;
49439 int num_imm32_operand;
49440 int num_imm64_operand;
49441
49442 if (INSN_CODE (insn) < 0)
49443 return;
49444
49445 byte_len = min_insn_size (insn);
49446 path = get_insn_path (insn);
49447 group = get_insn_group (insn);
49448 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
49449 &num_imm64_operand);
49450
49451 fprintf (file, " insn info:\n");
49452 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
49453 group_name[group], path, byte_len);
49454 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
49455 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
49456 }
49457
49458 /* Print to STDERR the status of the ready list with respect to
49459 dispatch windows. */
49460
49461 DEBUG_FUNCTION void
49462 debug_ready_dispatch (void)
49463 {
49464 int i;
49465 int no_ready = number_in_ready ();
49466
49467 fprintf (stdout, "Number of ready: %d\n", no_ready);
49468
49469 for (i = 0; i < no_ready; i++)
49470 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
49471 }
49472
49473 /* This routine is the driver of the dispatch scheduler. */
49474
49475 static void
49476 do_dispatch (rtx_insn *insn, int mode)
49477 {
49478 if (mode == DISPATCH_INIT)
49479 init_dispatch_sched ();
49480 else if (mode == ADD_TO_DISPATCH_WINDOW)
49481 add_to_dispatch_window (insn);
49482 }
49483
49484 /* Return TRUE if Dispatch Scheduling is supported. */
49485
49486 static bool
49487 has_dispatch (rtx_insn *insn, int action)
49488 {
49489 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3
49490 || TARGET_BDVER4 || TARGET_ZNVER1) && flag_dispatch_scheduler)
49491 switch (action)
49492 {
49493 default:
49494 return false;
49495
49496 case IS_DISPATCH_ON:
49497 return true;
49498
49499 case IS_CMP:
49500 return is_cmp (insn);
49501
49502 case DISPATCH_VIOLATION:
49503 return dispatch_violation ();
49504
49505 case FITS_DISPATCH_WINDOW:
49506 return fits_dispatch_window (insn);
49507 }
49508
49509 return false;
49510 }
49511
49512 /* Implementation of reassociation_width target hook used by
49513 reassoc phase to identify parallelism level in reassociated
49514 tree. Statements tree_code is passed in OPC. Arguments type
49515 is passed in MODE.
49516
49517 Currently parallel reassociation is enabled for Atom
49518 processors only and we set reassociation width to be 2
49519 because Atom may issue up to 2 instructions per cycle.
49520
49521 Return value should be fixed if parallel reassociation is
49522 enabled for other processors. */
49523
49524 static int
49525 ix86_reassociation_width (unsigned int, machine_mode mode)
49526 {
49527 /* Vector part. */
49528 if (VECTOR_MODE_P (mode))
49529 {
49530 if (TARGET_VECTOR_PARALLEL_EXECUTION)
49531 return 2;
49532 else
49533 return 1;
49534 }
49535
49536 /* Scalar part. */
49537 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
49538 return 2;
49539 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
49540 return ((TARGET_64BIT && ix86_tune == PROCESSOR_HASWELL)? 4 : 2);
49541 else
49542 return 1;
49543 }
49544
49545 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
49546 place emms and femms instructions. */
49547
49548 static machine_mode
49549 ix86_preferred_simd_mode (machine_mode mode)
49550 {
49551 if (!TARGET_SSE)
49552 return word_mode;
49553
49554 switch (mode)
49555 {
49556 case QImode:
49557 return TARGET_AVX512BW ? V64QImode :
49558 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
49559 case HImode:
49560 return TARGET_AVX512BW ? V32HImode :
49561 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
49562 case SImode:
49563 return TARGET_AVX512F ? V16SImode :
49564 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
49565 case DImode:
49566 return TARGET_AVX512F ? V8DImode :
49567 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
49568
49569 case SFmode:
49570 if (TARGET_AVX512F)
49571 return V16SFmode;
49572 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
49573 return V8SFmode;
49574 else
49575 return V4SFmode;
49576
49577 case DFmode:
49578 if (TARGET_AVX512F)
49579 return V8DFmode;
49580 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
49581 return V4DFmode;
49582 else if (TARGET_SSE2)
49583 return V2DFmode;
49584 /* FALLTHRU */
49585
49586 default:
49587 return word_mode;
49588 }
49589 }
49590
49591 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
49592 vectors. If AVX512F is enabled then try vectorizing with 512bit,
49593 256bit and 128bit vectors. */
49594
49595 static unsigned int
49596 ix86_autovectorize_vector_sizes (void)
49597 {
49598 return TARGET_AVX512F ? 64 | 32 | 16 :
49599 (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
49600 }
49601
49602 /* Implemenation of targetm.vectorize.get_mask_mode. */
49603
49604 static machine_mode
49605 ix86_get_mask_mode (unsigned nunits, unsigned vector_size)
49606 {
49607 unsigned elem_size = vector_size / nunits;
49608
49609 /* Scalar mask case. */
49610 if ((TARGET_AVX512F && vector_size == 64)
49611 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
49612 {
49613 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
49614 return smallest_mode_for_size (nunits, MODE_INT);
49615 }
49616
49617 machine_mode elem_mode
49618 = smallest_mode_for_size (elem_size * BITS_PER_UNIT, MODE_INT);
49619
49620 gcc_assert (elem_size * nunits == vector_size);
49621
49622 return mode_for_vector (elem_mode, nunits);
49623 }
49624
49625 \f
49626
49627 /* Return class of registers which could be used for pseudo of MODE
49628 and of class RCLASS for spilling instead of memory. Return NO_REGS
49629 if it is not possible or non-profitable. */
49630
49631 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
49632
49633 static reg_class_t
49634 ix86_spill_class (reg_class_t rclass, machine_mode mode)
49635 {
49636 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
49637 && TARGET_SSE2
49638 && TARGET_INTER_UNIT_MOVES_TO_VEC
49639 && TARGET_INTER_UNIT_MOVES_FROM_VEC
49640 && (mode == SImode || (TARGET_64BIT && mode == DImode))
49641 && INTEGER_CLASS_P (rclass))
49642 return ALL_SSE_REGS;
49643 return NO_REGS;
49644 }
49645
49646 /* Implement targetm.vectorize.init_cost. */
49647
49648 static void *
49649 ix86_init_cost (struct loop *)
49650 {
49651 unsigned *cost = XNEWVEC (unsigned, 3);
49652 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
49653 return cost;
49654 }
49655
49656 /* Implement targetm.vectorize.add_stmt_cost. */
49657
49658 static unsigned
49659 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
49660 struct _stmt_vec_info *stmt_info, int misalign,
49661 enum vect_cost_model_location where)
49662 {
49663 unsigned *cost = (unsigned *) data;
49664 unsigned retval = 0;
49665
49666 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
49667 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
49668
49669 /* Penalize DFmode vector operations for Bonnell. */
49670 if (TARGET_BONNELL && kind == vector_stmt
49671 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
49672 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
49673
49674 /* Statements in an inner loop relative to the loop being
49675 vectorized are weighted more heavily. The value here is
49676 arbitrary and could potentially be improved with analysis. */
49677 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
49678 count *= 50; /* FIXME. */
49679
49680 retval = (unsigned) (count * stmt_cost);
49681
49682 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
49683 for Silvermont as it has out of order integer pipeline and can execute
49684 2 scalar instruction per tick, but has in order SIMD pipeline. */
49685 if ((TARGET_SILVERMONT || TARGET_INTEL)
49686 && stmt_info && stmt_info->stmt)
49687 {
49688 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
49689 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
49690 retval = (retval * 17) / 10;
49691 }
49692
49693 cost[where] += retval;
49694
49695 return retval;
49696 }
49697
49698 /* Implement targetm.vectorize.finish_cost. */
49699
49700 static void
49701 ix86_finish_cost (void *data, unsigned *prologue_cost,
49702 unsigned *body_cost, unsigned *epilogue_cost)
49703 {
49704 unsigned *cost = (unsigned *) data;
49705 *prologue_cost = cost[vect_prologue];
49706 *body_cost = cost[vect_body];
49707 *epilogue_cost = cost[vect_epilogue];
49708 }
49709
49710 /* Implement targetm.vectorize.destroy_cost_data. */
49711
49712 static void
49713 ix86_destroy_cost_data (void *data)
49714 {
49715 free (data);
49716 }
49717
49718 /* Validate target specific memory model bits in VAL. */
49719
49720 static unsigned HOST_WIDE_INT
49721 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
49722 {
49723 enum memmodel model = memmodel_from_int (val);
49724 bool strong;
49725
49726 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
49727 |MEMMODEL_MASK)
49728 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
49729 {
49730 warning (OPT_Winvalid_memory_model,
49731 "Unknown architecture specific memory model");
49732 return MEMMODEL_SEQ_CST;
49733 }
49734 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
49735 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
49736 {
49737 warning (OPT_Winvalid_memory_model,
49738 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
49739 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
49740 }
49741 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
49742 {
49743 warning (OPT_Winvalid_memory_model,
49744 "HLE_RELEASE not used with RELEASE or stronger memory model");
49745 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
49746 }
49747 return val;
49748 }
49749
49750 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
49751 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
49752 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
49753 or number of vecsize_mangle variants that should be emitted. */
49754
49755 static int
49756 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
49757 struct cgraph_simd_clone *clonei,
49758 tree base_type, int num)
49759 {
49760 int ret = 1;
49761
49762 if (clonei->simdlen
49763 && (clonei->simdlen < 2
49764 || clonei->simdlen > 1024
49765 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
49766 {
49767 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49768 "unsupported simdlen %d", clonei->simdlen);
49769 return 0;
49770 }
49771
49772 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
49773 if (TREE_CODE (ret_type) != VOID_TYPE)
49774 switch (TYPE_MODE (ret_type))
49775 {
49776 case QImode:
49777 case HImode:
49778 case SImode:
49779 case DImode:
49780 case SFmode:
49781 case DFmode:
49782 /* case SCmode: */
49783 /* case DCmode: */
49784 break;
49785 default:
49786 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49787 "unsupported return type %qT for simd\n", ret_type);
49788 return 0;
49789 }
49790
49791 tree t;
49792 int i;
49793
49794 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
49795 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
49796 switch (TYPE_MODE (TREE_TYPE (t)))
49797 {
49798 case QImode:
49799 case HImode:
49800 case SImode:
49801 case DImode:
49802 case SFmode:
49803 case DFmode:
49804 /* case SCmode: */
49805 /* case DCmode: */
49806 break;
49807 default:
49808 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49809 "unsupported argument type %qT for simd\n", TREE_TYPE (t));
49810 return 0;
49811 }
49812
49813 if (clonei->cilk_elemental)
49814 {
49815 /* Parse here processor clause. If not present, default to 'b'. */
49816 clonei->vecsize_mangle = 'b';
49817 }
49818 else if (!TREE_PUBLIC (node->decl))
49819 {
49820 /* If the function isn't exported, we can pick up just one ISA
49821 for the clones. */
49822 if (TARGET_AVX512F)
49823 clonei->vecsize_mangle = 'e';
49824 else if (TARGET_AVX2)
49825 clonei->vecsize_mangle = 'd';
49826 else if (TARGET_AVX)
49827 clonei->vecsize_mangle = 'c';
49828 else
49829 clonei->vecsize_mangle = 'b';
49830 ret = 1;
49831 }
49832 else
49833 {
49834 clonei->vecsize_mangle = "bcde"[num];
49835 ret = 4;
49836 }
49837 clonei->mask_mode = VOIDmode;
49838 switch (clonei->vecsize_mangle)
49839 {
49840 case 'b':
49841 clonei->vecsize_int = 128;
49842 clonei->vecsize_float = 128;
49843 break;
49844 case 'c':
49845 clonei->vecsize_int = 128;
49846 clonei->vecsize_float = 256;
49847 break;
49848 case 'd':
49849 clonei->vecsize_int = 256;
49850 clonei->vecsize_float = 256;
49851 break;
49852 case 'e':
49853 clonei->vecsize_int = 512;
49854 clonei->vecsize_float = 512;
49855 if (TYPE_MODE (base_type) == QImode)
49856 clonei->mask_mode = DImode;
49857 else
49858 clonei->mask_mode = SImode;
49859 break;
49860 }
49861 if (clonei->simdlen == 0)
49862 {
49863 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
49864 clonei->simdlen = clonei->vecsize_int;
49865 else
49866 clonei->simdlen = clonei->vecsize_float;
49867 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
49868 }
49869 else if (clonei->simdlen > 16)
49870 {
49871 /* For compatibility with ICC, use the same upper bounds
49872 for simdlen. In particular, for CTYPE below, use the return type,
49873 unless the function returns void, in that case use the characteristic
49874 type. If it is possible for given SIMDLEN to pass CTYPE value
49875 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
49876 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
49877 emit corresponding clone. */
49878 tree ctype = ret_type;
49879 if (TREE_CODE (ret_type) == VOID_TYPE)
49880 ctype = base_type;
49881 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
49882 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
49883 cnt /= clonei->vecsize_int;
49884 else
49885 cnt /= clonei->vecsize_float;
49886 if (cnt > (TARGET_64BIT ? 16 : 8))
49887 {
49888 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49889 "unsupported simdlen %d", clonei->simdlen);
49890 return 0;
49891 }
49892 }
49893 return ret;
49894 }
49895
49896 /* Add target attribute to SIMD clone NODE if needed. */
49897
49898 static void
49899 ix86_simd_clone_adjust (struct cgraph_node *node)
49900 {
49901 const char *str = NULL;
49902 gcc_assert (node->decl == cfun->decl);
49903 switch (node->simdclone->vecsize_mangle)
49904 {
49905 case 'b':
49906 if (!TARGET_SSE2)
49907 str = "sse2";
49908 break;
49909 case 'c':
49910 if (!TARGET_AVX)
49911 str = "avx";
49912 break;
49913 case 'd':
49914 if (!TARGET_AVX2)
49915 str = "avx2";
49916 break;
49917 case 'e':
49918 if (!TARGET_AVX512F)
49919 str = "avx512f";
49920 break;
49921 default:
49922 gcc_unreachable ();
49923 }
49924 if (str == NULL)
49925 return;
49926 push_cfun (NULL);
49927 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
49928 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
49929 gcc_assert (ok);
49930 pop_cfun ();
49931 ix86_reset_previous_fndecl ();
49932 ix86_set_current_function (node->decl);
49933 }
49934
49935 /* If SIMD clone NODE can't be used in a vectorized loop
49936 in current function, return -1, otherwise return a badness of using it
49937 (0 if it is most desirable from vecsize_mangle point of view, 1
49938 slightly less desirable, etc.). */
49939
49940 static int
49941 ix86_simd_clone_usable (struct cgraph_node *node)
49942 {
49943 switch (node->simdclone->vecsize_mangle)
49944 {
49945 case 'b':
49946 if (!TARGET_SSE2)
49947 return -1;
49948 if (!TARGET_AVX)
49949 return 0;
49950 return TARGET_AVX2 ? 2 : 1;
49951 case 'c':
49952 if (!TARGET_AVX)
49953 return -1;
49954 return TARGET_AVX2 ? 1 : 0;
49955 case 'd':
49956 if (!TARGET_AVX2)
49957 return -1;
49958 return 0;
49959 case 'e':
49960 if (!TARGET_AVX512F)
49961 return -1;
49962 return 0;
49963 default:
49964 gcc_unreachable ();
49965 }
49966 }
49967
49968 /* This function adjusts the unroll factor based on
49969 the hardware capabilities. For ex, bdver3 has
49970 a loop buffer which makes unrolling of smaller
49971 loops less important. This function decides the
49972 unroll factor using number of memory references
49973 (value 32 is used) as a heuristic. */
49974
49975 static unsigned
49976 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
49977 {
49978 basic_block *bbs;
49979 rtx_insn *insn;
49980 unsigned i;
49981 unsigned mem_count = 0;
49982
49983 if (!TARGET_ADJUST_UNROLL)
49984 return nunroll;
49985
49986 /* Count the number of memory references within the loop body.
49987 This value determines the unrolling factor for bdver3 and bdver4
49988 architectures. */
49989 subrtx_iterator::array_type array;
49990 bbs = get_loop_body (loop);
49991 for (i = 0; i < loop->num_nodes; i++)
49992 FOR_BB_INSNS (bbs[i], insn)
49993 if (NONDEBUG_INSN_P (insn))
49994 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
49995 if (const_rtx x = *iter)
49996 if (MEM_P (x))
49997 {
49998 machine_mode mode = GET_MODE (x);
49999 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
50000 if (n_words > 4)
50001 mem_count += 2;
50002 else
50003 mem_count += 1;
50004 }
50005 free (bbs);
50006
50007 if (mem_count && mem_count <=32)
50008 return 32/mem_count;
50009
50010 return nunroll;
50011 }
50012
50013
50014 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
50015
50016 static bool
50017 ix86_float_exceptions_rounding_supported_p (void)
50018 {
50019 /* For x87 floating point with standard excess precision handling,
50020 there is no adddf3 pattern (since x87 floating point only has
50021 XFmode operations) so the default hook implementation gets this
50022 wrong. */
50023 return TARGET_80387 || TARGET_SSE_MATH;
50024 }
50025
50026 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
50027
50028 static void
50029 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
50030 {
50031 if (!TARGET_80387 && !TARGET_SSE_MATH)
50032 return;
50033 tree exceptions_var = create_tmp_var_raw (integer_type_node);
50034 if (TARGET_80387)
50035 {
50036 tree fenv_index_type = build_index_type (size_int (6));
50037 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
50038 tree fenv_var = create_tmp_var_raw (fenv_type);
50039 TREE_ADDRESSABLE (fenv_var) = 1;
50040 tree fenv_ptr = build_pointer_type (fenv_type);
50041 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
50042 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
50043 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
50044 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
50045 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
50046 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
50047 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
50048 tree hold_fnclex = build_call_expr (fnclex, 0);
50049 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
50050 NULL_TREE, NULL_TREE);
50051 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
50052 hold_fnclex);
50053 *clear = build_call_expr (fnclex, 0);
50054 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
50055 tree fnstsw_call = build_call_expr (fnstsw, 0);
50056 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
50057 sw_var, fnstsw_call);
50058 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
50059 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
50060 exceptions_var, exceptions_x87);
50061 *update = build2 (COMPOUND_EXPR, integer_type_node,
50062 sw_mod, update_mod);
50063 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
50064 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
50065 }
50066 if (TARGET_SSE_MATH)
50067 {
50068 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
50069 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
50070 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
50071 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
50072 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
50073 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
50074 mxcsr_orig_var, stmxcsr_hold_call);
50075 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
50076 mxcsr_orig_var,
50077 build_int_cst (unsigned_type_node, 0x1f80));
50078 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
50079 build_int_cst (unsigned_type_node, 0xffffffc0));
50080 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
50081 mxcsr_mod_var, hold_mod_val);
50082 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
50083 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
50084 hold_assign_orig, hold_assign_mod);
50085 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
50086 ldmxcsr_hold_call);
50087 if (*hold)
50088 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
50089 else
50090 *hold = hold_all;
50091 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
50092 if (*clear)
50093 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
50094 ldmxcsr_clear_call);
50095 else
50096 *clear = ldmxcsr_clear_call;
50097 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
50098 tree exceptions_sse = fold_convert (integer_type_node,
50099 stxmcsr_update_call);
50100 if (*update)
50101 {
50102 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
50103 exceptions_var, exceptions_sse);
50104 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
50105 exceptions_var, exceptions_mod);
50106 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
50107 exceptions_assign);
50108 }
50109 else
50110 *update = build2 (MODIFY_EXPR, integer_type_node,
50111 exceptions_var, exceptions_sse);
50112 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
50113 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
50114 ldmxcsr_update_call);
50115 }
50116 tree atomic_feraiseexcept
50117 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
50118 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
50119 1, exceptions_var);
50120 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
50121 atomic_feraiseexcept_call);
50122 }
50123
50124 /* Return mode to be used for bounds or VOIDmode
50125 if bounds are not supported. */
50126
50127 static enum machine_mode
50128 ix86_mpx_bound_mode ()
50129 {
50130 /* Do not support pointer checker if MPX
50131 is not enabled. */
50132 if (!TARGET_MPX)
50133 {
50134 if (flag_check_pointer_bounds)
50135 warning (0, "Pointer Checker requires MPX support on this target."
50136 " Use -mmpx options to enable MPX.");
50137 return VOIDmode;
50138 }
50139
50140 return BNDmode;
50141 }
50142
50143 /* Return constant used to statically initialize constant bounds.
50144
50145 This function is used to create special bound values. For now
50146 only INIT bounds and NONE bounds are expected. More special
50147 values may be added later. */
50148
50149 static tree
50150 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
50151 {
50152 tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
50153 : build_zero_cst (pointer_sized_int_node);
50154 tree high = ub ? build_zero_cst (pointer_sized_int_node)
50155 : build_minus_one_cst (pointer_sized_int_node);
50156
50157 /* This function is supposed to be used to create INIT and
50158 NONE bounds only. */
50159 gcc_assert ((lb == 0 && ub == -1)
50160 || (lb == -1 && ub == 0));
50161
50162 return build_complex (NULL, low, high);
50163 }
50164
50165 /* Generate a list of statements STMTS to initialize pointer bounds
50166 variable VAR with bounds LB and UB. Return the number of generated
50167 statements. */
50168
50169 static int
50170 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
50171 {
50172 tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
50173 tree lhs, modify, var_p;
50174
50175 ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
50176 var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
50177
50178 lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
50179 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
50180 append_to_statement_list (modify, stmts);
50181
50182 lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
50183 build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
50184 TYPE_SIZE_UNIT (pointer_sized_int_node)));
50185 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
50186 append_to_statement_list (modify, stmts);
50187
50188 return 2;
50189 }
50190
50191 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
50192 /* For i386, common symbol is local only for non-PIE binaries. For
50193 x86-64, common symbol is local only for non-PIE binaries or linker
50194 supports copy reloc in PIE binaries. */
50195
50196 static bool
50197 ix86_binds_local_p (const_tree exp)
50198 {
50199 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
50200 (!flag_pic
50201 || (TARGET_64BIT
50202 && HAVE_LD_PIE_COPYRELOC != 0)));
50203 }
50204 #endif
50205
50206 /* If MEM is in the form of [base+offset], extract the two parts
50207 of address and set to BASE and OFFSET, otherwise return false. */
50208
50209 static bool
50210 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
50211 {
50212 rtx addr;
50213
50214 gcc_assert (MEM_P (mem));
50215
50216 addr = XEXP (mem, 0);
50217
50218 if (GET_CODE (addr) == CONST)
50219 addr = XEXP (addr, 0);
50220
50221 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
50222 {
50223 *base = addr;
50224 *offset = const0_rtx;
50225 return true;
50226 }
50227
50228 if (GET_CODE (addr) == PLUS
50229 && (REG_P (XEXP (addr, 0))
50230 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
50231 && CONST_INT_P (XEXP (addr, 1)))
50232 {
50233 *base = XEXP (addr, 0);
50234 *offset = XEXP (addr, 1);
50235 return true;
50236 }
50237
50238 return false;
50239 }
50240
50241 /* Given OPERANDS of consecutive load/store, check if we can merge
50242 them into move multiple. LOAD is true if they are load instructions.
50243 MODE is the mode of memory operands. */
50244
50245 bool
50246 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
50247 enum machine_mode mode)
50248 {
50249 HOST_WIDE_INT offval_1, offval_2, msize;
50250 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
50251
50252 if (load)
50253 {
50254 mem_1 = operands[1];
50255 mem_2 = operands[3];
50256 reg_1 = operands[0];
50257 reg_2 = operands[2];
50258 }
50259 else
50260 {
50261 mem_1 = operands[0];
50262 mem_2 = operands[2];
50263 reg_1 = operands[1];
50264 reg_2 = operands[3];
50265 }
50266
50267 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
50268
50269 if (REGNO (reg_1) != REGNO (reg_2))
50270 return false;
50271
50272 /* Check if the addresses are in the form of [base+offset]. */
50273 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
50274 return false;
50275 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
50276 return false;
50277
50278 /* Check if the bases are the same. */
50279 if (!rtx_equal_p (base_1, base_2))
50280 return false;
50281
50282 offval_1 = INTVAL (offset_1);
50283 offval_2 = INTVAL (offset_2);
50284 msize = GET_MODE_SIZE (mode);
50285 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
50286 if (offval_1 + msize != offval_2)
50287 return false;
50288
50289 return true;
50290 }
50291
50292 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
50293
50294 static bool
50295 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
50296 optimization_type opt_type)
50297 {
50298 switch (op)
50299 {
50300 case asin_optab:
50301 case acos_optab:
50302 case log1p_optab:
50303 case exp_optab:
50304 case exp10_optab:
50305 case exp2_optab:
50306 case expm1_optab:
50307 case ldexp_optab:
50308 case scalb_optab:
50309 case round_optab:
50310 return opt_type == OPTIMIZE_FOR_SPEED;
50311
50312 case rint_optab:
50313 if (SSE_FLOAT_MODE_P (mode1)
50314 && TARGET_SSE_MATH
50315 && !flag_trapping_math
50316 && !TARGET_ROUND)
50317 return opt_type == OPTIMIZE_FOR_SPEED;
50318 return true;
50319
50320 case floor_optab:
50321 case ceil_optab:
50322 case btrunc_optab:
50323 if (SSE_FLOAT_MODE_P (mode1)
50324 && TARGET_SSE_MATH
50325 && !flag_trapping_math
50326 && TARGET_ROUND)
50327 return true;
50328 return opt_type == OPTIMIZE_FOR_SPEED;
50329
50330 case rsqrt_optab:
50331 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
50332
50333 default:
50334 return true;
50335 }
50336 }
50337
50338 /* Address space support.
50339
50340 This is not "far pointers" in the 16-bit sense, but an easy way
50341 to use %fs and %gs segment prefixes. Therefore:
50342
50343 (a) All address spaces have the same modes,
50344 (b) All address spaces have the same addresss forms,
50345 (c) While %fs and %gs are technically subsets of the generic
50346 address space, they are probably not subsets of each other.
50347 (d) Since we have no access to the segment base register values
50348 without resorting to a system call, we cannot convert a
50349 non-default address space to a default address space.
50350 Therefore we do not claim %fs or %gs are subsets of generic.
50351
50352 Therefore we can (mostly) use the default hooks. */
50353
50354 /* All use of segmentation is assumed to make address 0 valid. */
50355
50356 static bool
50357 ix86_addr_space_zero_address_valid (addr_space_t as)
50358 {
50359 return as != ADDR_SPACE_GENERIC;
50360 }
50361 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
50362 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
50363
50364 /* Initialize the GCC target structure. */
50365 #undef TARGET_RETURN_IN_MEMORY
50366 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
50367
50368 #undef TARGET_LEGITIMIZE_ADDRESS
50369 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
50370
50371 #undef TARGET_ATTRIBUTE_TABLE
50372 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
50373 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
50374 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
50375 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
50376 # undef TARGET_MERGE_DECL_ATTRIBUTES
50377 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
50378 #endif
50379
50380 #undef TARGET_COMP_TYPE_ATTRIBUTES
50381 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
50382
50383 #undef TARGET_INIT_BUILTINS
50384 #define TARGET_INIT_BUILTINS ix86_init_builtins
50385 #undef TARGET_BUILTIN_DECL
50386 #define TARGET_BUILTIN_DECL ix86_builtin_decl
50387 #undef TARGET_EXPAND_BUILTIN
50388 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
50389
50390 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
50391 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
50392 ix86_builtin_vectorized_function
50393
50394 #undef TARGET_VECTORIZE_BUILTIN_GATHER
50395 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
50396
50397 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
50398 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
50399
50400 #undef TARGET_BUILTIN_RECIPROCAL
50401 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
50402
50403 #undef TARGET_ASM_FUNCTION_EPILOGUE
50404 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
50405
50406 #undef TARGET_ENCODE_SECTION_INFO
50407 #ifndef SUBTARGET_ENCODE_SECTION_INFO
50408 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
50409 #else
50410 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
50411 #endif
50412
50413 #undef TARGET_ASM_OPEN_PAREN
50414 #define TARGET_ASM_OPEN_PAREN ""
50415 #undef TARGET_ASM_CLOSE_PAREN
50416 #define TARGET_ASM_CLOSE_PAREN ""
50417
50418 #undef TARGET_ASM_BYTE_OP
50419 #define TARGET_ASM_BYTE_OP ASM_BYTE
50420
50421 #undef TARGET_ASM_ALIGNED_HI_OP
50422 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
50423 #undef TARGET_ASM_ALIGNED_SI_OP
50424 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
50425 #ifdef ASM_QUAD
50426 #undef TARGET_ASM_ALIGNED_DI_OP
50427 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
50428 #endif
50429
50430 #undef TARGET_PROFILE_BEFORE_PROLOGUE
50431 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
50432
50433 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
50434 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
50435
50436 #undef TARGET_ASM_UNALIGNED_HI_OP
50437 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
50438 #undef TARGET_ASM_UNALIGNED_SI_OP
50439 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
50440 #undef TARGET_ASM_UNALIGNED_DI_OP
50441 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
50442
50443 #undef TARGET_PRINT_OPERAND
50444 #define TARGET_PRINT_OPERAND ix86_print_operand
50445 #undef TARGET_PRINT_OPERAND_ADDRESS
50446 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
50447 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
50448 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
50449 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
50450 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
50451
50452 #undef TARGET_SCHED_INIT_GLOBAL
50453 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
50454 #undef TARGET_SCHED_ADJUST_COST
50455 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
50456 #undef TARGET_SCHED_ISSUE_RATE
50457 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
50458 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
50459 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
50460 ia32_multipass_dfa_lookahead
50461 #undef TARGET_SCHED_MACRO_FUSION_P
50462 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
50463 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
50464 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
50465
50466 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
50467 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
50468
50469 #undef TARGET_MEMMODEL_CHECK
50470 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
50471
50472 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
50473 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
50474
50475 #ifdef HAVE_AS_TLS
50476 #undef TARGET_HAVE_TLS
50477 #define TARGET_HAVE_TLS true
50478 #endif
50479 #undef TARGET_CANNOT_FORCE_CONST_MEM
50480 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
50481 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
50482 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
50483
50484 #undef TARGET_DELEGITIMIZE_ADDRESS
50485 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
50486
50487 #undef TARGET_MS_BITFIELD_LAYOUT_P
50488 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
50489
50490 #if TARGET_MACHO
50491 #undef TARGET_BINDS_LOCAL_P
50492 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
50493 #else
50494 #undef TARGET_BINDS_LOCAL_P
50495 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
50496 #endif
50497 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
50498 #undef TARGET_BINDS_LOCAL_P
50499 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
50500 #endif
50501
50502 #undef TARGET_ASM_OUTPUT_MI_THUNK
50503 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
50504 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
50505 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
50506
50507 #undef TARGET_ASM_FILE_START
50508 #define TARGET_ASM_FILE_START x86_file_start
50509
50510 #undef TARGET_OPTION_OVERRIDE
50511 #define TARGET_OPTION_OVERRIDE ix86_option_override
50512
50513 #undef TARGET_REGISTER_MOVE_COST
50514 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
50515 #undef TARGET_MEMORY_MOVE_COST
50516 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
50517 #undef TARGET_RTX_COSTS
50518 #define TARGET_RTX_COSTS ix86_rtx_costs
50519 #undef TARGET_ADDRESS_COST
50520 #define TARGET_ADDRESS_COST ix86_address_cost
50521
50522 #undef TARGET_FIXED_CONDITION_CODE_REGS
50523 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
50524 #undef TARGET_CC_MODES_COMPATIBLE
50525 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
50526
50527 #undef TARGET_MACHINE_DEPENDENT_REORG
50528 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
50529
50530 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
50531 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
50532
50533 #undef TARGET_BUILD_BUILTIN_VA_LIST
50534 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
50535
50536 #undef TARGET_FOLD_BUILTIN
50537 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
50538
50539 #undef TARGET_COMPARE_VERSION_PRIORITY
50540 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
50541
50542 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
50543 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
50544 ix86_generate_version_dispatcher_body
50545
50546 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
50547 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
50548 ix86_get_function_versions_dispatcher
50549
50550 #undef TARGET_ENUM_VA_LIST_P
50551 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
50552
50553 #undef TARGET_FN_ABI_VA_LIST
50554 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
50555
50556 #undef TARGET_CANONICAL_VA_LIST_TYPE
50557 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
50558
50559 #undef TARGET_EXPAND_BUILTIN_VA_START
50560 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
50561
50562 #undef TARGET_MD_ASM_ADJUST
50563 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
50564
50565 #undef TARGET_PROMOTE_PROTOTYPES
50566 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
50567 #undef TARGET_SETUP_INCOMING_VARARGS
50568 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
50569 #undef TARGET_MUST_PASS_IN_STACK
50570 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
50571 #undef TARGET_FUNCTION_ARG_ADVANCE
50572 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
50573 #undef TARGET_FUNCTION_ARG
50574 #define TARGET_FUNCTION_ARG ix86_function_arg
50575 #undef TARGET_INIT_PIC_REG
50576 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
50577 #undef TARGET_USE_PSEUDO_PIC_REG
50578 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
50579 #undef TARGET_FUNCTION_ARG_BOUNDARY
50580 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
50581 #undef TARGET_PASS_BY_REFERENCE
50582 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
50583 #undef TARGET_INTERNAL_ARG_POINTER
50584 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
50585 #undef TARGET_UPDATE_STACK_BOUNDARY
50586 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
50587 #undef TARGET_GET_DRAP_RTX
50588 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
50589 #undef TARGET_STRICT_ARGUMENT_NAMING
50590 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
50591 #undef TARGET_STATIC_CHAIN
50592 #define TARGET_STATIC_CHAIN ix86_static_chain
50593 #undef TARGET_TRAMPOLINE_INIT
50594 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
50595 #undef TARGET_RETURN_POPS_ARGS
50596 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
50597
50598 #undef TARGET_LEGITIMATE_COMBINED_INSN
50599 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
50600
50601 #undef TARGET_ASAN_SHADOW_OFFSET
50602 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
50603
50604 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
50605 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
50606
50607 #undef TARGET_SCALAR_MODE_SUPPORTED_P
50608 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
50609
50610 #undef TARGET_VECTOR_MODE_SUPPORTED_P
50611 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
50612
50613 #undef TARGET_C_MODE_FOR_SUFFIX
50614 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
50615
50616 #ifdef HAVE_AS_TLS
50617 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
50618 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
50619 #endif
50620
50621 #ifdef SUBTARGET_INSERT_ATTRIBUTES
50622 #undef TARGET_INSERT_ATTRIBUTES
50623 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
50624 #endif
50625
50626 #undef TARGET_MANGLE_TYPE
50627 #define TARGET_MANGLE_TYPE ix86_mangle_type
50628
50629 #ifdef TARGET_THREAD_SSP_OFFSET
50630 #undef TARGET_STACK_PROTECT_GUARD
50631 #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard
50632 #endif
50633
50634 #if !TARGET_MACHO
50635 #undef TARGET_STACK_PROTECT_FAIL
50636 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
50637 #endif
50638
50639 #undef TARGET_FUNCTION_VALUE
50640 #define TARGET_FUNCTION_VALUE ix86_function_value
50641
50642 #undef TARGET_FUNCTION_VALUE_REGNO_P
50643 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
50644
50645 #undef TARGET_PROMOTE_FUNCTION_MODE
50646 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
50647
50648 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
50649 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
50650
50651 #undef TARGET_MEMBER_TYPE_FORCES_BLK
50652 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
50653
50654 #undef TARGET_INSTANTIATE_DECLS
50655 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
50656
50657 #undef TARGET_SECONDARY_RELOAD
50658 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
50659
50660 #undef TARGET_CLASS_MAX_NREGS
50661 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
50662
50663 #undef TARGET_PREFERRED_RELOAD_CLASS
50664 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
50665 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
50666 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
50667 #undef TARGET_CLASS_LIKELY_SPILLED_P
50668 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
50669
50670 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
50671 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
50672 ix86_builtin_vectorization_cost
50673 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
50674 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
50675 ix86_vectorize_vec_perm_const_ok
50676 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
50677 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
50678 ix86_preferred_simd_mode
50679 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
50680 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
50681 ix86_autovectorize_vector_sizes
50682 #undef TARGET_VECTORIZE_GET_MASK_MODE
50683 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
50684 #undef TARGET_VECTORIZE_INIT_COST
50685 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
50686 #undef TARGET_VECTORIZE_ADD_STMT_COST
50687 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
50688 #undef TARGET_VECTORIZE_FINISH_COST
50689 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
50690 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
50691 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
50692
50693 #undef TARGET_SET_CURRENT_FUNCTION
50694 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
50695
50696 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
50697 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
50698
50699 #undef TARGET_OPTION_SAVE
50700 #define TARGET_OPTION_SAVE ix86_function_specific_save
50701
50702 #undef TARGET_OPTION_RESTORE
50703 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
50704
50705 #undef TARGET_OPTION_POST_STREAM_IN
50706 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
50707
50708 #undef TARGET_OPTION_PRINT
50709 #define TARGET_OPTION_PRINT ix86_function_specific_print
50710
50711 #undef TARGET_OPTION_FUNCTION_VERSIONS
50712 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
50713
50714 #undef TARGET_CAN_INLINE_P
50715 #define TARGET_CAN_INLINE_P ix86_can_inline_p
50716
50717 #undef TARGET_LEGITIMATE_ADDRESS_P
50718 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
50719
50720 #undef TARGET_REGISTER_PRIORITY
50721 #define TARGET_REGISTER_PRIORITY ix86_register_priority
50722
50723 #undef TARGET_REGISTER_USAGE_LEVELING_P
50724 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
50725
50726 #undef TARGET_LEGITIMATE_CONSTANT_P
50727 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
50728
50729 #undef TARGET_FRAME_POINTER_REQUIRED
50730 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
50731
50732 #undef TARGET_CAN_ELIMINATE
50733 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
50734
50735 #undef TARGET_EXTRA_LIVE_ON_ENTRY
50736 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
50737
50738 #undef TARGET_ASM_CODE_END
50739 #define TARGET_ASM_CODE_END ix86_code_end
50740
50741 #undef TARGET_CONDITIONAL_REGISTER_USAGE
50742 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
50743
50744 #if TARGET_MACHO
50745 #undef TARGET_INIT_LIBFUNCS
50746 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
50747 #endif
50748
50749 #undef TARGET_LOOP_UNROLL_ADJUST
50750 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
50751
50752 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
50753 #undef TARGET_SPILL_CLASS
50754 #define TARGET_SPILL_CLASS ix86_spill_class
50755
50756 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
50757 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
50758 ix86_simd_clone_compute_vecsize_and_simdlen
50759
50760 #undef TARGET_SIMD_CLONE_ADJUST
50761 #define TARGET_SIMD_CLONE_ADJUST \
50762 ix86_simd_clone_adjust
50763
50764 #undef TARGET_SIMD_CLONE_USABLE
50765 #define TARGET_SIMD_CLONE_USABLE \
50766 ix86_simd_clone_usable
50767
50768 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
50769 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
50770 ix86_float_exceptions_rounding_supported_p
50771
50772 #undef TARGET_MODE_EMIT
50773 #define TARGET_MODE_EMIT ix86_emit_mode_set
50774
50775 #undef TARGET_MODE_NEEDED
50776 #define TARGET_MODE_NEEDED ix86_mode_needed
50777
50778 #undef TARGET_MODE_AFTER
50779 #define TARGET_MODE_AFTER ix86_mode_after
50780
50781 #undef TARGET_MODE_ENTRY
50782 #define TARGET_MODE_ENTRY ix86_mode_entry
50783
50784 #undef TARGET_MODE_EXIT
50785 #define TARGET_MODE_EXIT ix86_mode_exit
50786
50787 #undef TARGET_MODE_PRIORITY
50788 #define TARGET_MODE_PRIORITY ix86_mode_priority
50789
50790 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
50791 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
50792
50793 #undef TARGET_LOAD_BOUNDS_FOR_ARG
50794 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
50795
50796 #undef TARGET_STORE_BOUNDS_FOR_ARG
50797 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
50798
50799 #undef TARGET_LOAD_RETURNED_BOUNDS
50800 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
50801
50802 #undef TARGET_STORE_RETURNED_BOUNDS
50803 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
50804
50805 #undef TARGET_CHKP_BOUND_MODE
50806 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
50807
50808 #undef TARGET_BUILTIN_CHKP_FUNCTION
50809 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
50810
50811 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
50812 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
50813
50814 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
50815 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
50816
50817 #undef TARGET_CHKP_INITIALIZE_BOUNDS
50818 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
50819
50820 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
50821 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
50822
50823 #undef TARGET_OFFLOAD_OPTIONS
50824 #define TARGET_OFFLOAD_OPTIONS \
50825 ix86_offload_options
50826
50827 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
50828 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
50829
50830 #undef TARGET_OPTAB_SUPPORTED_P
50831 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
50832
50833 #undef TARGET_HARD_REGNO_SCRATCH_OK
50834 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
50835
50836 struct gcc_target targetm = TARGET_INITIALIZER;
50837 \f
50838 #include "gt-i386.h"