]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/i386/i386.c
94a84b3afd432538302bd916a0e270e2c4409c9c
[thirdparty/gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2017 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "memmodel.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "cfgloop.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "stringpool.h"
33 #include "expmed.h"
34 #include "optabs.h"
35 #include "regs.h"
36 #include "emit-rtl.h"
37 #include "recog.h"
38 #include "cgraph.h"
39 #include "diagnostic.h"
40 #include "cfgbuild.h"
41 #include "alias.h"
42 #include "fold-const.h"
43 #include "attribs.h"
44 #include "calls.h"
45 #include "stor-layout.h"
46 #include "varasm.h"
47 #include "output.h"
48 #include "insn-attr.h"
49 #include "flags.h"
50 #include "except.h"
51 #include "explow.h"
52 #include "expr.h"
53 #include "cfgrtl.h"
54 #include "common/common-target.h"
55 #include "langhooks.h"
56 #include "reload.h"
57 #include "gimplify.h"
58 #include "dwarf2.h"
59 #include "tm-constrs.h"
60 #include "params.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "tree-chkp.h"
75 #include "rtl-chkp.h"
76 #include "dbgcnt.h"
77 #include "case-cfn-macros.h"
78 #include "regrename.h"
79 #include "dojump.h"
80 #include "fold-const-call.h"
81 #include "tree-vrp.h"
82 #include "tree-ssanames.h"
83 #include "selftest.h"
84 #include "selftest-rtl.h"
85 #include "print-rtl.h"
86 #include "intl.h"
87 #include "ifcvt.h"
88
89 /* This file should be included last. */
90 #include "target-def.h"
91
92 static rtx legitimize_dllimport_symbol (rtx, bool);
93 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
94 static rtx legitimize_pe_coff_symbol (rtx, bool);
95 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
96 static bool ix86_save_reg (unsigned int, bool, bool);
97
98 #ifndef CHECK_STACK_LIMIT
99 #define CHECK_STACK_LIMIT (-1)
100 #endif
101
102 /* Return index of given mode in mult and division cost tables. */
103 #define MODE_INDEX(mode) \
104 ((mode) == QImode ? 0 \
105 : (mode) == HImode ? 1 \
106 : (mode) == SImode ? 2 \
107 : (mode) == DImode ? 3 \
108 : 4)
109
110 /* Processor costs (relative to an add) */
111 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
112 #define COSTS_N_BYTES(N) ((N) * 2)
113
114 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
115
116 static stringop_algs ix86_size_memcpy[2] = {
117 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
118 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
119 static stringop_algs ix86_size_memset[2] = {
120 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
121 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
122
123 const
124 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
125 COSTS_N_BYTES (2), /* cost of an add instruction */
126 COSTS_N_BYTES (3), /* cost of a lea instruction */
127 COSTS_N_BYTES (2), /* variable shift costs */
128 COSTS_N_BYTES (3), /* constant shift costs */
129 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
130 COSTS_N_BYTES (3), /* HI */
131 COSTS_N_BYTES (3), /* SI */
132 COSTS_N_BYTES (3), /* DI */
133 COSTS_N_BYTES (5)}, /* other */
134 0, /* cost of multiply per each bit set */
135 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
136 COSTS_N_BYTES (3), /* HI */
137 COSTS_N_BYTES (3), /* SI */
138 COSTS_N_BYTES (3), /* DI */
139 COSTS_N_BYTES (5)}, /* other */
140 COSTS_N_BYTES (3), /* cost of movsx */
141 COSTS_N_BYTES (3), /* cost of movzx */
142 0, /* "large" insn */
143 2, /* MOVE_RATIO */
144 2, /* cost for loading QImode using movzbl */
145 {2, 2, 2}, /* cost of loading integer registers
146 in QImode, HImode and SImode.
147 Relative to reg-reg move (2). */
148 {2, 2, 2}, /* cost of storing integer registers */
149 2, /* cost of reg,reg fld/fst */
150 {2, 2, 2}, /* cost of loading fp registers
151 in SFmode, DFmode and XFmode */
152 {2, 2, 2}, /* cost of storing fp registers
153 in SFmode, DFmode and XFmode */
154 3, /* cost of moving MMX register */
155 {3, 3}, /* cost of loading MMX registers
156 in SImode and DImode */
157 {3, 3}, /* cost of storing MMX registers
158 in SImode and DImode */
159 3, /* cost of moving SSE register */
160 {3, 3, 3}, /* cost of loading SSE registers
161 in SImode, DImode and TImode */
162 {3, 3, 3}, /* cost of storing SSE registers
163 in SImode, DImode and TImode */
164 3, /* MMX or SSE register to integer */
165 0, /* size of l1 cache */
166 0, /* size of l2 cache */
167 0, /* size of prefetch block */
168 0, /* number of parallel prefetches */
169 2, /* Branch cost */
170 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
171 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
172 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
173 COSTS_N_BYTES (2), /* cost of FABS instruction. */
174 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
175 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
176 ix86_size_memcpy,
177 ix86_size_memset,
178 1, /* scalar_stmt_cost. */
179 1, /* scalar load_cost. */
180 1, /* scalar_store_cost. */
181 1, /* vec_stmt_cost. */
182 1, /* vec_to_scalar_cost. */
183 1, /* scalar_to_vec_cost. */
184 1, /* vec_align_load_cost. */
185 1, /* vec_unalign_load_cost. */
186 1, /* vec_store_cost. */
187 1, /* cond_taken_branch_cost. */
188 1, /* cond_not_taken_branch_cost. */
189 };
190
191 /* Processor costs (relative to an add) */
192 static stringop_algs i386_memcpy[2] = {
193 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
194 DUMMY_STRINGOP_ALGS};
195 static stringop_algs i386_memset[2] = {
196 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
197 DUMMY_STRINGOP_ALGS};
198
199 static const
200 struct processor_costs i386_cost = { /* 386 specific costs */
201 COSTS_N_INSNS (1), /* cost of an add instruction */
202 COSTS_N_INSNS (1), /* cost of a lea instruction */
203 COSTS_N_INSNS (3), /* variable shift costs */
204 COSTS_N_INSNS (2), /* constant shift costs */
205 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
206 COSTS_N_INSNS (6), /* HI */
207 COSTS_N_INSNS (6), /* SI */
208 COSTS_N_INSNS (6), /* DI */
209 COSTS_N_INSNS (6)}, /* other */
210 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
211 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
212 COSTS_N_INSNS (23), /* HI */
213 COSTS_N_INSNS (23), /* SI */
214 COSTS_N_INSNS (23), /* DI */
215 COSTS_N_INSNS (23)}, /* other */
216 COSTS_N_INSNS (3), /* cost of movsx */
217 COSTS_N_INSNS (2), /* cost of movzx */
218 15, /* "large" insn */
219 3, /* MOVE_RATIO */
220 4, /* cost for loading QImode using movzbl */
221 {2, 4, 2}, /* cost of loading integer registers
222 in QImode, HImode and SImode.
223 Relative to reg-reg move (2). */
224 {2, 4, 2}, /* cost of storing integer registers */
225 2, /* cost of reg,reg fld/fst */
226 {8, 8, 8}, /* cost of loading fp registers
227 in SFmode, DFmode and XFmode */
228 {8, 8, 8}, /* cost of storing fp registers
229 in SFmode, DFmode and XFmode */
230 2, /* cost of moving MMX register */
231 {4, 8}, /* cost of loading MMX registers
232 in SImode and DImode */
233 {4, 8}, /* cost of storing MMX registers
234 in SImode and DImode */
235 2, /* cost of moving SSE register */
236 {4, 8, 16}, /* cost of loading SSE registers
237 in SImode, DImode and TImode */
238 {4, 8, 16}, /* cost of storing SSE registers
239 in SImode, DImode and TImode */
240 3, /* MMX or SSE register to integer */
241 0, /* size of l1 cache */
242 0, /* size of l2 cache */
243 0, /* size of prefetch block */
244 0, /* number of parallel prefetches */
245 1, /* Branch cost */
246 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
247 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
248 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
249 COSTS_N_INSNS (22), /* cost of FABS instruction. */
250 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
251 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
252 i386_memcpy,
253 i386_memset,
254 1, /* scalar_stmt_cost. */
255 1, /* scalar load_cost. */
256 1, /* scalar_store_cost. */
257 1, /* vec_stmt_cost. */
258 1, /* vec_to_scalar_cost. */
259 1, /* scalar_to_vec_cost. */
260 1, /* vec_align_load_cost. */
261 2, /* vec_unalign_load_cost. */
262 1, /* vec_store_cost. */
263 3, /* cond_taken_branch_cost. */
264 1, /* cond_not_taken_branch_cost. */
265 };
266
267 static stringop_algs i486_memcpy[2] = {
268 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
269 DUMMY_STRINGOP_ALGS};
270 static stringop_algs i486_memset[2] = {
271 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
272 DUMMY_STRINGOP_ALGS};
273
274 static const
275 struct processor_costs i486_cost = { /* 486 specific costs */
276 COSTS_N_INSNS (1), /* cost of an add instruction */
277 COSTS_N_INSNS (1), /* cost of a lea instruction */
278 COSTS_N_INSNS (3), /* variable shift costs */
279 COSTS_N_INSNS (2), /* constant shift costs */
280 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
281 COSTS_N_INSNS (12), /* HI */
282 COSTS_N_INSNS (12), /* SI */
283 COSTS_N_INSNS (12), /* DI */
284 COSTS_N_INSNS (12)}, /* other */
285 1, /* cost of multiply per each bit set */
286 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
287 COSTS_N_INSNS (40), /* HI */
288 COSTS_N_INSNS (40), /* SI */
289 COSTS_N_INSNS (40), /* DI */
290 COSTS_N_INSNS (40)}, /* other */
291 COSTS_N_INSNS (3), /* cost of movsx */
292 COSTS_N_INSNS (2), /* cost of movzx */
293 15, /* "large" insn */
294 3, /* MOVE_RATIO */
295 4, /* cost for loading QImode using movzbl */
296 {2, 4, 2}, /* cost of loading integer registers
297 in QImode, HImode and SImode.
298 Relative to reg-reg move (2). */
299 {2, 4, 2}, /* cost of storing integer registers */
300 2, /* cost of reg,reg fld/fst */
301 {8, 8, 8}, /* cost of loading fp registers
302 in SFmode, DFmode and XFmode */
303 {8, 8, 8}, /* cost of storing fp registers
304 in SFmode, DFmode and XFmode */
305 2, /* cost of moving MMX register */
306 {4, 8}, /* cost of loading MMX registers
307 in SImode and DImode */
308 {4, 8}, /* cost of storing MMX registers
309 in SImode and DImode */
310 2, /* cost of moving SSE register */
311 {4, 8, 16}, /* cost of loading SSE registers
312 in SImode, DImode and TImode */
313 {4, 8, 16}, /* cost of storing SSE registers
314 in SImode, DImode and TImode */
315 3, /* MMX or SSE register to integer */
316 4, /* size of l1 cache. 486 has 8kB cache
317 shared for code and data, so 4kB is
318 not really precise. */
319 4, /* size of l2 cache */
320 0, /* size of prefetch block */
321 0, /* number of parallel prefetches */
322 1, /* Branch cost */
323 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
324 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
325 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
326 COSTS_N_INSNS (3), /* cost of FABS instruction. */
327 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
328 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
329 i486_memcpy,
330 i486_memset,
331 1, /* scalar_stmt_cost. */
332 1, /* scalar load_cost. */
333 1, /* scalar_store_cost. */
334 1, /* vec_stmt_cost. */
335 1, /* vec_to_scalar_cost. */
336 1, /* scalar_to_vec_cost. */
337 1, /* vec_align_load_cost. */
338 2, /* vec_unalign_load_cost. */
339 1, /* vec_store_cost. */
340 3, /* cond_taken_branch_cost. */
341 1, /* cond_not_taken_branch_cost. */
342 };
343
344 static stringop_algs pentium_memcpy[2] = {
345 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
346 DUMMY_STRINGOP_ALGS};
347 static stringop_algs pentium_memset[2] = {
348 {libcall, {{-1, rep_prefix_4_byte, false}}},
349 DUMMY_STRINGOP_ALGS};
350
351 static const
352 struct processor_costs pentium_cost = {
353 COSTS_N_INSNS (1), /* cost of an add instruction */
354 COSTS_N_INSNS (1), /* cost of a lea instruction */
355 COSTS_N_INSNS (4), /* variable shift costs */
356 COSTS_N_INSNS (1), /* constant shift costs */
357 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
358 COSTS_N_INSNS (11), /* HI */
359 COSTS_N_INSNS (11), /* SI */
360 COSTS_N_INSNS (11), /* DI */
361 COSTS_N_INSNS (11)}, /* other */
362 0, /* cost of multiply per each bit set */
363 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
364 COSTS_N_INSNS (25), /* HI */
365 COSTS_N_INSNS (25), /* SI */
366 COSTS_N_INSNS (25), /* DI */
367 COSTS_N_INSNS (25)}, /* other */
368 COSTS_N_INSNS (3), /* cost of movsx */
369 COSTS_N_INSNS (2), /* cost of movzx */
370 8, /* "large" insn */
371 6, /* MOVE_RATIO */
372 6, /* cost for loading QImode using movzbl */
373 {2, 4, 2}, /* cost of loading integer registers
374 in QImode, HImode and SImode.
375 Relative to reg-reg move (2). */
376 {2, 4, 2}, /* cost of storing integer registers */
377 2, /* cost of reg,reg fld/fst */
378 {2, 2, 6}, /* cost of loading fp registers
379 in SFmode, DFmode and XFmode */
380 {4, 4, 6}, /* cost of storing fp registers
381 in SFmode, DFmode and XFmode */
382 8, /* cost of moving MMX register */
383 {8, 8}, /* cost of loading MMX registers
384 in SImode and DImode */
385 {8, 8}, /* cost of storing MMX registers
386 in SImode and DImode */
387 2, /* cost of moving SSE register */
388 {4, 8, 16}, /* cost of loading SSE registers
389 in SImode, DImode and TImode */
390 {4, 8, 16}, /* cost of storing SSE registers
391 in SImode, DImode and TImode */
392 3, /* MMX or SSE register to integer */
393 8, /* size of l1 cache. */
394 8, /* size of l2 cache */
395 0, /* size of prefetch block */
396 0, /* number of parallel prefetches */
397 2, /* Branch cost */
398 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
399 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
400 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
401 COSTS_N_INSNS (1), /* cost of FABS instruction. */
402 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
403 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
404 pentium_memcpy,
405 pentium_memset,
406 1, /* scalar_stmt_cost. */
407 1, /* scalar load_cost. */
408 1, /* scalar_store_cost. */
409 1, /* vec_stmt_cost. */
410 1, /* vec_to_scalar_cost. */
411 1, /* scalar_to_vec_cost. */
412 1, /* vec_align_load_cost. */
413 2, /* vec_unalign_load_cost. */
414 1, /* vec_store_cost. */
415 3, /* cond_taken_branch_cost. */
416 1, /* cond_not_taken_branch_cost. */
417 };
418
419 static const
420 struct processor_costs lakemont_cost = {
421 COSTS_N_INSNS (1), /* cost of an add instruction */
422 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
423 COSTS_N_INSNS (1), /* variable shift costs */
424 COSTS_N_INSNS (1), /* constant shift costs */
425 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
426 COSTS_N_INSNS (11), /* HI */
427 COSTS_N_INSNS (11), /* SI */
428 COSTS_N_INSNS (11), /* DI */
429 COSTS_N_INSNS (11)}, /* other */
430 0, /* cost of multiply per each bit set */
431 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
432 COSTS_N_INSNS (25), /* HI */
433 COSTS_N_INSNS (25), /* SI */
434 COSTS_N_INSNS (25), /* DI */
435 COSTS_N_INSNS (25)}, /* other */
436 COSTS_N_INSNS (3), /* cost of movsx */
437 COSTS_N_INSNS (2), /* cost of movzx */
438 8, /* "large" insn */
439 17, /* MOVE_RATIO */
440 6, /* cost for loading QImode using movzbl */
441 {2, 4, 2}, /* cost of loading integer registers
442 in QImode, HImode and SImode.
443 Relative to reg-reg move (2). */
444 {2, 4, 2}, /* cost of storing integer registers */
445 2, /* cost of reg,reg fld/fst */
446 {2, 2, 6}, /* cost of loading fp registers
447 in SFmode, DFmode and XFmode */
448 {4, 4, 6}, /* cost of storing fp registers
449 in SFmode, DFmode and XFmode */
450 8, /* cost of moving MMX register */
451 {8, 8}, /* cost of loading MMX registers
452 in SImode and DImode */
453 {8, 8}, /* cost of storing MMX registers
454 in SImode and DImode */
455 2, /* cost of moving SSE register */
456 {4, 8, 16}, /* cost of loading SSE registers
457 in SImode, DImode and TImode */
458 {4, 8, 16}, /* cost of storing SSE registers
459 in SImode, DImode and TImode */
460 3, /* MMX or SSE register to integer */
461 8, /* size of l1 cache. */
462 8, /* size of l2 cache */
463 0, /* size of prefetch block */
464 0, /* number of parallel prefetches */
465 2, /* Branch cost */
466 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
467 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
468 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
469 COSTS_N_INSNS (1), /* cost of FABS instruction. */
470 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
471 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
472 pentium_memcpy,
473 pentium_memset,
474 1, /* scalar_stmt_cost. */
475 1, /* scalar load_cost. */
476 1, /* scalar_store_cost. */
477 1, /* vec_stmt_cost. */
478 1, /* vec_to_scalar_cost. */
479 1, /* scalar_to_vec_cost. */
480 1, /* vec_align_load_cost. */
481 2, /* vec_unalign_load_cost. */
482 1, /* vec_store_cost. */
483 3, /* cond_taken_branch_cost. */
484 1, /* cond_not_taken_branch_cost. */
485 };
486
487 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
488 (we ensure the alignment). For small blocks inline loop is still a
489 noticeable win, for bigger blocks either rep movsl or rep movsb is
490 way to go. Rep movsb has apparently more expensive startup time in CPU,
491 but after 4K the difference is down in the noise. */
492 static stringop_algs pentiumpro_memcpy[2] = {
493 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
494 {8192, rep_prefix_4_byte, false},
495 {-1, rep_prefix_1_byte, false}}},
496 DUMMY_STRINGOP_ALGS};
497 static stringop_algs pentiumpro_memset[2] = {
498 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
499 {8192, rep_prefix_4_byte, false},
500 {-1, libcall, false}}},
501 DUMMY_STRINGOP_ALGS};
502 static const
503 struct processor_costs pentiumpro_cost = {
504 COSTS_N_INSNS (1), /* cost of an add instruction */
505 COSTS_N_INSNS (1), /* cost of a lea instruction */
506 COSTS_N_INSNS (1), /* variable shift costs */
507 COSTS_N_INSNS (1), /* constant shift costs */
508 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
509 COSTS_N_INSNS (4), /* HI */
510 COSTS_N_INSNS (4), /* SI */
511 COSTS_N_INSNS (4), /* DI */
512 COSTS_N_INSNS (4)}, /* other */
513 0, /* cost of multiply per each bit set */
514 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
515 COSTS_N_INSNS (17), /* HI */
516 COSTS_N_INSNS (17), /* SI */
517 COSTS_N_INSNS (17), /* DI */
518 COSTS_N_INSNS (17)}, /* other */
519 COSTS_N_INSNS (1), /* cost of movsx */
520 COSTS_N_INSNS (1), /* cost of movzx */
521 8, /* "large" insn */
522 6, /* MOVE_RATIO */
523 2, /* cost for loading QImode using movzbl */
524 {4, 4, 4}, /* cost of loading integer registers
525 in QImode, HImode and SImode.
526 Relative to reg-reg move (2). */
527 {2, 2, 2}, /* cost of storing integer registers */
528 2, /* cost of reg,reg fld/fst */
529 {2, 2, 6}, /* cost of loading fp registers
530 in SFmode, DFmode and XFmode */
531 {4, 4, 6}, /* cost of storing fp registers
532 in SFmode, DFmode and XFmode */
533 2, /* cost of moving MMX register */
534 {2, 2}, /* cost of loading MMX registers
535 in SImode and DImode */
536 {2, 2}, /* cost of storing MMX registers
537 in SImode and DImode */
538 2, /* cost of moving SSE register */
539 {2, 2, 8}, /* cost of loading SSE registers
540 in SImode, DImode and TImode */
541 {2, 2, 8}, /* cost of storing SSE registers
542 in SImode, DImode and TImode */
543 3, /* MMX or SSE register to integer */
544 8, /* size of l1 cache. */
545 256, /* size of l2 cache */
546 32, /* size of prefetch block */
547 6, /* number of parallel prefetches */
548 2, /* Branch cost */
549 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
550 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
551 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
552 COSTS_N_INSNS (2), /* cost of FABS instruction. */
553 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
554 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
555 pentiumpro_memcpy,
556 pentiumpro_memset,
557 1, /* scalar_stmt_cost. */
558 1, /* scalar load_cost. */
559 1, /* scalar_store_cost. */
560 1, /* vec_stmt_cost. */
561 1, /* vec_to_scalar_cost. */
562 1, /* scalar_to_vec_cost. */
563 1, /* vec_align_load_cost. */
564 2, /* vec_unalign_load_cost. */
565 1, /* vec_store_cost. */
566 3, /* cond_taken_branch_cost. */
567 1, /* cond_not_taken_branch_cost. */
568 };
569
570 static stringop_algs geode_memcpy[2] = {
571 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
572 DUMMY_STRINGOP_ALGS};
573 static stringop_algs geode_memset[2] = {
574 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
575 DUMMY_STRINGOP_ALGS};
576 static const
577 struct processor_costs geode_cost = {
578 COSTS_N_INSNS (1), /* cost of an add instruction */
579 COSTS_N_INSNS (1), /* cost of a lea instruction */
580 COSTS_N_INSNS (2), /* variable shift costs */
581 COSTS_N_INSNS (1), /* constant shift costs */
582 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
583 COSTS_N_INSNS (4), /* HI */
584 COSTS_N_INSNS (7), /* SI */
585 COSTS_N_INSNS (7), /* DI */
586 COSTS_N_INSNS (7)}, /* other */
587 0, /* cost of multiply per each bit set */
588 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
589 COSTS_N_INSNS (23), /* HI */
590 COSTS_N_INSNS (39), /* SI */
591 COSTS_N_INSNS (39), /* DI */
592 COSTS_N_INSNS (39)}, /* other */
593 COSTS_N_INSNS (1), /* cost of movsx */
594 COSTS_N_INSNS (1), /* cost of movzx */
595 8, /* "large" insn */
596 4, /* MOVE_RATIO */
597 1, /* cost for loading QImode using movzbl */
598 {1, 1, 1}, /* cost of loading integer registers
599 in QImode, HImode and SImode.
600 Relative to reg-reg move (2). */
601 {1, 1, 1}, /* cost of storing integer registers */
602 1, /* cost of reg,reg fld/fst */
603 {1, 1, 1}, /* cost of loading fp registers
604 in SFmode, DFmode and XFmode */
605 {4, 6, 6}, /* cost of storing fp registers
606 in SFmode, DFmode and XFmode */
607
608 2, /* cost of moving MMX register */
609 {2, 2}, /* cost of loading MMX registers
610 in SImode and DImode */
611 {2, 2}, /* cost of storing MMX registers
612 in SImode and DImode */
613 2, /* cost of moving SSE register */
614 {2, 2, 8}, /* cost of loading SSE registers
615 in SImode, DImode and TImode */
616 {2, 2, 8}, /* cost of storing SSE registers
617 in SImode, DImode and TImode */
618 3, /* MMX or SSE register to integer */
619 64, /* size of l1 cache. */
620 128, /* size of l2 cache. */
621 32, /* size of prefetch block */
622 1, /* number of parallel prefetches */
623 1, /* Branch cost */
624 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
625 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
626 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
627 COSTS_N_INSNS (1), /* cost of FABS instruction. */
628 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
629 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
630 geode_memcpy,
631 geode_memset,
632 1, /* scalar_stmt_cost. */
633 1, /* scalar load_cost. */
634 1, /* scalar_store_cost. */
635 1, /* vec_stmt_cost. */
636 1, /* vec_to_scalar_cost. */
637 1, /* scalar_to_vec_cost. */
638 1, /* vec_align_load_cost. */
639 2, /* vec_unalign_load_cost. */
640 1, /* vec_store_cost. */
641 3, /* cond_taken_branch_cost. */
642 1, /* cond_not_taken_branch_cost. */
643 };
644
645 static stringop_algs k6_memcpy[2] = {
646 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
647 DUMMY_STRINGOP_ALGS};
648 static stringop_algs k6_memset[2] = {
649 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
650 DUMMY_STRINGOP_ALGS};
651 static const
652 struct processor_costs k6_cost = {
653 COSTS_N_INSNS (1), /* cost of an add instruction */
654 COSTS_N_INSNS (2), /* cost of a lea instruction */
655 COSTS_N_INSNS (1), /* variable shift costs */
656 COSTS_N_INSNS (1), /* constant shift costs */
657 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
658 COSTS_N_INSNS (3), /* HI */
659 COSTS_N_INSNS (3), /* SI */
660 COSTS_N_INSNS (3), /* DI */
661 COSTS_N_INSNS (3)}, /* other */
662 0, /* cost of multiply per each bit set */
663 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
664 COSTS_N_INSNS (18), /* HI */
665 COSTS_N_INSNS (18), /* SI */
666 COSTS_N_INSNS (18), /* DI */
667 COSTS_N_INSNS (18)}, /* other */
668 COSTS_N_INSNS (2), /* cost of movsx */
669 COSTS_N_INSNS (2), /* cost of movzx */
670 8, /* "large" insn */
671 4, /* MOVE_RATIO */
672 3, /* cost for loading QImode using movzbl */
673 {4, 5, 4}, /* cost of loading integer registers
674 in QImode, HImode and SImode.
675 Relative to reg-reg move (2). */
676 {2, 3, 2}, /* cost of storing integer registers */
677 4, /* cost of reg,reg fld/fst */
678 {6, 6, 6}, /* cost of loading fp registers
679 in SFmode, DFmode and XFmode */
680 {4, 4, 4}, /* cost of storing fp registers
681 in SFmode, DFmode and XFmode */
682 2, /* cost of moving MMX register */
683 {2, 2}, /* cost of loading MMX registers
684 in SImode and DImode */
685 {2, 2}, /* cost of storing MMX registers
686 in SImode and DImode */
687 2, /* cost of moving SSE register */
688 {2, 2, 8}, /* cost of loading SSE registers
689 in SImode, DImode and TImode */
690 {2, 2, 8}, /* cost of storing SSE registers
691 in SImode, DImode and TImode */
692 6, /* MMX or SSE register to integer */
693 32, /* size of l1 cache. */
694 32, /* size of l2 cache. Some models
695 have integrated l2 cache, but
696 optimizing for k6 is not important
697 enough to worry about that. */
698 32, /* size of prefetch block */
699 1, /* number of parallel prefetches */
700 1, /* Branch cost */
701 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
702 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
703 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
704 COSTS_N_INSNS (2), /* cost of FABS instruction. */
705 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
706 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
707 k6_memcpy,
708 k6_memset,
709 1, /* scalar_stmt_cost. */
710 1, /* scalar load_cost. */
711 1, /* scalar_store_cost. */
712 1, /* vec_stmt_cost. */
713 1, /* vec_to_scalar_cost. */
714 1, /* scalar_to_vec_cost. */
715 1, /* vec_align_load_cost. */
716 2, /* vec_unalign_load_cost. */
717 1, /* vec_store_cost. */
718 3, /* cond_taken_branch_cost. */
719 1, /* cond_not_taken_branch_cost. */
720 };
721
722 /* For some reason, Athlon deals better with REP prefix (relative to loops)
723 compared to K8. Alignment becomes important after 8 bytes for memcpy and
724 128 bytes for memset. */
725 static stringop_algs athlon_memcpy[2] = {
726 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
727 DUMMY_STRINGOP_ALGS};
728 static stringop_algs athlon_memset[2] = {
729 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
730 DUMMY_STRINGOP_ALGS};
731 static const
732 struct processor_costs athlon_cost = {
733 COSTS_N_INSNS (1), /* cost of an add instruction */
734 COSTS_N_INSNS (2), /* cost of a lea instruction */
735 COSTS_N_INSNS (1), /* variable shift costs */
736 COSTS_N_INSNS (1), /* constant shift costs */
737 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
738 COSTS_N_INSNS (5), /* HI */
739 COSTS_N_INSNS (5), /* SI */
740 COSTS_N_INSNS (5), /* DI */
741 COSTS_N_INSNS (5)}, /* other */
742 0, /* cost of multiply per each bit set */
743 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
744 COSTS_N_INSNS (26), /* HI */
745 COSTS_N_INSNS (42), /* SI */
746 COSTS_N_INSNS (74), /* DI */
747 COSTS_N_INSNS (74)}, /* other */
748 COSTS_N_INSNS (1), /* cost of movsx */
749 COSTS_N_INSNS (1), /* cost of movzx */
750 8, /* "large" insn */
751 9, /* MOVE_RATIO */
752 4, /* cost for loading QImode using movzbl */
753 {3, 4, 3}, /* cost of loading integer registers
754 in QImode, HImode and SImode.
755 Relative to reg-reg move (2). */
756 {3, 4, 3}, /* cost of storing integer registers */
757 4, /* cost of reg,reg fld/fst */
758 {4, 4, 12}, /* cost of loading fp registers
759 in SFmode, DFmode and XFmode */
760 {6, 6, 8}, /* cost of storing fp registers
761 in SFmode, DFmode and XFmode */
762 2, /* cost of moving MMX register */
763 {4, 4}, /* cost of loading MMX registers
764 in SImode and DImode */
765 {4, 4}, /* cost of storing MMX registers
766 in SImode and DImode */
767 2, /* cost of moving SSE register */
768 {4, 4, 6}, /* cost of loading SSE registers
769 in SImode, DImode and TImode */
770 {4, 4, 5}, /* cost of storing SSE registers
771 in SImode, DImode and TImode */
772 5, /* MMX or SSE register to integer */
773 64, /* size of l1 cache. */
774 256, /* size of l2 cache. */
775 64, /* size of prefetch block */
776 6, /* number of parallel prefetches */
777 5, /* Branch cost */
778 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
779 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
780 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
781 COSTS_N_INSNS (2), /* cost of FABS instruction. */
782 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
783 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
784 athlon_memcpy,
785 athlon_memset,
786 1, /* scalar_stmt_cost. */
787 1, /* scalar load_cost. */
788 1, /* scalar_store_cost. */
789 1, /* vec_stmt_cost. */
790 1, /* vec_to_scalar_cost. */
791 1, /* scalar_to_vec_cost. */
792 1, /* vec_align_load_cost. */
793 2, /* vec_unalign_load_cost. */
794 1, /* vec_store_cost. */
795 3, /* cond_taken_branch_cost. */
796 1, /* cond_not_taken_branch_cost. */
797 };
798
799 /* K8 has optimized REP instruction for medium sized blocks, but for very
800 small blocks it is better to use loop. For large blocks, libcall can
801 do nontemporary accesses and beat inline considerably. */
802 static stringop_algs k8_memcpy[2] = {
803 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
804 {-1, rep_prefix_4_byte, false}}},
805 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
806 {-1, libcall, false}}}};
807 static stringop_algs k8_memset[2] = {
808 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
809 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
810 {libcall, {{48, unrolled_loop, false},
811 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
812 static const
813 struct processor_costs k8_cost = {
814 COSTS_N_INSNS (1), /* cost of an add instruction */
815 COSTS_N_INSNS (2), /* cost of a lea instruction */
816 COSTS_N_INSNS (1), /* variable shift costs */
817 COSTS_N_INSNS (1), /* constant shift costs */
818 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
819 COSTS_N_INSNS (4), /* HI */
820 COSTS_N_INSNS (3), /* SI */
821 COSTS_N_INSNS (4), /* DI */
822 COSTS_N_INSNS (5)}, /* other */
823 0, /* cost of multiply per each bit set */
824 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
825 COSTS_N_INSNS (26), /* HI */
826 COSTS_N_INSNS (42), /* SI */
827 COSTS_N_INSNS (74), /* DI */
828 COSTS_N_INSNS (74)}, /* other */
829 COSTS_N_INSNS (1), /* cost of movsx */
830 COSTS_N_INSNS (1), /* cost of movzx */
831 8, /* "large" insn */
832 9, /* MOVE_RATIO */
833 4, /* cost for loading QImode using movzbl */
834 {3, 4, 3}, /* cost of loading integer registers
835 in QImode, HImode and SImode.
836 Relative to reg-reg move (2). */
837 {3, 4, 3}, /* cost of storing integer registers */
838 4, /* cost of reg,reg fld/fst */
839 {4, 4, 12}, /* cost of loading fp registers
840 in SFmode, DFmode and XFmode */
841 {6, 6, 8}, /* cost of storing fp registers
842 in SFmode, DFmode and XFmode */
843 2, /* cost of moving MMX register */
844 {3, 3}, /* cost of loading MMX registers
845 in SImode and DImode */
846 {4, 4}, /* cost of storing MMX registers
847 in SImode and DImode */
848 2, /* cost of moving SSE register */
849 {4, 3, 6}, /* cost of loading SSE registers
850 in SImode, DImode and TImode */
851 {4, 4, 5}, /* cost of storing SSE registers
852 in SImode, DImode and TImode */
853 5, /* MMX or SSE register to integer */
854 64, /* size of l1 cache. */
855 512, /* size of l2 cache. */
856 64, /* size of prefetch block */
857 /* New AMD processors never drop prefetches; if they cannot be performed
858 immediately, they are queued. We set number of simultaneous prefetches
859 to a large constant to reflect this (it probably is not a good idea not
860 to limit number of prefetches at all, as their execution also takes some
861 time). */
862 100, /* number of parallel prefetches */
863 3, /* Branch cost */
864 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
865 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
866 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
867 COSTS_N_INSNS (2), /* cost of FABS instruction. */
868 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
869 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
870
871 k8_memcpy,
872 k8_memset,
873 4, /* scalar_stmt_cost. */
874 2, /* scalar load_cost. */
875 2, /* scalar_store_cost. */
876 5, /* vec_stmt_cost. */
877 0, /* vec_to_scalar_cost. */
878 2, /* scalar_to_vec_cost. */
879 2, /* vec_align_load_cost. */
880 3, /* vec_unalign_load_cost. */
881 3, /* vec_store_cost. */
882 3, /* cond_taken_branch_cost. */
883 2, /* cond_not_taken_branch_cost. */
884 };
885
886 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
887 very small blocks it is better to use loop. For large blocks, libcall can
888 do nontemporary accesses and beat inline considerably. */
889 static stringop_algs amdfam10_memcpy[2] = {
890 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
891 {-1, rep_prefix_4_byte, false}}},
892 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
893 {-1, libcall, false}}}};
894 static stringop_algs amdfam10_memset[2] = {
895 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
896 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
897 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
898 {-1, libcall, false}}}};
899 struct processor_costs amdfam10_cost = {
900 COSTS_N_INSNS (1), /* cost of an add instruction */
901 COSTS_N_INSNS (2), /* cost of a lea instruction */
902 COSTS_N_INSNS (1), /* variable shift costs */
903 COSTS_N_INSNS (1), /* constant shift costs */
904 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
905 COSTS_N_INSNS (4), /* HI */
906 COSTS_N_INSNS (3), /* SI */
907 COSTS_N_INSNS (4), /* DI */
908 COSTS_N_INSNS (5)}, /* other */
909 0, /* cost of multiply per each bit set */
910 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
911 COSTS_N_INSNS (35), /* HI */
912 COSTS_N_INSNS (51), /* SI */
913 COSTS_N_INSNS (83), /* DI */
914 COSTS_N_INSNS (83)}, /* other */
915 COSTS_N_INSNS (1), /* cost of movsx */
916 COSTS_N_INSNS (1), /* cost of movzx */
917 8, /* "large" insn */
918 9, /* MOVE_RATIO */
919 4, /* cost for loading QImode using movzbl */
920 {3, 4, 3}, /* cost of loading integer registers
921 in QImode, HImode and SImode.
922 Relative to reg-reg move (2). */
923 {3, 4, 3}, /* cost of storing integer registers */
924 4, /* cost of reg,reg fld/fst */
925 {4, 4, 12}, /* cost of loading fp registers
926 in SFmode, DFmode and XFmode */
927 {6, 6, 8}, /* cost of storing fp registers
928 in SFmode, DFmode and XFmode */
929 2, /* cost of moving MMX register */
930 {3, 3}, /* cost of loading MMX registers
931 in SImode and DImode */
932 {4, 4}, /* cost of storing MMX registers
933 in SImode and DImode */
934 2, /* cost of moving SSE register */
935 {4, 4, 3}, /* cost of loading SSE registers
936 in SImode, DImode and TImode */
937 {4, 4, 5}, /* cost of storing SSE registers
938 in SImode, DImode and TImode */
939 3, /* MMX or SSE register to integer */
940 /* On K8:
941 MOVD reg64, xmmreg Double FSTORE 4
942 MOVD reg32, xmmreg Double FSTORE 4
943 On AMDFAM10:
944 MOVD reg64, xmmreg Double FADD 3
945 1/1 1/1
946 MOVD reg32, xmmreg Double FADD 3
947 1/1 1/1 */
948 64, /* size of l1 cache. */
949 512, /* size of l2 cache. */
950 64, /* size of prefetch block */
951 /* New AMD processors never drop prefetches; if they cannot be performed
952 immediately, they are queued. We set number of simultaneous prefetches
953 to a large constant to reflect this (it probably is not a good idea not
954 to limit number of prefetches at all, as their execution also takes some
955 time). */
956 100, /* number of parallel prefetches */
957 2, /* Branch cost */
958 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
959 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
960 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
961 COSTS_N_INSNS (2), /* cost of FABS instruction. */
962 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
963 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
964
965 amdfam10_memcpy,
966 amdfam10_memset,
967 4, /* scalar_stmt_cost. */
968 2, /* scalar load_cost. */
969 2, /* scalar_store_cost. */
970 6, /* vec_stmt_cost. */
971 0, /* vec_to_scalar_cost. */
972 2, /* scalar_to_vec_cost. */
973 2, /* vec_align_load_cost. */
974 2, /* vec_unalign_load_cost. */
975 2, /* vec_store_cost. */
976 2, /* cond_taken_branch_cost. */
977 1, /* cond_not_taken_branch_cost. */
978 };
979
980 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
981 very small blocks it is better to use loop. For large blocks, libcall
982 can do nontemporary accesses and beat inline considerably. */
983 static stringop_algs bdver1_memcpy[2] = {
984 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
985 {-1, rep_prefix_4_byte, false}}},
986 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
987 {-1, libcall, false}}}};
988 static stringop_algs bdver1_memset[2] = {
989 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
990 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
991 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
992 {-1, libcall, false}}}};
993
994 const struct processor_costs bdver1_cost = {
995 COSTS_N_INSNS (1), /* cost of an add instruction */
996 COSTS_N_INSNS (1), /* cost of a lea instruction */
997 COSTS_N_INSNS (1), /* variable shift costs */
998 COSTS_N_INSNS (1), /* constant shift costs */
999 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1000 COSTS_N_INSNS (4), /* HI */
1001 COSTS_N_INSNS (4), /* SI */
1002 COSTS_N_INSNS (6), /* DI */
1003 COSTS_N_INSNS (6)}, /* other */
1004 0, /* cost of multiply per each bit set */
1005 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1006 COSTS_N_INSNS (35), /* HI */
1007 COSTS_N_INSNS (51), /* SI */
1008 COSTS_N_INSNS (83), /* DI */
1009 COSTS_N_INSNS (83)}, /* other */
1010 COSTS_N_INSNS (1), /* cost of movsx */
1011 COSTS_N_INSNS (1), /* cost of movzx */
1012 8, /* "large" insn */
1013 9, /* MOVE_RATIO */
1014 4, /* cost for loading QImode using movzbl */
1015 {5, 5, 4}, /* cost of loading integer registers
1016 in QImode, HImode and SImode.
1017 Relative to reg-reg move (2). */
1018 {4, 4, 4}, /* cost of storing integer registers */
1019 2, /* cost of reg,reg fld/fst */
1020 {5, 5, 12}, /* cost of loading fp registers
1021 in SFmode, DFmode and XFmode */
1022 {4, 4, 8}, /* cost of storing fp registers
1023 in SFmode, DFmode and XFmode */
1024 2, /* cost of moving MMX register */
1025 {4, 4}, /* cost of loading MMX registers
1026 in SImode and DImode */
1027 {4, 4}, /* cost of storing MMX registers
1028 in SImode and DImode */
1029 2, /* cost of moving SSE register */
1030 {4, 4, 4}, /* cost of loading SSE registers
1031 in SImode, DImode and TImode */
1032 {4, 4, 4}, /* cost of storing SSE registers
1033 in SImode, DImode and TImode */
1034 2, /* MMX or SSE register to integer */
1035 /* On K8:
1036 MOVD reg64, xmmreg Double FSTORE 4
1037 MOVD reg32, xmmreg Double FSTORE 4
1038 On AMDFAM10:
1039 MOVD reg64, xmmreg Double FADD 3
1040 1/1 1/1
1041 MOVD reg32, xmmreg Double FADD 3
1042 1/1 1/1 */
1043 16, /* size of l1 cache. */
1044 2048, /* size of l2 cache. */
1045 64, /* size of prefetch block */
1046 /* New AMD processors never drop prefetches; if they cannot be performed
1047 immediately, they are queued. We set number of simultaneous prefetches
1048 to a large constant to reflect this (it probably is not a good idea not
1049 to limit number of prefetches at all, as their execution also takes some
1050 time). */
1051 100, /* number of parallel prefetches */
1052 2, /* Branch cost */
1053 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1054 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1055 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1056 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1057 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1058 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1059
1060 bdver1_memcpy,
1061 bdver1_memset,
1062 6, /* scalar_stmt_cost. */
1063 4, /* scalar load_cost. */
1064 4, /* scalar_store_cost. */
1065 6, /* vec_stmt_cost. */
1066 0, /* vec_to_scalar_cost. */
1067 2, /* scalar_to_vec_cost. */
1068 4, /* vec_align_load_cost. */
1069 4, /* vec_unalign_load_cost. */
1070 4, /* vec_store_cost. */
1071 4, /* cond_taken_branch_cost. */
1072 2, /* cond_not_taken_branch_cost. */
1073 };
1074
1075 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1076 very small blocks it is better to use loop. For large blocks, libcall
1077 can do nontemporary accesses and beat inline considerably. */
1078
1079 static stringop_algs bdver2_memcpy[2] = {
1080 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1081 {-1, rep_prefix_4_byte, false}}},
1082 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1083 {-1, libcall, false}}}};
1084 static stringop_algs bdver2_memset[2] = {
1085 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1086 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1087 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1088 {-1, libcall, false}}}};
1089
1090 const struct processor_costs bdver2_cost = {
1091 COSTS_N_INSNS (1), /* cost of an add instruction */
1092 COSTS_N_INSNS (1), /* cost of a lea instruction */
1093 COSTS_N_INSNS (1), /* variable shift costs */
1094 COSTS_N_INSNS (1), /* constant shift costs */
1095 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1096 COSTS_N_INSNS (4), /* HI */
1097 COSTS_N_INSNS (4), /* SI */
1098 COSTS_N_INSNS (6), /* DI */
1099 COSTS_N_INSNS (6)}, /* other */
1100 0, /* cost of multiply per each bit set */
1101 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1102 COSTS_N_INSNS (35), /* HI */
1103 COSTS_N_INSNS (51), /* SI */
1104 COSTS_N_INSNS (83), /* DI */
1105 COSTS_N_INSNS (83)}, /* other */
1106 COSTS_N_INSNS (1), /* cost of movsx */
1107 COSTS_N_INSNS (1), /* cost of movzx */
1108 8, /* "large" insn */
1109 9, /* MOVE_RATIO */
1110 4, /* cost for loading QImode using movzbl */
1111 {5, 5, 4}, /* cost of loading integer registers
1112 in QImode, HImode and SImode.
1113 Relative to reg-reg move (2). */
1114 {4, 4, 4}, /* cost of storing integer registers */
1115 2, /* cost of reg,reg fld/fst */
1116 {5, 5, 12}, /* cost of loading fp registers
1117 in SFmode, DFmode and XFmode */
1118 {4, 4, 8}, /* cost of storing fp registers
1119 in SFmode, DFmode and XFmode */
1120 2, /* cost of moving MMX register */
1121 {4, 4}, /* cost of loading MMX registers
1122 in SImode and DImode */
1123 {4, 4}, /* cost of storing MMX registers
1124 in SImode and DImode */
1125 2, /* cost of moving SSE register */
1126 {4, 4, 4}, /* cost of loading SSE registers
1127 in SImode, DImode and TImode */
1128 {4, 4, 4}, /* cost of storing SSE registers
1129 in SImode, DImode and TImode */
1130 2, /* MMX or SSE register to integer */
1131 /* On K8:
1132 MOVD reg64, xmmreg Double FSTORE 4
1133 MOVD reg32, xmmreg Double FSTORE 4
1134 On AMDFAM10:
1135 MOVD reg64, xmmreg Double FADD 3
1136 1/1 1/1
1137 MOVD reg32, xmmreg Double FADD 3
1138 1/1 1/1 */
1139 16, /* size of l1 cache. */
1140 2048, /* size of l2 cache. */
1141 64, /* size of prefetch block */
1142 /* New AMD processors never drop prefetches; if they cannot be performed
1143 immediately, they are queued. We set number of simultaneous prefetches
1144 to a large constant to reflect this (it probably is not a good idea not
1145 to limit number of prefetches at all, as their execution also takes some
1146 time). */
1147 100, /* number of parallel prefetches */
1148 2, /* Branch cost */
1149 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1150 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1151 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1152 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1153 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1154 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1155
1156 bdver2_memcpy,
1157 bdver2_memset,
1158 6, /* scalar_stmt_cost. */
1159 4, /* scalar load_cost. */
1160 4, /* scalar_store_cost. */
1161 6, /* vec_stmt_cost. */
1162 0, /* vec_to_scalar_cost. */
1163 2, /* scalar_to_vec_cost. */
1164 4, /* vec_align_load_cost. */
1165 4, /* vec_unalign_load_cost. */
1166 4, /* vec_store_cost. */
1167 4, /* cond_taken_branch_cost. */
1168 2, /* cond_not_taken_branch_cost. */
1169 };
1170
1171
1172 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1173 very small blocks it is better to use loop. For large blocks, libcall
1174 can do nontemporary accesses and beat inline considerably. */
1175 static stringop_algs bdver3_memcpy[2] = {
1176 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1177 {-1, rep_prefix_4_byte, false}}},
1178 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1179 {-1, libcall, false}}}};
1180 static stringop_algs bdver3_memset[2] = {
1181 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1182 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1183 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1184 {-1, libcall, false}}}};
1185 struct processor_costs bdver3_cost = {
1186 COSTS_N_INSNS (1), /* cost of an add instruction */
1187 COSTS_N_INSNS (1), /* cost of a lea instruction */
1188 COSTS_N_INSNS (1), /* variable shift costs */
1189 COSTS_N_INSNS (1), /* constant shift costs */
1190 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1191 COSTS_N_INSNS (4), /* HI */
1192 COSTS_N_INSNS (4), /* SI */
1193 COSTS_N_INSNS (6), /* DI */
1194 COSTS_N_INSNS (6)}, /* other */
1195 0, /* cost of multiply per each bit set */
1196 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1197 COSTS_N_INSNS (35), /* HI */
1198 COSTS_N_INSNS (51), /* SI */
1199 COSTS_N_INSNS (83), /* DI */
1200 COSTS_N_INSNS (83)}, /* other */
1201 COSTS_N_INSNS (1), /* cost of movsx */
1202 COSTS_N_INSNS (1), /* cost of movzx */
1203 8, /* "large" insn */
1204 9, /* MOVE_RATIO */
1205 4, /* cost for loading QImode using movzbl */
1206 {5, 5, 4}, /* cost of loading integer registers
1207 in QImode, HImode and SImode.
1208 Relative to reg-reg move (2). */
1209 {4, 4, 4}, /* cost of storing integer registers */
1210 2, /* cost of reg,reg fld/fst */
1211 {5, 5, 12}, /* cost of loading fp registers
1212 in SFmode, DFmode and XFmode */
1213 {4, 4, 8}, /* cost of storing fp registers
1214 in SFmode, DFmode and XFmode */
1215 2, /* cost of moving MMX register */
1216 {4, 4}, /* cost of loading MMX registers
1217 in SImode and DImode */
1218 {4, 4}, /* cost of storing MMX registers
1219 in SImode and DImode */
1220 2, /* cost of moving SSE register */
1221 {4, 4, 4}, /* cost of loading SSE registers
1222 in SImode, DImode and TImode */
1223 {4, 4, 4}, /* cost of storing SSE registers
1224 in SImode, DImode and TImode */
1225 2, /* MMX or SSE register to integer */
1226 16, /* size of l1 cache. */
1227 2048, /* size of l2 cache. */
1228 64, /* size of prefetch block */
1229 /* New AMD processors never drop prefetches; if they cannot be performed
1230 immediately, they are queued. We set number of simultaneous prefetches
1231 to a large constant to reflect this (it probably is not a good idea not
1232 to limit number of prefetches at all, as their execution also takes some
1233 time). */
1234 100, /* number of parallel prefetches */
1235 2, /* Branch cost */
1236 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1237 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1238 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1239 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1240 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1241 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1242
1243 bdver3_memcpy,
1244 bdver3_memset,
1245 6, /* scalar_stmt_cost. */
1246 4, /* scalar load_cost. */
1247 4, /* scalar_store_cost. */
1248 6, /* vec_stmt_cost. */
1249 0, /* vec_to_scalar_cost. */
1250 2, /* scalar_to_vec_cost. */
1251 4, /* vec_align_load_cost. */
1252 4, /* vec_unalign_load_cost. */
1253 4, /* vec_store_cost. */
1254 4, /* cond_taken_branch_cost. */
1255 2, /* cond_not_taken_branch_cost. */
1256 };
1257
1258 /* BDVER4 has optimized REP instruction for medium sized blocks, but for
1259 very small blocks it is better to use loop. For large blocks, libcall
1260 can do nontemporary accesses and beat inline considerably. */
1261 static stringop_algs bdver4_memcpy[2] = {
1262 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1263 {-1, rep_prefix_4_byte, false}}},
1264 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1265 {-1, libcall, false}}}};
1266 static stringop_algs bdver4_memset[2] = {
1267 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1268 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1269 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1270 {-1, libcall, false}}}};
1271 struct processor_costs bdver4_cost = {
1272 COSTS_N_INSNS (1), /* cost of an add instruction */
1273 COSTS_N_INSNS (1), /* cost of a lea instruction */
1274 COSTS_N_INSNS (1), /* variable shift costs */
1275 COSTS_N_INSNS (1), /* constant shift costs */
1276 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1277 COSTS_N_INSNS (4), /* HI */
1278 COSTS_N_INSNS (4), /* SI */
1279 COSTS_N_INSNS (6), /* DI */
1280 COSTS_N_INSNS (6)}, /* other */
1281 0, /* cost of multiply per each bit set */
1282 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1283 COSTS_N_INSNS (35), /* HI */
1284 COSTS_N_INSNS (51), /* SI */
1285 COSTS_N_INSNS (83), /* DI */
1286 COSTS_N_INSNS (83)}, /* other */
1287 COSTS_N_INSNS (1), /* cost of movsx */
1288 COSTS_N_INSNS (1), /* cost of movzx */
1289 8, /* "large" insn */
1290 9, /* MOVE_RATIO */
1291 4, /* cost for loading QImode using movzbl */
1292 {5, 5, 4}, /* cost of loading integer registers
1293 in QImode, HImode and SImode.
1294 Relative to reg-reg move (2). */
1295 {4, 4, 4}, /* cost of storing integer registers */
1296 2, /* cost of reg,reg fld/fst */
1297 {5, 5, 12}, /* cost of loading fp registers
1298 in SFmode, DFmode and XFmode */
1299 {4, 4, 8}, /* cost of storing fp registers
1300 in SFmode, DFmode and XFmode */
1301 2, /* cost of moving MMX register */
1302 {4, 4}, /* cost of loading MMX registers
1303 in SImode and DImode */
1304 {4, 4}, /* cost of storing MMX registers
1305 in SImode and DImode */
1306 2, /* cost of moving SSE register */
1307 {4, 4, 4}, /* cost of loading SSE registers
1308 in SImode, DImode and TImode */
1309 {4, 4, 4}, /* cost of storing SSE registers
1310 in SImode, DImode and TImode */
1311 2, /* MMX or SSE register to integer */
1312 16, /* size of l1 cache. */
1313 2048, /* size of l2 cache. */
1314 64, /* size of prefetch block */
1315 /* New AMD processors never drop prefetches; if they cannot be performed
1316 immediately, they are queued. We set number of simultaneous prefetches
1317 to a large constant to reflect this (it probably is not a good idea not
1318 to limit number of prefetches at all, as their execution also takes some
1319 time). */
1320 100, /* number of parallel prefetches */
1321 2, /* Branch cost */
1322 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1323 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1324 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1325 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1326 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1327 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1328
1329 bdver4_memcpy,
1330 bdver4_memset,
1331 6, /* scalar_stmt_cost. */
1332 4, /* scalar load_cost. */
1333 4, /* scalar_store_cost. */
1334 6, /* vec_stmt_cost. */
1335 0, /* vec_to_scalar_cost. */
1336 2, /* scalar_to_vec_cost. */
1337 4, /* vec_align_load_cost. */
1338 4, /* vec_unalign_load_cost. */
1339 4, /* vec_store_cost. */
1340 4, /* cond_taken_branch_cost. */
1341 2, /* cond_not_taken_branch_cost. */
1342 };
1343
1344
1345 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1346 very small blocks it is better to use loop. For large blocks, libcall
1347 can do nontemporary accesses and beat inline considerably. */
1348 static stringop_algs znver1_memcpy[2] = {
1349 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1350 {-1, rep_prefix_4_byte, false}}},
1351 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1352 {-1, libcall, false}}}};
1353 static stringop_algs znver1_memset[2] = {
1354 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1355 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1356 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1357 {-1, libcall, false}}}};
1358 struct processor_costs znver1_cost = {
1359 COSTS_N_INSNS (1), /* cost of an add instruction. */
1360 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1361 COSTS_N_INSNS (1), /* variable shift costs. */
1362 COSTS_N_INSNS (1), /* constant shift costs. */
1363 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1364 COSTS_N_INSNS (3), /* HI. */
1365 COSTS_N_INSNS (3), /* SI. */
1366 COSTS_N_INSNS (4), /* DI. */
1367 COSTS_N_INSNS (4)}, /* other. */
1368 0, /* cost of multiply per each bit
1369 set. */
1370 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI. */
1371 COSTS_N_INSNS (35), /* HI. */
1372 COSTS_N_INSNS (51), /* SI. */
1373 COSTS_N_INSNS (83), /* DI. */
1374 COSTS_N_INSNS (83)}, /* other. */
1375 COSTS_N_INSNS (1), /* cost of movsx. */
1376 COSTS_N_INSNS (1), /* cost of movzx. */
1377 8, /* "large" insn. */
1378 9, /* MOVE_RATIO. */
1379 4, /* cost for loading QImode using
1380 movzbl. */
1381 {5, 5, 4}, /* cost of loading integer registers
1382 in QImode, HImode and SImode.
1383 Relative to reg-reg move (2). */
1384 {4, 4, 4}, /* cost of storing integer
1385 registers. */
1386 2, /* cost of reg,reg fld/fst. */
1387 {5, 5, 12}, /* cost of loading fp registers
1388 in SFmode, DFmode and XFmode. */
1389 {4, 4, 8}, /* cost of storing fp registers
1390 in SFmode, DFmode and XFmode. */
1391 2, /* cost of moving MMX register. */
1392 {4, 4}, /* cost of loading MMX registers
1393 in SImode and DImode. */
1394 {4, 4}, /* cost of storing MMX registers
1395 in SImode and DImode. */
1396 2, /* cost of moving SSE register. */
1397 {4, 4, 4}, /* cost of loading SSE registers
1398 in SImode, DImode and TImode. */
1399 {4, 4, 4}, /* cost of storing SSE registers
1400 in SImode, DImode and TImode. */
1401 2, /* MMX or SSE register to integer. */
1402 32, /* size of l1 cache. */
1403 512, /* size of l2 cache. */
1404 64, /* size of prefetch block. */
1405 /* New AMD processors never drop prefetches; if they cannot be performed
1406 immediately, they are queued. We set number of simultaneous prefetches
1407 to a large constant to reflect this (it probably is not a good idea not
1408 to limit number of prefetches at all, as their execution also takes some
1409 time). */
1410 100, /* number of parallel prefetches. */
1411 2, /* Branch cost. */
1412 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1413 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1414 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1415 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1416 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1417 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1418
1419 znver1_memcpy,
1420 znver1_memset,
1421 6, /* scalar_stmt_cost. */
1422 4, /* scalar load_cost. */
1423 4, /* scalar_store_cost. */
1424 6, /* vec_stmt_cost. */
1425 0, /* vec_to_scalar_cost. */
1426 2, /* scalar_to_vec_cost. */
1427 4, /* vec_align_load_cost. */
1428 4, /* vec_unalign_load_cost. */
1429 4, /* vec_store_cost. */
1430 4, /* cond_taken_branch_cost. */
1431 2, /* cond_not_taken_branch_cost. */
1432 };
1433
1434 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1435 very small blocks it is better to use loop. For large blocks, libcall can
1436 do nontemporary accesses and beat inline considerably. */
1437 static stringop_algs btver1_memcpy[2] = {
1438 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1439 {-1, rep_prefix_4_byte, false}}},
1440 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1441 {-1, libcall, false}}}};
1442 static stringop_algs btver1_memset[2] = {
1443 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1444 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1445 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1446 {-1, libcall, false}}}};
1447 const struct processor_costs btver1_cost = {
1448 COSTS_N_INSNS (1), /* cost of an add instruction */
1449 COSTS_N_INSNS (2), /* cost of a lea instruction */
1450 COSTS_N_INSNS (1), /* variable shift costs */
1451 COSTS_N_INSNS (1), /* constant shift costs */
1452 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1453 COSTS_N_INSNS (4), /* HI */
1454 COSTS_N_INSNS (3), /* SI */
1455 COSTS_N_INSNS (4), /* DI */
1456 COSTS_N_INSNS (5)}, /* other */
1457 0, /* cost of multiply per each bit set */
1458 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1459 COSTS_N_INSNS (35), /* HI */
1460 COSTS_N_INSNS (51), /* SI */
1461 COSTS_N_INSNS (83), /* DI */
1462 COSTS_N_INSNS (83)}, /* other */
1463 COSTS_N_INSNS (1), /* cost of movsx */
1464 COSTS_N_INSNS (1), /* cost of movzx */
1465 8, /* "large" insn */
1466 9, /* MOVE_RATIO */
1467 4, /* cost for loading QImode using movzbl */
1468 {3, 4, 3}, /* cost of loading integer registers
1469 in QImode, HImode and SImode.
1470 Relative to reg-reg move (2). */
1471 {3, 4, 3}, /* cost of storing integer registers */
1472 4, /* cost of reg,reg fld/fst */
1473 {4, 4, 12}, /* cost of loading fp registers
1474 in SFmode, DFmode and XFmode */
1475 {6, 6, 8}, /* cost of storing fp registers
1476 in SFmode, DFmode and XFmode */
1477 2, /* cost of moving MMX register */
1478 {3, 3}, /* cost of loading MMX registers
1479 in SImode and DImode */
1480 {4, 4}, /* cost of storing MMX registers
1481 in SImode and DImode */
1482 2, /* cost of moving SSE register */
1483 {4, 4, 3}, /* cost of loading SSE registers
1484 in SImode, DImode and TImode */
1485 {4, 4, 5}, /* cost of storing SSE registers
1486 in SImode, DImode and TImode */
1487 3, /* MMX or SSE register to integer */
1488 /* On K8:
1489 MOVD reg64, xmmreg Double FSTORE 4
1490 MOVD reg32, xmmreg Double FSTORE 4
1491 On AMDFAM10:
1492 MOVD reg64, xmmreg Double FADD 3
1493 1/1 1/1
1494 MOVD reg32, xmmreg Double FADD 3
1495 1/1 1/1 */
1496 32, /* size of l1 cache. */
1497 512, /* size of l2 cache. */
1498 64, /* size of prefetch block */
1499 100, /* number of parallel prefetches */
1500 2, /* Branch cost */
1501 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1502 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1503 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1504 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1505 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1506 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1507
1508 btver1_memcpy,
1509 btver1_memset,
1510 4, /* scalar_stmt_cost. */
1511 2, /* scalar load_cost. */
1512 2, /* scalar_store_cost. */
1513 6, /* vec_stmt_cost. */
1514 0, /* vec_to_scalar_cost. */
1515 2, /* scalar_to_vec_cost. */
1516 2, /* vec_align_load_cost. */
1517 2, /* vec_unalign_load_cost. */
1518 2, /* vec_store_cost. */
1519 2, /* cond_taken_branch_cost. */
1520 1, /* cond_not_taken_branch_cost. */
1521 };
1522
1523 static stringop_algs btver2_memcpy[2] = {
1524 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1525 {-1, rep_prefix_4_byte, false}}},
1526 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1527 {-1, libcall, false}}}};
1528 static stringop_algs btver2_memset[2] = {
1529 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1530 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1531 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1532 {-1, libcall, false}}}};
1533 const struct processor_costs btver2_cost = {
1534 COSTS_N_INSNS (1), /* cost of an add instruction */
1535 COSTS_N_INSNS (2), /* cost of a lea instruction */
1536 COSTS_N_INSNS (1), /* variable shift costs */
1537 COSTS_N_INSNS (1), /* constant shift costs */
1538 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1539 COSTS_N_INSNS (4), /* HI */
1540 COSTS_N_INSNS (3), /* SI */
1541 COSTS_N_INSNS (4), /* DI */
1542 COSTS_N_INSNS (5)}, /* other */
1543 0, /* cost of multiply per each bit set */
1544 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1545 COSTS_N_INSNS (35), /* HI */
1546 COSTS_N_INSNS (51), /* SI */
1547 COSTS_N_INSNS (83), /* DI */
1548 COSTS_N_INSNS (83)}, /* other */
1549 COSTS_N_INSNS (1), /* cost of movsx */
1550 COSTS_N_INSNS (1), /* cost of movzx */
1551 8, /* "large" insn */
1552 9, /* MOVE_RATIO */
1553 4, /* cost for loading QImode using movzbl */
1554 {3, 4, 3}, /* cost of loading integer registers
1555 in QImode, HImode and SImode.
1556 Relative to reg-reg move (2). */
1557 {3, 4, 3}, /* cost of storing integer registers */
1558 4, /* cost of reg,reg fld/fst */
1559 {4, 4, 12}, /* cost of loading fp registers
1560 in SFmode, DFmode and XFmode */
1561 {6, 6, 8}, /* cost of storing fp registers
1562 in SFmode, DFmode and XFmode */
1563 2, /* cost of moving MMX register */
1564 {3, 3}, /* cost of loading MMX registers
1565 in SImode and DImode */
1566 {4, 4}, /* cost of storing MMX registers
1567 in SImode and DImode */
1568 2, /* cost of moving SSE register */
1569 {4, 4, 3}, /* cost of loading SSE registers
1570 in SImode, DImode and TImode */
1571 {4, 4, 5}, /* cost of storing SSE registers
1572 in SImode, DImode and TImode */
1573 3, /* MMX or SSE register to integer */
1574 /* On K8:
1575 MOVD reg64, xmmreg Double FSTORE 4
1576 MOVD reg32, xmmreg Double FSTORE 4
1577 On AMDFAM10:
1578 MOVD reg64, xmmreg Double FADD 3
1579 1/1 1/1
1580 MOVD reg32, xmmreg Double FADD 3
1581 1/1 1/1 */
1582 32, /* size of l1 cache. */
1583 2048, /* size of l2 cache. */
1584 64, /* size of prefetch block */
1585 100, /* number of parallel prefetches */
1586 2, /* Branch cost */
1587 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1588 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1589 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1590 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1591 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1592 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1593 btver2_memcpy,
1594 btver2_memset,
1595 4, /* scalar_stmt_cost. */
1596 2, /* scalar load_cost. */
1597 2, /* scalar_store_cost. */
1598 6, /* vec_stmt_cost. */
1599 0, /* vec_to_scalar_cost. */
1600 2, /* scalar_to_vec_cost. */
1601 2, /* vec_align_load_cost. */
1602 2, /* vec_unalign_load_cost. */
1603 2, /* vec_store_cost. */
1604 2, /* cond_taken_branch_cost. */
1605 1, /* cond_not_taken_branch_cost. */
1606 };
1607
1608 static stringop_algs pentium4_memcpy[2] = {
1609 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1610 DUMMY_STRINGOP_ALGS};
1611 static stringop_algs pentium4_memset[2] = {
1612 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1613 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1614 DUMMY_STRINGOP_ALGS};
1615
1616 static const
1617 struct processor_costs pentium4_cost = {
1618 COSTS_N_INSNS (1), /* cost of an add instruction */
1619 COSTS_N_INSNS (3), /* cost of a lea instruction */
1620 COSTS_N_INSNS (4), /* variable shift costs */
1621 COSTS_N_INSNS (4), /* constant shift costs */
1622 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1623 COSTS_N_INSNS (15), /* HI */
1624 COSTS_N_INSNS (15), /* SI */
1625 COSTS_N_INSNS (15), /* DI */
1626 COSTS_N_INSNS (15)}, /* other */
1627 0, /* cost of multiply per each bit set */
1628 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1629 COSTS_N_INSNS (56), /* HI */
1630 COSTS_N_INSNS (56), /* SI */
1631 COSTS_N_INSNS (56), /* DI */
1632 COSTS_N_INSNS (56)}, /* other */
1633 COSTS_N_INSNS (1), /* cost of movsx */
1634 COSTS_N_INSNS (1), /* cost of movzx */
1635 16, /* "large" insn */
1636 6, /* MOVE_RATIO */
1637 2, /* cost for loading QImode using movzbl */
1638 {4, 5, 4}, /* cost of loading integer registers
1639 in QImode, HImode and SImode.
1640 Relative to reg-reg move (2). */
1641 {2, 3, 2}, /* cost of storing integer registers */
1642 2, /* cost of reg,reg fld/fst */
1643 {2, 2, 6}, /* cost of loading fp registers
1644 in SFmode, DFmode and XFmode */
1645 {4, 4, 6}, /* cost of storing fp registers
1646 in SFmode, DFmode and XFmode */
1647 2, /* cost of moving MMX register */
1648 {2, 2}, /* cost of loading MMX registers
1649 in SImode and DImode */
1650 {2, 2}, /* cost of storing MMX registers
1651 in SImode and DImode */
1652 12, /* cost of moving SSE register */
1653 {12, 12, 12}, /* cost of loading SSE registers
1654 in SImode, DImode and TImode */
1655 {2, 2, 8}, /* cost of storing SSE registers
1656 in SImode, DImode and TImode */
1657 10, /* MMX or SSE register to integer */
1658 8, /* size of l1 cache. */
1659 256, /* size of l2 cache. */
1660 64, /* size of prefetch block */
1661 6, /* number of parallel prefetches */
1662 2, /* Branch cost */
1663 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1664 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1665 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1666 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1667 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1668 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1669 pentium4_memcpy,
1670 pentium4_memset,
1671 1, /* scalar_stmt_cost. */
1672 1, /* scalar load_cost. */
1673 1, /* scalar_store_cost. */
1674 1, /* vec_stmt_cost. */
1675 1, /* vec_to_scalar_cost. */
1676 1, /* scalar_to_vec_cost. */
1677 1, /* vec_align_load_cost. */
1678 2, /* vec_unalign_load_cost. */
1679 1, /* vec_store_cost. */
1680 3, /* cond_taken_branch_cost. */
1681 1, /* cond_not_taken_branch_cost. */
1682 };
1683
1684 static stringop_algs nocona_memcpy[2] = {
1685 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1686 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1687 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1688
1689 static stringop_algs nocona_memset[2] = {
1690 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1691 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1692 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1693 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1694
1695 static const
1696 struct processor_costs nocona_cost = {
1697 COSTS_N_INSNS (1), /* cost of an add instruction */
1698 COSTS_N_INSNS (1), /* cost of a lea instruction */
1699 COSTS_N_INSNS (1), /* variable shift costs */
1700 COSTS_N_INSNS (1), /* constant shift costs */
1701 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1702 COSTS_N_INSNS (10), /* HI */
1703 COSTS_N_INSNS (10), /* SI */
1704 COSTS_N_INSNS (10), /* DI */
1705 COSTS_N_INSNS (10)}, /* other */
1706 0, /* cost of multiply per each bit set */
1707 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1708 COSTS_N_INSNS (66), /* HI */
1709 COSTS_N_INSNS (66), /* SI */
1710 COSTS_N_INSNS (66), /* DI */
1711 COSTS_N_INSNS (66)}, /* other */
1712 COSTS_N_INSNS (1), /* cost of movsx */
1713 COSTS_N_INSNS (1), /* cost of movzx */
1714 16, /* "large" insn */
1715 17, /* MOVE_RATIO */
1716 4, /* cost for loading QImode using movzbl */
1717 {4, 4, 4}, /* cost of loading integer registers
1718 in QImode, HImode and SImode.
1719 Relative to reg-reg move (2). */
1720 {4, 4, 4}, /* cost of storing integer registers */
1721 3, /* cost of reg,reg fld/fst */
1722 {12, 12, 12}, /* cost of loading fp registers
1723 in SFmode, DFmode and XFmode */
1724 {4, 4, 4}, /* cost of storing fp registers
1725 in SFmode, DFmode and XFmode */
1726 6, /* cost of moving MMX register */
1727 {12, 12}, /* cost of loading MMX registers
1728 in SImode and DImode */
1729 {12, 12}, /* cost of storing MMX registers
1730 in SImode and DImode */
1731 6, /* cost of moving SSE register */
1732 {12, 12, 12}, /* cost of loading SSE registers
1733 in SImode, DImode and TImode */
1734 {12, 12, 12}, /* cost of storing SSE registers
1735 in SImode, DImode and TImode */
1736 8, /* MMX or SSE register to integer */
1737 8, /* size of l1 cache. */
1738 1024, /* size of l2 cache. */
1739 64, /* size of prefetch block */
1740 8, /* number of parallel prefetches */
1741 1, /* Branch cost */
1742 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1743 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1744 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1745 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1746 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1747 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1748 nocona_memcpy,
1749 nocona_memset,
1750 1, /* scalar_stmt_cost. */
1751 1, /* scalar load_cost. */
1752 1, /* scalar_store_cost. */
1753 1, /* vec_stmt_cost. */
1754 1, /* vec_to_scalar_cost. */
1755 1, /* scalar_to_vec_cost. */
1756 1, /* vec_align_load_cost. */
1757 2, /* vec_unalign_load_cost. */
1758 1, /* vec_store_cost. */
1759 3, /* cond_taken_branch_cost. */
1760 1, /* cond_not_taken_branch_cost. */
1761 };
1762
1763 static stringop_algs atom_memcpy[2] = {
1764 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1765 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1766 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1767 static stringop_algs atom_memset[2] = {
1768 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1769 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1770 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1771 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1772 static const
1773 struct processor_costs atom_cost = {
1774 COSTS_N_INSNS (1), /* cost of an add instruction */
1775 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1776 COSTS_N_INSNS (1), /* variable shift costs */
1777 COSTS_N_INSNS (1), /* constant shift costs */
1778 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1779 COSTS_N_INSNS (4), /* HI */
1780 COSTS_N_INSNS (3), /* SI */
1781 COSTS_N_INSNS (4), /* DI */
1782 COSTS_N_INSNS (2)}, /* other */
1783 0, /* cost of multiply per each bit set */
1784 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1785 COSTS_N_INSNS (26), /* HI */
1786 COSTS_N_INSNS (42), /* SI */
1787 COSTS_N_INSNS (74), /* DI */
1788 COSTS_N_INSNS (74)}, /* other */
1789 COSTS_N_INSNS (1), /* cost of movsx */
1790 COSTS_N_INSNS (1), /* cost of movzx */
1791 8, /* "large" insn */
1792 17, /* MOVE_RATIO */
1793 4, /* cost for loading QImode using movzbl */
1794 {4, 4, 4}, /* cost of loading integer registers
1795 in QImode, HImode and SImode.
1796 Relative to reg-reg move (2). */
1797 {4, 4, 4}, /* cost of storing integer registers */
1798 4, /* cost of reg,reg fld/fst */
1799 {12, 12, 12}, /* cost of loading fp registers
1800 in SFmode, DFmode and XFmode */
1801 {6, 6, 8}, /* cost of storing fp registers
1802 in SFmode, DFmode and XFmode */
1803 2, /* cost of moving MMX register */
1804 {8, 8}, /* cost of loading MMX registers
1805 in SImode and DImode */
1806 {8, 8}, /* cost of storing MMX registers
1807 in SImode and DImode */
1808 2, /* cost of moving SSE register */
1809 {8, 8, 8}, /* cost of loading SSE registers
1810 in SImode, DImode and TImode */
1811 {8, 8, 8}, /* cost of storing SSE registers
1812 in SImode, DImode and TImode */
1813 5, /* MMX or SSE register to integer */
1814 32, /* size of l1 cache. */
1815 256, /* size of l2 cache. */
1816 64, /* size of prefetch block */
1817 6, /* number of parallel prefetches */
1818 3, /* Branch cost */
1819 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1820 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1821 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1822 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1823 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1824 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1825 atom_memcpy,
1826 atom_memset,
1827 1, /* scalar_stmt_cost. */
1828 1, /* scalar load_cost. */
1829 1, /* scalar_store_cost. */
1830 1, /* vec_stmt_cost. */
1831 1, /* vec_to_scalar_cost. */
1832 1, /* scalar_to_vec_cost. */
1833 1, /* vec_align_load_cost. */
1834 2, /* vec_unalign_load_cost. */
1835 1, /* vec_store_cost. */
1836 3, /* cond_taken_branch_cost. */
1837 1, /* cond_not_taken_branch_cost. */
1838 };
1839
1840 static stringop_algs slm_memcpy[2] = {
1841 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1842 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1843 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1844 static stringop_algs slm_memset[2] = {
1845 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1846 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1847 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1848 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1849 static const
1850 struct processor_costs slm_cost = {
1851 COSTS_N_INSNS (1), /* cost of an add instruction */
1852 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1853 COSTS_N_INSNS (1), /* variable shift costs */
1854 COSTS_N_INSNS (1), /* constant shift costs */
1855 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1856 COSTS_N_INSNS (3), /* HI */
1857 COSTS_N_INSNS (3), /* SI */
1858 COSTS_N_INSNS (4), /* DI */
1859 COSTS_N_INSNS (2)}, /* other */
1860 0, /* cost of multiply per each bit set */
1861 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1862 COSTS_N_INSNS (26), /* HI */
1863 COSTS_N_INSNS (42), /* SI */
1864 COSTS_N_INSNS (74), /* DI */
1865 COSTS_N_INSNS (74)}, /* other */
1866 COSTS_N_INSNS (1), /* cost of movsx */
1867 COSTS_N_INSNS (1), /* cost of movzx */
1868 8, /* "large" insn */
1869 17, /* MOVE_RATIO */
1870 4, /* cost for loading QImode using movzbl */
1871 {4, 4, 4}, /* cost of loading integer registers
1872 in QImode, HImode and SImode.
1873 Relative to reg-reg move (2). */
1874 {4, 4, 4}, /* cost of storing integer registers */
1875 4, /* cost of reg,reg fld/fst */
1876 {12, 12, 12}, /* cost of loading fp registers
1877 in SFmode, DFmode and XFmode */
1878 {6, 6, 8}, /* cost of storing fp registers
1879 in SFmode, DFmode and XFmode */
1880 2, /* cost of moving MMX register */
1881 {8, 8}, /* cost of loading MMX registers
1882 in SImode and DImode */
1883 {8, 8}, /* cost of storing MMX registers
1884 in SImode and DImode */
1885 2, /* cost of moving SSE register */
1886 {8, 8, 8}, /* cost of loading SSE registers
1887 in SImode, DImode and TImode */
1888 {8, 8, 8}, /* cost of storing SSE registers
1889 in SImode, DImode and TImode */
1890 5, /* MMX or SSE register to integer */
1891 32, /* size of l1 cache. */
1892 256, /* size of l2 cache. */
1893 64, /* size of prefetch block */
1894 6, /* number of parallel prefetches */
1895 3, /* Branch cost */
1896 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1897 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1898 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1899 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1900 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1901 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1902 slm_memcpy,
1903 slm_memset,
1904 1, /* scalar_stmt_cost. */
1905 1, /* scalar load_cost. */
1906 1, /* scalar_store_cost. */
1907 1, /* vec_stmt_cost. */
1908 4, /* vec_to_scalar_cost. */
1909 1, /* scalar_to_vec_cost. */
1910 1, /* vec_align_load_cost. */
1911 2, /* vec_unalign_load_cost. */
1912 1, /* vec_store_cost. */
1913 3, /* cond_taken_branch_cost. */
1914 1, /* cond_not_taken_branch_cost. */
1915 };
1916
1917 static stringop_algs intel_memcpy[2] = {
1918 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1919 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1920 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1921 static stringop_algs intel_memset[2] = {
1922 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1923 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1924 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1925 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1926 static const
1927 struct processor_costs intel_cost = {
1928 COSTS_N_INSNS (1), /* cost of an add instruction */
1929 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1930 COSTS_N_INSNS (1), /* variable shift costs */
1931 COSTS_N_INSNS (1), /* constant shift costs */
1932 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1933 COSTS_N_INSNS (3), /* HI */
1934 COSTS_N_INSNS (3), /* SI */
1935 COSTS_N_INSNS (4), /* DI */
1936 COSTS_N_INSNS (2)}, /* other */
1937 0, /* cost of multiply per each bit set */
1938 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1939 COSTS_N_INSNS (26), /* HI */
1940 COSTS_N_INSNS (42), /* SI */
1941 COSTS_N_INSNS (74), /* DI */
1942 COSTS_N_INSNS (74)}, /* other */
1943 COSTS_N_INSNS (1), /* cost of movsx */
1944 COSTS_N_INSNS (1), /* cost of movzx */
1945 8, /* "large" insn */
1946 17, /* MOVE_RATIO */
1947 4, /* cost for loading QImode using movzbl */
1948 {4, 4, 4}, /* cost of loading integer registers
1949 in QImode, HImode and SImode.
1950 Relative to reg-reg move (2). */
1951 {4, 4, 4}, /* cost of storing integer registers */
1952 4, /* cost of reg,reg fld/fst */
1953 {12, 12, 12}, /* cost of loading fp registers
1954 in SFmode, DFmode and XFmode */
1955 {6, 6, 8}, /* cost of storing fp registers
1956 in SFmode, DFmode and XFmode */
1957 2, /* cost of moving MMX register */
1958 {8, 8}, /* cost of loading MMX registers
1959 in SImode and DImode */
1960 {8, 8}, /* cost of storing MMX registers
1961 in SImode and DImode */
1962 2, /* cost of moving SSE register */
1963 {8, 8, 8}, /* cost of loading SSE registers
1964 in SImode, DImode and TImode */
1965 {8, 8, 8}, /* cost of storing SSE registers
1966 in SImode, DImode and TImode */
1967 5, /* MMX or SSE register to integer */
1968 32, /* size of l1 cache. */
1969 256, /* size of l2 cache. */
1970 64, /* size of prefetch block */
1971 6, /* number of parallel prefetches */
1972 3, /* Branch cost */
1973 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1974 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1975 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1976 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1977 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1978 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1979 intel_memcpy,
1980 intel_memset,
1981 1, /* scalar_stmt_cost. */
1982 1, /* scalar load_cost. */
1983 1, /* scalar_store_cost. */
1984 1, /* vec_stmt_cost. */
1985 4, /* vec_to_scalar_cost. */
1986 1, /* scalar_to_vec_cost. */
1987 1, /* vec_align_load_cost. */
1988 2, /* vec_unalign_load_cost. */
1989 1, /* vec_store_cost. */
1990 3, /* cond_taken_branch_cost. */
1991 1, /* cond_not_taken_branch_cost. */
1992 };
1993
1994 /* Generic should produce code tuned for Core-i7 (and newer chips)
1995 and btver1 (and newer chips). */
1996
1997 static stringop_algs generic_memcpy[2] = {
1998 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1999 {-1, libcall, false}}},
2000 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2001 {-1, libcall, false}}}};
2002 static stringop_algs generic_memset[2] = {
2003 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2004 {-1, libcall, false}}},
2005 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2006 {-1, libcall, false}}}};
2007 static const
2008 struct processor_costs generic_cost = {
2009 COSTS_N_INSNS (1), /* cost of an add instruction */
2010 /* On all chips taken into consideration lea is 2 cycles and more. With
2011 this cost however our current implementation of synth_mult results in
2012 use of unnecessary temporary registers causing regression on several
2013 SPECfp benchmarks. */
2014 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2015 COSTS_N_INSNS (1), /* variable shift costs */
2016 COSTS_N_INSNS (1), /* constant shift costs */
2017 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2018 COSTS_N_INSNS (4), /* HI */
2019 COSTS_N_INSNS (3), /* SI */
2020 COSTS_N_INSNS (4), /* DI */
2021 COSTS_N_INSNS (2)}, /* other */
2022 0, /* cost of multiply per each bit set */
2023 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2024 COSTS_N_INSNS (26), /* HI */
2025 COSTS_N_INSNS (42), /* SI */
2026 COSTS_N_INSNS (74), /* DI */
2027 COSTS_N_INSNS (74)}, /* other */
2028 COSTS_N_INSNS (1), /* cost of movsx */
2029 COSTS_N_INSNS (1), /* cost of movzx */
2030 8, /* "large" insn */
2031 17, /* MOVE_RATIO */
2032 4, /* cost for loading QImode using movzbl */
2033 {4, 4, 4}, /* cost of loading integer registers
2034 in QImode, HImode and SImode.
2035 Relative to reg-reg move (2). */
2036 {4, 4, 4}, /* cost of storing integer registers */
2037 4, /* cost of reg,reg fld/fst */
2038 {12, 12, 12}, /* cost of loading fp registers
2039 in SFmode, DFmode and XFmode */
2040 {6, 6, 8}, /* cost of storing fp registers
2041 in SFmode, DFmode and XFmode */
2042 2, /* cost of moving MMX register */
2043 {8, 8}, /* cost of loading MMX registers
2044 in SImode and DImode */
2045 {8, 8}, /* cost of storing MMX registers
2046 in SImode and DImode */
2047 2, /* cost of moving SSE register */
2048 {8, 8, 8}, /* cost of loading SSE registers
2049 in SImode, DImode and TImode */
2050 {8, 8, 8}, /* cost of storing SSE registers
2051 in SImode, DImode and TImode */
2052 5, /* MMX or SSE register to integer */
2053 32, /* size of l1 cache. */
2054 512, /* size of l2 cache. */
2055 64, /* size of prefetch block */
2056 6, /* number of parallel prefetches */
2057 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2058 value is increased to perhaps more appropriate value of 5. */
2059 3, /* Branch cost */
2060 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2061 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2062 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2063 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2064 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2065 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2066 generic_memcpy,
2067 generic_memset,
2068 1, /* scalar_stmt_cost. */
2069 1, /* scalar load_cost. */
2070 1, /* scalar_store_cost. */
2071 1, /* vec_stmt_cost. */
2072 1, /* vec_to_scalar_cost. */
2073 1, /* scalar_to_vec_cost. */
2074 1, /* vec_align_load_cost. */
2075 2, /* vec_unalign_load_cost. */
2076 1, /* vec_store_cost. */
2077 3, /* cond_taken_branch_cost. */
2078 1, /* cond_not_taken_branch_cost. */
2079 };
2080
2081 /* core_cost should produce code tuned for Core familly of CPUs. */
2082 static stringop_algs core_memcpy[2] = {
2083 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2084 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2085 {-1, libcall, false}}}};
2086 static stringop_algs core_memset[2] = {
2087 {libcall, {{6, loop_1_byte, true},
2088 {24, loop, true},
2089 {8192, rep_prefix_4_byte, true},
2090 {-1, libcall, false}}},
2091 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2092 {-1, libcall, false}}}};
2093
2094 static const
2095 struct processor_costs core_cost = {
2096 COSTS_N_INSNS (1), /* cost of an add instruction */
2097 /* On all chips taken into consideration lea is 2 cycles and more. With
2098 this cost however our current implementation of synth_mult results in
2099 use of unnecessary temporary registers causing regression on several
2100 SPECfp benchmarks. */
2101 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2102 COSTS_N_INSNS (1), /* variable shift costs */
2103 COSTS_N_INSNS (1), /* constant shift costs */
2104 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2105 COSTS_N_INSNS (4), /* HI */
2106 COSTS_N_INSNS (3), /* SI */
2107 COSTS_N_INSNS (4), /* DI */
2108 COSTS_N_INSNS (2)}, /* other */
2109 0, /* cost of multiply per each bit set */
2110 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2111 COSTS_N_INSNS (26), /* HI */
2112 COSTS_N_INSNS (42), /* SI */
2113 COSTS_N_INSNS (74), /* DI */
2114 COSTS_N_INSNS (74)}, /* other */
2115 COSTS_N_INSNS (1), /* cost of movsx */
2116 COSTS_N_INSNS (1), /* cost of movzx */
2117 8, /* "large" insn */
2118 17, /* MOVE_RATIO */
2119 4, /* cost for loading QImode using movzbl */
2120 {4, 4, 4}, /* cost of loading integer registers
2121 in QImode, HImode and SImode.
2122 Relative to reg-reg move (2). */
2123 {4, 4, 4}, /* cost of storing integer registers */
2124 4, /* cost of reg,reg fld/fst */
2125 {12, 12, 12}, /* cost of loading fp registers
2126 in SFmode, DFmode and XFmode */
2127 {6, 6, 8}, /* cost of storing fp registers
2128 in SFmode, DFmode and XFmode */
2129 2, /* cost of moving MMX register */
2130 {8, 8}, /* cost of loading MMX registers
2131 in SImode and DImode */
2132 {8, 8}, /* cost of storing MMX registers
2133 in SImode and DImode */
2134 2, /* cost of moving SSE register */
2135 {8, 8, 8}, /* cost of loading SSE registers
2136 in SImode, DImode and TImode */
2137 {8, 8, 8}, /* cost of storing SSE registers
2138 in SImode, DImode and TImode */
2139 5, /* MMX or SSE register to integer */
2140 64, /* size of l1 cache. */
2141 512, /* size of l2 cache. */
2142 64, /* size of prefetch block */
2143 6, /* number of parallel prefetches */
2144 /* FIXME perhaps more appropriate value is 5. */
2145 3, /* Branch cost */
2146 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2147 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2148 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2149 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2150 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2151 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2152 core_memcpy,
2153 core_memset,
2154 1, /* scalar_stmt_cost. */
2155 1, /* scalar load_cost. */
2156 1, /* scalar_store_cost. */
2157 1, /* vec_stmt_cost. */
2158 1, /* vec_to_scalar_cost. */
2159 1, /* scalar_to_vec_cost. */
2160 1, /* vec_align_load_cost. */
2161 2, /* vec_unalign_load_cost. */
2162 1, /* vec_store_cost. */
2163 3, /* cond_taken_branch_cost. */
2164 1, /* cond_not_taken_branch_cost. */
2165 };
2166
2167
2168 /* Set by -mtune. */
2169 const struct processor_costs *ix86_tune_cost = &pentium_cost;
2170
2171 /* Set by -mtune or -Os. */
2172 const struct processor_costs *ix86_cost = &pentium_cost;
2173
2174 /* Processor feature/optimization bitmasks. */
2175 #define m_386 (1U<<PROCESSOR_I386)
2176 #define m_486 (1U<<PROCESSOR_I486)
2177 #define m_PENT (1U<<PROCESSOR_PENTIUM)
2178 #define m_LAKEMONT (1U<<PROCESSOR_LAKEMONT)
2179 #define m_PPRO (1U<<PROCESSOR_PENTIUMPRO)
2180 #define m_PENT4 (1U<<PROCESSOR_PENTIUM4)
2181 #define m_NOCONA (1U<<PROCESSOR_NOCONA)
2182 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
2183 #define m_CORE2 (1U<<PROCESSOR_CORE2)
2184 #define m_NEHALEM (1U<<PROCESSOR_NEHALEM)
2185 #define m_SANDYBRIDGE (1U<<PROCESSOR_SANDYBRIDGE)
2186 #define m_HASWELL (1U<<PROCESSOR_HASWELL)
2187 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
2188 #define m_BONNELL (1U<<PROCESSOR_BONNELL)
2189 #define m_SILVERMONT (1U<<PROCESSOR_SILVERMONT)
2190 #define m_KNL (1U<<PROCESSOR_KNL)
2191 #define m_SKYLAKE_AVX512 (1U<<PROCESSOR_SKYLAKE_AVX512)
2192 #define m_INTEL (1U<<PROCESSOR_INTEL)
2193
2194 #define m_GEODE (1U<<PROCESSOR_GEODE)
2195 #define m_K6 (1U<<PROCESSOR_K6)
2196 #define m_K6_GEODE (m_K6 | m_GEODE)
2197 #define m_K8 (1U<<PROCESSOR_K8)
2198 #define m_ATHLON (1U<<PROCESSOR_ATHLON)
2199 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
2200 #define m_AMDFAM10 (1U<<PROCESSOR_AMDFAM10)
2201 #define m_BDVER1 (1U<<PROCESSOR_BDVER1)
2202 #define m_BDVER2 (1U<<PROCESSOR_BDVER2)
2203 #define m_BDVER3 (1U<<PROCESSOR_BDVER3)
2204 #define m_BDVER4 (1U<<PROCESSOR_BDVER4)
2205 #define m_ZNVER1 (1U<<PROCESSOR_ZNVER1)
2206 #define m_BTVER1 (1U<<PROCESSOR_BTVER1)
2207 #define m_BTVER2 (1U<<PROCESSOR_BTVER2)
2208 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
2209 #define m_BTVER (m_BTVER1 | m_BTVER2)
2210 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
2211 | m_ZNVER1)
2212
2213 #define m_GENERIC (1U<<PROCESSOR_GENERIC)
2214
2215 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
2216 #undef DEF_TUNE
2217 #define DEF_TUNE(tune, name, selector) name,
2218 #include "x86-tune.def"
2219 #undef DEF_TUNE
2220 };
2221
2222 /* Feature tests against the various tunings. */
2223 unsigned char ix86_tune_features[X86_TUNE_LAST];
2224
2225 /* Feature tests against the various tunings used to create ix86_tune_features
2226 based on the processor mask. */
2227 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
2228 #undef DEF_TUNE
2229 #define DEF_TUNE(tune, name, selector) selector,
2230 #include "x86-tune.def"
2231 #undef DEF_TUNE
2232 };
2233
2234 /* Feature tests against the various architecture variations. */
2235 unsigned char ix86_arch_features[X86_ARCH_LAST];
2236
2237 /* Feature tests against the various architecture variations, used to create
2238 ix86_arch_features based on the processor mask. */
2239 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2240 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2241 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
2242
2243 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2244 ~m_386,
2245
2246 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2247 ~(m_386 | m_486),
2248
2249 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2250 ~m_386,
2251
2252 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2253 ~m_386,
2254 };
2255
2256 /* In case the average insn count for single function invocation is
2257 lower than this constant, emit fast (but longer) prologue and
2258 epilogue code. */
2259 #define FAST_PROLOGUE_INSN_COUNT 20
2260
2261 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2262 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2263 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2264 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2265
2266 /* Array of the smallest class containing reg number REGNO, indexed by
2267 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2268
2269 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2270 {
2271 /* ax, dx, cx, bx */
2272 AREG, DREG, CREG, BREG,
2273 /* si, di, bp, sp */
2274 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2275 /* FP registers */
2276 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2277 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2278 /* arg pointer */
2279 NON_Q_REGS,
2280 /* flags, fpsr, fpcr, frame */
2281 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2282 /* SSE registers */
2283 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2284 SSE_REGS, SSE_REGS,
2285 /* MMX registers */
2286 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2287 MMX_REGS, MMX_REGS,
2288 /* REX registers */
2289 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2290 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2291 /* SSE REX registers */
2292 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2293 SSE_REGS, SSE_REGS,
2294 /* AVX-512 SSE registers */
2295 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2296 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2297 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2298 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2299 /* Mask registers. */
2300 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2301 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2302 /* MPX bound registers */
2303 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
2304 };
2305
2306 /* The "default" register map used in 32bit mode. */
2307
2308 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2309 {
2310 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2311 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2312 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2313 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2314 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2315 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2316 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2317 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2318 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2319 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2320 101, 102, 103, 104, /* bound registers */
2321 };
2322
2323 /* The "default" register map used in 64bit mode. */
2324
2325 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2326 {
2327 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2328 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2329 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2330 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2331 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2332 8,9,10,11,12,13,14,15, /* extended integer registers */
2333 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2334 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
2335 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
2336 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
2337 126, 127, 128, 129, /* bound registers */
2338 };
2339
2340 /* Define the register numbers to be used in Dwarf debugging information.
2341 The SVR4 reference port C compiler uses the following register numbers
2342 in its Dwarf output code:
2343 0 for %eax (gcc regno = 0)
2344 1 for %ecx (gcc regno = 2)
2345 2 for %edx (gcc regno = 1)
2346 3 for %ebx (gcc regno = 3)
2347 4 for %esp (gcc regno = 7)
2348 5 for %ebp (gcc regno = 6)
2349 6 for %esi (gcc regno = 4)
2350 7 for %edi (gcc regno = 5)
2351 The following three DWARF register numbers are never generated by
2352 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2353 believes these numbers have these meanings.
2354 8 for %eip (no gcc equivalent)
2355 9 for %eflags (gcc regno = 17)
2356 10 for %trapno (no gcc equivalent)
2357 It is not at all clear how we should number the FP stack registers
2358 for the x86 architecture. If the version of SDB on x86/svr4 were
2359 a bit less brain dead with respect to floating-point then we would
2360 have a precedent to follow with respect to DWARF register numbers
2361 for x86 FP registers, but the SDB on x86/svr4 is so completely
2362 broken with respect to FP registers that it is hardly worth thinking
2363 of it as something to strive for compatibility with.
2364 The version of x86/svr4 SDB I have at the moment does (partially)
2365 seem to believe that DWARF register number 11 is associated with
2366 the x86 register %st(0), but that's about all. Higher DWARF
2367 register numbers don't seem to be associated with anything in
2368 particular, and even for DWARF regno 11, SDB only seems to under-
2369 stand that it should say that a variable lives in %st(0) (when
2370 asked via an `=' command) if we said it was in DWARF regno 11,
2371 but SDB still prints garbage when asked for the value of the
2372 variable in question (via a `/' command).
2373 (Also note that the labels SDB prints for various FP stack regs
2374 when doing an `x' command are all wrong.)
2375 Note that these problems generally don't affect the native SVR4
2376 C compiler because it doesn't allow the use of -O with -g and
2377 because when it is *not* optimizing, it allocates a memory
2378 location for each floating-point variable, and the memory
2379 location is what gets described in the DWARF AT_location
2380 attribute for the variable in question.
2381 Regardless of the severe mental illness of the x86/svr4 SDB, we
2382 do something sensible here and we use the following DWARF
2383 register numbers. Note that these are all stack-top-relative
2384 numbers.
2385 11 for %st(0) (gcc regno = 8)
2386 12 for %st(1) (gcc regno = 9)
2387 13 for %st(2) (gcc regno = 10)
2388 14 for %st(3) (gcc regno = 11)
2389 15 for %st(4) (gcc regno = 12)
2390 16 for %st(5) (gcc regno = 13)
2391 17 for %st(6) (gcc regno = 14)
2392 18 for %st(7) (gcc regno = 15)
2393 */
2394 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2395 {
2396 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2397 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2398 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2399 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2400 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2401 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2402 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2403 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2404 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2405 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2406 101, 102, 103, 104, /* bound registers */
2407 };
2408
2409 /* Define parameter passing and return registers. */
2410
2411 static int const x86_64_int_parameter_registers[6] =
2412 {
2413 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2414 };
2415
2416 static int const x86_64_ms_abi_int_parameter_registers[4] =
2417 {
2418 CX_REG, DX_REG, R8_REG, R9_REG
2419 };
2420
2421 static int const x86_64_int_return_registers[4] =
2422 {
2423 AX_REG, DX_REG, DI_REG, SI_REG
2424 };
2425
2426 /* Additional registers that are clobbered by SYSV calls. */
2427
2428 #define NUM_X86_64_MS_CLOBBERED_REGS 12
2429 static int const x86_64_ms_sysv_extra_clobbered_registers
2430 [NUM_X86_64_MS_CLOBBERED_REGS] =
2431 {
2432 SI_REG, DI_REG,
2433 XMM6_REG, XMM7_REG,
2434 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2435 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2436 };
2437
2438 enum xlogue_stub {
2439 XLOGUE_STUB_SAVE,
2440 XLOGUE_STUB_RESTORE,
2441 XLOGUE_STUB_RESTORE_TAIL,
2442 XLOGUE_STUB_SAVE_HFP,
2443 XLOGUE_STUB_RESTORE_HFP,
2444 XLOGUE_STUB_RESTORE_HFP_TAIL,
2445
2446 XLOGUE_STUB_COUNT
2447 };
2448
2449 enum xlogue_stub_sets {
2450 XLOGUE_SET_ALIGNED,
2451 XLOGUE_SET_ALIGNED_PLUS_8,
2452 XLOGUE_SET_HFP_ALIGNED_OR_REALIGN,
2453 XLOGUE_SET_HFP_ALIGNED_PLUS_8,
2454
2455 XLOGUE_SET_COUNT
2456 };
2457
2458 /* Register save/restore layout used by out-of-line stubs. */
2459 class xlogue_layout {
2460 public:
2461 struct reginfo
2462 {
2463 unsigned regno;
2464 HOST_WIDE_INT offset; /* Offset used by stub base pointer (rax or
2465 rsi) to where each register is stored. */
2466 };
2467
2468 unsigned get_nregs () const {return m_nregs;}
2469 HOST_WIDE_INT get_stack_align_off_in () const {return m_stack_align_off_in;}
2470
2471 const reginfo &get_reginfo (unsigned reg) const
2472 {
2473 gcc_assert (reg < m_nregs);
2474 return m_regs[reg];
2475 }
2476
2477 static const char *get_stub_name (enum xlogue_stub stub,
2478 unsigned n_extra_args);
2479
2480 /* Returns an rtx for the stub's symbol based upon
2481 1.) the specified stub (save, restore or restore_ret) and
2482 2.) the value of cfun->machine->call_ms2sysv_extra_regs and
2483 3.) rather or not stack alignment is being performed. */
2484 static rtx get_stub_rtx (enum xlogue_stub stub);
2485
2486 /* Returns the amount of stack space (including padding) that the stub
2487 needs to store registers based upon data in the machine_function. */
2488 HOST_WIDE_INT get_stack_space_used () const
2489 {
2490 const struct machine_function *m = cfun->machine;
2491 unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1;
2492
2493 gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS);
2494 return m_regs[last_reg].offset
2495 + (m->call_ms2sysv_pad_out ? 8 : 0)
2496 + STUB_INDEX_OFFSET;
2497 }
2498
2499 /* Returns the offset for the base pointer used by the stub. */
2500 HOST_WIDE_INT get_stub_ptr_offset () const
2501 {
2502 return STUB_INDEX_OFFSET + m_stack_align_off_in;
2503 }
2504
2505 static const struct xlogue_layout &get_instance ();
2506 static unsigned count_stub_managed_regs ();
2507 static bool is_stub_managed_reg (unsigned regno, unsigned count);
2508
2509 static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70;
2510 static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS;
2511 static const unsigned MAX_REGS = 18;
2512 static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS;
2513 static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1;
2514 static const unsigned STUB_NAME_MAX_LEN = 16;
2515 static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT];
2516 static const unsigned REG_ORDER[MAX_REGS];
2517 static const unsigned REG_ORDER_REALIGN[MAX_REGS];
2518
2519 private:
2520 xlogue_layout ();
2521 xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp);
2522 xlogue_layout (const xlogue_layout &);
2523
2524 /* True if hard frame pointer is used. */
2525 bool m_hfp;
2526
2527 /* Max number of register this layout manages. */
2528 unsigned m_nregs;
2529
2530 /* Incoming offset from 16-byte alignment. */
2531 HOST_WIDE_INT m_stack_align_off_in;
2532
2533 /* Register order and offsets. */
2534 struct reginfo m_regs[MAX_REGS];
2535
2536 /* Lazy-inited cache of symbol names for stubs. */
2537 static char s_stub_names[XLOGUE_STUB_COUNT][VARIANT_COUNT]
2538 [STUB_NAME_MAX_LEN];
2539
2540 static const xlogue_layout s_instances[XLOGUE_SET_COUNT];
2541 };
2542
2543 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
2544 "savms64",
2545 "resms64",
2546 "resms64x",
2547 "savms64f",
2548 "resms64f",
2549 "resms64fx"
2550 };
2551
2552 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
2553 /* The below offset values are where each register is stored for the layout
2554 relative to incoming stack pointer. The value of each m_regs[].offset will
2555 be relative to the incoming base pointer (rax or rsi) used by the stub.
2556
2557 s_instances: 0 1 2 3
2558 Offset: realigned or aligned + 8
2559 Register aligned aligned + 8 aligned w/HFP w/HFP */
2560 XMM15_REG, /* 0x10 0x18 0x10 0x18 */
2561 XMM14_REG, /* 0x20 0x28 0x20 0x28 */
2562 XMM13_REG, /* 0x30 0x38 0x30 0x38 */
2563 XMM12_REG, /* 0x40 0x48 0x40 0x48 */
2564 XMM11_REG, /* 0x50 0x58 0x50 0x58 */
2565 XMM10_REG, /* 0x60 0x68 0x60 0x68 */
2566 XMM9_REG, /* 0x70 0x78 0x70 0x78 */
2567 XMM8_REG, /* 0x80 0x88 0x80 0x88 */
2568 XMM7_REG, /* 0x90 0x98 0x90 0x98 */
2569 XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
2570 SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
2571 DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
2572 BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
2573 BP_REG, /* 0xc0 0xc8 N/A N/A */
2574 R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
2575 R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
2576 R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
2577 R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
2578 };
2579
2580 /* Instantiate static const values. */
2581 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
2582 const unsigned xlogue_layout::MIN_REGS;
2583 const unsigned xlogue_layout::MAX_REGS;
2584 const unsigned xlogue_layout::MAX_EXTRA_REGS;
2585 const unsigned xlogue_layout::VARIANT_COUNT;
2586 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
2587
2588 /* Initialize xlogue_layout::s_stub_names to zero. */
2589 char xlogue_layout::s_stub_names[XLOGUE_STUB_COUNT][VARIANT_COUNT]
2590 [STUB_NAME_MAX_LEN];
2591
2592 /* Instantiates all xlogue_layout instances. */
2593 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
2594 xlogue_layout (0, false),
2595 xlogue_layout (8, false),
2596 xlogue_layout (0, true),
2597 xlogue_layout (8, true)
2598 };
2599
2600 /* Return an appropriate const instance of xlogue_layout based upon values
2601 in cfun->machine and crtl. */
2602 const struct xlogue_layout &
2603 xlogue_layout::get_instance ()
2604 {
2605 enum xlogue_stub_sets stub_set;
2606 bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
2607
2608 if (stack_realign_fp)
2609 stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
2610 else if (frame_pointer_needed)
2611 stub_set = aligned_plus_8
2612 ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
2613 : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
2614 else
2615 stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
2616
2617 return s_instances[stub_set];
2618 }
2619
2620 /* Determine how many clobbered registers can be saved by the stub.
2621 Returns the count of registers the stub will save and restore. */
2622 unsigned
2623 xlogue_layout::count_stub_managed_regs ()
2624 {
2625 bool hfp = frame_pointer_needed || stack_realign_fp;
2626 unsigned i, count;
2627 unsigned regno;
2628
2629 for (count = i = MIN_REGS; i < MAX_REGS; ++i)
2630 {
2631 regno = REG_ORDER[i];
2632 if (regno == BP_REG && hfp)
2633 continue;
2634 if (!ix86_save_reg (regno, false, false))
2635 break;
2636 ++count;
2637 }
2638 return count;
2639 }
2640
2641 /* Determine if register REGNO is a stub managed register given the
2642 total COUNT of stub managed registers. */
2643 bool
2644 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
2645 {
2646 bool hfp = frame_pointer_needed || stack_realign_fp;
2647 unsigned i;
2648
2649 for (i = 0; i < count; ++i)
2650 {
2651 gcc_assert (i < MAX_REGS);
2652 if (REG_ORDER[i] == BP_REG && hfp)
2653 ++count;
2654 else if (REG_ORDER[i] == regno)
2655 return true;
2656 }
2657 return false;
2658 }
2659
2660 /* Constructor for xlogue_layout. */
2661 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
2662 : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
2663 m_stack_align_off_in (stack_align_off_in)
2664 {
2665 HOST_WIDE_INT offset = stack_align_off_in;
2666 unsigned i, j;
2667
2668 for (i = j = 0; i < MAX_REGS; ++i)
2669 {
2670 unsigned regno = REG_ORDER[i];
2671
2672 if (regno == BP_REG && hfp)
2673 continue;
2674 if (SSE_REGNO_P (regno))
2675 {
2676 offset += 16;
2677 /* Verify that SSE regs are always aligned. */
2678 gcc_assert (!((stack_align_off_in + offset) & 15));
2679 }
2680 else
2681 offset += 8;
2682
2683 m_regs[j].regno = regno;
2684 m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
2685 }
2686 gcc_assert (j == m_nregs);
2687 }
2688
2689 const char *
2690 xlogue_layout::get_stub_name (enum xlogue_stub stub,
2691 unsigned n_extra_regs)
2692 {
2693 char *name = s_stub_names[stub][n_extra_regs];
2694
2695 /* Lazy init */
2696 if (!*name)
2697 {
2698 int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%u",
2699 STUB_BASE_NAMES[stub], MIN_REGS + n_extra_regs);
2700 gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
2701 }
2702
2703 return name;
2704 }
2705
2706 /* Return rtx of a symbol ref for the entry point (based upon
2707 cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
2708 rtx
2709 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
2710 {
2711 const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
2712 gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
2713 gcc_assert (stub < XLOGUE_STUB_COUNT);
2714 gcc_assert (crtl->stack_realign_finalized);
2715
2716 return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
2717 }
2718
2719 /* Define the structure for the machine field in struct function. */
2720
2721 struct GTY(()) stack_local_entry {
2722 unsigned short mode;
2723 unsigned short n;
2724 rtx rtl;
2725 struct stack_local_entry *next;
2726 };
2727
2728 /* Which cpu are we scheduling for. */
2729 enum attr_cpu ix86_schedule;
2730
2731 /* Which cpu are we optimizing for. */
2732 enum processor_type ix86_tune;
2733
2734 /* Which instruction set architecture to use. */
2735 enum processor_type ix86_arch;
2736
2737 /* True if processor has SSE prefetch instruction. */
2738 unsigned char x86_prefetch_sse;
2739
2740 /* -mstackrealign option */
2741 static const char ix86_force_align_arg_pointer_string[]
2742 = "force_align_arg_pointer";
2743
2744 static rtx (*ix86_gen_leave) (void);
2745 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2746 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2747 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2748 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2749 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2750 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
2751 static rtx (*ix86_gen_clzero) (rtx);
2752 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2753 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2754 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2755 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2756 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2757 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2758
2759 /* Preferred alignment for stack boundary in bits. */
2760 unsigned int ix86_preferred_stack_boundary;
2761
2762 /* Alignment for incoming stack boundary in bits specified at
2763 command line. */
2764 static unsigned int ix86_user_incoming_stack_boundary;
2765
2766 /* Default alignment for incoming stack boundary in bits. */
2767 static unsigned int ix86_default_incoming_stack_boundary;
2768
2769 /* Alignment for incoming stack boundary in bits. */
2770 unsigned int ix86_incoming_stack_boundary;
2771
2772 /* Calling abi specific va_list type nodes. */
2773 static GTY(()) tree sysv_va_list_type_node;
2774 static GTY(()) tree ms_va_list_type_node;
2775
2776 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2777 char internal_label_prefix[16];
2778 int internal_label_prefix_len;
2779
2780 /* Fence to use after loop using movnt. */
2781 tree x86_mfence;
2782
2783 /* Register class used for passing given 64bit part of the argument.
2784 These represent classes as documented by the PS ABI, with the exception
2785 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2786 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2787
2788 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2789 whenever possible (upper half does contain padding). */
2790 enum x86_64_reg_class
2791 {
2792 X86_64_NO_CLASS,
2793 X86_64_INTEGER_CLASS,
2794 X86_64_INTEGERSI_CLASS,
2795 X86_64_SSE_CLASS,
2796 X86_64_SSESF_CLASS,
2797 X86_64_SSEDF_CLASS,
2798 X86_64_SSEUP_CLASS,
2799 X86_64_X87_CLASS,
2800 X86_64_X87UP_CLASS,
2801 X86_64_COMPLEX_X87_CLASS,
2802 X86_64_MEMORY_CLASS
2803 };
2804
2805 #define MAX_CLASSES 8
2806
2807 /* Table of constants used by fldpi, fldln2, etc.... */
2808 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2809 static bool ext_80387_constants_init;
2810
2811 \f
2812 static struct machine_function * ix86_init_machine_status (void);
2813 static rtx ix86_function_value (const_tree, const_tree, bool);
2814 static bool ix86_function_value_regno_p (const unsigned int);
2815 static unsigned int ix86_function_arg_boundary (machine_mode,
2816 const_tree);
2817 static rtx ix86_static_chain (const_tree, bool);
2818 static int ix86_function_regparm (const_tree, const_tree);
2819 static void ix86_compute_frame_layout (void);
2820 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
2821 rtx, rtx, int);
2822 static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
2823 static tree ix86_canonical_va_list_type (tree);
2824 static void predict_jump (int);
2825 static unsigned int split_stack_prologue_scratch_regno (void);
2826 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2827
2828 enum ix86_function_specific_strings
2829 {
2830 IX86_FUNCTION_SPECIFIC_ARCH,
2831 IX86_FUNCTION_SPECIFIC_TUNE,
2832 IX86_FUNCTION_SPECIFIC_MAX
2833 };
2834
2835 static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
2836 const char *, const char *, enum fpmath_unit,
2837 bool);
2838 static void ix86_function_specific_save (struct cl_target_option *,
2839 struct gcc_options *opts);
2840 static void ix86_function_specific_restore (struct gcc_options *opts,
2841 struct cl_target_option *);
2842 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
2843 static void ix86_function_specific_print (FILE *, int,
2844 struct cl_target_option *);
2845 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2846 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2847 struct gcc_options *,
2848 struct gcc_options *,
2849 struct gcc_options *);
2850 static bool ix86_can_inline_p (tree, tree);
2851 static void ix86_set_current_function (tree);
2852 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2853
2854 static enum calling_abi ix86_function_abi (const_tree);
2855
2856 \f
2857 #ifndef SUBTARGET32_DEFAULT_CPU
2858 #define SUBTARGET32_DEFAULT_CPU "i386"
2859 #endif
2860
2861 /* Whether -mtune= or -march= were specified */
2862 static int ix86_tune_defaulted;
2863 static int ix86_arch_specified;
2864
2865 /* Vectorization library interface and handlers. */
2866 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
2867
2868 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
2869 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
2870
2871 /* Processor target table, indexed by processor number */
2872 struct ptt
2873 {
2874 const char *const name; /* processor name */
2875 const struct processor_costs *cost; /* Processor costs */
2876 const int align_loop; /* Default alignments. */
2877 const int align_loop_max_skip;
2878 const int align_jump;
2879 const int align_jump_max_skip;
2880 const int align_func;
2881 };
2882
2883 /* This table must be in sync with enum processor_type in i386.h. */
2884 static const struct ptt processor_target_table[PROCESSOR_max] =
2885 {
2886 {"generic", &generic_cost, 16, 10, 16, 10, 16},
2887 {"i386", &i386_cost, 4, 3, 4, 3, 4},
2888 {"i486", &i486_cost, 16, 15, 16, 15, 16},
2889 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
2890 {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
2891 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
2892 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
2893 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
2894 {"core2", &core_cost, 16, 10, 16, 10, 16},
2895 {"nehalem", &core_cost, 16, 10, 16, 10, 16},
2896 {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
2897 {"haswell", &core_cost, 16, 10, 16, 10, 16},
2898 {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
2899 {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
2900 {"knl", &slm_cost, 16, 15, 16, 7, 16},
2901 {"skylake-avx512", &core_cost, 16, 10, 16, 10, 16},
2902 {"intel", &intel_cost, 16, 15, 16, 7, 16},
2903 {"geode", &geode_cost, 0, 0, 0, 0, 0},
2904 {"k6", &k6_cost, 32, 7, 32, 7, 32},
2905 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
2906 {"k8", &k8_cost, 16, 7, 16, 7, 16},
2907 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
2908 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
2909 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
2910 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
2911 {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
2912 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
2913 {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
2914 {"znver1", &znver1_cost, 16, 15, 16, 15, 16}
2915 };
2916 \f
2917 static unsigned int
2918 rest_of_handle_insert_vzeroupper (void)
2919 {
2920 int i;
2921
2922 /* vzeroupper instructions are inserted immediately after reload to
2923 account for possible spills from 256bit registers. The pass
2924 reuses mode switching infrastructure by re-running mode insertion
2925 pass, so disable entities that have already been processed. */
2926 for (i = 0; i < MAX_386_ENTITIES; i++)
2927 ix86_optimize_mode_switching[i] = 0;
2928
2929 ix86_optimize_mode_switching[AVX_U128] = 1;
2930
2931 /* Call optimize_mode_switching. */
2932 g->get_passes ()->execute_pass_mode_switching ();
2933 return 0;
2934 }
2935
2936 /* Return 1 if INSN uses or defines a hard register.
2937 Hard register uses in a memory address are ignored.
2938 Clobbers and flags definitions are ignored. */
2939
2940 static bool
2941 has_non_address_hard_reg (rtx_insn *insn)
2942 {
2943 df_ref ref;
2944 FOR_EACH_INSN_DEF (ref, insn)
2945 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
2946 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
2947 && DF_REF_REGNO (ref) != FLAGS_REG)
2948 return true;
2949
2950 FOR_EACH_INSN_USE (ref, insn)
2951 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
2952 return true;
2953
2954 return false;
2955 }
2956
2957 /* Check if comparison INSN may be transformed
2958 into vector comparison. Currently we transform
2959 zero checks only which look like:
2960
2961 (set (reg:CCZ 17 flags)
2962 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
2963 (subreg:SI (reg:DI x) 0))
2964 (const_int 0 [0]))) */
2965
2966 static bool
2967 convertible_comparison_p (rtx_insn *insn)
2968 {
2969 if (!TARGET_SSE4_1)
2970 return false;
2971
2972 rtx def_set = single_set (insn);
2973
2974 gcc_assert (def_set);
2975
2976 rtx src = SET_SRC (def_set);
2977 rtx dst = SET_DEST (def_set);
2978
2979 gcc_assert (GET_CODE (src) == COMPARE);
2980
2981 if (GET_CODE (dst) != REG
2982 || REGNO (dst) != FLAGS_REG
2983 || GET_MODE (dst) != CCZmode)
2984 return false;
2985
2986 rtx op1 = XEXP (src, 0);
2987 rtx op2 = XEXP (src, 1);
2988
2989 if (op2 != CONST0_RTX (GET_MODE (op2)))
2990 return false;
2991
2992 if (GET_CODE (op1) != IOR)
2993 return false;
2994
2995 op2 = XEXP (op1, 1);
2996 op1 = XEXP (op1, 0);
2997
2998 if (!SUBREG_P (op1)
2999 || !SUBREG_P (op2)
3000 || GET_MODE (op1) != SImode
3001 || GET_MODE (op2) != SImode
3002 || ((SUBREG_BYTE (op1) != 0
3003 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
3004 && (SUBREG_BYTE (op2) != 0
3005 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
3006 return false;
3007
3008 op1 = SUBREG_REG (op1);
3009 op2 = SUBREG_REG (op2);
3010
3011 if (op1 != op2
3012 || !REG_P (op1)
3013 || GET_MODE (op1) != DImode)
3014 return false;
3015
3016 return true;
3017 }
3018
3019 /* The DImode version of scalar_to_vector_candidate_p. */
3020
3021 static bool
3022 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
3023 {
3024 rtx def_set = single_set (insn);
3025
3026 if (!def_set)
3027 return false;
3028
3029 if (has_non_address_hard_reg (insn))
3030 return false;
3031
3032 rtx src = SET_SRC (def_set);
3033 rtx dst = SET_DEST (def_set);
3034
3035 if (GET_CODE (src) == COMPARE)
3036 return convertible_comparison_p (insn);
3037
3038 /* We are interested in DImode promotion only. */
3039 if ((GET_MODE (src) != DImode
3040 && !CONST_INT_P (src))
3041 || GET_MODE (dst) != DImode)
3042 return false;
3043
3044 if (!REG_P (dst) && !MEM_P (dst))
3045 return false;
3046
3047 switch (GET_CODE (src))
3048 {
3049 case ASHIFTRT:
3050 if (!TARGET_AVX512VL)
3051 return false;
3052 /* FALLTHRU */
3053
3054 case ASHIFT:
3055 case LSHIFTRT:
3056 if (!REG_P (XEXP (src, 1))
3057 && (!SUBREG_P (XEXP (src, 1))
3058 || SUBREG_BYTE (XEXP (src, 1)) != 0
3059 || !REG_P (SUBREG_REG (XEXP (src, 1))))
3060 && (!CONST_INT_P (XEXP (src, 1))
3061 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63)))
3062 return false;
3063
3064 if (GET_MODE (XEXP (src, 1)) != QImode
3065 && !CONST_INT_P (XEXP (src, 1)))
3066 return false;
3067 break;
3068
3069 case PLUS:
3070 case MINUS:
3071 case IOR:
3072 case XOR:
3073 case AND:
3074 if (!REG_P (XEXP (src, 1))
3075 && !MEM_P (XEXP (src, 1))
3076 && !CONST_INT_P (XEXP (src, 1)))
3077 return false;
3078
3079 if (GET_MODE (XEXP (src, 1)) != DImode
3080 && !CONST_INT_P (XEXP (src, 1)))
3081 return false;
3082 break;
3083
3084 case NEG:
3085 case NOT:
3086 break;
3087
3088 case REG:
3089 return true;
3090
3091 case MEM:
3092 case CONST_INT:
3093 return REG_P (dst);
3094
3095 default:
3096 return false;
3097 }
3098
3099 if (!REG_P (XEXP (src, 0))
3100 && !MEM_P (XEXP (src, 0))
3101 && !CONST_INT_P (XEXP (src, 0))
3102 /* Check for andnot case. */
3103 && (GET_CODE (src) != AND
3104 || GET_CODE (XEXP (src, 0)) != NOT
3105 || !REG_P (XEXP (XEXP (src, 0), 0))))
3106 return false;
3107
3108 if (GET_MODE (XEXP (src, 0)) != DImode
3109 && !CONST_INT_P (XEXP (src, 0)))
3110 return false;
3111
3112 return true;
3113 }
3114
3115 /* The TImode version of scalar_to_vector_candidate_p. */
3116
3117 static bool
3118 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
3119 {
3120 rtx def_set = single_set (insn);
3121
3122 if (!def_set)
3123 return false;
3124
3125 if (has_non_address_hard_reg (insn))
3126 return false;
3127
3128 rtx src = SET_SRC (def_set);
3129 rtx dst = SET_DEST (def_set);
3130
3131 /* Only TImode load and store are allowed. */
3132 if (GET_MODE (dst) != TImode)
3133 return false;
3134
3135 if (MEM_P (dst))
3136 {
3137 /* Check for store. Memory must be aligned or unaligned store
3138 is optimal. Only support store from register, standard SSE
3139 constant or CONST_WIDE_INT generated from piecewise store.
3140
3141 ??? Verify performance impact before enabling CONST_INT for
3142 __int128 store. */
3143 if (misaligned_operand (dst, TImode)
3144 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
3145 return false;
3146
3147 switch (GET_CODE (src))
3148 {
3149 default:
3150 return false;
3151
3152 case REG:
3153 case CONST_WIDE_INT:
3154 return true;
3155
3156 case CONST_INT:
3157 return standard_sse_constant_p (src, TImode);
3158 }
3159 }
3160 else if (MEM_P (src))
3161 {
3162 /* Check for load. Memory must be aligned or unaligned load is
3163 optimal. */
3164 return (REG_P (dst)
3165 && (!misaligned_operand (src, TImode)
3166 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
3167 }
3168
3169 return false;
3170 }
3171
3172 /* Return 1 if INSN may be converted into vector
3173 instruction. */
3174
3175 static bool
3176 scalar_to_vector_candidate_p (rtx_insn *insn)
3177 {
3178 if (TARGET_64BIT)
3179 return timode_scalar_to_vector_candidate_p (insn);
3180 else
3181 return dimode_scalar_to_vector_candidate_p (insn);
3182 }
3183
3184 /* The DImode version of remove_non_convertible_regs. */
3185
3186 static void
3187 dimode_remove_non_convertible_regs (bitmap candidates)
3188 {
3189 bitmap_iterator bi;
3190 unsigned id;
3191 bitmap regs = BITMAP_ALLOC (NULL);
3192
3193 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
3194 {
3195 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
3196 rtx reg = SET_DEST (def_set);
3197
3198 if (!REG_P (reg)
3199 || bitmap_bit_p (regs, REGNO (reg))
3200 || HARD_REGISTER_P (reg))
3201 continue;
3202
3203 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
3204 def;
3205 def = DF_REF_NEXT_REG (def))
3206 {
3207 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3208 {
3209 if (dump_file)
3210 fprintf (dump_file,
3211 "r%d has non convertible definition in insn %d\n",
3212 REGNO (reg), DF_REF_INSN_UID (def));
3213
3214 bitmap_set_bit (regs, REGNO (reg));
3215 break;
3216 }
3217 }
3218 }
3219
3220 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
3221 {
3222 for (df_ref def = DF_REG_DEF_CHAIN (id);
3223 def;
3224 def = DF_REF_NEXT_REG (def))
3225 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3226 {
3227 if (dump_file)
3228 fprintf (dump_file, "Removing insn %d from candidates list\n",
3229 DF_REF_INSN_UID (def));
3230
3231 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
3232 }
3233 }
3234
3235 BITMAP_FREE (regs);
3236 }
3237
3238 /* For a register REGNO, scan instructions for its defs and uses.
3239 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
3240
3241 static void
3242 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
3243 unsigned int regno)
3244 {
3245 for (df_ref def = DF_REG_DEF_CHAIN (regno);
3246 def;
3247 def = DF_REF_NEXT_REG (def))
3248 {
3249 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3250 {
3251 if (dump_file)
3252 fprintf (dump_file,
3253 "r%d has non convertible def in insn %d\n",
3254 regno, DF_REF_INSN_UID (def));
3255
3256 bitmap_set_bit (regs, regno);
3257 break;
3258 }
3259 }
3260
3261 for (df_ref ref = DF_REG_USE_CHAIN (regno);
3262 ref;
3263 ref = DF_REF_NEXT_REG (ref))
3264 {
3265 /* Debug instructions are skipped. */
3266 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
3267 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
3268 {
3269 if (dump_file)
3270 fprintf (dump_file,
3271 "r%d has non convertible use in insn %d\n",
3272 regno, DF_REF_INSN_UID (ref));
3273
3274 bitmap_set_bit (regs, regno);
3275 break;
3276 }
3277 }
3278 }
3279
3280 /* The TImode version of remove_non_convertible_regs. */
3281
3282 static void
3283 timode_remove_non_convertible_regs (bitmap candidates)
3284 {
3285 bitmap_iterator bi;
3286 unsigned id;
3287 bitmap regs = BITMAP_ALLOC (NULL);
3288
3289 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
3290 {
3291 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
3292 rtx dest = SET_DEST (def_set);
3293 rtx src = SET_SRC (def_set);
3294
3295 if ((!REG_P (dest)
3296 || bitmap_bit_p (regs, REGNO (dest))
3297 || HARD_REGISTER_P (dest))
3298 && (!REG_P (src)
3299 || bitmap_bit_p (regs, REGNO (src))
3300 || HARD_REGISTER_P (src)))
3301 continue;
3302
3303 if (REG_P (dest))
3304 timode_check_non_convertible_regs (candidates, regs,
3305 REGNO (dest));
3306
3307 if (REG_P (src))
3308 timode_check_non_convertible_regs (candidates, regs,
3309 REGNO (src));
3310 }
3311
3312 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
3313 {
3314 for (df_ref def = DF_REG_DEF_CHAIN (id);
3315 def;
3316 def = DF_REF_NEXT_REG (def))
3317 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3318 {
3319 if (dump_file)
3320 fprintf (dump_file, "Removing insn %d from candidates list\n",
3321 DF_REF_INSN_UID (def));
3322
3323 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
3324 }
3325
3326 for (df_ref ref = DF_REG_USE_CHAIN (id);
3327 ref;
3328 ref = DF_REF_NEXT_REG (ref))
3329 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
3330 {
3331 if (dump_file)
3332 fprintf (dump_file, "Removing insn %d from candidates list\n",
3333 DF_REF_INSN_UID (ref));
3334
3335 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
3336 }
3337 }
3338
3339 BITMAP_FREE (regs);
3340 }
3341
3342 /* For a given bitmap of insn UIDs scans all instruction and
3343 remove insn from CANDIDATES in case it has both convertible
3344 and not convertible definitions.
3345
3346 All insns in a bitmap are conversion candidates according to
3347 scalar_to_vector_candidate_p. Currently it implies all insns
3348 are single_set. */
3349
3350 static void
3351 remove_non_convertible_regs (bitmap candidates)
3352 {
3353 if (TARGET_64BIT)
3354 timode_remove_non_convertible_regs (candidates);
3355 else
3356 dimode_remove_non_convertible_regs (candidates);
3357 }
3358
3359 class scalar_chain
3360 {
3361 public:
3362 scalar_chain ();
3363 virtual ~scalar_chain ();
3364
3365 static unsigned max_id;
3366
3367 /* ID of a chain. */
3368 unsigned int chain_id;
3369 /* A queue of instructions to be included into a chain. */
3370 bitmap queue;
3371 /* Instructions included into a chain. */
3372 bitmap insns;
3373 /* All registers defined by a chain. */
3374 bitmap defs;
3375 /* Registers used in both vector and sclar modes. */
3376 bitmap defs_conv;
3377
3378 void build (bitmap candidates, unsigned insn_uid);
3379 virtual int compute_convert_gain () = 0;
3380 int convert ();
3381
3382 protected:
3383 void add_to_queue (unsigned insn_uid);
3384 void emit_conversion_insns (rtx insns, rtx_insn *pos);
3385
3386 private:
3387 void add_insn (bitmap candidates, unsigned insn_uid);
3388 void analyze_register_chain (bitmap candidates, df_ref ref);
3389 virtual void mark_dual_mode_def (df_ref def) = 0;
3390 virtual void convert_insn (rtx_insn *insn) = 0;
3391 virtual void convert_registers () = 0;
3392 };
3393
3394 class dimode_scalar_chain : public scalar_chain
3395 {
3396 public:
3397 int compute_convert_gain ();
3398 private:
3399 void mark_dual_mode_def (df_ref def);
3400 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
3401 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
3402 void convert_insn (rtx_insn *insn);
3403 void convert_op (rtx *op, rtx_insn *insn);
3404 void convert_reg (unsigned regno);
3405 void make_vector_copies (unsigned regno);
3406 void convert_registers ();
3407 int vector_const_cost (rtx exp);
3408 };
3409
3410 class timode_scalar_chain : public scalar_chain
3411 {
3412 public:
3413 /* Convert from TImode to V1TImode is always faster. */
3414 int compute_convert_gain () { return 1; }
3415
3416 private:
3417 void mark_dual_mode_def (df_ref def);
3418 void fix_debug_reg_uses (rtx reg);
3419 void convert_insn (rtx_insn *insn);
3420 /* We don't convert registers to difference size. */
3421 void convert_registers () {}
3422 };
3423
3424 unsigned scalar_chain::max_id = 0;
3425
3426 /* Initialize new chain. */
3427
3428 scalar_chain::scalar_chain ()
3429 {
3430 chain_id = ++max_id;
3431
3432 if (dump_file)
3433 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
3434
3435 bitmap_obstack_initialize (NULL);
3436 insns = BITMAP_ALLOC (NULL);
3437 defs = BITMAP_ALLOC (NULL);
3438 defs_conv = BITMAP_ALLOC (NULL);
3439 queue = NULL;
3440 }
3441
3442 /* Free chain's data. */
3443
3444 scalar_chain::~scalar_chain ()
3445 {
3446 BITMAP_FREE (insns);
3447 BITMAP_FREE (defs);
3448 BITMAP_FREE (defs_conv);
3449 bitmap_obstack_release (NULL);
3450 }
3451
3452 /* Add instruction into chains' queue. */
3453
3454 void
3455 scalar_chain::add_to_queue (unsigned insn_uid)
3456 {
3457 if (bitmap_bit_p (insns, insn_uid)
3458 || bitmap_bit_p (queue, insn_uid))
3459 return;
3460
3461 if (dump_file)
3462 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
3463 insn_uid, chain_id);
3464 bitmap_set_bit (queue, insn_uid);
3465 }
3466
3467 /* For DImode conversion, mark register defined by DEF as requiring
3468 conversion. */
3469
3470 void
3471 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
3472 {
3473 gcc_assert (DF_REF_REG_DEF_P (def));
3474
3475 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
3476 return;
3477
3478 if (dump_file)
3479 fprintf (dump_file,
3480 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
3481 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
3482
3483 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
3484 }
3485
3486 /* For TImode conversion, it is unused. */
3487
3488 void
3489 timode_scalar_chain::mark_dual_mode_def (df_ref)
3490 {
3491 gcc_unreachable ();
3492 }
3493
3494 /* Check REF's chain to add new insns into a queue
3495 and find registers requiring conversion. */
3496
3497 void
3498 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
3499 {
3500 df_link *chain;
3501
3502 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
3503 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
3504 add_to_queue (DF_REF_INSN_UID (ref));
3505
3506 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
3507 {
3508 unsigned uid = DF_REF_INSN_UID (chain->ref);
3509
3510 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
3511 continue;
3512
3513 if (!DF_REF_REG_MEM_P (chain->ref))
3514 {
3515 if (bitmap_bit_p (insns, uid))
3516 continue;
3517
3518 if (bitmap_bit_p (candidates, uid))
3519 {
3520 add_to_queue (uid);
3521 continue;
3522 }
3523 }
3524
3525 if (DF_REF_REG_DEF_P (chain->ref))
3526 {
3527 if (dump_file)
3528 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
3529 DF_REF_REGNO (chain->ref), uid);
3530 mark_dual_mode_def (chain->ref);
3531 }
3532 else
3533 {
3534 if (dump_file)
3535 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
3536 DF_REF_REGNO (chain->ref), uid);
3537 mark_dual_mode_def (ref);
3538 }
3539 }
3540 }
3541
3542 /* Add instruction into a chain. */
3543
3544 void
3545 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
3546 {
3547 if (bitmap_bit_p (insns, insn_uid))
3548 return;
3549
3550 if (dump_file)
3551 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
3552
3553 bitmap_set_bit (insns, insn_uid);
3554
3555 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3556 rtx def_set = single_set (insn);
3557 if (def_set && REG_P (SET_DEST (def_set))
3558 && !HARD_REGISTER_P (SET_DEST (def_set)))
3559 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
3560
3561 df_ref ref;
3562 df_ref def;
3563 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3564 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
3565 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
3566 def;
3567 def = DF_REF_NEXT_REG (def))
3568 analyze_register_chain (candidates, def);
3569 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3570 if (!DF_REF_REG_MEM_P (ref))
3571 analyze_register_chain (candidates, ref);
3572 }
3573
3574 /* Build new chain starting from insn INSN_UID recursively
3575 adding all dependent uses and definitions. */
3576
3577 void
3578 scalar_chain::build (bitmap candidates, unsigned insn_uid)
3579 {
3580 queue = BITMAP_ALLOC (NULL);
3581 bitmap_set_bit (queue, insn_uid);
3582
3583 if (dump_file)
3584 fprintf (dump_file, "Building chain #%d...\n", chain_id);
3585
3586 while (!bitmap_empty_p (queue))
3587 {
3588 insn_uid = bitmap_first_set_bit (queue);
3589 bitmap_clear_bit (queue, insn_uid);
3590 bitmap_clear_bit (candidates, insn_uid);
3591 add_insn (candidates, insn_uid);
3592 }
3593
3594 if (dump_file)
3595 {
3596 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
3597 fprintf (dump_file, " insns: ");
3598 dump_bitmap (dump_file, insns);
3599 if (!bitmap_empty_p (defs_conv))
3600 {
3601 bitmap_iterator bi;
3602 unsigned id;
3603 const char *comma = "";
3604 fprintf (dump_file, " defs to convert: ");
3605 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
3606 {
3607 fprintf (dump_file, "%sr%d", comma, id);
3608 comma = ", ";
3609 }
3610 fprintf (dump_file, "\n");
3611 }
3612 }
3613
3614 BITMAP_FREE (queue);
3615 }
3616
3617 /* Return a cost of building a vector costant
3618 instead of using a scalar one. */
3619
3620 int
3621 dimode_scalar_chain::vector_const_cost (rtx exp)
3622 {
3623 gcc_assert (CONST_INT_P (exp));
3624
3625 if (standard_sse_constant_p (exp, V2DImode))
3626 return COSTS_N_INSNS (1);
3627 return ix86_cost->sse_load[1];
3628 }
3629
3630 /* Compute a gain for chain conversion. */
3631
3632 int
3633 dimode_scalar_chain::compute_convert_gain ()
3634 {
3635 bitmap_iterator bi;
3636 unsigned insn_uid;
3637 int gain = 0;
3638 int cost = 0;
3639
3640 if (dump_file)
3641 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
3642
3643 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
3644 {
3645 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3646 rtx def_set = single_set (insn);
3647 rtx src = SET_SRC (def_set);
3648 rtx dst = SET_DEST (def_set);
3649
3650 if (REG_P (src) && REG_P (dst))
3651 gain += COSTS_N_INSNS (2) - ix86_cost->sse_move;
3652 else if (REG_P (src) && MEM_P (dst))
3653 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3654 else if (MEM_P (src) && REG_P (dst))
3655 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
3656 else if (GET_CODE (src) == ASHIFT
3657 || GET_CODE (src) == ASHIFTRT
3658 || GET_CODE (src) == LSHIFTRT)
3659 {
3660 if (CONST_INT_P (XEXP (src, 0)))
3661 gain -= vector_const_cost (XEXP (src, 0));
3662 if (CONST_INT_P (XEXP (src, 1)))
3663 {
3664 gain += ix86_cost->shift_const;
3665 if (INTVAL (XEXP (src, 1)) >= 32)
3666 gain -= COSTS_N_INSNS (1);
3667 }
3668 else
3669 /* Additional gain for omitting two CMOVs. */
3670 gain += ix86_cost->shift_var + COSTS_N_INSNS (2);
3671 }
3672 else if (GET_CODE (src) == PLUS
3673 || GET_CODE (src) == MINUS
3674 || GET_CODE (src) == IOR
3675 || GET_CODE (src) == XOR
3676 || GET_CODE (src) == AND)
3677 {
3678 gain += ix86_cost->add;
3679 /* Additional gain for andnot for targets without BMI. */
3680 if (GET_CODE (XEXP (src, 0)) == NOT
3681 && !TARGET_BMI)
3682 gain += 2 * ix86_cost->add;
3683
3684 if (CONST_INT_P (XEXP (src, 0)))
3685 gain -= vector_const_cost (XEXP (src, 0));
3686 if (CONST_INT_P (XEXP (src, 1)))
3687 gain -= vector_const_cost (XEXP (src, 1));
3688 }
3689 else if (GET_CODE (src) == NEG
3690 || GET_CODE (src) == NOT)
3691 gain += ix86_cost->add - COSTS_N_INSNS (1);
3692 else if (GET_CODE (src) == COMPARE)
3693 {
3694 /* Assume comparison cost is the same. */
3695 }
3696 else if (CONST_INT_P (src))
3697 {
3698 if (REG_P (dst))
3699 gain += COSTS_N_INSNS (2);
3700 else if (MEM_P (dst))
3701 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3702 gain -= vector_const_cost (src);
3703 }
3704 else
3705 gcc_unreachable ();
3706 }
3707
3708 if (dump_file)
3709 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
3710
3711 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
3712 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
3713
3714 if (dump_file)
3715 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
3716
3717 gain -= cost;
3718
3719 if (dump_file)
3720 fprintf (dump_file, " Total gain: %d\n", gain);
3721
3722 return gain;
3723 }
3724
3725 /* Replace REG in X with a V2DI subreg of NEW_REG. */
3726
3727 rtx
3728 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
3729 {
3730 if (x == reg)
3731 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
3732
3733 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
3734 int i, j;
3735 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
3736 {
3737 if (fmt[i] == 'e')
3738 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
3739 else if (fmt[i] == 'E')
3740 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
3741 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
3742 reg, new_reg);
3743 }
3744
3745 return x;
3746 }
3747
3748 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
3749
3750 void
3751 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
3752 rtx reg, rtx new_reg)
3753 {
3754 replace_with_subreg (single_set (insn), reg, new_reg);
3755 }
3756
3757 /* Insert generated conversion instruction sequence INSNS
3758 after instruction AFTER. New BB may be required in case
3759 instruction has EH region attached. */
3760
3761 void
3762 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
3763 {
3764 if (!control_flow_insn_p (after))
3765 {
3766 emit_insn_after (insns, after);
3767 return;
3768 }
3769
3770 basic_block bb = BLOCK_FOR_INSN (after);
3771 edge e = find_fallthru_edge (bb->succs);
3772 gcc_assert (e);
3773
3774 basic_block new_bb = split_edge (e);
3775 emit_insn_after (insns, BB_HEAD (new_bb));
3776 }
3777
3778 /* Make vector copies for all register REGNO definitions
3779 and replace its uses in a chain. */
3780
3781 void
3782 dimode_scalar_chain::make_vector_copies (unsigned regno)
3783 {
3784 rtx reg = regno_reg_rtx[regno];
3785 rtx vreg = gen_reg_rtx (DImode);
3786 bool count_reg = false;
3787 df_ref ref;
3788
3789 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3790 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3791 {
3792 df_ref use;
3793
3794 /* Detect the count register of a shift instruction. */
3795 for (use = DF_REG_USE_CHAIN (regno); use; use = DF_REF_NEXT_REG (use))
3796 if (bitmap_bit_p (insns, DF_REF_INSN_UID (use)))
3797 {
3798 rtx_insn *insn = DF_REF_INSN (use);
3799 rtx def_set = single_set (insn);
3800
3801 gcc_assert (def_set);
3802
3803 rtx src = SET_SRC (def_set);
3804
3805 if ((GET_CODE (src) == ASHIFT
3806 || GET_CODE (src) == ASHIFTRT
3807 || GET_CODE (src) == LSHIFTRT)
3808 && !CONST_INT_P (XEXP (src, 1))
3809 && reg_or_subregno (XEXP (src, 1)) == regno)
3810 count_reg = true;
3811 }
3812
3813 start_sequence ();
3814 if (count_reg)
3815 {
3816 rtx qreg = gen_lowpart (QImode, reg);
3817 rtx tmp = gen_reg_rtx (SImode);
3818
3819 if (TARGET_ZERO_EXTEND_WITH_AND
3820 && optimize_function_for_speed_p (cfun))
3821 {
3822 emit_move_insn (tmp, const0_rtx);
3823 emit_insn (gen_movstrictqi
3824 (gen_lowpart (QImode, tmp), qreg));
3825 }
3826 else
3827 emit_insn (gen_rtx_SET
3828 (tmp, gen_rtx_ZERO_EXTEND (SImode, qreg)));
3829
3830 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
3831 {
3832 rtx slot = assign_386_stack_local (SImode, SLOT_STV_TEMP);
3833 emit_move_insn (slot, tmp);
3834 tmp = copy_rtx (slot);
3835 }
3836
3837 emit_insn (gen_zero_extendsidi2 (vreg, tmp));
3838 }
3839 else if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
3840 {
3841 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
3842 emit_move_insn (adjust_address (tmp, SImode, 0),
3843 gen_rtx_SUBREG (SImode, reg, 0));
3844 emit_move_insn (adjust_address (tmp, SImode, 4),
3845 gen_rtx_SUBREG (SImode, reg, 4));
3846 emit_move_insn (vreg, tmp);
3847 }
3848 else if (TARGET_SSE4_1)
3849 {
3850 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3851 CONST0_RTX (V4SImode),
3852 gen_rtx_SUBREG (SImode, reg, 0)));
3853 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
3854 gen_rtx_SUBREG (V4SImode, vreg, 0),
3855 gen_rtx_SUBREG (SImode, reg, 4),
3856 GEN_INT (2)));
3857 }
3858 else
3859 {
3860 rtx tmp = gen_reg_rtx (DImode);
3861 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3862 CONST0_RTX (V4SImode),
3863 gen_rtx_SUBREG (SImode, reg, 0)));
3864 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
3865 CONST0_RTX (V4SImode),
3866 gen_rtx_SUBREG (SImode, reg, 4)));
3867 emit_insn (gen_vec_interleave_lowv4si
3868 (gen_rtx_SUBREG (V4SImode, vreg, 0),
3869 gen_rtx_SUBREG (V4SImode, vreg, 0),
3870 gen_rtx_SUBREG (V4SImode, tmp, 0)));
3871 }
3872 rtx_insn *seq = get_insns ();
3873 end_sequence ();
3874 rtx_insn *insn = DF_REF_INSN (ref);
3875 emit_conversion_insns (seq, insn);
3876
3877 if (dump_file)
3878 fprintf (dump_file,
3879 " Copied r%d to a vector register r%d for insn %d\n",
3880 regno, REGNO (vreg), INSN_UID (insn));
3881 }
3882
3883 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3884 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3885 {
3886 rtx_insn *insn = DF_REF_INSN (ref);
3887 if (count_reg)
3888 {
3889 rtx def_set = single_set (insn);
3890 gcc_assert (def_set);
3891
3892 rtx src = SET_SRC (def_set);
3893
3894 if ((GET_CODE (src) == ASHIFT
3895 || GET_CODE (src) == ASHIFTRT
3896 || GET_CODE (src) == LSHIFTRT)
3897 && !CONST_INT_P (XEXP (src, 1))
3898 && reg_or_subregno (XEXP (src, 1)) == regno)
3899 XEXP (src, 1) = vreg;
3900 }
3901 else
3902 replace_with_subreg_in_insn (insn, reg, vreg);
3903
3904 if (dump_file)
3905 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
3906 regno, REGNO (vreg), INSN_UID (insn));
3907 }
3908 }
3909
3910 /* Convert all definitions of register REGNO
3911 and fix its uses. Scalar copies may be created
3912 in case register is used in not convertible insn. */
3913
3914 void
3915 dimode_scalar_chain::convert_reg (unsigned regno)
3916 {
3917 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
3918 rtx reg = regno_reg_rtx[regno];
3919 rtx scopy = NULL_RTX;
3920 df_ref ref;
3921 bitmap conv;
3922
3923 conv = BITMAP_ALLOC (NULL);
3924 bitmap_copy (conv, insns);
3925
3926 if (scalar_copy)
3927 scopy = gen_reg_rtx (DImode);
3928
3929 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3930 {
3931 rtx_insn *insn = DF_REF_INSN (ref);
3932 rtx def_set = single_set (insn);
3933 rtx src = SET_SRC (def_set);
3934 rtx reg = DF_REF_REG (ref);
3935
3936 if (!MEM_P (src))
3937 {
3938 replace_with_subreg_in_insn (insn, reg, reg);
3939 bitmap_clear_bit (conv, INSN_UID (insn));
3940 }
3941
3942 if (scalar_copy)
3943 {
3944 start_sequence ();
3945 if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
3946 {
3947 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
3948 emit_move_insn (tmp, reg);
3949 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3950 adjust_address (tmp, SImode, 0));
3951 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3952 adjust_address (tmp, SImode, 4));
3953 }
3954 else if (TARGET_SSE4_1)
3955 {
3956 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
3957 emit_insn
3958 (gen_rtx_SET
3959 (gen_rtx_SUBREG (SImode, scopy, 0),
3960 gen_rtx_VEC_SELECT (SImode,
3961 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
3962
3963 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
3964 emit_insn
3965 (gen_rtx_SET
3966 (gen_rtx_SUBREG (SImode, scopy, 4),
3967 gen_rtx_VEC_SELECT (SImode,
3968 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
3969 }
3970 else
3971 {
3972 rtx vcopy = gen_reg_rtx (V2DImode);
3973 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
3974 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3975 gen_rtx_SUBREG (SImode, vcopy, 0));
3976 emit_move_insn (vcopy,
3977 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
3978 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3979 gen_rtx_SUBREG (SImode, vcopy, 0));
3980 }
3981 rtx_insn *seq = get_insns ();
3982 end_sequence ();
3983 emit_conversion_insns (seq, insn);
3984
3985 if (dump_file)
3986 fprintf (dump_file,
3987 " Copied r%d to a scalar register r%d for insn %d\n",
3988 regno, REGNO (scopy), INSN_UID (insn));
3989 }
3990 }
3991
3992 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3993 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3994 {
3995 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
3996 {
3997 rtx_insn *insn = DF_REF_INSN (ref);
3998
3999 rtx def_set = single_set (insn);
4000 gcc_assert (def_set);
4001
4002 rtx src = SET_SRC (def_set);
4003 rtx dst = SET_DEST (def_set);
4004
4005 if ((GET_CODE (src) == ASHIFT
4006 || GET_CODE (src) == ASHIFTRT
4007 || GET_CODE (src) == LSHIFTRT)
4008 && !CONST_INT_P (XEXP (src, 1))
4009 && reg_or_subregno (XEXP (src, 1)) == regno)
4010 {
4011 rtx tmp2 = gen_reg_rtx (V2DImode);
4012
4013 start_sequence ();
4014
4015 if (TARGET_SSE4_1)
4016 emit_insn (gen_sse4_1_zero_extendv2qiv2di2
4017 (tmp2, gen_rtx_SUBREG (V16QImode, reg, 0)));
4018 else
4019 {
4020 rtx vec_cst
4021 = gen_rtx_CONST_VECTOR (V2DImode,
4022 gen_rtvec (2, GEN_INT (0xff),
4023 const0_rtx));
4024 vec_cst
4025 = validize_mem (force_const_mem (V2DImode, vec_cst));
4026
4027 emit_insn (gen_rtx_SET
4028 (tmp2,
4029 gen_rtx_AND (V2DImode,
4030 gen_rtx_SUBREG (V2DImode, reg, 0),
4031 vec_cst)));
4032 }
4033 rtx_insn *seq = get_insns ();
4034 end_sequence ();
4035
4036 emit_insn_before (seq, insn);
4037
4038 XEXP (src, 1) = gen_rtx_SUBREG (DImode, tmp2, 0);
4039 }
4040 else if (!MEM_P (dst) || !REG_P (src))
4041 replace_with_subreg_in_insn (insn, reg, reg);
4042
4043 bitmap_clear_bit (conv, INSN_UID (insn));
4044 }
4045 }
4046 /* Skip debug insns and uninitialized uses. */
4047 else if (DF_REF_CHAIN (ref)
4048 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
4049 {
4050 gcc_assert (scopy);
4051 replace_rtx (DF_REF_INSN (ref), reg, scopy);
4052 df_insn_rescan (DF_REF_INSN (ref));
4053 }
4054
4055 BITMAP_FREE (conv);
4056 }
4057
4058 /* Convert operand OP in INSN. We should handle
4059 memory operands and uninitialized registers.
4060 All other register uses are converted during
4061 registers conversion. */
4062
4063 void
4064 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
4065 {
4066 *op = copy_rtx_if_shared (*op);
4067
4068 if (GET_CODE (*op) == NOT)
4069 {
4070 convert_op (&XEXP (*op, 0), insn);
4071 PUT_MODE (*op, V2DImode);
4072 }
4073 else if (MEM_P (*op))
4074 {
4075 rtx tmp = gen_reg_rtx (DImode);
4076
4077 emit_insn_before (gen_move_insn (tmp, *op), insn);
4078 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
4079
4080 if (dump_file)
4081 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
4082 INSN_UID (insn), REGNO (tmp));
4083 }
4084 else if (REG_P (*op))
4085 {
4086 /* We may have not converted register usage in case
4087 this register has no definition. Otherwise it
4088 should be converted in convert_reg. */
4089 df_ref ref;
4090 FOR_EACH_INSN_USE (ref, insn)
4091 if (DF_REF_REGNO (ref) == REGNO (*op))
4092 {
4093 gcc_assert (!DF_REF_CHAIN (ref));
4094 break;
4095 }
4096 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
4097 }
4098 else if (CONST_INT_P (*op))
4099 {
4100 rtx vec_cst;
4101 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
4102
4103 /* Prefer all ones vector in case of -1. */
4104 if (constm1_operand (*op, GET_MODE (*op)))
4105 vec_cst = CONSTM1_RTX (V2DImode);
4106 else
4107 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
4108 gen_rtvec (2, *op, const0_rtx));
4109
4110 if (!standard_sse_constant_p (vec_cst, V2DImode))
4111 {
4112 start_sequence ();
4113 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
4114 rtx_insn *seq = get_insns ();
4115 end_sequence ();
4116 emit_insn_before (seq, insn);
4117 }
4118
4119 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
4120 *op = tmp;
4121 }
4122 else
4123 {
4124 gcc_assert (SUBREG_P (*op));
4125 gcc_assert (GET_MODE (*op) == V2DImode);
4126 }
4127 }
4128
4129 /* Convert INSN to vector mode. */
4130
4131 void
4132 dimode_scalar_chain::convert_insn (rtx_insn *insn)
4133 {
4134 rtx def_set = single_set (insn);
4135 rtx src = SET_SRC (def_set);
4136 rtx dst = SET_DEST (def_set);
4137 rtx subreg;
4138
4139 if (MEM_P (dst) && !REG_P (src))
4140 {
4141 /* There are no scalar integer instructions and therefore
4142 temporary register usage is required. */
4143 rtx tmp = gen_reg_rtx (DImode);
4144 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
4145 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
4146 }
4147
4148 switch (GET_CODE (src))
4149 {
4150 case ASHIFT:
4151 case ASHIFTRT:
4152 case LSHIFTRT:
4153 convert_op (&XEXP (src, 0), insn);
4154 PUT_MODE (src, V2DImode);
4155 break;
4156
4157 case PLUS:
4158 case MINUS:
4159 case IOR:
4160 case XOR:
4161 case AND:
4162 convert_op (&XEXP (src, 0), insn);
4163 convert_op (&XEXP (src, 1), insn);
4164 PUT_MODE (src, V2DImode);
4165 break;
4166
4167 case NEG:
4168 src = XEXP (src, 0);
4169 convert_op (&src, insn);
4170 subreg = gen_reg_rtx (V2DImode);
4171 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
4172 src = gen_rtx_MINUS (V2DImode, subreg, src);
4173 break;
4174
4175 case NOT:
4176 src = XEXP (src, 0);
4177 convert_op (&src, insn);
4178 subreg = gen_reg_rtx (V2DImode);
4179 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
4180 src = gen_rtx_XOR (V2DImode, src, subreg);
4181 break;
4182
4183 case MEM:
4184 if (!REG_P (dst))
4185 convert_op (&src, insn);
4186 break;
4187
4188 case REG:
4189 if (!MEM_P (dst))
4190 convert_op (&src, insn);
4191 break;
4192
4193 case SUBREG:
4194 gcc_assert (GET_MODE (src) == V2DImode);
4195 break;
4196
4197 case COMPARE:
4198 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
4199
4200 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
4201 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
4202
4203 if (REG_P (src))
4204 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
4205 else
4206 subreg = copy_rtx_if_shared (src);
4207 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
4208 copy_rtx_if_shared (subreg),
4209 copy_rtx_if_shared (subreg)),
4210 insn);
4211 dst = gen_rtx_REG (CCmode, FLAGS_REG);
4212 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
4213 copy_rtx_if_shared (src)),
4214 UNSPEC_PTEST);
4215 break;
4216
4217 case CONST_INT:
4218 convert_op (&src, insn);
4219 break;
4220
4221 default:
4222 gcc_unreachable ();
4223 }
4224
4225 SET_SRC (def_set) = src;
4226 SET_DEST (def_set) = dst;
4227
4228 /* Drop possible dead definitions. */
4229 PATTERN (insn) = def_set;
4230
4231 INSN_CODE (insn) = -1;
4232 recog_memoized (insn);
4233 df_insn_rescan (insn);
4234 }
4235
4236 /* Fix uses of converted REG in debug insns. */
4237
4238 void
4239 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
4240 {
4241 if (!flag_var_tracking)
4242 return;
4243
4244 df_ref ref, next;
4245 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
4246 {
4247 rtx_insn *insn = DF_REF_INSN (ref);
4248 /* Make sure the next ref is for a different instruction,
4249 so that we're not affected by the rescan. */
4250 next = DF_REF_NEXT_REG (ref);
4251 while (next && DF_REF_INSN (next) == insn)
4252 next = DF_REF_NEXT_REG (next);
4253
4254 if (DEBUG_INSN_P (insn))
4255 {
4256 /* It may be a debug insn with a TImode variable in
4257 register. */
4258 bool changed = false;
4259 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
4260 {
4261 rtx *loc = DF_REF_LOC (ref);
4262 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
4263 {
4264 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
4265 changed = true;
4266 }
4267 }
4268 if (changed)
4269 df_insn_rescan (insn);
4270 }
4271 }
4272 }
4273
4274 /* Convert INSN from TImode to V1T1mode. */
4275
4276 void
4277 timode_scalar_chain::convert_insn (rtx_insn *insn)
4278 {
4279 rtx def_set = single_set (insn);
4280 rtx src = SET_SRC (def_set);
4281 rtx dst = SET_DEST (def_set);
4282
4283 switch (GET_CODE (dst))
4284 {
4285 case REG:
4286 {
4287 rtx tmp = find_reg_equal_equiv_note (insn);
4288 if (tmp)
4289 PUT_MODE (XEXP (tmp, 0), V1TImode);
4290 PUT_MODE (dst, V1TImode);
4291 fix_debug_reg_uses (dst);
4292 }
4293 break;
4294 case MEM:
4295 PUT_MODE (dst, V1TImode);
4296 break;
4297
4298 default:
4299 gcc_unreachable ();
4300 }
4301
4302 switch (GET_CODE (src))
4303 {
4304 case REG:
4305 PUT_MODE (src, V1TImode);
4306 /* Call fix_debug_reg_uses only if SRC is never defined. */
4307 if (!DF_REG_DEF_CHAIN (REGNO (src)))
4308 fix_debug_reg_uses (src);
4309 break;
4310
4311 case MEM:
4312 PUT_MODE (src, V1TImode);
4313 break;
4314
4315 case CONST_WIDE_INT:
4316 if (NONDEBUG_INSN_P (insn))
4317 {
4318 /* Since there are no instructions to store 128-bit constant,
4319 temporary register usage is required. */
4320 rtx tmp = gen_reg_rtx (V1TImode);
4321 start_sequence ();
4322 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
4323 src = validize_mem (force_const_mem (V1TImode, src));
4324 rtx_insn *seq = get_insns ();
4325 end_sequence ();
4326 if (seq)
4327 emit_insn_before (seq, insn);
4328 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
4329 dst = tmp;
4330 }
4331 break;
4332
4333 case CONST_INT:
4334 switch (standard_sse_constant_p (src, TImode))
4335 {
4336 case 1:
4337 src = CONST0_RTX (GET_MODE (dst));
4338 break;
4339 case 2:
4340 src = CONSTM1_RTX (GET_MODE (dst));
4341 break;
4342 default:
4343 gcc_unreachable ();
4344 }
4345 if (NONDEBUG_INSN_P (insn))
4346 {
4347 rtx tmp = gen_reg_rtx (V1TImode);
4348 /* Since there are no instructions to store standard SSE
4349 constant, temporary register usage is required. */
4350 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
4351 dst = tmp;
4352 }
4353 break;
4354
4355 default:
4356 gcc_unreachable ();
4357 }
4358
4359 SET_SRC (def_set) = src;
4360 SET_DEST (def_set) = dst;
4361
4362 /* Drop possible dead definitions. */
4363 PATTERN (insn) = def_set;
4364
4365 INSN_CODE (insn) = -1;
4366 recog_memoized (insn);
4367 df_insn_rescan (insn);
4368 }
4369
4370 void
4371 dimode_scalar_chain::convert_registers ()
4372 {
4373 bitmap_iterator bi;
4374 unsigned id;
4375
4376 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
4377 convert_reg (id);
4378
4379 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
4380 make_vector_copies (id);
4381 }
4382
4383 /* Convert whole chain creating required register
4384 conversions and copies. */
4385
4386 int
4387 scalar_chain::convert ()
4388 {
4389 bitmap_iterator bi;
4390 unsigned id;
4391 int converted_insns = 0;
4392
4393 if (!dbg_cnt (stv_conversion))
4394 return 0;
4395
4396 if (dump_file)
4397 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
4398
4399 convert_registers ();
4400
4401 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
4402 {
4403 convert_insn (DF_INSN_UID_GET (id)->insn);
4404 converted_insns++;
4405 }
4406
4407 return converted_insns;
4408 }
4409
4410 /* Main STV pass function. Find and convert scalar
4411 instructions into vector mode when profitable. */
4412
4413 static unsigned int
4414 convert_scalars_to_vector ()
4415 {
4416 basic_block bb;
4417 bitmap candidates;
4418 int converted_insns = 0;
4419
4420 bitmap_obstack_initialize (NULL);
4421 candidates = BITMAP_ALLOC (NULL);
4422
4423 calculate_dominance_info (CDI_DOMINATORS);
4424 df_set_flags (DF_DEFER_INSN_RESCAN);
4425 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
4426 df_md_add_problem ();
4427 df_analyze ();
4428
4429 /* Find all instructions we want to convert into vector mode. */
4430 if (dump_file)
4431 fprintf (dump_file, "Searching for mode conversion candidates...\n");
4432
4433 FOR_EACH_BB_FN (bb, cfun)
4434 {
4435 rtx_insn *insn;
4436 FOR_BB_INSNS (bb, insn)
4437 if (scalar_to_vector_candidate_p (insn))
4438 {
4439 if (dump_file)
4440 fprintf (dump_file, " insn %d is marked as a candidate\n",
4441 INSN_UID (insn));
4442
4443 bitmap_set_bit (candidates, INSN_UID (insn));
4444 }
4445 }
4446
4447 remove_non_convertible_regs (candidates);
4448
4449 if (bitmap_empty_p (candidates))
4450 if (dump_file)
4451 fprintf (dump_file, "There are no candidates for optimization.\n");
4452
4453 while (!bitmap_empty_p (candidates))
4454 {
4455 unsigned uid = bitmap_first_set_bit (candidates);
4456 scalar_chain *chain;
4457
4458 if (TARGET_64BIT)
4459 chain = new timode_scalar_chain;
4460 else
4461 chain = new dimode_scalar_chain;
4462
4463 /* Find instructions chain we want to convert to vector mode.
4464 Check all uses and definitions to estimate all required
4465 conversions. */
4466 chain->build (candidates, uid);
4467
4468 if (chain->compute_convert_gain () > 0)
4469 converted_insns += chain->convert ();
4470 else
4471 if (dump_file)
4472 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
4473 chain->chain_id);
4474
4475 delete chain;
4476 }
4477
4478 if (dump_file)
4479 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
4480
4481 BITMAP_FREE (candidates);
4482 bitmap_obstack_release (NULL);
4483 df_process_deferred_rescans ();
4484
4485 /* Conversion means we may have 128bit register spills/fills
4486 which require aligned stack. */
4487 if (converted_insns)
4488 {
4489 if (crtl->stack_alignment_needed < 128)
4490 crtl->stack_alignment_needed = 128;
4491 if (crtl->stack_alignment_estimated < 128)
4492 crtl->stack_alignment_estimated = 128;
4493 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
4494 if (TARGET_64BIT)
4495 for (tree parm = DECL_ARGUMENTS (current_function_decl);
4496 parm; parm = DECL_CHAIN (parm))
4497 {
4498 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
4499 continue;
4500 if (DECL_RTL_SET_P (parm)
4501 && GET_MODE (DECL_RTL (parm)) == V1TImode)
4502 {
4503 rtx r = DECL_RTL (parm);
4504 if (REG_P (r))
4505 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
4506 }
4507 if (DECL_INCOMING_RTL (parm)
4508 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
4509 {
4510 rtx r = DECL_INCOMING_RTL (parm);
4511 if (REG_P (r))
4512 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
4513 }
4514 }
4515 }
4516
4517 return 0;
4518 }
4519
4520 namespace {
4521
4522 const pass_data pass_data_insert_vzeroupper =
4523 {
4524 RTL_PASS, /* type */
4525 "vzeroupper", /* name */
4526 OPTGROUP_NONE, /* optinfo_flags */
4527 TV_MACH_DEP, /* tv_id */
4528 0, /* properties_required */
4529 0, /* properties_provided */
4530 0, /* properties_destroyed */
4531 0, /* todo_flags_start */
4532 TODO_df_finish, /* todo_flags_finish */
4533 };
4534
4535 class pass_insert_vzeroupper : public rtl_opt_pass
4536 {
4537 public:
4538 pass_insert_vzeroupper(gcc::context *ctxt)
4539 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
4540 {}
4541
4542 /* opt_pass methods: */
4543 virtual bool gate (function *)
4544 {
4545 return TARGET_AVX && !TARGET_AVX512F
4546 && TARGET_VZEROUPPER && flag_expensive_optimizations
4547 && !optimize_size;
4548 }
4549
4550 virtual unsigned int execute (function *)
4551 {
4552 return rest_of_handle_insert_vzeroupper ();
4553 }
4554
4555 }; // class pass_insert_vzeroupper
4556
4557 const pass_data pass_data_stv =
4558 {
4559 RTL_PASS, /* type */
4560 "stv", /* name */
4561 OPTGROUP_NONE, /* optinfo_flags */
4562 TV_MACH_DEP, /* tv_id */
4563 0, /* properties_required */
4564 0, /* properties_provided */
4565 0, /* properties_destroyed */
4566 0, /* todo_flags_start */
4567 TODO_df_finish, /* todo_flags_finish */
4568 };
4569
4570 class pass_stv : public rtl_opt_pass
4571 {
4572 public:
4573 pass_stv (gcc::context *ctxt)
4574 : rtl_opt_pass (pass_data_stv, ctxt),
4575 timode_p (false)
4576 {}
4577
4578 /* opt_pass methods: */
4579 virtual bool gate (function *)
4580 {
4581 return (timode_p == !!TARGET_64BIT
4582 && TARGET_STV && TARGET_SSE2 && optimize > 1);
4583 }
4584
4585 virtual unsigned int execute (function *)
4586 {
4587 return convert_scalars_to_vector ();
4588 }
4589
4590 opt_pass *clone ()
4591 {
4592 return new pass_stv (m_ctxt);
4593 }
4594
4595 void set_pass_param (unsigned int n, bool param)
4596 {
4597 gcc_assert (n == 0);
4598 timode_p = param;
4599 }
4600
4601 private:
4602 bool timode_p;
4603 }; // class pass_stv
4604
4605 } // anon namespace
4606
4607 rtl_opt_pass *
4608 make_pass_insert_vzeroupper (gcc::context *ctxt)
4609 {
4610 return new pass_insert_vzeroupper (ctxt);
4611 }
4612
4613 rtl_opt_pass *
4614 make_pass_stv (gcc::context *ctxt)
4615 {
4616 return new pass_stv (ctxt);
4617 }
4618
4619 /* Return true if a red-zone is in use. */
4620
4621 bool
4622 ix86_using_red_zone (void)
4623 {
4624 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
4625 }
4626 \f
4627 /* Return a string that documents the current -m options. The caller is
4628 responsible for freeing the string. */
4629
4630 static char *
4631 ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
4632 int flags, int flags2,
4633 const char *arch, const char *tune,
4634 enum fpmath_unit fpmath, bool add_nl_p)
4635 {
4636 struct ix86_target_opts
4637 {
4638 const char *option; /* option string */
4639 HOST_WIDE_INT mask; /* isa mask options */
4640 };
4641
4642 /* This table is ordered so that options like -msse4.2 that imply other
4643 ISAs come first. Target string will be displayed in the same order. */
4644 static struct ix86_target_opts isa2_opts[] =
4645 {
4646 { "-mrdpid", OPTION_MASK_ISA_RDPID },
4647 { "-msgx", OPTION_MASK_ISA_SGX },
4648 { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
4649 { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
4650 { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ }
4651 };
4652 static struct ix86_target_opts isa_opts[] =
4653 {
4654 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
4655 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
4656 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
4657 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
4658 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
4659 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
4660 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
4661 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
4662 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
4663 { "-mavx2", OPTION_MASK_ISA_AVX2 },
4664 { "-mfma", OPTION_MASK_ISA_FMA },
4665 { "-mxop", OPTION_MASK_ISA_XOP },
4666 { "-mfma4", OPTION_MASK_ISA_FMA4 },
4667 { "-mf16c", OPTION_MASK_ISA_F16C },
4668 { "-mavx", OPTION_MASK_ISA_AVX },
4669 /* { "-msse4" OPTION_MASK_ISA_SSE4 }, */
4670 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
4671 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
4672 { "-msse4a", OPTION_MASK_ISA_SSE4A },
4673 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
4674 { "-msse3", OPTION_MASK_ISA_SSE3 },
4675 { "-maes", OPTION_MASK_ISA_AES },
4676 { "-msha", OPTION_MASK_ISA_SHA },
4677 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
4678 { "-msse2", OPTION_MASK_ISA_SSE2 },
4679 { "-msse", OPTION_MASK_ISA_SSE },
4680 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
4681 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
4682 { "-mmmx", OPTION_MASK_ISA_MMX },
4683 { "-mrtm", OPTION_MASK_ISA_RTM },
4684 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
4685 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
4686 { "-madx", OPTION_MASK_ISA_ADX },
4687 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
4688 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
4689 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
4690 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
4691 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
4692 { "-mxsave", OPTION_MASK_ISA_XSAVE },
4693 { "-mabm", OPTION_MASK_ISA_ABM },
4694 { "-mbmi", OPTION_MASK_ISA_BMI },
4695 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
4696 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
4697 { "-mtbm", OPTION_MASK_ISA_TBM },
4698 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
4699 { "-mcx16", OPTION_MASK_ISA_CX16 },
4700 { "-msahf", OPTION_MASK_ISA_SAHF },
4701 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
4702 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
4703 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
4704 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
4705 { "-mmwaitx", OPTION_MASK_ISA_MWAITX },
4706 { "-mclzero", OPTION_MASK_ISA_CLZERO },
4707 { "-mpku", OPTION_MASK_ISA_PKU },
4708 { "-mlwp", OPTION_MASK_ISA_LWP },
4709 { "-mhle", OPTION_MASK_ISA_HLE },
4710 { "-mfxsr", OPTION_MASK_ISA_FXSR },
4711 { "-mmpx", OPTION_MASK_ISA_MPX },
4712 { "-mclwb", OPTION_MASK_ISA_CLWB }
4713 };
4714
4715 /* Flag options. */
4716 static struct ix86_target_opts flag_opts[] =
4717 {
4718 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
4719 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
4720 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
4721 { "-m80387", MASK_80387 },
4722 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
4723 { "-malign-double", MASK_ALIGN_DOUBLE },
4724 { "-mcld", MASK_CLD },
4725 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
4726 { "-mieee-fp", MASK_IEEE_FP },
4727 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
4728 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
4729 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
4730 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
4731 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
4732 { "-mno-push-args", MASK_NO_PUSH_ARGS },
4733 { "-mno-red-zone", MASK_NO_RED_ZONE },
4734 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
4735 { "-mrecip", MASK_RECIP },
4736 { "-mrtd", MASK_RTD },
4737 { "-msseregparm", MASK_SSEREGPARM },
4738 { "-mstack-arg-probe", MASK_STACK_PROBE },
4739 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
4740 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
4741 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
4742 { "-mvzeroupper", MASK_VZEROUPPER },
4743 { "-mstv", MASK_STV },
4744 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
4745 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE },
4746 { "-mprefer-avx128", MASK_PREFER_AVX128 },
4747 { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES }
4748 };
4749
4750 /* Additional flag options. */
4751 static struct ix86_target_opts flag2_opts[] =
4752 {
4753 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY },
4754 };
4755
4756 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
4757 + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
4758
4759 char isa_other[40];
4760 char isa2_other[40];
4761 char flags_other[40];
4762 char flags2_other[40];
4763 unsigned num = 0;
4764 unsigned i, j;
4765 char *ret;
4766 char *ptr;
4767 size_t len;
4768 size_t line_len;
4769 size_t sep_len;
4770 const char *abi;
4771
4772 memset (opts, '\0', sizeof (opts));
4773
4774 /* Add -march= option. */
4775 if (arch)
4776 {
4777 opts[num][0] = "-march=";
4778 opts[num++][1] = arch;
4779 }
4780
4781 /* Add -mtune= option. */
4782 if (tune)
4783 {
4784 opts[num][0] = "-mtune=";
4785 opts[num++][1] = tune;
4786 }
4787
4788 /* Add -m32/-m64/-mx32. */
4789 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
4790 {
4791 if ((isa & OPTION_MASK_ABI_64) != 0)
4792 abi = "-m64";
4793 else
4794 abi = "-mx32";
4795 isa &= ~ (OPTION_MASK_ISA_64BIT
4796 | OPTION_MASK_ABI_64
4797 | OPTION_MASK_ABI_X32);
4798 }
4799 else
4800 abi = "-m32";
4801 opts[num++][0] = abi;
4802
4803 /* Pick out the options in isa2 options. */
4804 for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
4805 {
4806 if ((isa2 & isa2_opts[i].mask) != 0)
4807 {
4808 opts[num++][0] = isa2_opts[i].option;
4809 isa2 &= ~ isa2_opts[i].mask;
4810 }
4811 }
4812
4813 if (isa2 && add_nl_p)
4814 {
4815 opts[num++][0] = isa2_other;
4816 sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
4817 }
4818
4819 /* Pick out the options in isa options. */
4820 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
4821 {
4822 if ((isa & isa_opts[i].mask) != 0)
4823 {
4824 opts[num++][0] = isa_opts[i].option;
4825 isa &= ~ isa_opts[i].mask;
4826 }
4827 }
4828
4829 if (isa && add_nl_p)
4830 {
4831 opts[num++][0] = isa_other;
4832 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
4833 }
4834
4835 /* Add flag options. */
4836 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
4837 {
4838 if ((flags & flag_opts[i].mask) != 0)
4839 {
4840 opts[num++][0] = flag_opts[i].option;
4841 flags &= ~ flag_opts[i].mask;
4842 }
4843 }
4844
4845 if (flags && add_nl_p)
4846 {
4847 opts[num++][0] = flags_other;
4848 sprintf (flags_other, "(other flags: %#x)", flags);
4849 }
4850
4851 /* Add additional flag options. */
4852 for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
4853 {
4854 if ((flags2 & flag2_opts[i].mask) != 0)
4855 {
4856 opts[num++][0] = flag2_opts[i].option;
4857 flags2 &= ~ flag2_opts[i].mask;
4858 }
4859 }
4860
4861 if (flags2 && add_nl_p)
4862 {
4863 opts[num++][0] = flags2_other;
4864 sprintf (flags2_other, "(other flags2: %#x)", flags2);
4865 }
4866
4867 /* Add -fpmath= option. */
4868 if (fpmath)
4869 {
4870 opts[num][0] = "-mfpmath=";
4871 switch ((int) fpmath)
4872 {
4873 case FPMATH_387:
4874 opts[num++][1] = "387";
4875 break;
4876
4877 case FPMATH_SSE:
4878 opts[num++][1] = "sse";
4879 break;
4880
4881 case FPMATH_387 | FPMATH_SSE:
4882 opts[num++][1] = "sse+387";
4883 break;
4884
4885 default:
4886 gcc_unreachable ();
4887 }
4888 }
4889
4890 /* Any options? */
4891 if (num == 0)
4892 return NULL;
4893
4894 gcc_assert (num < ARRAY_SIZE (opts));
4895
4896 /* Size the string. */
4897 len = 0;
4898 sep_len = (add_nl_p) ? 3 : 1;
4899 for (i = 0; i < num; i++)
4900 {
4901 len += sep_len;
4902 for (j = 0; j < 2; j++)
4903 if (opts[i][j])
4904 len += strlen (opts[i][j]);
4905 }
4906
4907 /* Build the string. */
4908 ret = ptr = (char *) xmalloc (len);
4909 line_len = 0;
4910
4911 for (i = 0; i < num; i++)
4912 {
4913 size_t len2[2];
4914
4915 for (j = 0; j < 2; j++)
4916 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
4917
4918 if (i != 0)
4919 {
4920 *ptr++ = ' ';
4921 line_len++;
4922
4923 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
4924 {
4925 *ptr++ = '\\';
4926 *ptr++ = '\n';
4927 line_len = 0;
4928 }
4929 }
4930
4931 for (j = 0; j < 2; j++)
4932 if (opts[i][j])
4933 {
4934 memcpy (ptr, opts[i][j], len2[j]);
4935 ptr += len2[j];
4936 line_len += len2[j];
4937 }
4938 }
4939
4940 *ptr = '\0';
4941 gcc_assert (ret + len >= ptr);
4942
4943 return ret;
4944 }
4945
4946 /* Return true, if profiling code should be emitted before
4947 prologue. Otherwise it returns false.
4948 Note: For x86 with "hotfix" it is sorried. */
4949 static bool
4950 ix86_profile_before_prologue (void)
4951 {
4952 return flag_fentry != 0;
4953 }
4954
4955 /* Function that is callable from the debugger to print the current
4956 options. */
4957 void ATTRIBUTE_UNUSED
4958 ix86_debug_options (void)
4959 {
4960 char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
4961 target_flags, ix86_target_flags,
4962 ix86_arch_string,ix86_tune_string,
4963 ix86_fpmath, true);
4964
4965 if (opts)
4966 {
4967 fprintf (stderr, "%s\n\n", opts);
4968 free (opts);
4969 }
4970 else
4971 fputs ("<no options>\n\n", stderr);
4972
4973 return;
4974 }
4975
4976 /* Return true if T is one of the bytes we should avoid with
4977 -fmitigate-rop. */
4978
4979 static bool
4980 ix86_rop_should_change_byte_p (int t)
4981 {
4982 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
4983 }
4984
4985 static const char *stringop_alg_names[] = {
4986 #define DEF_ENUM
4987 #define DEF_ALG(alg, name) #name,
4988 #include "stringop.def"
4989 #undef DEF_ENUM
4990 #undef DEF_ALG
4991 };
4992
4993 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
4994 The string is of the following form (or comma separated list of it):
4995
4996 strategy_alg:max_size:[align|noalign]
4997
4998 where the full size range for the strategy is either [0, max_size] or
4999 [min_size, max_size], in which min_size is the max_size + 1 of the
5000 preceding range. The last size range must have max_size == -1.
5001
5002 Examples:
5003
5004 1.
5005 -mmemcpy-strategy=libcall:-1:noalign
5006
5007 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
5008
5009
5010 2.
5011 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
5012
5013 This is to tell the compiler to use the following strategy for memset
5014 1) when the expected size is between [1, 16], use rep_8byte strategy;
5015 2) when the size is between [17, 2048], use vector_loop;
5016 3) when the size is > 2048, use libcall. */
5017
5018 struct stringop_size_range
5019 {
5020 int max;
5021 stringop_alg alg;
5022 bool noalign;
5023 };
5024
5025 static void
5026 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
5027 {
5028 const struct stringop_algs *default_algs;
5029 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
5030 char *curr_range_str, *next_range_str;
5031 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
5032 int i = 0, n = 0;
5033
5034 if (is_memset)
5035 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
5036 else
5037 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
5038
5039 curr_range_str = strategy_str;
5040
5041 do
5042 {
5043 int maxs;
5044 char alg_name[128];
5045 char align[16];
5046 next_range_str = strchr (curr_range_str, ',');
5047 if (next_range_str)
5048 *next_range_str++ = '\0';
5049
5050 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
5051 alg_name, &maxs, align))
5052 {
5053 error ("wrong argument %qs to option %qs", curr_range_str, opt);
5054 return;
5055 }
5056
5057 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
5058 {
5059 error ("size ranges of option %qs should be increasing", opt);
5060 return;
5061 }
5062
5063 for (i = 0; i < last_alg; i++)
5064 if (!strcmp (alg_name, stringop_alg_names[i]))
5065 break;
5066
5067 if (i == last_alg)
5068 {
5069 error ("wrong strategy name %qs specified for option %qs",
5070 alg_name, opt);
5071
5072 auto_vec <const char *> candidates;
5073 for (i = 0; i < last_alg; i++)
5074 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
5075 candidates.safe_push (stringop_alg_names[i]);
5076
5077 char *s;
5078 const char *hint
5079 = candidates_list_and_hint (alg_name, s, candidates);
5080 if (hint)
5081 inform (input_location,
5082 "valid arguments to %qs are: %s; did you mean %qs?",
5083 opt, s, hint);
5084 else
5085 inform (input_location, "valid arguments to %qs are: %s",
5086 opt, s);
5087 XDELETEVEC (s);
5088 return;
5089 }
5090
5091 if ((stringop_alg) i == rep_prefix_8_byte
5092 && !TARGET_64BIT)
5093 {
5094 /* rep; movq isn't available in 32-bit code. */
5095 error ("strategy name %qs specified for option %qs "
5096 "not supported for 32-bit code", alg_name, opt);
5097 return;
5098 }
5099
5100 input_ranges[n].max = maxs;
5101 input_ranges[n].alg = (stringop_alg) i;
5102 if (!strcmp (align, "align"))
5103 input_ranges[n].noalign = false;
5104 else if (!strcmp (align, "noalign"))
5105 input_ranges[n].noalign = true;
5106 else
5107 {
5108 error ("unknown alignment %qs specified for option %qs", align, opt);
5109 return;
5110 }
5111 n++;
5112 curr_range_str = next_range_str;
5113 }
5114 while (curr_range_str);
5115
5116 if (input_ranges[n - 1].max != -1)
5117 {
5118 error ("the max value for the last size range should be -1"
5119 " for option %qs", opt);
5120 return;
5121 }
5122
5123 if (n > MAX_STRINGOP_ALGS)
5124 {
5125 error ("too many size ranges specified in option %qs", opt);
5126 return;
5127 }
5128
5129 /* Now override the default algs array. */
5130 for (i = 0; i < n; i++)
5131 {
5132 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
5133 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
5134 = input_ranges[i].alg;
5135 *const_cast<int *>(&default_algs->size[i].noalign)
5136 = input_ranges[i].noalign;
5137 }
5138 }
5139
5140 \f
5141 /* parse -mtune-ctrl= option. When DUMP is true,
5142 print the features that are explicitly set. */
5143
5144 static void
5145 parse_mtune_ctrl_str (bool dump)
5146 {
5147 if (!ix86_tune_ctrl_string)
5148 return;
5149
5150 char *next_feature_string = NULL;
5151 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
5152 char *orig = curr_feature_string;
5153 int i;
5154 do
5155 {
5156 bool clear = false;
5157
5158 next_feature_string = strchr (curr_feature_string, ',');
5159 if (next_feature_string)
5160 *next_feature_string++ = '\0';
5161 if (*curr_feature_string == '^')
5162 {
5163 curr_feature_string++;
5164 clear = true;
5165 }
5166 for (i = 0; i < X86_TUNE_LAST; i++)
5167 {
5168 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
5169 {
5170 ix86_tune_features[i] = !clear;
5171 if (dump)
5172 fprintf (stderr, "Explicitly %s feature %s\n",
5173 clear ? "clear" : "set", ix86_tune_feature_names[i]);
5174 break;
5175 }
5176 }
5177 if (i == X86_TUNE_LAST)
5178 error ("Unknown parameter to option -mtune-ctrl: %s",
5179 clear ? curr_feature_string - 1 : curr_feature_string);
5180 curr_feature_string = next_feature_string;
5181 }
5182 while (curr_feature_string);
5183 free (orig);
5184 }
5185
5186 /* Helper function to set ix86_tune_features. IX86_TUNE is the
5187 processor type. */
5188
5189 static void
5190 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
5191 {
5192 unsigned int ix86_tune_mask = 1u << ix86_tune;
5193 int i;
5194
5195 for (i = 0; i < X86_TUNE_LAST; ++i)
5196 {
5197 if (ix86_tune_no_default)
5198 ix86_tune_features[i] = 0;
5199 else
5200 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
5201 }
5202
5203 if (dump)
5204 {
5205 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
5206 for (i = 0; i < X86_TUNE_LAST; i++)
5207 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
5208 ix86_tune_features[i] ? "on" : "off");
5209 }
5210
5211 parse_mtune_ctrl_str (dump);
5212 }
5213
5214
5215 /* Default align_* from the processor table. */
5216
5217 static void
5218 ix86_default_align (struct gcc_options *opts)
5219 {
5220 if (opts->x_align_loops == 0)
5221 {
5222 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
5223 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
5224 }
5225 if (opts->x_align_jumps == 0)
5226 {
5227 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
5228 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
5229 }
5230 if (opts->x_align_functions == 0)
5231 {
5232 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
5233 }
5234 }
5235
5236 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
5237
5238 static void
5239 ix86_override_options_after_change (void)
5240 {
5241 ix86_default_align (&global_options);
5242 }
5243
5244 /* Override various settings based on options. If MAIN_ARGS_P, the
5245 options are from the command line, otherwise they are from
5246 attributes. Return true if there's an error related to march
5247 option. */
5248
5249 static bool
5250 ix86_option_override_internal (bool main_args_p,
5251 struct gcc_options *opts,
5252 struct gcc_options *opts_set)
5253 {
5254 int i;
5255 unsigned int ix86_arch_mask;
5256 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
5257
5258 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
5259 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
5260 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
5261 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
5262 #define PTA_AES (HOST_WIDE_INT_1 << 4)
5263 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
5264 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
5265 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
5266 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
5267 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
5268 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
5269 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
5270 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
5271 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
5272 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
5273 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
5274 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
5275 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
5276 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
5277 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
5278 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
5279 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
5280 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
5281 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
5282 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
5283 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
5284 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
5285 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
5286 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
5287 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
5288 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
5289 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
5290 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
5291 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
5292 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
5293 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
5294 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
5295 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
5296 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
5297 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
5298 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
5299 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
5300 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
5301 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
5302 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
5303 #define PTA_SHA (HOST_WIDE_INT_1 << 45)
5304 #define PTA_PREFETCHWT1 (HOST_WIDE_INT_1 << 46)
5305 #define PTA_CLFLUSHOPT (HOST_WIDE_INT_1 << 47)
5306 #define PTA_XSAVEC (HOST_WIDE_INT_1 << 48)
5307 #define PTA_XSAVES (HOST_WIDE_INT_1 << 49)
5308 #define PTA_AVX512DQ (HOST_WIDE_INT_1 << 50)
5309 #define PTA_AVX512BW (HOST_WIDE_INT_1 << 51)
5310 #define PTA_AVX512VL (HOST_WIDE_INT_1 << 52)
5311 #define PTA_AVX512IFMA (HOST_WIDE_INT_1 << 53)
5312 #define PTA_AVX512VBMI (HOST_WIDE_INT_1 << 54)
5313 #define PTA_CLWB (HOST_WIDE_INT_1 << 55)
5314 #define PTA_MWAITX (HOST_WIDE_INT_1 << 56)
5315 #define PTA_CLZERO (HOST_WIDE_INT_1 << 57)
5316 #define PTA_NO_80387 (HOST_WIDE_INT_1 << 58)
5317 #define PTA_PKU (HOST_WIDE_INT_1 << 59)
5318 #define PTA_AVX5124VNNIW (HOST_WIDE_INT_1 << 60)
5319 #define PTA_AVX5124FMAPS (HOST_WIDE_INT_1 << 61)
5320 #define PTA_AVX512VPOPCNTDQ (HOST_WIDE_INT_1 << 62)
5321 #define PTA_SGX (HOST_WIDE_INT_1 << 63)
5322
5323 #define PTA_CORE2 \
5324 (PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 \
5325 | PTA_CX16 | PTA_FXSR)
5326 #define PTA_NEHALEM \
5327 (PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_POPCNT)
5328 #define PTA_WESTMERE \
5329 (PTA_NEHALEM | PTA_AES | PTA_PCLMUL)
5330 #define PTA_SANDYBRIDGE \
5331 (PTA_WESTMERE | PTA_AVX | PTA_XSAVE | PTA_XSAVEOPT)
5332 #define PTA_IVYBRIDGE \
5333 (PTA_SANDYBRIDGE | PTA_FSGSBASE | PTA_RDRND | PTA_F16C)
5334 #define PTA_HASWELL \
5335 (PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_LZCNT \
5336 | PTA_FMA | PTA_MOVBE | PTA_HLE)
5337 #define PTA_BROADWELL \
5338 (PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED)
5339 #define PTA_SKYLAKE \
5340 (PTA_BROADWELL | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES)
5341 #define PTA_SKYLAKE_AVX512 \
5342 (PTA_SKYLAKE | PTA_AVX512F | PTA_AVX512CD | PTA_AVX512VL \
5343 | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU)
5344 #define PTA_KNL \
5345 (PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD)
5346 #define PTA_BONNELL \
5347 (PTA_CORE2 | PTA_MOVBE)
5348 #define PTA_SILVERMONT \
5349 (PTA_WESTMERE | PTA_MOVBE)
5350
5351 /* if this reaches 64, need to widen struct pta flags below */
5352
5353 static struct pta
5354 {
5355 const char *const name; /* processor name or nickname. */
5356 const enum processor_type processor;
5357 const enum attr_cpu schedule;
5358 const unsigned HOST_WIDE_INT flags;
5359 }
5360 const processor_alias_table[] =
5361 {
5362 {"i386", PROCESSOR_I386, CPU_NONE, 0},
5363 {"i486", PROCESSOR_I486, CPU_NONE, 0},
5364 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
5365 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
5366 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
5367 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
5368 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
5369 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5370 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5371 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5372 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5373 PTA_MMX | PTA_SSE | PTA_FXSR},
5374 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5375 PTA_MMX | PTA_SSE | PTA_FXSR},
5376 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5377 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5378 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5379 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5380 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
5381 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
5382 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
5383 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5384 PTA_MMX | PTA_SSE | PTA_FXSR},
5385 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5386 PTA_MMX | PTA_SSE | PTA_FXSR},
5387 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5388 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
5389 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
5390 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
5391 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
5392 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
5393 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
5394 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5395 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
5396 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5397 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
5398 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
5399 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
5400 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
5401 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
5402 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5403 PTA_SANDYBRIDGE},
5404 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5405 PTA_SANDYBRIDGE},
5406 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5407 PTA_IVYBRIDGE},
5408 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5409 PTA_IVYBRIDGE},
5410 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
5411 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
5412 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
5413 {"skylake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE},
5414 {"skylake-avx512", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE_AVX512},
5415 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
5416 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
5417 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
5418 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
5419 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
5420 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
5421 {"geode", PROCESSOR_GEODE, CPU_GEODE,
5422 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5423 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
5424 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
5425 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
5426 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
5427 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5428 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
5429 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5430 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
5431 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5432 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
5433 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5434 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
5435 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5436 {"x86-64", PROCESSOR_K8, CPU_K8,
5437 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5438 {"eden-x2", PROCESSOR_K8, CPU_K8,
5439 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5440 {"nano", PROCESSOR_K8, CPU_K8,
5441 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5442 | PTA_SSSE3 | PTA_FXSR},
5443 {"nano-1000", PROCESSOR_K8, CPU_K8,
5444 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5445 | PTA_SSSE3 | PTA_FXSR},
5446 {"nano-2000", PROCESSOR_K8, CPU_K8,
5447 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5448 | PTA_SSSE3 | PTA_FXSR},
5449 {"nano-3000", PROCESSOR_K8, CPU_K8,
5450 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5451 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5452 {"nano-x2", PROCESSOR_K8, CPU_K8,
5453 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5454 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5455 {"eden-x4", PROCESSOR_K8, CPU_K8,
5456 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5457 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5458 {"nano-x4", PROCESSOR_K8, CPU_K8,
5459 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5460 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5461 {"k8", PROCESSOR_K8, CPU_K8,
5462 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5463 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5464 {"k8-sse3", PROCESSOR_K8, CPU_K8,
5465 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5466 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5467 {"opteron", PROCESSOR_K8, CPU_K8,
5468 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5469 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5470 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
5471 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5472 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5473 {"athlon64", PROCESSOR_K8, CPU_K8,
5474 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5475 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5476 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
5477 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5478 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5479 {"athlon-fx", PROCESSOR_K8, CPU_K8,
5480 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5481 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5482 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
5483 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
5484 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
5485 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
5486 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
5487 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
5488 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
5489 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5490 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5491 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5492 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
5493 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
5494 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5495 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5496 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5497 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
5498 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
5499 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
5500 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5501 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5502 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5503 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
5504 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
5505 | PTA_XSAVEOPT | PTA_FSGSBASE},
5506 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
5507 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5508 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5509 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5510 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
5511 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
5512 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
5513 | PTA_MOVBE | PTA_MWAITX},
5514 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
5515 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5516 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5517 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5518 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
5519 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
5520 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
5521 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
5522 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
5523 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
5524 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5525 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
5526 | PTA_FXSR | PTA_XSAVE},
5527 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
5528 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5529 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
5530 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
5531 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
5532 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
5533
5534 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
5535 PTA_64BIT
5536 | PTA_HLE /* flags are only used for -march switch. */ },
5537 };
5538
5539 /* -mrecip options. */
5540 static struct
5541 {
5542 const char *string; /* option name */
5543 unsigned int mask; /* mask bits to set */
5544 }
5545 const recip_options[] =
5546 {
5547 { "all", RECIP_MASK_ALL },
5548 { "none", RECIP_MASK_NONE },
5549 { "div", RECIP_MASK_DIV },
5550 { "sqrt", RECIP_MASK_SQRT },
5551 { "vec-div", RECIP_MASK_VEC_DIV },
5552 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
5553 };
5554
5555 int const pta_size = ARRAY_SIZE (processor_alias_table);
5556
5557 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
5558 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
5559 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5560 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
5561 #ifdef TARGET_BI_ARCH
5562 else
5563 {
5564 #if TARGET_BI_ARCH == 1
5565 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
5566 is on and OPTION_MASK_ABI_X32 is off. We turn off
5567 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
5568 -mx32. */
5569 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5570 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5571 #else
5572 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
5573 on and OPTION_MASK_ABI_64 is off. We turn off
5574 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
5575 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
5576 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
5577 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
5578 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5579 #endif
5580 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5581 && TARGET_IAMCU_P (opts->x_target_flags))
5582 sorry ("Intel MCU psABI isn%'t supported in %s mode",
5583 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
5584 }
5585 #endif
5586
5587 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5588 {
5589 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5590 OPTION_MASK_ABI_64 for TARGET_X32. */
5591 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5592 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5593 }
5594 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
5595 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
5596 | OPTION_MASK_ABI_X32
5597 | OPTION_MASK_ABI_64);
5598 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
5599 {
5600 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5601 OPTION_MASK_ABI_X32 for TARGET_LP64. */
5602 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5603 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5604 }
5605
5606 #ifdef SUBTARGET_OVERRIDE_OPTIONS
5607 SUBTARGET_OVERRIDE_OPTIONS;
5608 #endif
5609
5610 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
5611 SUBSUBTARGET_OVERRIDE_OPTIONS;
5612 #endif
5613
5614 /* -fPIC is the default for x86_64. */
5615 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
5616 opts->x_flag_pic = 2;
5617
5618 /* Need to check -mtune=generic first. */
5619 if (opts->x_ix86_tune_string)
5620 {
5621 /* As special support for cross compilers we read -mtune=native
5622 as -mtune=generic. With native compilers we won't see the
5623 -mtune=native, as it was changed by the driver. */
5624 if (!strcmp (opts->x_ix86_tune_string, "native"))
5625 {
5626 opts->x_ix86_tune_string = "generic";
5627 }
5628 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5629 warning (OPT_Wdeprecated,
5630 main_args_p
5631 ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
5632 "or %<-mtune=generic%> instead as appropriate")
5633 : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
5634 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
5635 " instead as appropriate"));
5636 }
5637 else
5638 {
5639 if (opts->x_ix86_arch_string)
5640 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
5641 if (!opts->x_ix86_tune_string)
5642 {
5643 opts->x_ix86_tune_string
5644 = processor_target_table[TARGET_CPU_DEFAULT].name;
5645 ix86_tune_defaulted = 1;
5646 }
5647
5648 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
5649 or defaulted. We need to use a sensible tune option. */
5650 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5651 {
5652 opts->x_ix86_tune_string = "generic";
5653 }
5654 }
5655
5656 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
5657 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5658 {
5659 /* rep; movq isn't available in 32-bit code. */
5660 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
5661 opts->x_ix86_stringop_alg = no_stringop;
5662 }
5663
5664 if (!opts->x_ix86_arch_string)
5665 opts->x_ix86_arch_string
5666 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
5667 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
5668 else
5669 ix86_arch_specified = 1;
5670
5671 if (opts_set->x_ix86_pmode)
5672 {
5673 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
5674 && opts->x_ix86_pmode == PMODE_SI)
5675 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5676 && opts->x_ix86_pmode == PMODE_DI))
5677 error ("address mode %qs not supported in the %s bit mode",
5678 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
5679 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
5680 }
5681 else
5682 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
5683 ? PMODE_DI : PMODE_SI;
5684
5685 if (!opts_set->x_ix86_abi)
5686 opts->x_ix86_abi = DEFAULT_ABI;
5687
5688 /* For targets using ms ABI enable ms-extensions, if not
5689 explicit turned off. For non-ms ABI we turn off this
5690 option. */
5691 if (!opts_set->x_flag_ms_extensions)
5692 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
5693
5694 if (opts_set->x_ix86_cmodel)
5695 {
5696 switch (opts->x_ix86_cmodel)
5697 {
5698 case CM_SMALL:
5699 case CM_SMALL_PIC:
5700 if (opts->x_flag_pic)
5701 opts->x_ix86_cmodel = CM_SMALL_PIC;
5702 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5703 error ("code model %qs not supported in the %s bit mode",
5704 "small", "32");
5705 break;
5706
5707 case CM_MEDIUM:
5708 case CM_MEDIUM_PIC:
5709 if (opts->x_flag_pic)
5710 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
5711 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5712 error ("code model %qs not supported in the %s bit mode",
5713 "medium", "32");
5714 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5715 error ("code model %qs not supported in x32 mode",
5716 "medium");
5717 break;
5718
5719 case CM_LARGE:
5720 case CM_LARGE_PIC:
5721 if (opts->x_flag_pic)
5722 opts->x_ix86_cmodel = CM_LARGE_PIC;
5723 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5724 error ("code model %qs not supported in the %s bit mode",
5725 "large", "32");
5726 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5727 error ("code model %qs not supported in x32 mode",
5728 "large");
5729 break;
5730
5731 case CM_32:
5732 if (opts->x_flag_pic)
5733 error ("code model %s does not support PIC mode", "32");
5734 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5735 error ("code model %qs not supported in the %s bit mode",
5736 "32", "64");
5737 break;
5738
5739 case CM_KERNEL:
5740 if (opts->x_flag_pic)
5741 {
5742 error ("code model %s does not support PIC mode", "kernel");
5743 opts->x_ix86_cmodel = CM_32;
5744 }
5745 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5746 error ("code model %qs not supported in the %s bit mode",
5747 "kernel", "32");
5748 break;
5749
5750 default:
5751 gcc_unreachable ();
5752 }
5753 }
5754 else
5755 {
5756 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
5757 use of rip-relative addressing. This eliminates fixups that
5758 would otherwise be needed if this object is to be placed in a
5759 DLL, and is essentially just as efficient as direct addressing. */
5760 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5761 && (TARGET_RDOS || TARGET_PECOFF))
5762 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
5763 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5764 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
5765 else
5766 opts->x_ix86_cmodel = CM_32;
5767 }
5768 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
5769 {
5770 error ("-masm=intel not supported in this configuration");
5771 opts->x_ix86_asm_dialect = ASM_ATT;
5772 }
5773 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
5774 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
5775 sorry ("%i-bit mode not compiled in",
5776 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
5777
5778 for (i = 0; i < pta_size; i++)
5779 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
5780 {
5781 if (!strcmp (opts->x_ix86_arch_string, "generic"))
5782 {
5783 error (main_args_p
5784 ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
5785 "switch")
5786 : G_("%<generic%> CPU can be used only for "
5787 "%<target(\"tune=\")%> attribute"));
5788 return false;
5789 }
5790 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
5791 {
5792 error (main_args_p
5793 ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
5794 "switch")
5795 : G_("%<intel%> CPU can be used only for "
5796 "%<target(\"tune=\")%> attribute"));
5797 return false;
5798 }
5799
5800 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5801 && !(processor_alias_table[i].flags & PTA_64BIT))
5802 {
5803 error ("CPU you selected does not support x86-64 "
5804 "instruction set");
5805 return false;
5806 }
5807
5808 ix86_schedule = processor_alias_table[i].schedule;
5809 ix86_arch = processor_alias_table[i].processor;
5810 /* Default cpu tuning to the architecture. */
5811 ix86_tune = ix86_arch;
5812
5813 if (processor_alias_table[i].flags & PTA_MMX
5814 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
5815 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
5816 if (processor_alias_table[i].flags & PTA_3DNOW
5817 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
5818 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
5819 if (processor_alias_table[i].flags & PTA_3DNOW_A
5820 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
5821 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
5822 if (processor_alias_table[i].flags & PTA_SSE
5823 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
5824 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
5825 if (processor_alias_table[i].flags & PTA_SSE2
5826 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
5827 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
5828 if (processor_alias_table[i].flags & PTA_SSE3
5829 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
5830 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
5831 if (processor_alias_table[i].flags & PTA_SSSE3
5832 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
5833 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
5834 if (processor_alias_table[i].flags & PTA_SSE4_1
5835 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
5836 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
5837 if (processor_alias_table[i].flags & PTA_SSE4_2
5838 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
5839 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
5840 if (processor_alias_table[i].flags & PTA_AVX
5841 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
5842 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
5843 if (processor_alias_table[i].flags & PTA_AVX2
5844 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
5845 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
5846 if (processor_alias_table[i].flags & PTA_FMA
5847 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
5848 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
5849 if (processor_alias_table[i].flags & PTA_SSE4A
5850 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
5851 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
5852 if (processor_alias_table[i].flags & PTA_FMA4
5853 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
5854 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
5855 if (processor_alias_table[i].flags & PTA_XOP
5856 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
5857 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
5858 if (processor_alias_table[i].flags & PTA_LWP
5859 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
5860 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
5861 if (processor_alias_table[i].flags & PTA_ABM
5862 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
5863 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
5864 if (processor_alias_table[i].flags & PTA_BMI
5865 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
5866 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
5867 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
5868 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
5869 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
5870 if (processor_alias_table[i].flags & PTA_TBM
5871 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
5872 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
5873 if (processor_alias_table[i].flags & PTA_BMI2
5874 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
5875 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
5876 if (processor_alias_table[i].flags & PTA_CX16
5877 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
5878 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
5879 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
5880 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
5881 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
5882 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
5883 && (processor_alias_table[i].flags & PTA_NO_SAHF))
5884 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
5885 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
5886 if (processor_alias_table[i].flags & PTA_MOVBE
5887 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
5888 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
5889 if (processor_alias_table[i].flags & PTA_AES
5890 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
5891 ix86_isa_flags |= OPTION_MASK_ISA_AES;
5892 if (processor_alias_table[i].flags & PTA_SHA
5893 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
5894 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
5895 if (processor_alias_table[i].flags & PTA_PCLMUL
5896 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
5897 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
5898 if (processor_alias_table[i].flags & PTA_FSGSBASE
5899 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
5900 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
5901 if (processor_alias_table[i].flags & PTA_RDRND
5902 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
5903 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
5904 if (processor_alias_table[i].flags & PTA_F16C
5905 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
5906 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
5907 if (processor_alias_table[i].flags & PTA_RTM
5908 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
5909 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
5910 if (processor_alias_table[i].flags & PTA_HLE
5911 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
5912 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
5913 if (processor_alias_table[i].flags & PTA_PRFCHW
5914 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
5915 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
5916 if (processor_alias_table[i].flags & PTA_RDSEED
5917 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
5918 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
5919 if (processor_alias_table[i].flags & PTA_ADX
5920 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
5921 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
5922 if (processor_alias_table[i].flags & PTA_FXSR
5923 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
5924 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
5925 if (processor_alias_table[i].flags & PTA_XSAVE
5926 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
5927 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
5928 if (processor_alias_table[i].flags & PTA_XSAVEOPT
5929 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
5930 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
5931 if (processor_alias_table[i].flags & PTA_AVX512F
5932 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
5933 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
5934 if (processor_alias_table[i].flags & PTA_AVX512ER
5935 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
5936 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
5937 if (processor_alias_table[i].flags & PTA_AVX512PF
5938 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
5939 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
5940 if (processor_alias_table[i].flags & PTA_AVX512CD
5941 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
5942 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
5943 if (processor_alias_table[i].flags & PTA_PREFETCHWT1
5944 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
5945 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
5946 if (processor_alias_table[i].flags & PTA_CLWB
5947 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
5948 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
5949 if (processor_alias_table[i].flags & PTA_CLFLUSHOPT
5950 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
5951 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
5952 if (processor_alias_table[i].flags & PTA_CLZERO
5953 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLZERO))
5954 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLZERO;
5955 if (processor_alias_table[i].flags & PTA_XSAVEC
5956 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
5957 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
5958 if (processor_alias_table[i].flags & PTA_XSAVES
5959 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
5960 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
5961 if (processor_alias_table[i].flags & PTA_AVX512DQ
5962 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
5963 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
5964 if (processor_alias_table[i].flags & PTA_AVX512BW
5965 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
5966 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
5967 if (processor_alias_table[i].flags & PTA_AVX512VL
5968 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
5969 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
5970 if (processor_alias_table[i].flags & PTA_MPX
5971 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MPX))
5972 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MPX;
5973 if (processor_alias_table[i].flags & PTA_AVX512VBMI
5974 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
5975 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
5976 if (processor_alias_table[i].flags & PTA_AVX512IFMA
5977 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
5978 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
5979
5980 if (processor_alias_table[i].flags & PTA_AVX5124VNNIW
5981 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124VNNIW))
5982 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
5983 if (processor_alias_table[i].flags & PTA_AVX5124FMAPS
5984 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124FMAPS))
5985 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
5986 if (processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ
5987 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
5988 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
5989 if (processor_alias_table[i].flags & PTA_SGX
5990 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
5991 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
5992
5993 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
5994 x86_prefetch_sse = true;
5995 if (processor_alias_table[i].flags & PTA_MWAITX
5996 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MWAITX))
5997 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MWAITX;
5998 if (processor_alias_table[i].flags & PTA_PKU
5999 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
6000 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
6001
6002 /* Don't enable x87 instructions if only
6003 general registers are allowed. */
6004 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
6005 && !(opts_set->x_target_flags & MASK_80387))
6006 {
6007 if (processor_alias_table[i].flags & PTA_NO_80387)
6008 opts->x_target_flags &= ~MASK_80387;
6009 else
6010 opts->x_target_flags |= MASK_80387;
6011 }
6012 break;
6013 }
6014
6015 if (TARGET_X32 && (opts->x_ix86_isa_flags & OPTION_MASK_ISA_MPX))
6016 error ("Intel MPX does not support x32");
6017
6018 if (TARGET_X32 && (ix86_isa_flags & OPTION_MASK_ISA_MPX))
6019 error ("Intel MPX does not support x32");
6020
6021 if (i == pta_size)
6022 {
6023 error (main_args_p
6024 ? G_("bad value (%qs) for %<-march=%> switch")
6025 : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
6026 opts->x_ix86_arch_string);
6027
6028 auto_vec <const char *> candidates;
6029 for (i = 0; i < pta_size; i++)
6030 if (strcmp (processor_alias_table[i].name, "generic")
6031 && strcmp (processor_alias_table[i].name, "intel")
6032 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
6033 || (processor_alias_table[i].flags & PTA_64BIT)))
6034 candidates.safe_push (processor_alias_table[i].name);
6035
6036 char *s;
6037 const char *hint
6038 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
6039 if (hint)
6040 inform (input_location,
6041 main_args_p
6042 ? G_("valid arguments to %<-march=%> switch are: "
6043 "%s; did you mean %qs?")
6044 : G_("valid arguments to %<target(\"arch=\")%> attribute are: "
6045 "%s; did you mean %qs?"), s, hint);
6046 else
6047 inform (input_location,
6048 main_args_p
6049 ? G_("valid arguments to %<-march=%> switch are: %s")
6050 : G_("valid arguments to %<target(\"arch=\")%> attribute "
6051 "are: %s"), s);
6052 XDELETEVEC (s);
6053 }
6054
6055 ix86_arch_mask = 1u << ix86_arch;
6056 for (i = 0; i < X86_ARCH_LAST; ++i)
6057 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
6058
6059 for (i = 0; i < pta_size; i++)
6060 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
6061 {
6062 ix86_schedule = processor_alias_table[i].schedule;
6063 ix86_tune = processor_alias_table[i].processor;
6064 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6065 {
6066 if (!(processor_alias_table[i].flags & PTA_64BIT))
6067 {
6068 if (ix86_tune_defaulted)
6069 {
6070 opts->x_ix86_tune_string = "x86-64";
6071 for (i = 0; i < pta_size; i++)
6072 if (! strcmp (opts->x_ix86_tune_string,
6073 processor_alias_table[i].name))
6074 break;
6075 ix86_schedule = processor_alias_table[i].schedule;
6076 ix86_tune = processor_alias_table[i].processor;
6077 }
6078 else
6079 error ("CPU you selected does not support x86-64 "
6080 "instruction set");
6081 }
6082 }
6083 /* Intel CPUs have always interpreted SSE prefetch instructions as
6084 NOPs; so, we can enable SSE prefetch instructions even when
6085 -mtune (rather than -march) points us to a processor that has them.
6086 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
6087 higher processors. */
6088 if (TARGET_CMOV
6089 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
6090 x86_prefetch_sse = true;
6091 break;
6092 }
6093
6094 if (ix86_tune_specified && i == pta_size)
6095 {
6096 error (main_args_p
6097 ? G_("bad value (%qs) for %<-mtune=%> switch")
6098 : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
6099 opts->x_ix86_tune_string);
6100
6101 auto_vec <const char *> candidates;
6102 for (i = 0; i < pta_size; i++)
6103 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
6104 || (processor_alias_table[i].flags & PTA_64BIT))
6105 candidates.safe_push (processor_alias_table[i].name);
6106
6107 char *s;
6108 const char *hint
6109 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
6110 if (hint)
6111 inform (input_location,
6112 main_args_p
6113 ? G_("valid arguments to %<-mtune=%> switch are: "
6114 "%s; did you mean %qs?")
6115 : G_("valid arguments to %<target(\"tune=\")%> attribute are: "
6116 "%s; did you mean %qs?"), s, hint);
6117 else
6118 inform (input_location,
6119 main_args_p
6120 ? G_("valid arguments to %<-mtune=%> switch are: %s")
6121 : G_("valid arguments to %<target(\"tune=\")%> attribute "
6122 "are: %s"), s);
6123 XDELETEVEC (s);
6124 }
6125
6126 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
6127
6128 #ifndef USE_IX86_FRAME_POINTER
6129 #define USE_IX86_FRAME_POINTER 0
6130 #endif
6131
6132 #ifndef USE_X86_64_FRAME_POINTER
6133 #define USE_X86_64_FRAME_POINTER 0
6134 #endif
6135
6136 /* Set the default values for switches whose default depends on TARGET_64BIT
6137 in case they weren't overwritten by command line options. */
6138 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6139 {
6140 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
6141 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
6142 if (opts->x_flag_asynchronous_unwind_tables
6143 && !opts_set->x_flag_unwind_tables
6144 && TARGET_64BIT_MS_ABI)
6145 opts->x_flag_unwind_tables = 1;
6146 if (opts->x_flag_asynchronous_unwind_tables == 2)
6147 opts->x_flag_unwind_tables
6148 = opts->x_flag_asynchronous_unwind_tables = 1;
6149 if (opts->x_flag_pcc_struct_return == 2)
6150 opts->x_flag_pcc_struct_return = 0;
6151 }
6152 else
6153 {
6154 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
6155 opts->x_flag_omit_frame_pointer
6156 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
6157 if (opts->x_flag_asynchronous_unwind_tables == 2)
6158 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
6159 if (opts->x_flag_pcc_struct_return == 2)
6160 {
6161 /* Intel MCU psABI specifies that -freg-struct-return should
6162 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
6163 we check -miamcu so that -freg-struct-return is always
6164 turned on if -miamcu is used. */
6165 if (TARGET_IAMCU_P (opts->x_target_flags))
6166 opts->x_flag_pcc_struct_return = 0;
6167 else
6168 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
6169 }
6170 }
6171
6172 ix86_tune_cost = processor_target_table[ix86_tune].cost;
6173 /* TODO: ix86_cost should be chosen at instruction or function granuality
6174 so for cold code we use size_cost even in !optimize_size compilation. */
6175 if (opts->x_optimize_size)
6176 ix86_cost = &ix86_size_cost;
6177 else
6178 ix86_cost = ix86_tune_cost;
6179
6180 /* Arrange to set up i386_stack_locals for all functions. */
6181 init_machine_status = ix86_init_machine_status;
6182
6183 /* Validate -mregparm= value. */
6184 if (opts_set->x_ix86_regparm)
6185 {
6186 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6187 warning (0, "-mregparm is ignored in 64-bit mode");
6188 else if (TARGET_IAMCU_P (opts->x_target_flags))
6189 warning (0, "-mregparm is ignored for Intel MCU psABI");
6190 if (opts->x_ix86_regparm > REGPARM_MAX)
6191 {
6192 error ("-mregparm=%d is not between 0 and %d",
6193 opts->x_ix86_regparm, REGPARM_MAX);
6194 opts->x_ix86_regparm = 0;
6195 }
6196 }
6197 if (TARGET_IAMCU_P (opts->x_target_flags)
6198 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
6199 opts->x_ix86_regparm = REGPARM_MAX;
6200
6201 /* Default align_* from the processor table. */
6202 ix86_default_align (opts);
6203
6204 /* Provide default for -mbranch-cost= value. */
6205 if (!opts_set->x_ix86_branch_cost)
6206 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
6207
6208 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6209 {
6210 opts->x_target_flags
6211 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
6212
6213 /* Enable by default the SSE and MMX builtins. Do allow the user to
6214 explicitly disable any of these. In particular, disabling SSE and
6215 MMX for kernel code is extremely useful. */
6216 if (!ix86_arch_specified)
6217 opts->x_ix86_isa_flags
6218 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
6219 | TARGET_SUBTARGET64_ISA_DEFAULT)
6220 & ~opts->x_ix86_isa_flags_explicit);
6221
6222 if (TARGET_RTD_P (opts->x_target_flags))
6223 warning (0,
6224 main_args_p
6225 ? G_("%<-mrtd%> is ignored in 64bit mode")
6226 : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
6227 }
6228 else
6229 {
6230 opts->x_target_flags
6231 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
6232
6233 if (!ix86_arch_specified)
6234 opts->x_ix86_isa_flags
6235 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
6236
6237 /* i386 ABI does not specify red zone. It still makes sense to use it
6238 when programmer takes care to stack from being destroyed. */
6239 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
6240 opts->x_target_flags |= MASK_NO_RED_ZONE;
6241 }
6242
6243 /* Keep nonleaf frame pointers. */
6244 if (opts->x_flag_omit_frame_pointer)
6245 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
6246 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
6247 opts->x_flag_omit_frame_pointer = 1;
6248
6249 /* If we're doing fast math, we don't care about comparison order
6250 wrt NaNs. This lets us use a shorter comparison sequence. */
6251 if (opts->x_flag_finite_math_only)
6252 opts->x_target_flags &= ~MASK_IEEE_FP;
6253
6254 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
6255 since the insns won't need emulation. */
6256 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
6257 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
6258
6259 /* Likewise, if the target doesn't have a 387, or we've specified
6260 software floating point, don't use 387 inline intrinsics. */
6261 if (!TARGET_80387_P (opts->x_target_flags))
6262 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
6263
6264 /* Turn on MMX builtins for -msse. */
6265 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
6266 opts->x_ix86_isa_flags
6267 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
6268
6269 /* Enable SSE prefetch. */
6270 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
6271 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
6272 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
6273 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
6274 x86_prefetch_sse = true;
6275
6276 /* Enable popcnt instruction for -msse4.2 or -mabm. */
6277 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
6278 || TARGET_ABM_P (opts->x_ix86_isa_flags))
6279 opts->x_ix86_isa_flags
6280 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
6281
6282 /* Enable lzcnt instruction for -mabm. */
6283 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
6284 opts->x_ix86_isa_flags
6285 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
6286
6287 /* Validate -mpreferred-stack-boundary= value or default it to
6288 PREFERRED_STACK_BOUNDARY_DEFAULT. */
6289 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
6290 if (opts_set->x_ix86_preferred_stack_boundary_arg)
6291 {
6292 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
6293 int max = TARGET_SEH ? 4 : 12;
6294
6295 if (opts->x_ix86_preferred_stack_boundary_arg < min
6296 || opts->x_ix86_preferred_stack_boundary_arg > max)
6297 {
6298 if (min == max)
6299 error ("-mpreferred-stack-boundary is not supported "
6300 "for this target");
6301 else
6302 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
6303 opts->x_ix86_preferred_stack_boundary_arg, min, max);
6304 }
6305 else
6306 ix86_preferred_stack_boundary
6307 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
6308 }
6309
6310 /* Set the default value for -mstackrealign. */
6311 if (opts->x_ix86_force_align_arg_pointer == -1)
6312 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
6313
6314 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
6315
6316 /* Validate -mincoming-stack-boundary= value or default it to
6317 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
6318 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
6319 if (opts_set->x_ix86_incoming_stack_boundary_arg)
6320 {
6321 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
6322
6323 if (opts->x_ix86_incoming_stack_boundary_arg < min
6324 || opts->x_ix86_incoming_stack_boundary_arg > 12)
6325 error ("-mincoming-stack-boundary=%d is not between %d and 12",
6326 opts->x_ix86_incoming_stack_boundary_arg, min);
6327 else
6328 {
6329 ix86_user_incoming_stack_boundary
6330 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
6331 ix86_incoming_stack_boundary
6332 = ix86_user_incoming_stack_boundary;
6333 }
6334 }
6335
6336 #ifndef NO_PROFILE_COUNTERS
6337 if (flag_nop_mcount)
6338 error ("-mnop-mcount is not compatible with this target");
6339 #endif
6340 if (flag_nop_mcount && flag_pic)
6341 error ("-mnop-mcount is not implemented for -fPIC");
6342
6343 /* Accept -msseregparm only if at least SSE support is enabled. */
6344 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
6345 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
6346 error (main_args_p
6347 ? G_("%<-msseregparm%> used without SSE enabled")
6348 : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
6349
6350 if (opts_set->x_ix86_fpmath)
6351 {
6352 if (opts->x_ix86_fpmath & FPMATH_SSE)
6353 {
6354 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
6355 {
6356 if (TARGET_80387_P (opts->x_target_flags))
6357 {
6358 warning (0, "SSE instruction set disabled, using 387 arithmetics");
6359 opts->x_ix86_fpmath = FPMATH_387;
6360 }
6361 }
6362 else if ((opts->x_ix86_fpmath & FPMATH_387)
6363 && !TARGET_80387_P (opts->x_target_flags))
6364 {
6365 warning (0, "387 instruction set disabled, using SSE arithmetics");
6366 opts->x_ix86_fpmath = FPMATH_SSE;
6367 }
6368 }
6369 }
6370 /* For all chips supporting SSE2, -mfpmath=sse performs better than
6371 fpmath=387. The second is however default at many targets since the
6372 extra 80bit precision of temporaries is considered to be part of ABI.
6373 Overwrite the default at least for -ffast-math.
6374 TODO: -mfpmath=both seems to produce same performing code with bit
6375 smaller binaries. It is however not clear if register allocation is
6376 ready for this setting.
6377 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
6378 codegen. We may switch to 387 with -ffast-math for size optimized
6379 functions. */
6380 else if (fast_math_flags_set_p (&global_options)
6381 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
6382 opts->x_ix86_fpmath = FPMATH_SSE;
6383 else
6384 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
6385
6386 /* Use external vectorized library in vectorizing intrinsics. */
6387 if (opts_set->x_ix86_veclibabi_type)
6388 switch (opts->x_ix86_veclibabi_type)
6389 {
6390 case ix86_veclibabi_type_svml:
6391 ix86_veclib_handler = ix86_veclibabi_svml;
6392 break;
6393
6394 case ix86_veclibabi_type_acml:
6395 ix86_veclib_handler = ix86_veclibabi_acml;
6396 break;
6397
6398 default:
6399 gcc_unreachable ();
6400 }
6401
6402 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
6403 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6404 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6405
6406 /* If stack probes are required, the space used for large function
6407 arguments on the stack must also be probed, so enable
6408 -maccumulate-outgoing-args so this happens in the prologue. */
6409 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
6410 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6411 {
6412 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
6413 warning (0,
6414 main_args_p
6415 ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
6416 "for correctness")
6417 : G_("stack probing requires "
6418 "%<target(\"accumulate-outgoing-args\")%> for "
6419 "correctness"));
6420 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6421 }
6422
6423 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
6424 so enable -maccumulate-outgoing-args when %ebp is fixed. */
6425 if (fixed_regs[BP_REG]
6426 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6427 {
6428 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
6429 warning (0,
6430 main_args_p
6431 ? G_("fixed ebp register requires "
6432 "%<-maccumulate-outgoing-args%>")
6433 : G_("fixed ebp register requires "
6434 "%<target(\"accumulate-outgoing-args\")%>"));
6435 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6436 }
6437
6438 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
6439 {
6440 char *p;
6441 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
6442 p = strchr (internal_label_prefix, 'X');
6443 internal_label_prefix_len = p - internal_label_prefix;
6444 *p = '\0';
6445 }
6446
6447 /* When scheduling description is not available, disable scheduler pass
6448 so it won't slow down the compilation and make x87 code slower. */
6449 if (!TARGET_SCHEDULE)
6450 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
6451
6452 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
6453 ix86_tune_cost->simultaneous_prefetches,
6454 opts->x_param_values,
6455 opts_set->x_param_values);
6456 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
6457 ix86_tune_cost->prefetch_block,
6458 opts->x_param_values,
6459 opts_set->x_param_values);
6460 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
6461 ix86_tune_cost->l1_cache_size,
6462 opts->x_param_values,
6463 opts_set->x_param_values);
6464 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
6465 ix86_tune_cost->l2_cache_size,
6466 opts->x_param_values,
6467 opts_set->x_param_values);
6468
6469 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
6470 if (opts->x_flag_prefetch_loop_arrays < 0
6471 && HAVE_prefetch
6472 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
6473 && !opts->x_optimize_size
6474 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
6475 opts->x_flag_prefetch_loop_arrays = 1;
6476
6477 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
6478 can be opts->x_optimized to ap = __builtin_next_arg (0). */
6479 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
6480 targetm.expand_builtin_va_start = NULL;
6481
6482 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6483 {
6484 ix86_gen_leave = gen_leave_rex64;
6485 if (Pmode == DImode)
6486 {
6487 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
6488 ix86_gen_tls_local_dynamic_base_64
6489 = gen_tls_local_dynamic_base_64_di;
6490 }
6491 else
6492 {
6493 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
6494 ix86_gen_tls_local_dynamic_base_64
6495 = gen_tls_local_dynamic_base_64_si;
6496 }
6497 }
6498 else
6499 ix86_gen_leave = gen_leave;
6500
6501 if (Pmode == DImode)
6502 {
6503 ix86_gen_add3 = gen_adddi3;
6504 ix86_gen_sub3 = gen_subdi3;
6505 ix86_gen_sub3_carry = gen_subdi3_carry;
6506 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
6507 ix86_gen_andsp = gen_anddi3;
6508 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
6509 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
6510 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
6511 ix86_gen_monitor = gen_sse3_monitor_di;
6512 ix86_gen_monitorx = gen_monitorx_di;
6513 ix86_gen_clzero = gen_clzero_di;
6514 }
6515 else
6516 {
6517 ix86_gen_add3 = gen_addsi3;
6518 ix86_gen_sub3 = gen_subsi3;
6519 ix86_gen_sub3_carry = gen_subsi3_carry;
6520 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
6521 ix86_gen_andsp = gen_andsi3;
6522 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
6523 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
6524 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
6525 ix86_gen_monitor = gen_sse3_monitor_si;
6526 ix86_gen_monitorx = gen_monitorx_si;
6527 ix86_gen_clzero = gen_clzero_si;
6528 }
6529
6530 #ifdef USE_IX86_CLD
6531 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
6532 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
6533 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
6534 #endif
6535
6536 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic)
6537 {
6538 if (opts->x_flag_fentry > 0)
6539 sorry ("-mfentry isn%'t supported for 32-bit in combination "
6540 "with -fpic");
6541 opts->x_flag_fentry = 0;
6542 }
6543 else if (TARGET_SEH)
6544 {
6545 if (opts->x_flag_fentry == 0)
6546 sorry ("-mno-fentry isn%'t compatible with SEH");
6547 opts->x_flag_fentry = 1;
6548 }
6549 else if (opts->x_flag_fentry < 0)
6550 {
6551 #if defined(PROFILE_BEFORE_PROLOGUE)
6552 opts->x_flag_fentry = 1;
6553 #else
6554 opts->x_flag_fentry = 0;
6555 #endif
6556 }
6557
6558 if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES)
6559 sorry ("-mcall-ms2sysv-xlogues isn%'t currently supported with SEH");
6560
6561 if (!(opts_set->x_target_flags & MASK_VZEROUPPER))
6562 opts->x_target_flags |= MASK_VZEROUPPER;
6563 if (!(opts_set->x_target_flags & MASK_STV))
6564 opts->x_target_flags |= MASK_STV;
6565 /* Disable STV if -mpreferred-stack-boundary={2,3} or
6566 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
6567 stack realignment will be extra cost the pass doesn't take into
6568 account and the pass can't realign the stack. */
6569 if (ix86_preferred_stack_boundary < 128
6570 || ix86_incoming_stack_boundary < 128
6571 || opts->x_ix86_force_align_arg_pointer)
6572 opts->x_target_flags &= ~MASK_STV;
6573 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
6574 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
6575 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
6576 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
6577 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
6578 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
6579 /* Enable 128-bit AVX instruction generation
6580 for the auto-vectorizer. */
6581 if (TARGET_AVX128_OPTIMAL
6582 && !(opts_set->x_target_flags & MASK_PREFER_AVX128))
6583 opts->x_target_flags |= MASK_PREFER_AVX128;
6584
6585 if (opts->x_ix86_recip_name)
6586 {
6587 char *p = ASTRDUP (opts->x_ix86_recip_name);
6588 char *q;
6589 unsigned int mask, i;
6590 bool invert;
6591
6592 while ((q = strtok (p, ",")) != NULL)
6593 {
6594 p = NULL;
6595 if (*q == '!')
6596 {
6597 invert = true;
6598 q++;
6599 }
6600 else
6601 invert = false;
6602
6603 if (!strcmp (q, "default"))
6604 mask = RECIP_MASK_ALL;
6605 else
6606 {
6607 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
6608 if (!strcmp (q, recip_options[i].string))
6609 {
6610 mask = recip_options[i].mask;
6611 break;
6612 }
6613
6614 if (i == ARRAY_SIZE (recip_options))
6615 {
6616 error ("unknown option for -mrecip=%s", q);
6617 invert = false;
6618 mask = RECIP_MASK_NONE;
6619 }
6620 }
6621
6622 opts->x_recip_mask_explicit |= mask;
6623 if (invert)
6624 opts->x_recip_mask &= ~mask;
6625 else
6626 opts->x_recip_mask |= mask;
6627 }
6628 }
6629
6630 if (TARGET_RECIP_P (opts->x_target_flags))
6631 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
6632 else if (opts_set->x_target_flags & MASK_RECIP)
6633 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
6634
6635 /* Default long double to 64-bit for 32-bit Bionic and to __float128
6636 for 64-bit Bionic. Also default long double to 64-bit for Intel
6637 MCU psABI. */
6638 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
6639 && !(opts_set->x_target_flags
6640 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
6641 opts->x_target_flags |= (TARGET_64BIT
6642 ? MASK_LONG_DOUBLE_128
6643 : MASK_LONG_DOUBLE_64);
6644
6645 /* Only one of them can be active. */
6646 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
6647 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
6648
6649 /* Save the initial options in case the user does function specific
6650 options. */
6651 if (main_args_p)
6652 target_option_default_node = target_option_current_node
6653 = build_target_option_node (opts);
6654
6655 /* Handle stack protector */
6656 if (!opts_set->x_ix86_stack_protector_guard)
6657 opts->x_ix86_stack_protector_guard
6658 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
6659
6660 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
6661 if (opts->x_ix86_tune_memcpy_strategy)
6662 {
6663 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
6664 ix86_parse_stringop_strategy_string (str, false);
6665 free (str);
6666 }
6667
6668 if (opts->x_ix86_tune_memset_strategy)
6669 {
6670 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
6671 ix86_parse_stringop_strategy_string (str, true);
6672 free (str);
6673 }
6674
6675 return true;
6676 }
6677
6678 /* Implement the TARGET_OPTION_OVERRIDE hook. */
6679
6680 static void
6681 ix86_option_override (void)
6682 {
6683 ix86_option_override_internal (true, &global_options, &global_options_set);
6684 }
6685
6686 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
6687 static char *
6688 ix86_offload_options (void)
6689 {
6690 if (TARGET_LP64)
6691 return xstrdup ("-foffload-abi=lp64");
6692 return xstrdup ("-foffload-abi=ilp32");
6693 }
6694
6695 /* Update register usage after having seen the compiler flags. */
6696
6697 static void
6698 ix86_conditional_register_usage (void)
6699 {
6700 int i, c_mask;
6701
6702 /* If there are no caller-saved registers, preserve all registers.
6703 except fixed_regs and registers used for function return value
6704 since aggregate_value_p checks call_used_regs[regno] on return
6705 value. */
6706 if (cfun && cfun->machine->no_caller_saved_registers)
6707 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6708 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
6709 call_used_regs[i] = 0;
6710
6711 /* For 32-bit targets, squash the REX registers. */
6712 if (! TARGET_64BIT)
6713 {
6714 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
6715 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6716 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
6717 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6718 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6719 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6720 }
6721
6722 /* See the definition of CALL_USED_REGISTERS in i386.h. */
6723 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
6724
6725 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
6726
6727 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6728 {
6729 /* Set/reset conditionally defined registers from
6730 CALL_USED_REGISTERS initializer. */
6731 if (call_used_regs[i] > 1)
6732 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
6733
6734 /* Calculate registers of CLOBBERED_REGS register set
6735 as call used registers from GENERAL_REGS register set. */
6736 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
6737 && call_used_regs[i])
6738 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
6739 }
6740
6741 /* If MMX is disabled, squash the registers. */
6742 if (! TARGET_MMX)
6743 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6744 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
6745 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6746
6747 /* If SSE is disabled, squash the registers. */
6748 if (! TARGET_SSE)
6749 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6750 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
6751 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6752
6753 /* If the FPU is disabled, squash the registers. */
6754 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
6755 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6756 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
6757 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6758
6759 /* If AVX512F is disabled, squash the registers. */
6760 if (! TARGET_AVX512F)
6761 {
6762 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6763 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6764
6765 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
6766 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6767 }
6768
6769 /* If MPX is disabled, squash the registers. */
6770 if (! TARGET_MPX)
6771 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
6772 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6773 }
6774
6775 \f
6776 /* Save the current options */
6777
6778 static void
6779 ix86_function_specific_save (struct cl_target_option *ptr,
6780 struct gcc_options *opts)
6781 {
6782 ptr->arch = ix86_arch;
6783 ptr->schedule = ix86_schedule;
6784 ptr->prefetch_sse = x86_prefetch_sse;
6785 ptr->tune = ix86_tune;
6786 ptr->branch_cost = ix86_branch_cost;
6787 ptr->tune_defaulted = ix86_tune_defaulted;
6788 ptr->arch_specified = ix86_arch_specified;
6789 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
6790 ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
6791 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
6792 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
6793 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
6794 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
6795 ptr->x_ix86_abi = opts->x_ix86_abi;
6796 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
6797 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
6798 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
6799 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
6800 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
6801 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
6802 ptr->x_ix86_pmode = opts->x_ix86_pmode;
6803 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
6804 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
6805 ptr->x_ix86_regparm = opts->x_ix86_regparm;
6806 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
6807 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
6808 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
6809 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
6810 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
6811 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
6812 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
6813 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
6814 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
6815 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
6816
6817 /* The fields are char but the variables are not; make sure the
6818 values fit in the fields. */
6819 gcc_assert (ptr->arch == ix86_arch);
6820 gcc_assert (ptr->schedule == ix86_schedule);
6821 gcc_assert (ptr->tune == ix86_tune);
6822 gcc_assert (ptr->branch_cost == ix86_branch_cost);
6823 }
6824
6825 /* Restore the current options */
6826
6827 static void
6828 ix86_function_specific_restore (struct gcc_options *opts,
6829 struct cl_target_option *ptr)
6830 {
6831 enum processor_type old_tune = ix86_tune;
6832 enum processor_type old_arch = ix86_arch;
6833 unsigned int ix86_arch_mask;
6834 int i;
6835
6836 /* We don't change -fPIC. */
6837 opts->x_flag_pic = flag_pic;
6838
6839 ix86_arch = (enum processor_type) ptr->arch;
6840 ix86_schedule = (enum attr_cpu) ptr->schedule;
6841 ix86_tune = (enum processor_type) ptr->tune;
6842 x86_prefetch_sse = ptr->prefetch_sse;
6843 opts->x_ix86_branch_cost = ptr->branch_cost;
6844 ix86_tune_defaulted = ptr->tune_defaulted;
6845 ix86_arch_specified = ptr->arch_specified;
6846 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
6847 opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
6848 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
6849 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
6850 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
6851 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
6852 opts->x_ix86_abi = ptr->x_ix86_abi;
6853 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
6854 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
6855 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
6856 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
6857 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
6858 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
6859 opts->x_ix86_pmode = ptr->x_ix86_pmode;
6860 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
6861 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
6862 opts->x_ix86_regparm = ptr->x_ix86_regparm;
6863 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
6864 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
6865 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
6866 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
6867 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
6868 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
6869 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
6870 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
6871 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
6872 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
6873 ix86_tune_cost = processor_target_table[ix86_tune].cost;
6874 /* TODO: ix86_cost should be chosen at instruction or function granuality
6875 so for cold code we use size_cost even in !optimize_size compilation. */
6876 if (opts->x_optimize_size)
6877 ix86_cost = &ix86_size_cost;
6878 else
6879 ix86_cost = ix86_tune_cost;
6880
6881 /* Recreate the arch feature tests if the arch changed */
6882 if (old_arch != ix86_arch)
6883 {
6884 ix86_arch_mask = 1u << ix86_arch;
6885 for (i = 0; i < X86_ARCH_LAST; ++i)
6886 ix86_arch_features[i]
6887 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
6888 }
6889
6890 /* Recreate the tune optimization tests */
6891 if (old_tune != ix86_tune)
6892 set_ix86_tune_features (ix86_tune, false);
6893 }
6894
6895 /* Adjust target options after streaming them in. This is mainly about
6896 reconciling them with global options. */
6897
6898 static void
6899 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
6900 {
6901 /* flag_pic is a global option, but ix86_cmodel is target saved option
6902 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
6903 for PIC, or error out. */
6904 if (flag_pic)
6905 switch (ptr->x_ix86_cmodel)
6906 {
6907 case CM_SMALL:
6908 ptr->x_ix86_cmodel = CM_SMALL_PIC;
6909 break;
6910
6911 case CM_MEDIUM:
6912 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
6913 break;
6914
6915 case CM_LARGE:
6916 ptr->x_ix86_cmodel = CM_LARGE_PIC;
6917 break;
6918
6919 case CM_KERNEL:
6920 error ("code model %s does not support PIC mode", "kernel");
6921 break;
6922
6923 default:
6924 break;
6925 }
6926 else
6927 switch (ptr->x_ix86_cmodel)
6928 {
6929 case CM_SMALL_PIC:
6930 ptr->x_ix86_cmodel = CM_SMALL;
6931 break;
6932
6933 case CM_MEDIUM_PIC:
6934 ptr->x_ix86_cmodel = CM_MEDIUM;
6935 break;
6936
6937 case CM_LARGE_PIC:
6938 ptr->x_ix86_cmodel = CM_LARGE;
6939 break;
6940
6941 default:
6942 break;
6943 }
6944 }
6945
6946 /* Print the current options */
6947
6948 static void
6949 ix86_function_specific_print (FILE *file, int indent,
6950 struct cl_target_option *ptr)
6951 {
6952 char *target_string
6953 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
6954 ptr->x_target_flags, ptr->x_ix86_target_flags,
6955 NULL, NULL, ptr->x_ix86_fpmath, false);
6956
6957 gcc_assert (ptr->arch < PROCESSOR_max);
6958 fprintf (file, "%*sarch = %d (%s)\n",
6959 indent, "",
6960 ptr->arch, processor_target_table[ptr->arch].name);
6961
6962 gcc_assert (ptr->tune < PROCESSOR_max);
6963 fprintf (file, "%*stune = %d (%s)\n",
6964 indent, "",
6965 ptr->tune, processor_target_table[ptr->tune].name);
6966
6967 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
6968
6969 if (target_string)
6970 {
6971 fprintf (file, "%*s%s\n", indent, "", target_string);
6972 free (target_string);
6973 }
6974 }
6975
6976 \f
6977 /* Inner function to process the attribute((target(...))), take an argument and
6978 set the current options from the argument. If we have a list, recursively go
6979 over the list. */
6980
6981 static bool
6982 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
6983 struct gcc_options *opts,
6984 struct gcc_options *opts_set,
6985 struct gcc_options *enum_opts_set)
6986 {
6987 char *next_optstr;
6988 bool ret = true;
6989
6990 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
6991 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
6992 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
6993 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
6994 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
6995
6996 enum ix86_opt_type
6997 {
6998 ix86_opt_unknown,
6999 ix86_opt_yes,
7000 ix86_opt_no,
7001 ix86_opt_str,
7002 ix86_opt_enum,
7003 ix86_opt_isa
7004 };
7005
7006 static const struct
7007 {
7008 const char *string;
7009 size_t len;
7010 enum ix86_opt_type type;
7011 int opt;
7012 int mask;
7013 } attrs[] = {
7014 /* isa options */
7015 IX86_ATTR_ISA ("sgx", OPT_msgx),
7016 IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
7017 IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
7018 IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
7019
7020 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
7021 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
7022 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
7023 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
7024 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
7025 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
7026 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
7027 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
7028 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
7029 IX86_ATTR_ISA ("avx2", OPT_mavx2),
7030 IX86_ATTR_ISA ("fma", OPT_mfma),
7031 IX86_ATTR_ISA ("xop", OPT_mxop),
7032 IX86_ATTR_ISA ("fma4", OPT_mfma4),
7033 IX86_ATTR_ISA ("f16c", OPT_mf16c),
7034 IX86_ATTR_ISA ("avx", OPT_mavx),
7035 IX86_ATTR_ISA ("sse4", OPT_msse4),
7036 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
7037 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
7038 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
7039 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
7040 IX86_ATTR_ISA ("sse3", OPT_msse3),
7041 IX86_ATTR_ISA ("aes", OPT_maes),
7042 IX86_ATTR_ISA ("sha", OPT_msha),
7043 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
7044 IX86_ATTR_ISA ("sse2", OPT_msse2),
7045 IX86_ATTR_ISA ("sse", OPT_msse),
7046 IX86_ATTR_ISA ("3dnowa", OPT_m3dnowa),
7047 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
7048 IX86_ATTR_ISA ("mmx", OPT_mmmx),
7049 IX86_ATTR_ISA ("rtm", OPT_mrtm),
7050 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
7051 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
7052 IX86_ATTR_ISA ("adx", OPT_madx),
7053 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
7054 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
7055 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
7056 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
7057 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
7058 IX86_ATTR_ISA ("xsave", OPT_mxsave),
7059 IX86_ATTR_ISA ("abm", OPT_mabm),
7060 IX86_ATTR_ISA ("bmi", OPT_mbmi),
7061 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
7062 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
7063 IX86_ATTR_ISA ("tbm", OPT_mtbm),
7064 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
7065 IX86_ATTR_ISA ("cx16", OPT_mcx16),
7066 IX86_ATTR_ISA ("sahf", OPT_msahf),
7067 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
7068 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
7069 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
7070 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
7071 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
7072 IX86_ATTR_ISA ("clzero", OPT_mclzero),
7073 IX86_ATTR_ISA ("pku", OPT_mpku),
7074 IX86_ATTR_ISA ("lwp", OPT_mlwp),
7075 IX86_ATTR_ISA ("hle", OPT_mhle),
7076 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
7077 IX86_ATTR_ISA ("mpx", OPT_mmpx),
7078 IX86_ATTR_ISA ("clwb", OPT_mclwb),
7079 IX86_ATTR_ISA ("rdpid", OPT_mrdpid),
7080
7081 /* enum options */
7082 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
7083
7084 /* string options */
7085 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
7086 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
7087
7088 /* flag options */
7089 IX86_ATTR_YES ("cld",
7090 OPT_mcld,
7091 MASK_CLD),
7092
7093 IX86_ATTR_NO ("fancy-math-387",
7094 OPT_mfancy_math_387,
7095 MASK_NO_FANCY_MATH_387),
7096
7097 IX86_ATTR_YES ("ieee-fp",
7098 OPT_mieee_fp,
7099 MASK_IEEE_FP),
7100
7101 IX86_ATTR_YES ("inline-all-stringops",
7102 OPT_minline_all_stringops,
7103 MASK_INLINE_ALL_STRINGOPS),
7104
7105 IX86_ATTR_YES ("inline-stringops-dynamically",
7106 OPT_minline_stringops_dynamically,
7107 MASK_INLINE_STRINGOPS_DYNAMICALLY),
7108
7109 IX86_ATTR_NO ("align-stringops",
7110 OPT_mno_align_stringops,
7111 MASK_NO_ALIGN_STRINGOPS),
7112
7113 IX86_ATTR_YES ("recip",
7114 OPT_mrecip,
7115 MASK_RECIP),
7116
7117 };
7118
7119 /* If this is a list, recurse to get the options. */
7120 if (TREE_CODE (args) == TREE_LIST)
7121 {
7122 bool ret = true;
7123
7124 for (; args; args = TREE_CHAIN (args))
7125 if (TREE_VALUE (args)
7126 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
7127 p_strings, opts, opts_set,
7128 enum_opts_set))
7129 ret = false;
7130
7131 return ret;
7132 }
7133
7134 else if (TREE_CODE (args) != STRING_CST)
7135 {
7136 error ("attribute %<target%> argument not a string");
7137 return false;
7138 }
7139
7140 /* Handle multiple arguments separated by commas. */
7141 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
7142
7143 while (next_optstr && *next_optstr != '\0')
7144 {
7145 char *p = next_optstr;
7146 char *orig_p = p;
7147 char *comma = strchr (next_optstr, ',');
7148 const char *opt_string;
7149 size_t len, opt_len;
7150 int opt;
7151 bool opt_set_p;
7152 char ch;
7153 unsigned i;
7154 enum ix86_opt_type type = ix86_opt_unknown;
7155 int mask = 0;
7156
7157 if (comma)
7158 {
7159 *comma = '\0';
7160 len = comma - next_optstr;
7161 next_optstr = comma + 1;
7162 }
7163 else
7164 {
7165 len = strlen (p);
7166 next_optstr = NULL;
7167 }
7168
7169 /* Recognize no-xxx. */
7170 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
7171 {
7172 opt_set_p = false;
7173 p += 3;
7174 len -= 3;
7175 }
7176 else
7177 opt_set_p = true;
7178
7179 /* Find the option. */
7180 ch = *p;
7181 opt = N_OPTS;
7182 for (i = 0; i < ARRAY_SIZE (attrs); i++)
7183 {
7184 type = attrs[i].type;
7185 opt_len = attrs[i].len;
7186 if (ch == attrs[i].string[0]
7187 && ((type != ix86_opt_str && type != ix86_opt_enum)
7188 ? len == opt_len
7189 : len > opt_len)
7190 && memcmp (p, attrs[i].string, opt_len) == 0)
7191 {
7192 opt = attrs[i].opt;
7193 mask = attrs[i].mask;
7194 opt_string = attrs[i].string;
7195 break;
7196 }
7197 }
7198
7199 /* Process the option. */
7200 if (opt == N_OPTS)
7201 {
7202 error ("attribute(target(\"%s\")) is unknown", orig_p);
7203 ret = false;
7204 }
7205
7206 else if (type == ix86_opt_isa)
7207 {
7208 struct cl_decoded_option decoded;
7209
7210 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
7211 ix86_handle_option (opts, opts_set,
7212 &decoded, input_location);
7213 }
7214
7215 else if (type == ix86_opt_yes || type == ix86_opt_no)
7216 {
7217 if (type == ix86_opt_no)
7218 opt_set_p = !opt_set_p;
7219
7220 if (opt_set_p)
7221 opts->x_target_flags |= mask;
7222 else
7223 opts->x_target_flags &= ~mask;
7224 }
7225
7226 else if (type == ix86_opt_str)
7227 {
7228 if (p_strings[opt])
7229 {
7230 error ("option(\"%s\") was already specified", opt_string);
7231 ret = false;
7232 }
7233 else
7234 p_strings[opt] = xstrdup (p + opt_len);
7235 }
7236
7237 else if (type == ix86_opt_enum)
7238 {
7239 bool arg_ok;
7240 int value;
7241
7242 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
7243 if (arg_ok)
7244 set_option (opts, enum_opts_set, opt, value,
7245 p + opt_len, DK_UNSPECIFIED, input_location,
7246 global_dc);
7247 else
7248 {
7249 error ("attribute(target(\"%s\")) is unknown", orig_p);
7250 ret = false;
7251 }
7252 }
7253
7254 else
7255 gcc_unreachable ();
7256 }
7257
7258 return ret;
7259 }
7260
7261 /* Release allocated strings. */
7262 static void
7263 release_options_strings (char **option_strings)
7264 {
7265 /* Free up memory allocated to hold the strings */
7266 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
7267 free (option_strings[i]);
7268 }
7269
7270 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
7271
7272 tree
7273 ix86_valid_target_attribute_tree (tree args,
7274 struct gcc_options *opts,
7275 struct gcc_options *opts_set)
7276 {
7277 const char *orig_arch_string = opts->x_ix86_arch_string;
7278 const char *orig_tune_string = opts->x_ix86_tune_string;
7279 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
7280 int orig_tune_defaulted = ix86_tune_defaulted;
7281 int orig_arch_specified = ix86_arch_specified;
7282 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
7283 tree t = NULL_TREE;
7284 struct cl_target_option *def
7285 = TREE_TARGET_OPTION (target_option_default_node);
7286 struct gcc_options enum_opts_set;
7287
7288 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
7289
7290 /* Process each of the options on the chain. */
7291 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
7292 opts_set, &enum_opts_set))
7293 return error_mark_node;
7294
7295 /* If the changed options are different from the default, rerun
7296 ix86_option_override_internal, and then save the options away.
7297 The string options are attribute options, and will be undone
7298 when we copy the save structure. */
7299 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
7300 || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
7301 || opts->x_target_flags != def->x_target_flags
7302 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
7303 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
7304 || enum_opts_set.x_ix86_fpmath)
7305 {
7306 /* If we are using the default tune= or arch=, undo the string assigned,
7307 and use the default. */
7308 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
7309 {
7310 opts->x_ix86_arch_string
7311 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
7312
7313 /* If arch= is set, clear all bits in x_ix86_isa_flags,
7314 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
7315 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
7316 | OPTION_MASK_ABI_64
7317 | OPTION_MASK_ABI_X32
7318 | OPTION_MASK_CODE16);
7319 opts->x_ix86_isa_flags2 = 0;
7320 }
7321 else if (!orig_arch_specified)
7322 opts->x_ix86_arch_string = NULL;
7323
7324 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
7325 opts->x_ix86_tune_string
7326 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
7327 else if (orig_tune_defaulted)
7328 opts->x_ix86_tune_string = NULL;
7329
7330 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
7331 if (enum_opts_set.x_ix86_fpmath)
7332 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
7333 else if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
7334 && TARGET_SSE_P (opts->x_ix86_isa_flags))
7335 {
7336 if (TARGET_80387_P (opts->x_target_flags))
7337 opts->x_ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE
7338 | FPMATH_387);
7339 else
7340 opts->x_ix86_fpmath = (enum fpmath_unit) FPMATH_SSE;
7341 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
7342 }
7343
7344 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
7345 bool r = ix86_option_override_internal (false, opts, opts_set);
7346 if (!r)
7347 {
7348 release_options_strings (option_strings);
7349 return error_mark_node;
7350 }
7351
7352 /* Add any builtin functions with the new isa if any. */
7353 ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
7354
7355 /* Save the current options unless we are validating options for
7356 #pragma. */
7357 t = build_target_option_node (opts);
7358
7359 opts->x_ix86_arch_string = orig_arch_string;
7360 opts->x_ix86_tune_string = orig_tune_string;
7361 opts_set->x_ix86_fpmath = orig_fpmath_set;
7362
7363 release_options_strings (option_strings);
7364 }
7365
7366 return t;
7367 }
7368
7369 /* Hook to validate attribute((target("string"))). */
7370
7371 static bool
7372 ix86_valid_target_attribute_p (tree fndecl,
7373 tree ARG_UNUSED (name),
7374 tree args,
7375 int ARG_UNUSED (flags))
7376 {
7377 struct gcc_options func_options;
7378 tree new_target, new_optimize;
7379 bool ret = true;
7380
7381 /* attribute((target("default"))) does nothing, beyond
7382 affecting multi-versioning. */
7383 if (TREE_VALUE (args)
7384 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
7385 && TREE_CHAIN (args) == NULL_TREE
7386 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
7387 return true;
7388
7389 tree old_optimize = build_optimization_node (&global_options);
7390
7391 /* Get the optimization options of the current function. */
7392 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
7393
7394 if (!func_optimize)
7395 func_optimize = old_optimize;
7396
7397 /* Init func_options. */
7398 memset (&func_options, 0, sizeof (func_options));
7399 init_options_struct (&func_options, NULL);
7400 lang_hooks.init_options_struct (&func_options);
7401
7402 cl_optimization_restore (&func_options,
7403 TREE_OPTIMIZATION (func_optimize));
7404
7405 /* Initialize func_options to the default before its target options can
7406 be set. */
7407 cl_target_option_restore (&func_options,
7408 TREE_TARGET_OPTION (target_option_default_node));
7409
7410 new_target = ix86_valid_target_attribute_tree (args, &func_options,
7411 &global_options_set);
7412
7413 new_optimize = build_optimization_node (&func_options);
7414
7415 if (new_target == error_mark_node)
7416 ret = false;
7417
7418 else if (fndecl && new_target)
7419 {
7420 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
7421
7422 if (old_optimize != new_optimize)
7423 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
7424 }
7425
7426 finalize_options_struct (&func_options);
7427
7428 return ret;
7429 }
7430
7431 \f
7432 /* Hook to determine if one function can safely inline another. */
7433
7434 static bool
7435 ix86_can_inline_p (tree caller, tree callee)
7436 {
7437 bool ret = false;
7438 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
7439 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
7440
7441 /* If callee has no option attributes, then it is ok to inline. */
7442 if (!callee_tree)
7443 ret = true;
7444
7445 /* If caller has no option attributes, but callee does then it is not ok to
7446 inline. */
7447 else if (!caller_tree)
7448 ret = false;
7449
7450 else
7451 {
7452 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
7453 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
7454
7455 /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
7456 function can inline a SSE2 function but a SSE2 function can't inline
7457 a SSE4 function. */
7458 if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
7459 != callee_opts->x_ix86_isa_flags)
7460 || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
7461 != callee_opts->x_ix86_isa_flags2))
7462 ret = false;
7463
7464 /* See if we have the same non-isa options. */
7465 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
7466 ret = false;
7467
7468 /* See if arch, tune, etc. are the same. */
7469 else if (caller_opts->arch != callee_opts->arch)
7470 ret = false;
7471
7472 else if (caller_opts->tune != callee_opts->tune)
7473 ret = false;
7474
7475 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
7476 ret = false;
7477
7478 else if (caller_opts->branch_cost != callee_opts->branch_cost)
7479 ret = false;
7480
7481 else
7482 ret = true;
7483 }
7484
7485 return ret;
7486 }
7487
7488 \f
7489 /* Remember the last target of ix86_set_current_function. */
7490 static GTY(()) tree ix86_previous_fndecl;
7491
7492 /* Set targets globals to the default (or current #pragma GCC target
7493 if active). Invalidate ix86_previous_fndecl cache. */
7494
7495 void
7496 ix86_reset_previous_fndecl (void)
7497 {
7498 tree new_tree = target_option_current_node;
7499 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
7500 if (TREE_TARGET_GLOBALS (new_tree))
7501 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7502 else if (new_tree == target_option_default_node)
7503 restore_target_globals (&default_target_globals);
7504 else
7505 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
7506 ix86_previous_fndecl = NULL_TREE;
7507 }
7508
7509 /* Set the func_type field from the function FNDECL. */
7510
7511 static void
7512 ix86_set_func_type (tree fndecl)
7513 {
7514 if (cfun->machine->func_type == TYPE_UNKNOWN)
7515 {
7516 if (lookup_attribute ("interrupt",
7517 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7518 {
7519 int nargs = 0;
7520 for (tree arg = DECL_ARGUMENTS (fndecl);
7521 arg;
7522 arg = TREE_CHAIN (arg))
7523 nargs++;
7524 cfun->machine->no_caller_saved_registers = true;
7525 cfun->machine->func_type
7526 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
7527
7528 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
7529
7530 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
7531 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
7532 sorry ("Only DWARF debug format is supported for interrupt "
7533 "service routine.");
7534 }
7535 else
7536 {
7537 cfun->machine->func_type = TYPE_NORMAL;
7538 if (lookup_attribute ("no_caller_saved_registers",
7539 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7540 cfun->machine->no_caller_saved_registers = true;
7541 }
7542 }
7543 }
7544
7545 /* Establish appropriate back-end context for processing the function
7546 FNDECL. The argument might be NULL to indicate processing at top
7547 level, outside of any function scope. */
7548 static void
7549 ix86_set_current_function (tree fndecl)
7550 {
7551 /* Only change the context if the function changes. This hook is called
7552 several times in the course of compiling a function, and we don't want to
7553 slow things down too much or call target_reinit when it isn't safe. */
7554 if (fndecl == ix86_previous_fndecl)
7555 {
7556 /* There may be 2 function bodies for the same function FNDECL,
7557 one is extern inline and one isn't. Call ix86_set_func_type
7558 to set the func_type field. */
7559 if (fndecl != NULL_TREE)
7560 ix86_set_func_type (fndecl);
7561 return;
7562 }
7563
7564 tree old_tree;
7565 if (ix86_previous_fndecl == NULL_TREE)
7566 old_tree = target_option_current_node;
7567 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
7568 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
7569 else
7570 old_tree = target_option_default_node;
7571
7572 if (fndecl == NULL_TREE)
7573 {
7574 if (old_tree != target_option_current_node)
7575 ix86_reset_previous_fndecl ();
7576 return;
7577 }
7578
7579 ix86_set_func_type (fndecl);
7580
7581 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
7582 if (new_tree == NULL_TREE)
7583 new_tree = target_option_default_node;
7584
7585 if (old_tree != new_tree)
7586 {
7587 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
7588 if (TREE_TARGET_GLOBALS (new_tree))
7589 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7590 else if (new_tree == target_option_default_node)
7591 restore_target_globals (&default_target_globals);
7592 else
7593 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
7594 }
7595 ix86_previous_fndecl = fndecl;
7596
7597 static bool prev_no_caller_saved_registers;
7598
7599 /* 64-bit MS and SYSV ABI have different set of call used registers.
7600 Avoid expensive re-initialization of init_regs each time we switch
7601 function context. */
7602 if (TARGET_64BIT
7603 && (call_used_regs[SI_REG]
7604 == (cfun->machine->call_abi == MS_ABI)))
7605 reinit_regs ();
7606 /* Need to re-initialize init_regs if caller-saved registers are
7607 changed. */
7608 else if (prev_no_caller_saved_registers
7609 != cfun->machine->no_caller_saved_registers)
7610 reinit_regs ();
7611
7612 if (cfun->machine->func_type != TYPE_NORMAL
7613 || cfun->machine->no_caller_saved_registers)
7614 {
7615 /* Don't allow MPX, SSE, MMX nor x87 instructions since they
7616 may change processor state. */
7617 const char *isa;
7618 if (TARGET_MPX)
7619 isa = "MPX";
7620 else if (TARGET_SSE)
7621 isa = "SSE";
7622 else if (TARGET_MMX)
7623 isa = "MMX/3Dnow";
7624 else if (TARGET_80387)
7625 isa = "80387";
7626 else
7627 isa = NULL;
7628 if (isa != NULL)
7629 {
7630 if (cfun->machine->func_type != TYPE_NORMAL)
7631 sorry ("%s instructions aren't allowed in %s service routine",
7632 isa, (cfun->machine->func_type == TYPE_EXCEPTION
7633 ? "exception" : "interrupt"));
7634 else
7635 sorry ("%s instructions aren't allowed in function with "
7636 "no_caller_saved_registers attribute", isa);
7637 /* Don't issue the same error twice. */
7638 cfun->machine->func_type = TYPE_NORMAL;
7639 cfun->machine->no_caller_saved_registers = false;
7640 }
7641 }
7642
7643 prev_no_caller_saved_registers
7644 = cfun->machine->no_caller_saved_registers;
7645 }
7646
7647 \f
7648 /* Return true if this goes in large data/bss. */
7649
7650 static bool
7651 ix86_in_large_data_p (tree exp)
7652 {
7653 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
7654 return false;
7655
7656 if (exp == NULL_TREE)
7657 return false;
7658
7659 /* Functions are never large data. */
7660 if (TREE_CODE (exp) == FUNCTION_DECL)
7661 return false;
7662
7663 /* Automatic variables are never large data. */
7664 if (VAR_P (exp) && !is_global_var (exp))
7665 return false;
7666
7667 if (VAR_P (exp) && DECL_SECTION_NAME (exp))
7668 {
7669 const char *section = DECL_SECTION_NAME (exp);
7670 if (strcmp (section, ".ldata") == 0
7671 || strcmp (section, ".lbss") == 0)
7672 return true;
7673 return false;
7674 }
7675 else
7676 {
7677 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
7678
7679 /* If this is an incomplete type with size 0, then we can't put it
7680 in data because it might be too big when completed. Also,
7681 int_size_in_bytes returns -1 if size can vary or is larger than
7682 an integer in which case also it is safer to assume that it goes in
7683 large data. */
7684 if (size <= 0 || size > ix86_section_threshold)
7685 return true;
7686 }
7687
7688 return false;
7689 }
7690
7691 /* i386-specific section flag to mark large sections. */
7692 #define SECTION_LARGE SECTION_MACH_DEP
7693
7694 /* Switch to the appropriate section for output of DECL.
7695 DECL is either a `VAR_DECL' node or a constant of some sort.
7696 RELOC indicates whether forming the initial value of DECL requires
7697 link-time relocations. */
7698
7699 ATTRIBUTE_UNUSED static section *
7700 x86_64_elf_select_section (tree decl, int reloc,
7701 unsigned HOST_WIDE_INT align)
7702 {
7703 if (ix86_in_large_data_p (decl))
7704 {
7705 const char *sname = NULL;
7706 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
7707 switch (categorize_decl_for_section (decl, reloc))
7708 {
7709 case SECCAT_DATA:
7710 sname = ".ldata";
7711 break;
7712 case SECCAT_DATA_REL:
7713 sname = ".ldata.rel";
7714 break;
7715 case SECCAT_DATA_REL_LOCAL:
7716 sname = ".ldata.rel.local";
7717 break;
7718 case SECCAT_DATA_REL_RO:
7719 sname = ".ldata.rel.ro";
7720 break;
7721 case SECCAT_DATA_REL_RO_LOCAL:
7722 sname = ".ldata.rel.ro.local";
7723 break;
7724 case SECCAT_BSS:
7725 sname = ".lbss";
7726 flags |= SECTION_BSS;
7727 break;
7728 case SECCAT_RODATA:
7729 case SECCAT_RODATA_MERGE_STR:
7730 case SECCAT_RODATA_MERGE_STR_INIT:
7731 case SECCAT_RODATA_MERGE_CONST:
7732 sname = ".lrodata";
7733 flags &= ~SECTION_WRITE;
7734 break;
7735 case SECCAT_SRODATA:
7736 case SECCAT_SDATA:
7737 case SECCAT_SBSS:
7738 gcc_unreachable ();
7739 case SECCAT_TEXT:
7740 case SECCAT_TDATA:
7741 case SECCAT_TBSS:
7742 /* We don't split these for medium model. Place them into
7743 default sections and hope for best. */
7744 break;
7745 }
7746 if (sname)
7747 {
7748 /* We might get called with string constants, but get_named_section
7749 doesn't like them as they are not DECLs. Also, we need to set
7750 flags in that case. */
7751 if (!DECL_P (decl))
7752 return get_section (sname, flags, NULL);
7753 return get_named_section (decl, sname, reloc);
7754 }
7755 }
7756 return default_elf_select_section (decl, reloc, align);
7757 }
7758
7759 /* Select a set of attributes for section NAME based on the properties
7760 of DECL and whether or not RELOC indicates that DECL's initializer
7761 might contain runtime relocations. */
7762
7763 static unsigned int ATTRIBUTE_UNUSED
7764 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
7765 {
7766 unsigned int flags = default_section_type_flags (decl, name, reloc);
7767
7768 if (ix86_in_large_data_p (decl))
7769 flags |= SECTION_LARGE;
7770
7771 if (decl == NULL_TREE
7772 && (strcmp (name, ".ldata.rel.ro") == 0
7773 || strcmp (name, ".ldata.rel.ro.local") == 0))
7774 flags |= SECTION_RELRO;
7775
7776 if (strcmp (name, ".lbss") == 0
7777 || strncmp (name, ".lbss.", 5) == 0
7778 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
7779 flags |= SECTION_BSS;
7780
7781 return flags;
7782 }
7783
7784 /* Build up a unique section name, expressed as a
7785 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
7786 RELOC indicates whether the initial value of EXP requires
7787 link-time relocations. */
7788
7789 static void ATTRIBUTE_UNUSED
7790 x86_64_elf_unique_section (tree decl, int reloc)
7791 {
7792 if (ix86_in_large_data_p (decl))
7793 {
7794 const char *prefix = NULL;
7795 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
7796 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
7797
7798 switch (categorize_decl_for_section (decl, reloc))
7799 {
7800 case SECCAT_DATA:
7801 case SECCAT_DATA_REL:
7802 case SECCAT_DATA_REL_LOCAL:
7803 case SECCAT_DATA_REL_RO:
7804 case SECCAT_DATA_REL_RO_LOCAL:
7805 prefix = one_only ? ".ld" : ".ldata";
7806 break;
7807 case SECCAT_BSS:
7808 prefix = one_only ? ".lb" : ".lbss";
7809 break;
7810 case SECCAT_RODATA:
7811 case SECCAT_RODATA_MERGE_STR:
7812 case SECCAT_RODATA_MERGE_STR_INIT:
7813 case SECCAT_RODATA_MERGE_CONST:
7814 prefix = one_only ? ".lr" : ".lrodata";
7815 break;
7816 case SECCAT_SRODATA:
7817 case SECCAT_SDATA:
7818 case SECCAT_SBSS:
7819 gcc_unreachable ();
7820 case SECCAT_TEXT:
7821 case SECCAT_TDATA:
7822 case SECCAT_TBSS:
7823 /* We don't split these for medium model. Place them into
7824 default sections and hope for best. */
7825 break;
7826 }
7827 if (prefix)
7828 {
7829 const char *name, *linkonce;
7830 char *string;
7831
7832 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
7833 name = targetm.strip_name_encoding (name);
7834
7835 /* If we're using one_only, then there needs to be a .gnu.linkonce
7836 prefix to the section name. */
7837 linkonce = one_only ? ".gnu.linkonce" : "";
7838
7839 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
7840
7841 set_decl_section_name (decl, string);
7842 return;
7843 }
7844 }
7845 default_unique_section (decl, reloc);
7846 }
7847
7848 #ifdef COMMON_ASM_OP
7849
7850 #ifndef LARGECOMM_SECTION_ASM_OP
7851 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
7852 #endif
7853
7854 /* This says how to output assembler code to declare an
7855 uninitialized external linkage data object.
7856
7857 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
7858 large objects. */
7859 void
7860 x86_elf_aligned_decl_common (FILE *file, tree decl,
7861 const char *name, unsigned HOST_WIDE_INT size,
7862 int align)
7863 {
7864 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7865 && size > (unsigned int)ix86_section_threshold)
7866 {
7867 switch_to_section (get_named_section (decl, ".lbss", 0));
7868 fputs (LARGECOMM_SECTION_ASM_OP, file);
7869 }
7870 else
7871 fputs (COMMON_ASM_OP, file);
7872 assemble_name (file, name);
7873 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
7874 size, align / BITS_PER_UNIT);
7875 }
7876 #endif
7877
7878 /* Utility function for targets to use in implementing
7879 ASM_OUTPUT_ALIGNED_BSS. */
7880
7881 void
7882 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
7883 unsigned HOST_WIDE_INT size, int align)
7884 {
7885 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7886 && size > (unsigned int)ix86_section_threshold)
7887 switch_to_section (get_named_section (decl, ".lbss", 0));
7888 else
7889 switch_to_section (bss_section);
7890 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
7891 #ifdef ASM_DECLARE_OBJECT_NAME
7892 last_assemble_variable_decl = decl;
7893 ASM_DECLARE_OBJECT_NAME (file, name, decl);
7894 #else
7895 /* Standard thing is just output label for the object. */
7896 ASM_OUTPUT_LABEL (file, name);
7897 #endif /* ASM_DECLARE_OBJECT_NAME */
7898 ASM_OUTPUT_SKIP (file, size ? size : 1);
7899 }
7900 \f
7901 /* Decide whether we must probe the stack before any space allocation
7902 on this target. It's essentially TARGET_STACK_PROBE except when
7903 -fstack-check causes the stack to be already probed differently. */
7904
7905 bool
7906 ix86_target_stack_probe (void)
7907 {
7908 /* Do not probe the stack twice if static stack checking is enabled. */
7909 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7910 return false;
7911
7912 return TARGET_STACK_PROBE;
7913 }
7914 \f
7915 /* Decide whether we can make a sibling call to a function. DECL is the
7916 declaration of the function being targeted by the call and EXP is the
7917 CALL_EXPR representing the call. */
7918
7919 static bool
7920 ix86_function_ok_for_sibcall (tree decl, tree exp)
7921 {
7922 tree type, decl_or_type;
7923 rtx a, b;
7924 bool bind_global = decl && !targetm.binds_local_p (decl);
7925
7926 /* Sibling call isn't OK if there are no caller-saved registers
7927 since all registers must be preserved before return. */
7928 if (cfun->machine->no_caller_saved_registers)
7929 return false;
7930
7931 /* If we are generating position-independent code, we cannot sibcall
7932 optimize direct calls to global functions, as the PLT requires
7933 %ebx be live. (Darwin does not have a PLT.) */
7934 if (!TARGET_MACHO
7935 && !TARGET_64BIT
7936 && flag_pic
7937 && flag_plt
7938 && bind_global)
7939 return false;
7940
7941 /* If we need to align the outgoing stack, then sibcalling would
7942 unalign the stack, which may break the called function. */
7943 if (ix86_minimum_incoming_stack_boundary (true)
7944 < PREFERRED_STACK_BOUNDARY)
7945 return false;
7946
7947 if (decl)
7948 {
7949 decl_or_type = decl;
7950 type = TREE_TYPE (decl);
7951 }
7952 else
7953 {
7954 /* We're looking at the CALL_EXPR, we need the type of the function. */
7955 type = CALL_EXPR_FN (exp); /* pointer expression */
7956 type = TREE_TYPE (type); /* pointer type */
7957 type = TREE_TYPE (type); /* function type */
7958 decl_or_type = type;
7959 }
7960
7961 /* Check that the return value locations are the same. Like
7962 if we are returning floats on the 80387 register stack, we cannot
7963 make a sibcall from a function that doesn't return a float to a
7964 function that does or, conversely, from a function that does return
7965 a float to a function that doesn't; the necessary stack adjustment
7966 would not be executed. This is also the place we notice
7967 differences in the return value ABI. Note that it is ok for one
7968 of the functions to have void return type as long as the return
7969 value of the other is passed in a register. */
7970 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
7971 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
7972 cfun->decl, false);
7973 if (STACK_REG_P (a) || STACK_REG_P (b))
7974 {
7975 if (!rtx_equal_p (a, b))
7976 return false;
7977 }
7978 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
7979 ;
7980 else if (!rtx_equal_p (a, b))
7981 return false;
7982
7983 if (TARGET_64BIT)
7984 {
7985 /* The SYSV ABI has more call-clobbered registers;
7986 disallow sibcalls from MS to SYSV. */
7987 if (cfun->machine->call_abi == MS_ABI
7988 && ix86_function_type_abi (type) == SYSV_ABI)
7989 return false;
7990 }
7991 else
7992 {
7993 /* If this call is indirect, we'll need to be able to use a
7994 call-clobbered register for the address of the target function.
7995 Make sure that all such registers are not used for passing
7996 parameters. Note that DLLIMPORT functions and call to global
7997 function via GOT slot are indirect. */
7998 if (!decl
7999 || (bind_global && flag_pic && !flag_plt)
8000 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
8001 {
8002 /* Check if regparm >= 3 since arg_reg_available is set to
8003 false if regparm == 0. If regparm is 1 or 2, there is
8004 always a call-clobbered register available.
8005
8006 ??? The symbol indirect call doesn't need a call-clobbered
8007 register. But we don't know if this is a symbol indirect
8008 call or not here. */
8009 if (ix86_function_regparm (type, NULL) >= 3
8010 && !cfun->machine->arg_reg_available)
8011 return false;
8012 }
8013 }
8014
8015 /* Otherwise okay. That also includes certain types of indirect calls. */
8016 return true;
8017 }
8018
8019 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
8020 and "sseregparm" calling convention attributes;
8021 arguments as in struct attribute_spec.handler. */
8022
8023 static tree
8024 ix86_handle_cconv_attribute (tree *node, tree name,
8025 tree args,
8026 int,
8027 bool *no_add_attrs)
8028 {
8029 if (TREE_CODE (*node) != FUNCTION_TYPE
8030 && TREE_CODE (*node) != METHOD_TYPE
8031 && TREE_CODE (*node) != FIELD_DECL
8032 && TREE_CODE (*node) != TYPE_DECL)
8033 {
8034 warning (OPT_Wattributes, "%qE attribute only applies to functions",
8035 name);
8036 *no_add_attrs = true;
8037 return NULL_TREE;
8038 }
8039
8040 /* Can combine regparm with all attributes but fastcall, and thiscall. */
8041 if (is_attribute_p ("regparm", name))
8042 {
8043 tree cst;
8044
8045 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8046 {
8047 error ("fastcall and regparm attributes are not compatible");
8048 }
8049
8050 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8051 {
8052 error ("regparam and thiscall attributes are not compatible");
8053 }
8054
8055 cst = TREE_VALUE (args);
8056 if (TREE_CODE (cst) != INTEGER_CST)
8057 {
8058 warning (OPT_Wattributes,
8059 "%qE attribute requires an integer constant argument",
8060 name);
8061 *no_add_attrs = true;
8062 }
8063 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
8064 {
8065 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
8066 name, REGPARM_MAX);
8067 *no_add_attrs = true;
8068 }
8069
8070 return NULL_TREE;
8071 }
8072
8073 if (TARGET_64BIT)
8074 {
8075 /* Do not warn when emulating the MS ABI. */
8076 if ((TREE_CODE (*node) != FUNCTION_TYPE
8077 && TREE_CODE (*node) != METHOD_TYPE)
8078 || ix86_function_type_abi (*node) != MS_ABI)
8079 warning (OPT_Wattributes, "%qE attribute ignored",
8080 name);
8081 *no_add_attrs = true;
8082 return NULL_TREE;
8083 }
8084
8085 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
8086 if (is_attribute_p ("fastcall", name))
8087 {
8088 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
8089 {
8090 error ("fastcall and cdecl attributes are not compatible");
8091 }
8092 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
8093 {
8094 error ("fastcall and stdcall attributes are not compatible");
8095 }
8096 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
8097 {
8098 error ("fastcall and regparm attributes are not compatible");
8099 }
8100 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8101 {
8102 error ("fastcall and thiscall attributes are not compatible");
8103 }
8104 }
8105
8106 /* Can combine stdcall with fastcall (redundant), regparm and
8107 sseregparm. */
8108 else if (is_attribute_p ("stdcall", name))
8109 {
8110 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
8111 {
8112 error ("stdcall and cdecl attributes are not compatible");
8113 }
8114 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8115 {
8116 error ("stdcall and fastcall attributes are not compatible");
8117 }
8118 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8119 {
8120 error ("stdcall and thiscall attributes are not compatible");
8121 }
8122 }
8123
8124 /* Can combine cdecl with regparm and sseregparm. */
8125 else if (is_attribute_p ("cdecl", name))
8126 {
8127 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
8128 {
8129 error ("stdcall and cdecl attributes are not compatible");
8130 }
8131 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8132 {
8133 error ("fastcall and cdecl attributes are not compatible");
8134 }
8135 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8136 {
8137 error ("cdecl and thiscall attributes are not compatible");
8138 }
8139 }
8140 else if (is_attribute_p ("thiscall", name))
8141 {
8142 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
8143 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
8144 name);
8145 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
8146 {
8147 error ("stdcall and thiscall attributes are not compatible");
8148 }
8149 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8150 {
8151 error ("fastcall and thiscall attributes are not compatible");
8152 }
8153 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
8154 {
8155 error ("cdecl and thiscall attributes are not compatible");
8156 }
8157 }
8158
8159 /* Can combine sseregparm with all attributes. */
8160
8161 return NULL_TREE;
8162 }
8163
8164 /* The transactional memory builtins are implicitly regparm or fastcall
8165 depending on the ABI. Override the generic do-nothing attribute that
8166 these builtins were declared with, and replace it with one of the two
8167 attributes that we expect elsewhere. */
8168
8169 static tree
8170 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
8171 int flags, bool *no_add_attrs)
8172 {
8173 tree alt;
8174
8175 /* In no case do we want to add the placeholder attribute. */
8176 *no_add_attrs = true;
8177
8178 /* The 64-bit ABI is unchanged for transactional memory. */
8179 if (TARGET_64BIT)
8180 return NULL_TREE;
8181
8182 /* ??? Is there a better way to validate 32-bit windows? We have
8183 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
8184 if (CHECK_STACK_LIMIT > 0)
8185 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
8186 else
8187 {
8188 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
8189 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
8190 }
8191 decl_attributes (node, alt, flags);
8192
8193 return NULL_TREE;
8194 }
8195
8196 /* This function determines from TYPE the calling-convention. */
8197
8198 unsigned int
8199 ix86_get_callcvt (const_tree type)
8200 {
8201 unsigned int ret = 0;
8202 bool is_stdarg;
8203 tree attrs;
8204
8205 if (TARGET_64BIT)
8206 return IX86_CALLCVT_CDECL;
8207
8208 attrs = TYPE_ATTRIBUTES (type);
8209 if (attrs != NULL_TREE)
8210 {
8211 if (lookup_attribute ("cdecl", attrs))
8212 ret |= IX86_CALLCVT_CDECL;
8213 else if (lookup_attribute ("stdcall", attrs))
8214 ret |= IX86_CALLCVT_STDCALL;
8215 else if (lookup_attribute ("fastcall", attrs))
8216 ret |= IX86_CALLCVT_FASTCALL;
8217 else if (lookup_attribute ("thiscall", attrs))
8218 ret |= IX86_CALLCVT_THISCALL;
8219
8220 /* Regparam isn't allowed for thiscall and fastcall. */
8221 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
8222 {
8223 if (lookup_attribute ("regparm", attrs))
8224 ret |= IX86_CALLCVT_REGPARM;
8225 if (lookup_attribute ("sseregparm", attrs))
8226 ret |= IX86_CALLCVT_SSEREGPARM;
8227 }
8228
8229 if (IX86_BASE_CALLCVT(ret) != 0)
8230 return ret;
8231 }
8232
8233 is_stdarg = stdarg_p (type);
8234 if (TARGET_RTD && !is_stdarg)
8235 return IX86_CALLCVT_STDCALL | ret;
8236
8237 if (ret != 0
8238 || is_stdarg
8239 || TREE_CODE (type) != METHOD_TYPE
8240 || ix86_function_type_abi (type) != MS_ABI)
8241 return IX86_CALLCVT_CDECL | ret;
8242
8243 return IX86_CALLCVT_THISCALL;
8244 }
8245
8246 /* Return 0 if the attributes for two types are incompatible, 1 if they
8247 are compatible, and 2 if they are nearly compatible (which causes a
8248 warning to be generated). */
8249
8250 static int
8251 ix86_comp_type_attributes (const_tree type1, const_tree type2)
8252 {
8253 unsigned int ccvt1, ccvt2;
8254
8255 if (TREE_CODE (type1) != FUNCTION_TYPE
8256 && TREE_CODE (type1) != METHOD_TYPE)
8257 return 1;
8258
8259 ccvt1 = ix86_get_callcvt (type1);
8260 ccvt2 = ix86_get_callcvt (type2);
8261 if (ccvt1 != ccvt2)
8262 return 0;
8263 if (ix86_function_regparm (type1, NULL)
8264 != ix86_function_regparm (type2, NULL))
8265 return 0;
8266
8267 return 1;
8268 }
8269 \f
8270 /* Return the regparm value for a function with the indicated TYPE and DECL.
8271 DECL may be NULL when calling function indirectly
8272 or considering a libcall. */
8273
8274 static int
8275 ix86_function_regparm (const_tree type, const_tree decl)
8276 {
8277 tree attr;
8278 int regparm;
8279 unsigned int ccvt;
8280
8281 if (TARGET_64BIT)
8282 return (ix86_function_type_abi (type) == SYSV_ABI
8283 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
8284 ccvt = ix86_get_callcvt (type);
8285 regparm = ix86_regparm;
8286
8287 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
8288 {
8289 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
8290 if (attr)
8291 {
8292 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
8293 return regparm;
8294 }
8295 }
8296 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
8297 return 2;
8298 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
8299 return 1;
8300
8301 /* Use register calling convention for local functions when possible. */
8302 if (decl
8303 && TREE_CODE (decl) == FUNCTION_DECL)
8304 {
8305 cgraph_node *target = cgraph_node::get (decl);
8306 if (target)
8307 target = target->function_symbol ();
8308
8309 /* Caller and callee must agree on the calling convention, so
8310 checking here just optimize means that with
8311 __attribute__((optimize (...))) caller could use regparm convention
8312 and callee not, or vice versa. Instead look at whether the callee
8313 is optimized or not. */
8314 if (target && opt_for_fn (target->decl, optimize)
8315 && !(profile_flag && !flag_fentry))
8316 {
8317 cgraph_local_info *i = &target->local;
8318 if (i && i->local && i->can_change_signature)
8319 {
8320 int local_regparm, globals = 0, regno;
8321
8322 /* Make sure no regparm register is taken by a
8323 fixed register variable. */
8324 for (local_regparm = 0; local_regparm < REGPARM_MAX;
8325 local_regparm++)
8326 if (fixed_regs[local_regparm])
8327 break;
8328
8329 /* We don't want to use regparm(3) for nested functions as
8330 these use a static chain pointer in the third argument. */
8331 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
8332 local_regparm = 2;
8333
8334 /* Save a register for the split stack. */
8335 if (flag_split_stack)
8336 {
8337 if (local_regparm == 3)
8338 local_regparm = 2;
8339 else if (local_regparm == 2
8340 && DECL_STATIC_CHAIN (target->decl))
8341 local_regparm = 1;
8342 }
8343
8344 /* Each fixed register usage increases register pressure,
8345 so less registers should be used for argument passing.
8346 This functionality can be overriden by an explicit
8347 regparm value. */
8348 for (regno = AX_REG; regno <= DI_REG; regno++)
8349 if (fixed_regs[regno])
8350 globals++;
8351
8352 local_regparm
8353 = globals < local_regparm ? local_regparm - globals : 0;
8354
8355 if (local_regparm > regparm)
8356 regparm = local_regparm;
8357 }
8358 }
8359 }
8360
8361 return regparm;
8362 }
8363
8364 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
8365 DFmode (2) arguments in SSE registers for a function with the
8366 indicated TYPE and DECL. DECL may be NULL when calling function
8367 indirectly or considering a libcall. Return -1 if any FP parameter
8368 should be rejected by error. This is used in siutation we imply SSE
8369 calling convetion but the function is called from another function with
8370 SSE disabled. Otherwise return 0. */
8371
8372 static int
8373 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
8374 {
8375 gcc_assert (!TARGET_64BIT);
8376
8377 /* Use SSE registers to pass SFmode and DFmode arguments if requested
8378 by the sseregparm attribute. */
8379 if (TARGET_SSEREGPARM
8380 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
8381 {
8382 if (!TARGET_SSE)
8383 {
8384 if (warn)
8385 {
8386 if (decl)
8387 error ("calling %qD with attribute sseregparm without "
8388 "SSE/SSE2 enabled", decl);
8389 else
8390 error ("calling %qT with attribute sseregparm without "
8391 "SSE/SSE2 enabled", type);
8392 }
8393 return 0;
8394 }
8395
8396 return 2;
8397 }
8398
8399 if (!decl)
8400 return 0;
8401
8402 cgraph_node *target = cgraph_node::get (decl);
8403 if (target)
8404 target = target->function_symbol ();
8405
8406 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
8407 (and DFmode for SSE2) arguments in SSE registers. */
8408 if (target
8409 /* TARGET_SSE_MATH */
8410 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
8411 && opt_for_fn (target->decl, optimize)
8412 && !(profile_flag && !flag_fentry))
8413 {
8414 cgraph_local_info *i = &target->local;
8415 if (i && i->local && i->can_change_signature)
8416 {
8417 /* Refuse to produce wrong code when local function with SSE enabled
8418 is called from SSE disabled function.
8419 FIXME: We need a way to detect these cases cross-ltrans partition
8420 and avoid using SSE calling conventions on local functions called
8421 from function with SSE disabled. For now at least delay the
8422 warning until we know we are going to produce wrong code.
8423 See PR66047 */
8424 if (!TARGET_SSE && warn)
8425 return -1;
8426 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
8427 ->x_ix86_isa_flags) ? 2 : 1;
8428 }
8429 }
8430
8431 return 0;
8432 }
8433
8434 /* Return true if EAX is live at the start of the function. Used by
8435 ix86_expand_prologue to determine if we need special help before
8436 calling allocate_stack_worker. */
8437
8438 static bool
8439 ix86_eax_live_at_start_p (void)
8440 {
8441 /* Cheat. Don't bother working forward from ix86_function_regparm
8442 to the function type to whether an actual argument is located in
8443 eax. Instead just look at cfg info, which is still close enough
8444 to correct at this point. This gives false positives for broken
8445 functions that might use uninitialized data that happens to be
8446 allocated in eax, but who cares? */
8447 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
8448 }
8449
8450 static bool
8451 ix86_keep_aggregate_return_pointer (tree fntype)
8452 {
8453 tree attr;
8454
8455 if (!TARGET_64BIT)
8456 {
8457 attr = lookup_attribute ("callee_pop_aggregate_return",
8458 TYPE_ATTRIBUTES (fntype));
8459 if (attr)
8460 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
8461
8462 /* For 32-bit MS-ABI the default is to keep aggregate
8463 return pointer. */
8464 if (ix86_function_type_abi (fntype) == MS_ABI)
8465 return true;
8466 }
8467 return KEEP_AGGREGATE_RETURN_POINTER != 0;
8468 }
8469
8470 /* Value is the number of bytes of arguments automatically
8471 popped when returning from a subroutine call.
8472 FUNDECL is the declaration node of the function (as a tree),
8473 FUNTYPE is the data type of the function (as a tree),
8474 or for a library call it is an identifier node for the subroutine name.
8475 SIZE is the number of bytes of arguments passed on the stack.
8476
8477 On the 80386, the RTD insn may be used to pop them if the number
8478 of args is fixed, but if the number is variable then the caller
8479 must pop them all. RTD can't be used for library calls now
8480 because the library is compiled with the Unix compiler.
8481 Use of RTD is a selectable option, since it is incompatible with
8482 standard Unix calling sequences. If the option is not selected,
8483 the caller must always pop the args.
8484
8485 The attribute stdcall is equivalent to RTD on a per module basis. */
8486
8487 static int
8488 ix86_return_pops_args (tree fundecl, tree funtype, int size)
8489 {
8490 unsigned int ccvt;
8491
8492 /* None of the 64-bit ABIs pop arguments. */
8493 if (TARGET_64BIT)
8494 return 0;
8495
8496 ccvt = ix86_get_callcvt (funtype);
8497
8498 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
8499 | IX86_CALLCVT_THISCALL)) != 0
8500 && ! stdarg_p (funtype))
8501 return size;
8502
8503 /* Lose any fake structure return argument if it is passed on the stack. */
8504 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
8505 && !ix86_keep_aggregate_return_pointer (funtype))
8506 {
8507 int nregs = ix86_function_regparm (funtype, fundecl);
8508 if (nregs == 0)
8509 return GET_MODE_SIZE (Pmode);
8510 }
8511
8512 return 0;
8513 }
8514
8515 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
8516
8517 static bool
8518 ix86_legitimate_combined_insn (rtx_insn *insn)
8519 {
8520 int i;
8521
8522 /* Check operand constraints in case hard registers were propagated
8523 into insn pattern. This check prevents combine pass from
8524 generating insn patterns with invalid hard register operands.
8525 These invalid insns can eventually confuse reload to error out
8526 with a spill failure. See also PRs 46829 and 46843. */
8527
8528 gcc_assert (INSN_CODE (insn) >= 0);
8529
8530 extract_insn (insn);
8531 preprocess_constraints (insn);
8532
8533 int n_operands = recog_data.n_operands;
8534 int n_alternatives = recog_data.n_alternatives;
8535 for (i = 0; i < n_operands; i++)
8536 {
8537 rtx op = recog_data.operand[i];
8538 machine_mode mode = GET_MODE (op);
8539 const operand_alternative *op_alt;
8540 int offset = 0;
8541 bool win;
8542 int j;
8543
8544 /* A unary operator may be accepted by the predicate, but it
8545 is irrelevant for matching constraints. */
8546 if (UNARY_P (op))
8547 op = XEXP (op, 0);
8548
8549 if (SUBREG_P (op))
8550 {
8551 if (REG_P (SUBREG_REG (op))
8552 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
8553 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
8554 GET_MODE (SUBREG_REG (op)),
8555 SUBREG_BYTE (op),
8556 GET_MODE (op));
8557 op = SUBREG_REG (op);
8558 }
8559
8560 if (!(REG_P (op) && HARD_REGISTER_P (op)))
8561 continue;
8562
8563 op_alt = recog_op_alt;
8564
8565 /* Operand has no constraints, anything is OK. */
8566 win = !n_alternatives;
8567
8568 alternative_mask preferred = get_preferred_alternatives (insn);
8569 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
8570 {
8571 if (!TEST_BIT (preferred, j))
8572 continue;
8573 if (op_alt[i].anything_ok
8574 || (op_alt[i].matches != -1
8575 && operands_match_p
8576 (recog_data.operand[i],
8577 recog_data.operand[op_alt[i].matches]))
8578 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
8579 {
8580 win = true;
8581 break;
8582 }
8583 }
8584
8585 if (!win)
8586 return false;
8587 }
8588
8589 return true;
8590 }
8591 \f
8592 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
8593
8594 static unsigned HOST_WIDE_INT
8595 ix86_asan_shadow_offset (void)
8596 {
8597 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
8598 : HOST_WIDE_INT_C (0x7fff8000))
8599 : (HOST_WIDE_INT_1 << 29);
8600 }
8601 \f
8602 /* Argument support functions. */
8603
8604 /* Return true when register may be used to pass function parameters. */
8605 bool
8606 ix86_function_arg_regno_p (int regno)
8607 {
8608 int i;
8609 enum calling_abi call_abi;
8610 const int *parm_regs;
8611
8612 if (TARGET_MPX && BND_REGNO_P (regno))
8613 return true;
8614
8615 if (!TARGET_64BIT)
8616 {
8617 if (TARGET_MACHO)
8618 return (regno < REGPARM_MAX
8619 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
8620 else
8621 return (regno < REGPARM_MAX
8622 || (TARGET_MMX && MMX_REGNO_P (regno)
8623 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
8624 || (TARGET_SSE && SSE_REGNO_P (regno)
8625 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
8626 }
8627
8628 if (TARGET_SSE && SSE_REGNO_P (regno)
8629 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
8630 return true;
8631
8632 /* TODO: The function should depend on current function ABI but
8633 builtins.c would need updating then. Therefore we use the
8634 default ABI. */
8635 call_abi = ix86_cfun_abi ();
8636
8637 /* RAX is used as hidden argument to va_arg functions. */
8638 if (call_abi == SYSV_ABI && regno == AX_REG)
8639 return true;
8640
8641 if (call_abi == MS_ABI)
8642 parm_regs = x86_64_ms_abi_int_parameter_registers;
8643 else
8644 parm_regs = x86_64_int_parameter_registers;
8645
8646 for (i = 0; i < (call_abi == MS_ABI
8647 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
8648 if (regno == parm_regs[i])
8649 return true;
8650 return false;
8651 }
8652
8653 /* Return if we do not know how to pass TYPE solely in registers. */
8654
8655 static bool
8656 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
8657 {
8658 if (must_pass_in_stack_var_size_or_pad (mode, type))
8659 return true;
8660
8661 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
8662 The layout_type routine is crafty and tries to trick us into passing
8663 currently unsupported vector types on the stack by using TImode. */
8664 return (!TARGET_64BIT && mode == TImode
8665 && type && TREE_CODE (type) != VECTOR_TYPE);
8666 }
8667
8668 /* It returns the size, in bytes, of the area reserved for arguments passed
8669 in registers for the function represented by fndecl dependent to the used
8670 abi format. */
8671 int
8672 ix86_reg_parm_stack_space (const_tree fndecl)
8673 {
8674 enum calling_abi call_abi = SYSV_ABI;
8675 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
8676 call_abi = ix86_function_abi (fndecl);
8677 else
8678 call_abi = ix86_function_type_abi (fndecl);
8679 if (TARGET_64BIT && call_abi == MS_ABI)
8680 return 32;
8681 return 0;
8682 }
8683
8684 /* We add this as a workaround in order to use libc_has_function
8685 hook in i386.md. */
8686 bool
8687 ix86_libc_has_function (enum function_class fn_class)
8688 {
8689 return targetm.libc_has_function (fn_class);
8690 }
8691
8692 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
8693 specifying the call abi used. */
8694 enum calling_abi
8695 ix86_function_type_abi (const_tree fntype)
8696 {
8697 enum calling_abi abi = ix86_abi;
8698
8699 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
8700 return abi;
8701
8702 if (abi == SYSV_ABI
8703 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
8704 {
8705 if (TARGET_X32)
8706 error ("X32 does not support ms_abi attribute");
8707
8708 abi = MS_ABI;
8709 }
8710 else if (abi == MS_ABI
8711 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
8712 abi = SYSV_ABI;
8713
8714 return abi;
8715 }
8716
8717 static enum calling_abi
8718 ix86_function_abi (const_tree fndecl)
8719 {
8720 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
8721 }
8722
8723 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
8724 specifying the call abi used. */
8725 enum calling_abi
8726 ix86_cfun_abi (void)
8727 {
8728 return cfun ? cfun->machine->call_abi : ix86_abi;
8729 }
8730
8731 static bool
8732 ix86_function_ms_hook_prologue (const_tree fn)
8733 {
8734 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
8735 {
8736 if (decl_function_context (fn) != NULL_TREE)
8737 error_at (DECL_SOURCE_LOCATION (fn),
8738 "ms_hook_prologue is not compatible with nested function");
8739 else
8740 return true;
8741 }
8742 return false;
8743 }
8744
8745 /* Write the extra assembler code needed to declare a function properly. */
8746
8747 void
8748 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
8749 tree decl)
8750 {
8751 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
8752
8753 if (is_ms_hook)
8754 {
8755 int i, filler_count = (TARGET_64BIT ? 32 : 16);
8756 unsigned int filler_cc = 0xcccccccc;
8757
8758 for (i = 0; i < filler_count; i += 4)
8759 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
8760 }
8761
8762 #ifdef SUBTARGET_ASM_UNWIND_INIT
8763 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
8764 #endif
8765
8766 ASM_OUTPUT_LABEL (asm_out_file, fname);
8767
8768 /* Output magic byte marker, if hot-patch attribute is set. */
8769 if (is_ms_hook)
8770 {
8771 if (TARGET_64BIT)
8772 {
8773 /* leaq [%rsp + 0], %rsp */
8774 asm_fprintf (asm_out_file, ASM_BYTE
8775 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
8776 }
8777 else
8778 {
8779 /* movl.s %edi, %edi
8780 push %ebp
8781 movl.s %esp, %ebp */
8782 asm_fprintf (asm_out_file, ASM_BYTE
8783 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
8784 }
8785 }
8786 }
8787
8788 /* Implementation of call abi switching target hook. Specific to FNDECL
8789 the specific call register sets are set. See also
8790 ix86_conditional_register_usage for more details. */
8791 void
8792 ix86_call_abi_override (const_tree fndecl)
8793 {
8794 cfun->machine->call_abi = ix86_function_abi (fndecl);
8795 }
8796
8797 /* Return 1 if pseudo register should be created and used to hold
8798 GOT address for PIC code. */
8799 bool
8800 ix86_use_pseudo_pic_reg (void)
8801 {
8802 if ((TARGET_64BIT
8803 && (ix86_cmodel == CM_SMALL_PIC
8804 || TARGET_PECOFF))
8805 || !flag_pic)
8806 return false;
8807 return true;
8808 }
8809
8810 /* Initialize large model PIC register. */
8811
8812 static void
8813 ix86_init_large_pic_reg (unsigned int tmp_regno)
8814 {
8815 rtx_code_label *label;
8816 rtx tmp_reg;
8817
8818 gcc_assert (Pmode == DImode);
8819 label = gen_label_rtx ();
8820 emit_label (label);
8821 LABEL_PRESERVE_P (label) = 1;
8822 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
8823 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
8824 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
8825 label));
8826 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
8827 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
8828 pic_offset_table_rtx, tmp_reg));
8829 }
8830
8831 /* Create and initialize PIC register if required. */
8832 static void
8833 ix86_init_pic_reg (void)
8834 {
8835 edge entry_edge;
8836 rtx_insn *seq;
8837
8838 if (!ix86_use_pseudo_pic_reg ())
8839 return;
8840
8841 start_sequence ();
8842
8843 if (TARGET_64BIT)
8844 {
8845 if (ix86_cmodel == CM_LARGE_PIC)
8846 ix86_init_large_pic_reg (R11_REG);
8847 else
8848 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
8849 }
8850 else
8851 {
8852 /* If there is future mcount call in the function it is more profitable
8853 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
8854 rtx reg = crtl->profile
8855 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
8856 : pic_offset_table_rtx;
8857 rtx_insn *insn = emit_insn (gen_set_got (reg));
8858 RTX_FRAME_RELATED_P (insn) = 1;
8859 if (crtl->profile)
8860 emit_move_insn (pic_offset_table_rtx, reg);
8861 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
8862 }
8863
8864 seq = get_insns ();
8865 end_sequence ();
8866
8867 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
8868 insert_insn_on_edge (seq, entry_edge);
8869 commit_one_edge_insertion (entry_edge);
8870 }
8871
8872 /* Initialize a variable CUM of type CUMULATIVE_ARGS
8873 for a call to a function whose data type is FNTYPE.
8874 For a library call, FNTYPE is 0. */
8875
8876 void
8877 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
8878 tree fntype, /* tree ptr for function decl */
8879 rtx libname, /* SYMBOL_REF of library name or 0 */
8880 tree fndecl,
8881 int caller)
8882 {
8883 struct cgraph_local_info *i = NULL;
8884 struct cgraph_node *target = NULL;
8885
8886 memset (cum, 0, sizeof (*cum));
8887
8888 if (fndecl)
8889 {
8890 target = cgraph_node::get (fndecl);
8891 if (target)
8892 {
8893 target = target->function_symbol ();
8894 i = cgraph_node::local_info (target->decl);
8895 cum->call_abi = ix86_function_abi (target->decl);
8896 }
8897 else
8898 cum->call_abi = ix86_function_abi (fndecl);
8899 }
8900 else
8901 cum->call_abi = ix86_function_type_abi (fntype);
8902
8903 cum->caller = caller;
8904
8905 /* Set up the number of registers to use for passing arguments. */
8906 cum->nregs = ix86_regparm;
8907 if (TARGET_64BIT)
8908 {
8909 cum->nregs = (cum->call_abi == SYSV_ABI
8910 ? X86_64_REGPARM_MAX
8911 : X86_64_MS_REGPARM_MAX);
8912 }
8913 if (TARGET_SSE)
8914 {
8915 cum->sse_nregs = SSE_REGPARM_MAX;
8916 if (TARGET_64BIT)
8917 {
8918 cum->sse_nregs = (cum->call_abi == SYSV_ABI
8919 ? X86_64_SSE_REGPARM_MAX
8920 : X86_64_MS_SSE_REGPARM_MAX);
8921 }
8922 }
8923 if (TARGET_MMX)
8924 cum->mmx_nregs = MMX_REGPARM_MAX;
8925 cum->warn_avx512f = true;
8926 cum->warn_avx = true;
8927 cum->warn_sse = true;
8928 cum->warn_mmx = true;
8929
8930 /* Because type might mismatch in between caller and callee, we need to
8931 use actual type of function for local calls.
8932 FIXME: cgraph_analyze can be told to actually record if function uses
8933 va_start so for local functions maybe_vaarg can be made aggressive
8934 helping K&R code.
8935 FIXME: once typesytem is fixed, we won't need this code anymore. */
8936 if (i && i->local && i->can_change_signature)
8937 fntype = TREE_TYPE (target->decl);
8938 cum->stdarg = stdarg_p (fntype);
8939 cum->maybe_vaarg = (fntype
8940 ? (!prototype_p (fntype) || stdarg_p (fntype))
8941 : !libname);
8942
8943 cum->bnd_regno = FIRST_BND_REG;
8944 cum->bnds_in_bt = 0;
8945 cum->force_bnd_pass = 0;
8946 cum->decl = fndecl;
8947
8948 if (!TARGET_64BIT)
8949 {
8950 /* If there are variable arguments, then we won't pass anything
8951 in registers in 32-bit mode. */
8952 if (stdarg_p (fntype))
8953 {
8954 cum->nregs = 0;
8955 /* Since in 32-bit, variable arguments are always passed on
8956 stack, there is scratch register available for indirect
8957 sibcall. */
8958 cfun->machine->arg_reg_available = true;
8959 cum->sse_nregs = 0;
8960 cum->mmx_nregs = 0;
8961 cum->warn_avx512f = false;
8962 cum->warn_avx = false;
8963 cum->warn_sse = false;
8964 cum->warn_mmx = false;
8965 return;
8966 }
8967
8968 /* Use ecx and edx registers if function has fastcall attribute,
8969 else look for regparm information. */
8970 if (fntype)
8971 {
8972 unsigned int ccvt = ix86_get_callcvt (fntype);
8973 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
8974 {
8975 cum->nregs = 1;
8976 cum->fastcall = 1; /* Same first register as in fastcall. */
8977 }
8978 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
8979 {
8980 cum->nregs = 2;
8981 cum->fastcall = 1;
8982 }
8983 else
8984 cum->nregs = ix86_function_regparm (fntype, fndecl);
8985 }
8986
8987 /* Set up the number of SSE registers used for passing SFmode
8988 and DFmode arguments. Warn for mismatching ABI. */
8989 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
8990 }
8991
8992 cfun->machine->arg_reg_available = (cum->nregs > 0);
8993 }
8994
8995 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
8996 But in the case of vector types, it is some vector mode.
8997
8998 When we have only some of our vector isa extensions enabled, then there
8999 are some modes for which vector_mode_supported_p is false. For these
9000 modes, the generic vector support in gcc will choose some non-vector mode
9001 in order to implement the type. By computing the natural mode, we'll
9002 select the proper ABI location for the operand and not depend on whatever
9003 the middle-end decides to do with these vector types.
9004
9005 The midde-end can't deal with the vector types > 16 bytes. In this
9006 case, we return the original mode and warn ABI change if CUM isn't
9007 NULL.
9008
9009 If INT_RETURN is true, warn ABI change if the vector mode isn't
9010 available for function return value. */
9011
9012 static machine_mode
9013 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
9014 bool in_return)
9015 {
9016 machine_mode mode = TYPE_MODE (type);
9017
9018 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
9019 {
9020 HOST_WIDE_INT size = int_size_in_bytes (type);
9021 if ((size == 8 || size == 16 || size == 32 || size == 64)
9022 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
9023 && TYPE_VECTOR_SUBPARTS (type) > 1)
9024 {
9025 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
9026
9027 /* There are no XFmode vector modes. */
9028 if (innermode == XFmode)
9029 return mode;
9030
9031 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
9032 mode = MIN_MODE_VECTOR_FLOAT;
9033 else
9034 mode = MIN_MODE_VECTOR_INT;
9035
9036 /* Get the mode which has this inner mode and number of units. */
9037 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
9038 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
9039 && GET_MODE_INNER (mode) == innermode)
9040 {
9041 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
9042 {
9043 static bool warnedavx512f;
9044 static bool warnedavx512f_ret;
9045
9046 if (cum && cum->warn_avx512f && !warnedavx512f)
9047 {
9048 if (warning (OPT_Wpsabi, "AVX512F vector argument "
9049 "without AVX512F enabled changes the ABI"))
9050 warnedavx512f = true;
9051 }
9052 else if (in_return && !warnedavx512f_ret)
9053 {
9054 if (warning (OPT_Wpsabi, "AVX512F vector return "
9055 "without AVX512F enabled changes the ABI"))
9056 warnedavx512f_ret = true;
9057 }
9058
9059 return TYPE_MODE (type);
9060 }
9061 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
9062 {
9063 static bool warnedavx;
9064 static bool warnedavx_ret;
9065
9066 if (cum && cum->warn_avx && !warnedavx)
9067 {
9068 if (warning (OPT_Wpsabi, "AVX vector argument "
9069 "without AVX enabled changes the ABI"))
9070 warnedavx = true;
9071 }
9072 else if (in_return && !warnedavx_ret)
9073 {
9074 if (warning (OPT_Wpsabi, "AVX vector return "
9075 "without AVX enabled changes the ABI"))
9076 warnedavx_ret = true;
9077 }
9078
9079 return TYPE_MODE (type);
9080 }
9081 else if (((size == 8 && TARGET_64BIT) || size == 16)
9082 && !TARGET_SSE
9083 && !TARGET_IAMCU)
9084 {
9085 static bool warnedsse;
9086 static bool warnedsse_ret;
9087
9088 if (cum && cum->warn_sse && !warnedsse)
9089 {
9090 if (warning (OPT_Wpsabi, "SSE vector argument "
9091 "without SSE enabled changes the ABI"))
9092 warnedsse = true;
9093 }
9094 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
9095 {
9096 if (warning (OPT_Wpsabi, "SSE vector return "
9097 "without SSE enabled changes the ABI"))
9098 warnedsse_ret = true;
9099 }
9100 }
9101 else if ((size == 8 && !TARGET_64BIT)
9102 && (!cfun
9103 || cfun->machine->func_type == TYPE_NORMAL)
9104 && !TARGET_MMX
9105 && !TARGET_IAMCU)
9106 {
9107 static bool warnedmmx;
9108 static bool warnedmmx_ret;
9109
9110 if (cum && cum->warn_mmx && !warnedmmx)
9111 {
9112 if (warning (OPT_Wpsabi, "MMX vector argument "
9113 "without MMX enabled changes the ABI"))
9114 warnedmmx = true;
9115 }
9116 else if (in_return && !warnedmmx_ret)
9117 {
9118 if (warning (OPT_Wpsabi, "MMX vector return "
9119 "without MMX enabled changes the ABI"))
9120 warnedmmx_ret = true;
9121 }
9122 }
9123 return mode;
9124 }
9125
9126 gcc_unreachable ();
9127 }
9128 }
9129
9130 return mode;
9131 }
9132
9133 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
9134 this may not agree with the mode that the type system has chosen for the
9135 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
9136 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
9137
9138 static rtx
9139 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
9140 unsigned int regno)
9141 {
9142 rtx tmp;
9143
9144 if (orig_mode != BLKmode)
9145 tmp = gen_rtx_REG (orig_mode, regno);
9146 else
9147 {
9148 tmp = gen_rtx_REG (mode, regno);
9149 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
9150 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
9151 }
9152
9153 return tmp;
9154 }
9155
9156 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
9157 of this code is to classify each 8bytes of incoming argument by the register
9158 class and assign registers accordingly. */
9159
9160 /* Return the union class of CLASS1 and CLASS2.
9161 See the x86-64 PS ABI for details. */
9162
9163 static enum x86_64_reg_class
9164 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
9165 {
9166 /* Rule #1: If both classes are equal, this is the resulting class. */
9167 if (class1 == class2)
9168 return class1;
9169
9170 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
9171 the other class. */
9172 if (class1 == X86_64_NO_CLASS)
9173 return class2;
9174 if (class2 == X86_64_NO_CLASS)
9175 return class1;
9176
9177 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
9178 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
9179 return X86_64_MEMORY_CLASS;
9180
9181 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
9182 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
9183 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
9184 return X86_64_INTEGERSI_CLASS;
9185 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
9186 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
9187 return X86_64_INTEGER_CLASS;
9188
9189 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
9190 MEMORY is used. */
9191 if (class1 == X86_64_X87_CLASS
9192 || class1 == X86_64_X87UP_CLASS
9193 || class1 == X86_64_COMPLEX_X87_CLASS
9194 || class2 == X86_64_X87_CLASS
9195 || class2 == X86_64_X87UP_CLASS
9196 || class2 == X86_64_COMPLEX_X87_CLASS)
9197 return X86_64_MEMORY_CLASS;
9198
9199 /* Rule #6: Otherwise class SSE is used. */
9200 return X86_64_SSE_CLASS;
9201 }
9202
9203 /* Classify the argument of type TYPE and mode MODE.
9204 CLASSES will be filled by the register class used to pass each word
9205 of the operand. The number of words is returned. In case the parameter
9206 should be passed in memory, 0 is returned. As a special case for zero
9207 sized containers, classes[0] will be NO_CLASS and 1 is returned.
9208
9209 BIT_OFFSET is used internally for handling records and specifies offset
9210 of the offset in bits modulo 512 to avoid overflow cases.
9211
9212 See the x86-64 PS ABI for details.
9213 */
9214
9215 static int
9216 classify_argument (machine_mode mode, const_tree type,
9217 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
9218 {
9219 HOST_WIDE_INT bytes =
9220 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
9221 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
9222
9223 /* Variable sized entities are always passed/returned in memory. */
9224 if (bytes < 0)
9225 return 0;
9226
9227 if (mode != VOIDmode
9228 && targetm.calls.must_pass_in_stack (mode, type))
9229 return 0;
9230
9231 if (type && AGGREGATE_TYPE_P (type))
9232 {
9233 int i;
9234 tree field;
9235 enum x86_64_reg_class subclasses[MAX_CLASSES];
9236
9237 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
9238 if (bytes > 64)
9239 return 0;
9240
9241 for (i = 0; i < words; i++)
9242 classes[i] = X86_64_NO_CLASS;
9243
9244 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
9245 signalize memory class, so handle it as special case. */
9246 if (!words)
9247 {
9248 classes[0] = X86_64_NO_CLASS;
9249 return 1;
9250 }
9251
9252 /* Classify each field of record and merge classes. */
9253 switch (TREE_CODE (type))
9254 {
9255 case RECORD_TYPE:
9256 /* And now merge the fields of structure. */
9257 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
9258 {
9259 if (TREE_CODE (field) == FIELD_DECL)
9260 {
9261 int num;
9262
9263 if (TREE_TYPE (field) == error_mark_node)
9264 continue;
9265
9266 /* Bitfields are always classified as integer. Handle them
9267 early, since later code would consider them to be
9268 misaligned integers. */
9269 if (DECL_BIT_FIELD (field))
9270 {
9271 for (i = (int_bit_position (field)
9272 + (bit_offset % 64)) / 8 / 8;
9273 i < ((int_bit_position (field) + (bit_offset % 64))
9274 + tree_to_shwi (DECL_SIZE (field))
9275 + 63) / 8 / 8; i++)
9276 classes[i] =
9277 merge_classes (X86_64_INTEGER_CLASS,
9278 classes[i]);
9279 }
9280 else
9281 {
9282 int pos;
9283
9284 type = TREE_TYPE (field);
9285
9286 /* Flexible array member is ignored. */
9287 if (TYPE_MODE (type) == BLKmode
9288 && TREE_CODE (type) == ARRAY_TYPE
9289 && TYPE_SIZE (type) == NULL_TREE
9290 && TYPE_DOMAIN (type) != NULL_TREE
9291 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
9292 == NULL_TREE))
9293 {
9294 static bool warned;
9295
9296 if (!warned && warn_psabi)
9297 {
9298 warned = true;
9299 inform (input_location,
9300 "the ABI of passing struct with"
9301 " a flexible array member has"
9302 " changed in GCC 4.4");
9303 }
9304 continue;
9305 }
9306 num = classify_argument (TYPE_MODE (type), type,
9307 subclasses,
9308 (int_bit_position (field)
9309 + bit_offset) % 512);
9310 if (!num)
9311 return 0;
9312 pos = (int_bit_position (field)
9313 + (bit_offset % 64)) / 8 / 8;
9314 for (i = 0; i < num && (i + pos) < words; i++)
9315 classes[i + pos] =
9316 merge_classes (subclasses[i], classes[i + pos]);
9317 }
9318 }
9319 }
9320 break;
9321
9322 case ARRAY_TYPE:
9323 /* Arrays are handled as small records. */
9324 {
9325 int num;
9326 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
9327 TREE_TYPE (type), subclasses, bit_offset);
9328 if (!num)
9329 return 0;
9330
9331 /* The partial classes are now full classes. */
9332 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
9333 subclasses[0] = X86_64_SSE_CLASS;
9334 if (subclasses[0] == X86_64_INTEGERSI_CLASS
9335 && !((bit_offset % 64) == 0 && bytes == 4))
9336 subclasses[0] = X86_64_INTEGER_CLASS;
9337
9338 for (i = 0; i < words; i++)
9339 classes[i] = subclasses[i % num];
9340
9341 break;
9342 }
9343 case UNION_TYPE:
9344 case QUAL_UNION_TYPE:
9345 /* Unions are similar to RECORD_TYPE but offset is always 0.
9346 */
9347 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
9348 {
9349 if (TREE_CODE (field) == FIELD_DECL)
9350 {
9351 int num;
9352
9353 if (TREE_TYPE (field) == error_mark_node)
9354 continue;
9355
9356 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
9357 TREE_TYPE (field), subclasses,
9358 bit_offset);
9359 if (!num)
9360 return 0;
9361 for (i = 0; i < num && i < words; i++)
9362 classes[i] = merge_classes (subclasses[i], classes[i]);
9363 }
9364 }
9365 break;
9366
9367 default:
9368 gcc_unreachable ();
9369 }
9370
9371 if (words > 2)
9372 {
9373 /* When size > 16 bytes, if the first one isn't
9374 X86_64_SSE_CLASS or any other ones aren't
9375 X86_64_SSEUP_CLASS, everything should be passed in
9376 memory. */
9377 if (classes[0] != X86_64_SSE_CLASS)
9378 return 0;
9379
9380 for (i = 1; i < words; i++)
9381 if (classes[i] != X86_64_SSEUP_CLASS)
9382 return 0;
9383 }
9384
9385 /* Final merger cleanup. */
9386 for (i = 0; i < words; i++)
9387 {
9388 /* If one class is MEMORY, everything should be passed in
9389 memory. */
9390 if (classes[i] == X86_64_MEMORY_CLASS)
9391 return 0;
9392
9393 /* The X86_64_SSEUP_CLASS should be always preceded by
9394 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
9395 if (classes[i] == X86_64_SSEUP_CLASS
9396 && classes[i - 1] != X86_64_SSE_CLASS
9397 && classes[i - 1] != X86_64_SSEUP_CLASS)
9398 {
9399 /* The first one should never be X86_64_SSEUP_CLASS. */
9400 gcc_assert (i != 0);
9401 classes[i] = X86_64_SSE_CLASS;
9402 }
9403
9404 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
9405 everything should be passed in memory. */
9406 if (classes[i] == X86_64_X87UP_CLASS
9407 && (classes[i - 1] != X86_64_X87_CLASS))
9408 {
9409 static bool warned;
9410
9411 /* The first one should never be X86_64_X87UP_CLASS. */
9412 gcc_assert (i != 0);
9413 if (!warned && warn_psabi)
9414 {
9415 warned = true;
9416 inform (input_location,
9417 "the ABI of passing union with long double"
9418 " has changed in GCC 4.4");
9419 }
9420 return 0;
9421 }
9422 }
9423 return words;
9424 }
9425
9426 /* Compute alignment needed. We align all types to natural boundaries with
9427 exception of XFmode that is aligned to 64bits. */
9428 if (mode != VOIDmode && mode != BLKmode)
9429 {
9430 int mode_alignment = GET_MODE_BITSIZE (mode);
9431
9432 if (mode == XFmode)
9433 mode_alignment = 128;
9434 else if (mode == XCmode)
9435 mode_alignment = 256;
9436 if (COMPLEX_MODE_P (mode))
9437 mode_alignment /= 2;
9438 /* Misaligned fields are always returned in memory. */
9439 if (bit_offset % mode_alignment)
9440 return 0;
9441 }
9442
9443 /* for V1xx modes, just use the base mode */
9444 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
9445 && GET_MODE_UNIT_SIZE (mode) == bytes)
9446 mode = GET_MODE_INNER (mode);
9447
9448 /* Classification of atomic types. */
9449 switch (mode)
9450 {
9451 case SDmode:
9452 case DDmode:
9453 classes[0] = X86_64_SSE_CLASS;
9454 return 1;
9455 case TDmode:
9456 classes[0] = X86_64_SSE_CLASS;
9457 classes[1] = X86_64_SSEUP_CLASS;
9458 return 2;
9459 case DImode:
9460 case SImode:
9461 case HImode:
9462 case QImode:
9463 case CSImode:
9464 case CHImode:
9465 case CQImode:
9466 {
9467 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
9468
9469 /* Analyze last 128 bits only. */
9470 size = (size - 1) & 0x7f;
9471
9472 if (size < 32)
9473 {
9474 classes[0] = X86_64_INTEGERSI_CLASS;
9475 return 1;
9476 }
9477 else if (size < 64)
9478 {
9479 classes[0] = X86_64_INTEGER_CLASS;
9480 return 1;
9481 }
9482 else if (size < 64+32)
9483 {
9484 classes[0] = X86_64_INTEGER_CLASS;
9485 classes[1] = X86_64_INTEGERSI_CLASS;
9486 return 2;
9487 }
9488 else if (size < 64+64)
9489 {
9490 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
9491 return 2;
9492 }
9493 else
9494 gcc_unreachable ();
9495 }
9496 case CDImode:
9497 case TImode:
9498 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
9499 return 2;
9500 case COImode:
9501 case OImode:
9502 /* OImode shouldn't be used directly. */
9503 gcc_unreachable ();
9504 case CTImode:
9505 return 0;
9506 case SFmode:
9507 if (!(bit_offset % 64))
9508 classes[0] = X86_64_SSESF_CLASS;
9509 else
9510 classes[0] = X86_64_SSE_CLASS;
9511 return 1;
9512 case DFmode:
9513 classes[0] = X86_64_SSEDF_CLASS;
9514 return 1;
9515 case XFmode:
9516 classes[0] = X86_64_X87_CLASS;
9517 classes[1] = X86_64_X87UP_CLASS;
9518 return 2;
9519 case TFmode:
9520 classes[0] = X86_64_SSE_CLASS;
9521 classes[1] = X86_64_SSEUP_CLASS;
9522 return 2;
9523 case SCmode:
9524 classes[0] = X86_64_SSE_CLASS;
9525 if (!(bit_offset % 64))
9526 return 1;
9527 else
9528 {
9529 static bool warned;
9530
9531 if (!warned && warn_psabi)
9532 {
9533 warned = true;
9534 inform (input_location,
9535 "the ABI of passing structure with complex float"
9536 " member has changed in GCC 4.4");
9537 }
9538 classes[1] = X86_64_SSESF_CLASS;
9539 return 2;
9540 }
9541 case DCmode:
9542 classes[0] = X86_64_SSEDF_CLASS;
9543 classes[1] = X86_64_SSEDF_CLASS;
9544 return 2;
9545 case XCmode:
9546 classes[0] = X86_64_COMPLEX_X87_CLASS;
9547 return 1;
9548 case TCmode:
9549 /* This modes is larger than 16 bytes. */
9550 return 0;
9551 case V8SFmode:
9552 case V8SImode:
9553 case V32QImode:
9554 case V16HImode:
9555 case V4DFmode:
9556 case V4DImode:
9557 classes[0] = X86_64_SSE_CLASS;
9558 classes[1] = X86_64_SSEUP_CLASS;
9559 classes[2] = X86_64_SSEUP_CLASS;
9560 classes[3] = X86_64_SSEUP_CLASS;
9561 return 4;
9562 case V8DFmode:
9563 case V16SFmode:
9564 case V8DImode:
9565 case V16SImode:
9566 case V32HImode:
9567 case V64QImode:
9568 classes[0] = X86_64_SSE_CLASS;
9569 classes[1] = X86_64_SSEUP_CLASS;
9570 classes[2] = X86_64_SSEUP_CLASS;
9571 classes[3] = X86_64_SSEUP_CLASS;
9572 classes[4] = X86_64_SSEUP_CLASS;
9573 classes[5] = X86_64_SSEUP_CLASS;
9574 classes[6] = X86_64_SSEUP_CLASS;
9575 classes[7] = X86_64_SSEUP_CLASS;
9576 return 8;
9577 case V4SFmode:
9578 case V4SImode:
9579 case V16QImode:
9580 case V8HImode:
9581 case V2DFmode:
9582 case V2DImode:
9583 classes[0] = X86_64_SSE_CLASS;
9584 classes[1] = X86_64_SSEUP_CLASS;
9585 return 2;
9586 case V1TImode:
9587 case V1DImode:
9588 case V2SFmode:
9589 case V2SImode:
9590 case V4HImode:
9591 case V8QImode:
9592 classes[0] = X86_64_SSE_CLASS;
9593 return 1;
9594 case BLKmode:
9595 case VOIDmode:
9596 return 0;
9597 default:
9598 gcc_assert (VECTOR_MODE_P (mode));
9599
9600 if (bytes > 16)
9601 return 0;
9602
9603 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
9604
9605 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
9606 classes[0] = X86_64_INTEGERSI_CLASS;
9607 else
9608 classes[0] = X86_64_INTEGER_CLASS;
9609 classes[1] = X86_64_INTEGER_CLASS;
9610 return 1 + (bytes > 8);
9611 }
9612 }
9613
9614 /* Examine the argument and return set number of register required in each
9615 class. Return true iff parameter should be passed in memory. */
9616
9617 static bool
9618 examine_argument (machine_mode mode, const_tree type, int in_return,
9619 int *int_nregs, int *sse_nregs)
9620 {
9621 enum x86_64_reg_class regclass[MAX_CLASSES];
9622 int n = classify_argument (mode, type, regclass, 0);
9623
9624 *int_nregs = 0;
9625 *sse_nregs = 0;
9626
9627 if (!n)
9628 return true;
9629 for (n--; n >= 0; n--)
9630 switch (regclass[n])
9631 {
9632 case X86_64_INTEGER_CLASS:
9633 case X86_64_INTEGERSI_CLASS:
9634 (*int_nregs)++;
9635 break;
9636 case X86_64_SSE_CLASS:
9637 case X86_64_SSESF_CLASS:
9638 case X86_64_SSEDF_CLASS:
9639 (*sse_nregs)++;
9640 break;
9641 case X86_64_NO_CLASS:
9642 case X86_64_SSEUP_CLASS:
9643 break;
9644 case X86_64_X87_CLASS:
9645 case X86_64_X87UP_CLASS:
9646 case X86_64_COMPLEX_X87_CLASS:
9647 if (!in_return)
9648 return true;
9649 break;
9650 case X86_64_MEMORY_CLASS:
9651 gcc_unreachable ();
9652 }
9653
9654 return false;
9655 }
9656
9657 /* Construct container for the argument used by GCC interface. See
9658 FUNCTION_ARG for the detailed description. */
9659
9660 static rtx
9661 construct_container (machine_mode mode, machine_mode orig_mode,
9662 const_tree type, int in_return, int nintregs, int nsseregs,
9663 const int *intreg, int sse_regno)
9664 {
9665 /* The following variables hold the static issued_error state. */
9666 static bool issued_sse_arg_error;
9667 static bool issued_sse_ret_error;
9668 static bool issued_x87_ret_error;
9669
9670 machine_mode tmpmode;
9671 int bytes =
9672 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
9673 enum x86_64_reg_class regclass[MAX_CLASSES];
9674 int n;
9675 int i;
9676 int nexps = 0;
9677 int needed_sseregs, needed_intregs;
9678 rtx exp[MAX_CLASSES];
9679 rtx ret;
9680
9681 n = classify_argument (mode, type, regclass, 0);
9682 if (!n)
9683 return NULL;
9684 if (examine_argument (mode, type, in_return, &needed_intregs,
9685 &needed_sseregs))
9686 return NULL;
9687 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
9688 return NULL;
9689
9690 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
9691 some less clueful developer tries to use floating-point anyway. */
9692 if (needed_sseregs && !TARGET_SSE)
9693 {
9694 if (in_return)
9695 {
9696 if (!issued_sse_ret_error)
9697 {
9698 error ("SSE register return with SSE disabled");
9699 issued_sse_ret_error = true;
9700 }
9701 }
9702 else if (!issued_sse_arg_error)
9703 {
9704 error ("SSE register argument with SSE disabled");
9705 issued_sse_arg_error = true;
9706 }
9707 return NULL;
9708 }
9709
9710 /* Likewise, error if the ABI requires us to return values in the
9711 x87 registers and the user specified -mno-80387. */
9712 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
9713 for (i = 0; i < n; i++)
9714 if (regclass[i] == X86_64_X87_CLASS
9715 || regclass[i] == X86_64_X87UP_CLASS
9716 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
9717 {
9718 if (!issued_x87_ret_error)
9719 {
9720 error ("x87 register return with x87 disabled");
9721 issued_x87_ret_error = true;
9722 }
9723 return NULL;
9724 }
9725
9726 /* First construct simple cases. Avoid SCmode, since we want to use
9727 single register to pass this type. */
9728 if (n == 1 && mode != SCmode)
9729 switch (regclass[0])
9730 {
9731 case X86_64_INTEGER_CLASS:
9732 case X86_64_INTEGERSI_CLASS:
9733 return gen_rtx_REG (mode, intreg[0]);
9734 case X86_64_SSE_CLASS:
9735 case X86_64_SSESF_CLASS:
9736 case X86_64_SSEDF_CLASS:
9737 if (mode != BLKmode)
9738 return gen_reg_or_parallel (mode, orig_mode,
9739 SSE_REGNO (sse_regno));
9740 break;
9741 case X86_64_X87_CLASS:
9742 case X86_64_COMPLEX_X87_CLASS:
9743 return gen_rtx_REG (mode, FIRST_STACK_REG);
9744 case X86_64_NO_CLASS:
9745 /* Zero sized array, struct or class. */
9746 return NULL;
9747 default:
9748 gcc_unreachable ();
9749 }
9750 if (n == 2
9751 && regclass[0] == X86_64_SSE_CLASS
9752 && regclass[1] == X86_64_SSEUP_CLASS
9753 && mode != BLKmode)
9754 return gen_reg_or_parallel (mode, orig_mode,
9755 SSE_REGNO (sse_regno));
9756 if (n == 4
9757 && regclass[0] == X86_64_SSE_CLASS
9758 && regclass[1] == X86_64_SSEUP_CLASS
9759 && regclass[2] == X86_64_SSEUP_CLASS
9760 && regclass[3] == X86_64_SSEUP_CLASS
9761 && mode != BLKmode)
9762 return gen_reg_or_parallel (mode, orig_mode,
9763 SSE_REGNO (sse_regno));
9764 if (n == 8
9765 && regclass[0] == X86_64_SSE_CLASS
9766 && regclass[1] == X86_64_SSEUP_CLASS
9767 && regclass[2] == X86_64_SSEUP_CLASS
9768 && regclass[3] == X86_64_SSEUP_CLASS
9769 && regclass[4] == X86_64_SSEUP_CLASS
9770 && regclass[5] == X86_64_SSEUP_CLASS
9771 && regclass[6] == X86_64_SSEUP_CLASS
9772 && regclass[7] == X86_64_SSEUP_CLASS
9773 && mode != BLKmode)
9774 return gen_reg_or_parallel (mode, orig_mode,
9775 SSE_REGNO (sse_regno));
9776 if (n == 2
9777 && regclass[0] == X86_64_X87_CLASS
9778 && regclass[1] == X86_64_X87UP_CLASS)
9779 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
9780
9781 if (n == 2
9782 && regclass[0] == X86_64_INTEGER_CLASS
9783 && regclass[1] == X86_64_INTEGER_CLASS
9784 && (mode == CDImode || mode == TImode)
9785 && intreg[0] + 1 == intreg[1])
9786 return gen_rtx_REG (mode, intreg[0]);
9787
9788 /* Otherwise figure out the entries of the PARALLEL. */
9789 for (i = 0; i < n; i++)
9790 {
9791 int pos;
9792
9793 switch (regclass[i])
9794 {
9795 case X86_64_NO_CLASS:
9796 break;
9797 case X86_64_INTEGER_CLASS:
9798 case X86_64_INTEGERSI_CLASS:
9799 /* Merge TImodes on aligned occasions here too. */
9800 if (i * 8 + 8 > bytes)
9801 tmpmode
9802 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
9803 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
9804 tmpmode = SImode;
9805 else
9806 tmpmode = DImode;
9807 /* We've requested 24 bytes we
9808 don't have mode for. Use DImode. */
9809 if (tmpmode == BLKmode)
9810 tmpmode = DImode;
9811 exp [nexps++]
9812 = gen_rtx_EXPR_LIST (VOIDmode,
9813 gen_rtx_REG (tmpmode, *intreg),
9814 GEN_INT (i*8));
9815 intreg++;
9816 break;
9817 case X86_64_SSESF_CLASS:
9818 exp [nexps++]
9819 = gen_rtx_EXPR_LIST (VOIDmode,
9820 gen_rtx_REG (SFmode,
9821 SSE_REGNO (sse_regno)),
9822 GEN_INT (i*8));
9823 sse_regno++;
9824 break;
9825 case X86_64_SSEDF_CLASS:
9826 exp [nexps++]
9827 = gen_rtx_EXPR_LIST (VOIDmode,
9828 gen_rtx_REG (DFmode,
9829 SSE_REGNO (sse_regno)),
9830 GEN_INT (i*8));
9831 sse_regno++;
9832 break;
9833 case X86_64_SSE_CLASS:
9834 pos = i;
9835 switch (n)
9836 {
9837 case 1:
9838 tmpmode = DImode;
9839 break;
9840 case 2:
9841 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
9842 {
9843 tmpmode = TImode;
9844 i++;
9845 }
9846 else
9847 tmpmode = DImode;
9848 break;
9849 case 4:
9850 gcc_assert (i == 0
9851 && regclass[1] == X86_64_SSEUP_CLASS
9852 && regclass[2] == X86_64_SSEUP_CLASS
9853 && regclass[3] == X86_64_SSEUP_CLASS);
9854 tmpmode = OImode;
9855 i += 3;
9856 break;
9857 case 8:
9858 gcc_assert (i == 0
9859 && regclass[1] == X86_64_SSEUP_CLASS
9860 && regclass[2] == X86_64_SSEUP_CLASS
9861 && regclass[3] == X86_64_SSEUP_CLASS
9862 && regclass[4] == X86_64_SSEUP_CLASS
9863 && regclass[5] == X86_64_SSEUP_CLASS
9864 && regclass[6] == X86_64_SSEUP_CLASS
9865 && regclass[7] == X86_64_SSEUP_CLASS);
9866 tmpmode = XImode;
9867 i += 7;
9868 break;
9869 default:
9870 gcc_unreachable ();
9871 }
9872 exp [nexps++]
9873 = gen_rtx_EXPR_LIST (VOIDmode,
9874 gen_rtx_REG (tmpmode,
9875 SSE_REGNO (sse_regno)),
9876 GEN_INT (pos*8));
9877 sse_regno++;
9878 break;
9879 default:
9880 gcc_unreachable ();
9881 }
9882 }
9883
9884 /* Empty aligned struct, union or class. */
9885 if (nexps == 0)
9886 return NULL;
9887
9888 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
9889 for (i = 0; i < nexps; i++)
9890 XVECEXP (ret, 0, i) = exp [i];
9891 return ret;
9892 }
9893
9894 /* Update the data in CUM to advance over an argument of mode MODE
9895 and data type TYPE. (TYPE is null for libcalls where that information
9896 may not be available.)
9897
9898 Return a number of integer regsiters advanced over. */
9899
9900 static int
9901 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
9902 const_tree type, HOST_WIDE_INT bytes,
9903 HOST_WIDE_INT words)
9904 {
9905 int res = 0;
9906 bool error_p = false;
9907
9908 if (TARGET_IAMCU)
9909 {
9910 /* Intel MCU psABI passes scalars and aggregates no larger than 8
9911 bytes in registers. */
9912 if (!VECTOR_MODE_P (mode) && bytes <= 8)
9913 goto pass_in_reg;
9914 return res;
9915 }
9916
9917 switch (mode)
9918 {
9919 default:
9920 break;
9921
9922 case BLKmode:
9923 if (bytes < 0)
9924 break;
9925 /* FALLTHRU */
9926
9927 case DImode:
9928 case SImode:
9929 case HImode:
9930 case QImode:
9931 pass_in_reg:
9932 cum->words += words;
9933 cum->nregs -= words;
9934 cum->regno += words;
9935 if (cum->nregs >= 0)
9936 res = words;
9937 if (cum->nregs <= 0)
9938 {
9939 cum->nregs = 0;
9940 cfun->machine->arg_reg_available = false;
9941 cum->regno = 0;
9942 }
9943 break;
9944
9945 case OImode:
9946 /* OImode shouldn't be used directly. */
9947 gcc_unreachable ();
9948
9949 case DFmode:
9950 if (cum->float_in_sse == -1)
9951 error_p = true;
9952 if (cum->float_in_sse < 2)
9953 break;
9954 /* FALLTHRU */
9955 case SFmode:
9956 if (cum->float_in_sse == -1)
9957 error_p = true;
9958 if (cum->float_in_sse < 1)
9959 break;
9960 /* FALLTHRU */
9961
9962 case V8SFmode:
9963 case V8SImode:
9964 case V64QImode:
9965 case V32HImode:
9966 case V16SImode:
9967 case V8DImode:
9968 case V16SFmode:
9969 case V8DFmode:
9970 case V32QImode:
9971 case V16HImode:
9972 case V4DFmode:
9973 case V4DImode:
9974 case TImode:
9975 case V16QImode:
9976 case V8HImode:
9977 case V4SImode:
9978 case V2DImode:
9979 case V4SFmode:
9980 case V2DFmode:
9981 if (!type || !AGGREGATE_TYPE_P (type))
9982 {
9983 cum->sse_words += words;
9984 cum->sse_nregs -= 1;
9985 cum->sse_regno += 1;
9986 if (cum->sse_nregs <= 0)
9987 {
9988 cum->sse_nregs = 0;
9989 cum->sse_regno = 0;
9990 }
9991 }
9992 break;
9993
9994 case V8QImode:
9995 case V4HImode:
9996 case V2SImode:
9997 case V2SFmode:
9998 case V1TImode:
9999 case V1DImode:
10000 if (!type || !AGGREGATE_TYPE_P (type))
10001 {
10002 cum->mmx_words += words;
10003 cum->mmx_nregs -= 1;
10004 cum->mmx_regno += 1;
10005 if (cum->mmx_nregs <= 0)
10006 {
10007 cum->mmx_nregs = 0;
10008 cum->mmx_regno = 0;
10009 }
10010 }
10011 break;
10012 }
10013 if (error_p)
10014 {
10015 cum->float_in_sse = 0;
10016 error ("calling %qD with SSE calling convention without "
10017 "SSE/SSE2 enabled", cum->decl);
10018 sorry ("this is a GCC bug that can be worked around by adding "
10019 "attribute used to function called");
10020 }
10021
10022 return res;
10023 }
10024
10025 static int
10026 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
10027 const_tree type, HOST_WIDE_INT words, bool named)
10028 {
10029 int int_nregs, sse_nregs;
10030
10031 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
10032 if (!named && (VALID_AVX512F_REG_MODE (mode)
10033 || VALID_AVX256_REG_MODE (mode)))
10034 return 0;
10035
10036 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
10037 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
10038 {
10039 cum->nregs -= int_nregs;
10040 cum->sse_nregs -= sse_nregs;
10041 cum->regno += int_nregs;
10042 cum->sse_regno += sse_nregs;
10043 return int_nregs;
10044 }
10045 else
10046 {
10047 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
10048 cum->words = ROUND_UP (cum->words, align);
10049 cum->words += words;
10050 return 0;
10051 }
10052 }
10053
10054 static int
10055 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
10056 HOST_WIDE_INT words)
10057 {
10058 /* Otherwise, this should be passed indirect. */
10059 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
10060
10061 cum->words += words;
10062 if (cum->nregs > 0)
10063 {
10064 cum->nregs -= 1;
10065 cum->regno += 1;
10066 return 1;
10067 }
10068 return 0;
10069 }
10070
10071 /* Update the data in CUM to advance over an argument of mode MODE and
10072 data type TYPE. (TYPE is null for libcalls where that information
10073 may not be available.) */
10074
10075 static void
10076 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
10077 const_tree type, bool named)
10078 {
10079 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10080 HOST_WIDE_INT bytes, words;
10081 int nregs;
10082
10083 /* The argument of interrupt handler is a special case and is
10084 handled in ix86_function_arg. */
10085 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
10086 return;
10087
10088 if (mode == BLKmode)
10089 bytes = int_size_in_bytes (type);
10090 else
10091 bytes = GET_MODE_SIZE (mode);
10092 words = CEIL (bytes, UNITS_PER_WORD);
10093
10094 if (type)
10095 mode = type_natural_mode (type, NULL, false);
10096
10097 if ((type && POINTER_BOUNDS_TYPE_P (type))
10098 || POINTER_BOUNDS_MODE_P (mode))
10099 {
10100 /* If we pass bounds in BT then just update remained bounds count. */
10101 if (cum->bnds_in_bt)
10102 {
10103 cum->bnds_in_bt--;
10104 return;
10105 }
10106
10107 /* Update remained number of bounds to force. */
10108 if (cum->force_bnd_pass)
10109 cum->force_bnd_pass--;
10110
10111 cum->bnd_regno++;
10112
10113 return;
10114 }
10115
10116 /* The first arg not going to Bounds Tables resets this counter. */
10117 cum->bnds_in_bt = 0;
10118 /* For unnamed args we always pass bounds to avoid bounds mess when
10119 passed and received types do not match. If bounds do not follow
10120 unnamed arg, still pretend required number of bounds were passed. */
10121 if (cum->force_bnd_pass)
10122 {
10123 cum->bnd_regno += cum->force_bnd_pass;
10124 cum->force_bnd_pass = 0;
10125 }
10126
10127 if (TARGET_64BIT)
10128 {
10129 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10130
10131 if (call_abi == MS_ABI)
10132 nregs = function_arg_advance_ms_64 (cum, bytes, words);
10133 else
10134 nregs = function_arg_advance_64 (cum, mode, type, words, named);
10135 }
10136 else
10137 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
10138
10139 /* For stdarg we expect bounds to be passed for each value passed
10140 in register. */
10141 if (cum->stdarg)
10142 cum->force_bnd_pass = nregs;
10143 /* For pointers passed in memory we expect bounds passed in Bounds
10144 Table. */
10145 if (!nregs)
10146 cum->bnds_in_bt = chkp_type_bounds_count (type);
10147 }
10148
10149 /* Define where to put the arguments to a function.
10150 Value is zero to push the argument on the stack,
10151 or a hard register in which to store the argument.
10152
10153 MODE is the argument's machine mode.
10154 TYPE is the data type of the argument (as a tree).
10155 This is null for libcalls where that information may
10156 not be available.
10157 CUM is a variable of type CUMULATIVE_ARGS which gives info about
10158 the preceding args and about the function being called.
10159 NAMED is nonzero if this argument is a named parameter
10160 (otherwise it is an extra parameter matching an ellipsis). */
10161
10162 static rtx
10163 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
10164 machine_mode orig_mode, const_tree type,
10165 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
10166 {
10167 bool error_p = false;
10168
10169 /* Avoid the AL settings for the Unix64 ABI. */
10170 if (mode == VOIDmode)
10171 return constm1_rtx;
10172
10173 if (TARGET_IAMCU)
10174 {
10175 /* Intel MCU psABI passes scalars and aggregates no larger than 8
10176 bytes in registers. */
10177 if (!VECTOR_MODE_P (mode) && bytes <= 8)
10178 goto pass_in_reg;
10179 return NULL_RTX;
10180 }
10181
10182 switch (mode)
10183 {
10184 default:
10185 break;
10186
10187 case BLKmode:
10188 if (bytes < 0)
10189 break;
10190 /* FALLTHRU */
10191 case DImode:
10192 case SImode:
10193 case HImode:
10194 case QImode:
10195 pass_in_reg:
10196 if (words <= cum->nregs)
10197 {
10198 int regno = cum->regno;
10199
10200 /* Fastcall allocates the first two DWORD (SImode) or
10201 smaller arguments to ECX and EDX if it isn't an
10202 aggregate type . */
10203 if (cum->fastcall)
10204 {
10205 if (mode == BLKmode
10206 || mode == DImode
10207 || (type && AGGREGATE_TYPE_P (type)))
10208 break;
10209
10210 /* ECX not EAX is the first allocated register. */
10211 if (regno == AX_REG)
10212 regno = CX_REG;
10213 }
10214 return gen_rtx_REG (mode, regno);
10215 }
10216 break;
10217
10218 case DFmode:
10219 if (cum->float_in_sse == -1)
10220 error_p = true;
10221 if (cum->float_in_sse < 2)
10222 break;
10223 /* FALLTHRU */
10224 case SFmode:
10225 if (cum->float_in_sse == -1)
10226 error_p = true;
10227 if (cum->float_in_sse < 1)
10228 break;
10229 /* FALLTHRU */
10230 case TImode:
10231 /* In 32bit, we pass TImode in xmm registers. */
10232 case V16QImode:
10233 case V8HImode:
10234 case V4SImode:
10235 case V2DImode:
10236 case V4SFmode:
10237 case V2DFmode:
10238 if (!type || !AGGREGATE_TYPE_P (type))
10239 {
10240 if (cum->sse_nregs)
10241 return gen_reg_or_parallel (mode, orig_mode,
10242 cum->sse_regno + FIRST_SSE_REG);
10243 }
10244 break;
10245
10246 case OImode:
10247 case XImode:
10248 /* OImode and XImode shouldn't be used directly. */
10249 gcc_unreachable ();
10250
10251 case V64QImode:
10252 case V32HImode:
10253 case V16SImode:
10254 case V8DImode:
10255 case V16SFmode:
10256 case V8DFmode:
10257 case V8SFmode:
10258 case V8SImode:
10259 case V32QImode:
10260 case V16HImode:
10261 case V4DFmode:
10262 case V4DImode:
10263 if (!type || !AGGREGATE_TYPE_P (type))
10264 {
10265 if (cum->sse_nregs)
10266 return gen_reg_or_parallel (mode, orig_mode,
10267 cum->sse_regno + FIRST_SSE_REG);
10268 }
10269 break;
10270
10271 case V8QImode:
10272 case V4HImode:
10273 case V2SImode:
10274 case V2SFmode:
10275 case V1TImode:
10276 case V1DImode:
10277 if (!type || !AGGREGATE_TYPE_P (type))
10278 {
10279 if (cum->mmx_nregs)
10280 return gen_reg_or_parallel (mode, orig_mode,
10281 cum->mmx_regno + FIRST_MMX_REG);
10282 }
10283 break;
10284 }
10285 if (error_p)
10286 {
10287 cum->float_in_sse = 0;
10288 error ("calling %qD with SSE calling convention without "
10289 "SSE/SSE2 enabled", cum->decl);
10290 sorry ("this is a GCC bug that can be worked around by adding "
10291 "attribute used to function called");
10292 }
10293
10294 return NULL_RTX;
10295 }
10296
10297 static rtx
10298 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
10299 machine_mode orig_mode, const_tree type, bool named)
10300 {
10301 /* Handle a hidden AL argument containing number of registers
10302 for varargs x86-64 functions. */
10303 if (mode == VOIDmode)
10304 return GEN_INT (cum->maybe_vaarg
10305 ? (cum->sse_nregs < 0
10306 ? X86_64_SSE_REGPARM_MAX
10307 : cum->sse_regno)
10308 : -1);
10309
10310 switch (mode)
10311 {
10312 default:
10313 break;
10314
10315 case V8SFmode:
10316 case V8SImode:
10317 case V32QImode:
10318 case V16HImode:
10319 case V4DFmode:
10320 case V4DImode:
10321 case V16SFmode:
10322 case V16SImode:
10323 case V64QImode:
10324 case V32HImode:
10325 case V8DFmode:
10326 case V8DImode:
10327 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
10328 if (!named)
10329 return NULL;
10330 break;
10331 }
10332
10333 return construct_container (mode, orig_mode, type, 0, cum->nregs,
10334 cum->sse_nregs,
10335 &x86_64_int_parameter_registers [cum->regno],
10336 cum->sse_regno);
10337 }
10338
10339 static rtx
10340 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
10341 machine_mode orig_mode, bool named,
10342 HOST_WIDE_INT bytes)
10343 {
10344 unsigned int regno;
10345
10346 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
10347 We use value of -2 to specify that current function call is MSABI. */
10348 if (mode == VOIDmode)
10349 return GEN_INT (-2);
10350
10351 /* If we've run out of registers, it goes on the stack. */
10352 if (cum->nregs == 0)
10353 return NULL_RTX;
10354
10355 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
10356
10357 /* Only floating point modes are passed in anything but integer regs. */
10358 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
10359 {
10360 if (named)
10361 regno = cum->regno + FIRST_SSE_REG;
10362 else
10363 {
10364 rtx t1, t2;
10365
10366 /* Unnamed floating parameters are passed in both the
10367 SSE and integer registers. */
10368 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
10369 t2 = gen_rtx_REG (mode, regno);
10370 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
10371 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
10372 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
10373 }
10374 }
10375 /* Handle aggregated types passed in register. */
10376 if (orig_mode == BLKmode)
10377 {
10378 if (bytes > 0 && bytes <= 8)
10379 mode = (bytes > 4 ? DImode : SImode);
10380 if (mode == BLKmode)
10381 mode = DImode;
10382 }
10383
10384 return gen_reg_or_parallel (mode, orig_mode, regno);
10385 }
10386
10387 /* Return where to put the arguments to a function.
10388 Return zero to push the argument on the stack, or a hard register in which to store the argument.
10389
10390 MODE is the argument's machine mode. TYPE is the data type of the
10391 argument. It is null for libcalls where that information may not be
10392 available. CUM gives information about the preceding args and about
10393 the function being called. NAMED is nonzero if this argument is a
10394 named parameter (otherwise it is an extra parameter matching an
10395 ellipsis). */
10396
10397 static rtx
10398 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
10399 const_tree type, bool named)
10400 {
10401 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10402 machine_mode mode = omode;
10403 HOST_WIDE_INT bytes, words;
10404 rtx arg;
10405
10406 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
10407 {
10408 gcc_assert (type != NULL_TREE);
10409 if (POINTER_TYPE_P (type))
10410 {
10411 /* This is the pointer argument. */
10412 gcc_assert (TYPE_MODE (type) == Pmode);
10413 if (cfun->machine->func_type == TYPE_INTERRUPT)
10414 /* -WORD(AP) in the current frame in interrupt handler. */
10415 arg = plus_constant (Pmode, arg_pointer_rtx,
10416 -UNITS_PER_WORD);
10417 else
10418 /* (AP) in the current frame in exception handler. */
10419 arg = arg_pointer_rtx;
10420 }
10421 else
10422 {
10423 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
10424 && TREE_CODE (type) == INTEGER_TYPE
10425 && TYPE_MODE (type) == word_mode);
10426 /* The integer argument is the error code at -WORD(AP) in
10427 the current frame in exception handler. */
10428 arg = gen_rtx_MEM (word_mode,
10429 plus_constant (Pmode,
10430 arg_pointer_rtx,
10431 -UNITS_PER_WORD));
10432 }
10433 return arg;
10434 }
10435
10436 /* All pointer bounds arguments are handled separately here. */
10437 if ((type && POINTER_BOUNDS_TYPE_P (type))
10438 || POINTER_BOUNDS_MODE_P (mode))
10439 {
10440 /* Return NULL if bounds are forced to go in Bounds Table. */
10441 if (cum->bnds_in_bt)
10442 arg = NULL;
10443 /* Return the next available bound reg if any. */
10444 else if (cum->bnd_regno <= LAST_BND_REG)
10445 arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
10446 /* Return the next special slot number otherwise. */
10447 else
10448 arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
10449
10450 return arg;
10451 }
10452
10453 if (mode == BLKmode)
10454 bytes = int_size_in_bytes (type);
10455 else
10456 bytes = GET_MODE_SIZE (mode);
10457 words = CEIL (bytes, UNITS_PER_WORD);
10458
10459 /* To simplify the code below, represent vector types with a vector mode
10460 even if MMX/SSE are not active. */
10461 if (type && TREE_CODE (type) == VECTOR_TYPE)
10462 mode = type_natural_mode (type, cum, false);
10463
10464 if (TARGET_64BIT)
10465 {
10466 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10467
10468 if (call_abi == MS_ABI)
10469 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
10470 else
10471 arg = function_arg_64 (cum, mode, omode, type, named);
10472 }
10473 else
10474 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
10475
10476 return arg;
10477 }
10478
10479 /* A C expression that indicates when an argument must be passed by
10480 reference. If nonzero for an argument, a copy of that argument is
10481 made in memory and a pointer to the argument is passed instead of
10482 the argument itself. The pointer is passed in whatever way is
10483 appropriate for passing a pointer to that type. */
10484
10485 static bool
10486 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
10487 const_tree type, bool)
10488 {
10489 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10490
10491 /* Bounds are never passed by reference. */
10492 if ((type && POINTER_BOUNDS_TYPE_P (type))
10493 || POINTER_BOUNDS_MODE_P (mode))
10494 return false;
10495
10496 if (TARGET_64BIT)
10497 {
10498 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10499
10500 /* See Windows x64 Software Convention. */
10501 if (call_abi == MS_ABI)
10502 {
10503 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
10504
10505 if (type)
10506 {
10507 /* Arrays are passed by reference. */
10508 if (TREE_CODE (type) == ARRAY_TYPE)
10509 return true;
10510
10511 if (RECORD_OR_UNION_TYPE_P (type))
10512 {
10513 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
10514 are passed by reference. */
10515 msize = int_size_in_bytes (type);
10516 }
10517 }
10518
10519 /* __m128 is passed by reference. */
10520 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
10521 }
10522 else if (type && int_size_in_bytes (type) == -1)
10523 return true;
10524 }
10525
10526 return false;
10527 }
10528
10529 /* Return true when TYPE should be 128bit aligned for 32bit argument
10530 passing ABI. XXX: This function is obsolete and is only used for
10531 checking psABI compatibility with previous versions of GCC. */
10532
10533 static bool
10534 ix86_compat_aligned_value_p (const_tree type)
10535 {
10536 machine_mode mode = TYPE_MODE (type);
10537 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
10538 || mode == TDmode
10539 || mode == TFmode
10540 || mode == TCmode)
10541 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
10542 return true;
10543 if (TYPE_ALIGN (type) < 128)
10544 return false;
10545
10546 if (AGGREGATE_TYPE_P (type))
10547 {
10548 /* Walk the aggregates recursively. */
10549 switch (TREE_CODE (type))
10550 {
10551 case RECORD_TYPE:
10552 case UNION_TYPE:
10553 case QUAL_UNION_TYPE:
10554 {
10555 tree field;
10556
10557 /* Walk all the structure fields. */
10558 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
10559 {
10560 if (TREE_CODE (field) == FIELD_DECL
10561 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
10562 return true;
10563 }
10564 break;
10565 }
10566
10567 case ARRAY_TYPE:
10568 /* Just for use if some languages passes arrays by value. */
10569 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
10570 return true;
10571 break;
10572
10573 default:
10574 gcc_unreachable ();
10575 }
10576 }
10577 return false;
10578 }
10579
10580 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
10581 XXX: This function is obsolete and is only used for checking psABI
10582 compatibility with previous versions of GCC. */
10583
10584 static unsigned int
10585 ix86_compat_function_arg_boundary (machine_mode mode,
10586 const_tree type, unsigned int align)
10587 {
10588 /* In 32bit, only _Decimal128 and __float128 are aligned to their
10589 natural boundaries. */
10590 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
10591 {
10592 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
10593 make an exception for SSE modes since these require 128bit
10594 alignment.
10595
10596 The handling here differs from field_alignment. ICC aligns MMX
10597 arguments to 4 byte boundaries, while structure fields are aligned
10598 to 8 byte boundaries. */
10599 if (!type)
10600 {
10601 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
10602 align = PARM_BOUNDARY;
10603 }
10604 else
10605 {
10606 if (!ix86_compat_aligned_value_p (type))
10607 align = PARM_BOUNDARY;
10608 }
10609 }
10610 if (align > BIGGEST_ALIGNMENT)
10611 align = BIGGEST_ALIGNMENT;
10612 return align;
10613 }
10614
10615 /* Return true when TYPE should be 128bit aligned for 32bit argument
10616 passing ABI. */
10617
10618 static bool
10619 ix86_contains_aligned_value_p (const_tree type)
10620 {
10621 machine_mode mode = TYPE_MODE (type);
10622
10623 if (mode == XFmode || mode == XCmode)
10624 return false;
10625
10626 if (TYPE_ALIGN (type) < 128)
10627 return false;
10628
10629 if (AGGREGATE_TYPE_P (type))
10630 {
10631 /* Walk the aggregates recursively. */
10632 switch (TREE_CODE (type))
10633 {
10634 case RECORD_TYPE:
10635 case UNION_TYPE:
10636 case QUAL_UNION_TYPE:
10637 {
10638 tree field;
10639
10640 /* Walk all the structure fields. */
10641 for (field = TYPE_FIELDS (type);
10642 field;
10643 field = DECL_CHAIN (field))
10644 {
10645 if (TREE_CODE (field) == FIELD_DECL
10646 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
10647 return true;
10648 }
10649 break;
10650 }
10651
10652 case ARRAY_TYPE:
10653 /* Just for use if some languages passes arrays by value. */
10654 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
10655 return true;
10656 break;
10657
10658 default:
10659 gcc_unreachable ();
10660 }
10661 }
10662 else
10663 return TYPE_ALIGN (type) >= 128;
10664
10665 return false;
10666 }
10667
10668 /* Gives the alignment boundary, in bits, of an argument with the
10669 specified mode and type. */
10670
10671 static unsigned int
10672 ix86_function_arg_boundary (machine_mode mode, const_tree type)
10673 {
10674 unsigned int align;
10675 if (type)
10676 {
10677 /* Since the main variant type is used for call, we convert it to
10678 the main variant type. */
10679 type = TYPE_MAIN_VARIANT (type);
10680 align = TYPE_ALIGN (type);
10681 }
10682 else
10683 align = GET_MODE_ALIGNMENT (mode);
10684 if (align < PARM_BOUNDARY)
10685 align = PARM_BOUNDARY;
10686 else
10687 {
10688 static bool warned;
10689 unsigned int saved_align = align;
10690
10691 if (!TARGET_64BIT)
10692 {
10693 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
10694 if (!type)
10695 {
10696 if (mode == XFmode || mode == XCmode)
10697 align = PARM_BOUNDARY;
10698 }
10699 else if (!ix86_contains_aligned_value_p (type))
10700 align = PARM_BOUNDARY;
10701
10702 if (align < 128)
10703 align = PARM_BOUNDARY;
10704 }
10705
10706 if (warn_psabi
10707 && !warned
10708 && align != ix86_compat_function_arg_boundary (mode, type,
10709 saved_align))
10710 {
10711 warned = true;
10712 inform (input_location,
10713 "The ABI for passing parameters with %d-byte"
10714 " alignment has changed in GCC 4.6",
10715 align / BITS_PER_UNIT);
10716 }
10717 }
10718
10719 return align;
10720 }
10721
10722 /* Return true if N is a possible register number of function value. */
10723
10724 static bool
10725 ix86_function_value_regno_p (const unsigned int regno)
10726 {
10727 switch (regno)
10728 {
10729 case AX_REG:
10730 return true;
10731 case DX_REG:
10732 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
10733 case DI_REG:
10734 case SI_REG:
10735 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
10736
10737 case BND0_REG:
10738 case BND1_REG:
10739 return chkp_function_instrumented_p (current_function_decl);
10740
10741 /* Complex values are returned in %st(0)/%st(1) pair. */
10742 case ST0_REG:
10743 case ST1_REG:
10744 /* TODO: The function should depend on current function ABI but
10745 builtins.c would need updating then. Therefore we use the
10746 default ABI. */
10747 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
10748 return false;
10749 return TARGET_FLOAT_RETURNS_IN_80387;
10750
10751 /* Complex values are returned in %xmm0/%xmm1 pair. */
10752 case XMM0_REG:
10753 case XMM1_REG:
10754 return TARGET_SSE;
10755
10756 case MM0_REG:
10757 if (TARGET_MACHO || TARGET_64BIT)
10758 return false;
10759 return TARGET_MMX;
10760 }
10761
10762 return false;
10763 }
10764
10765 /* Define how to find the value returned by a function.
10766 VALTYPE is the data type of the value (as a tree).
10767 If the precise function being called is known, FUNC is its FUNCTION_DECL;
10768 otherwise, FUNC is 0. */
10769
10770 static rtx
10771 function_value_32 (machine_mode orig_mode, machine_mode mode,
10772 const_tree fntype, const_tree fn)
10773 {
10774 unsigned int regno;
10775
10776 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
10777 we normally prevent this case when mmx is not available. However
10778 some ABIs may require the result to be returned like DImode. */
10779 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
10780 regno = FIRST_MMX_REG;
10781
10782 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
10783 we prevent this case when sse is not available. However some ABIs
10784 may require the result to be returned like integer TImode. */
10785 else if (mode == TImode
10786 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
10787 regno = FIRST_SSE_REG;
10788
10789 /* 32-byte vector modes in %ymm0. */
10790 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
10791 regno = FIRST_SSE_REG;
10792
10793 /* 64-byte vector modes in %zmm0. */
10794 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
10795 regno = FIRST_SSE_REG;
10796
10797 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
10798 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
10799 regno = FIRST_FLOAT_REG;
10800 else
10801 /* Most things go in %eax. */
10802 regno = AX_REG;
10803
10804 /* Override FP return register with %xmm0 for local functions when
10805 SSE math is enabled or for functions with sseregparm attribute. */
10806 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
10807 {
10808 int sse_level = ix86_function_sseregparm (fntype, fn, false);
10809 if (sse_level == -1)
10810 {
10811 error ("calling %qD with SSE calling convention without "
10812 "SSE/SSE2 enabled", fn);
10813 sorry ("this is a GCC bug that can be worked around by adding "
10814 "attribute used to function called");
10815 }
10816 else if ((sse_level >= 1 && mode == SFmode)
10817 || (sse_level == 2 && mode == DFmode))
10818 regno = FIRST_SSE_REG;
10819 }
10820
10821 /* OImode shouldn't be used directly. */
10822 gcc_assert (mode != OImode);
10823
10824 return gen_rtx_REG (orig_mode, regno);
10825 }
10826
10827 static rtx
10828 function_value_64 (machine_mode orig_mode, machine_mode mode,
10829 const_tree valtype)
10830 {
10831 rtx ret;
10832
10833 /* Handle libcalls, which don't provide a type node. */
10834 if (valtype == NULL)
10835 {
10836 unsigned int regno;
10837
10838 switch (mode)
10839 {
10840 case SFmode:
10841 case SCmode:
10842 case DFmode:
10843 case DCmode:
10844 case TFmode:
10845 case SDmode:
10846 case DDmode:
10847 case TDmode:
10848 regno = FIRST_SSE_REG;
10849 break;
10850 case XFmode:
10851 case XCmode:
10852 regno = FIRST_FLOAT_REG;
10853 break;
10854 case TCmode:
10855 return NULL;
10856 default:
10857 regno = AX_REG;
10858 }
10859
10860 return gen_rtx_REG (mode, regno);
10861 }
10862 else if (POINTER_TYPE_P (valtype))
10863 {
10864 /* Pointers are always returned in word_mode. */
10865 mode = word_mode;
10866 }
10867
10868 ret = construct_container (mode, orig_mode, valtype, 1,
10869 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
10870 x86_64_int_return_registers, 0);
10871
10872 /* For zero sized structures, construct_container returns NULL, but we
10873 need to keep rest of compiler happy by returning meaningful value. */
10874 if (!ret)
10875 ret = gen_rtx_REG (orig_mode, AX_REG);
10876
10877 return ret;
10878 }
10879
10880 static rtx
10881 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
10882 const_tree valtype)
10883 {
10884 unsigned int regno = AX_REG;
10885
10886 if (TARGET_SSE)
10887 {
10888 switch (GET_MODE_SIZE (mode))
10889 {
10890 case 16:
10891 if (valtype != NULL_TREE
10892 && !VECTOR_INTEGER_TYPE_P (valtype)
10893 && !VECTOR_INTEGER_TYPE_P (valtype)
10894 && !INTEGRAL_TYPE_P (valtype)
10895 && !VECTOR_FLOAT_TYPE_P (valtype))
10896 break;
10897 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
10898 && !COMPLEX_MODE_P (mode))
10899 regno = FIRST_SSE_REG;
10900 break;
10901 case 8:
10902 case 4:
10903 if (mode == SFmode || mode == DFmode)
10904 regno = FIRST_SSE_REG;
10905 break;
10906 default:
10907 break;
10908 }
10909 }
10910 return gen_rtx_REG (orig_mode, regno);
10911 }
10912
10913 static rtx
10914 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
10915 machine_mode orig_mode, machine_mode mode)
10916 {
10917 const_tree fn, fntype;
10918
10919 fn = NULL_TREE;
10920 if (fntype_or_decl && DECL_P (fntype_or_decl))
10921 fn = fntype_or_decl;
10922 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
10923
10924 if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
10925 || POINTER_BOUNDS_MODE_P (mode))
10926 return gen_rtx_REG (BNDmode, FIRST_BND_REG);
10927 else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
10928 return function_value_ms_64 (orig_mode, mode, valtype);
10929 else if (TARGET_64BIT)
10930 return function_value_64 (orig_mode, mode, valtype);
10931 else
10932 return function_value_32 (orig_mode, mode, fntype, fn);
10933 }
10934
10935 static rtx
10936 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
10937 {
10938 machine_mode mode, orig_mode;
10939
10940 orig_mode = TYPE_MODE (valtype);
10941 mode = type_natural_mode (valtype, NULL, true);
10942 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
10943 }
10944
10945 /* Return an RTX representing a place where a function returns
10946 or recieves pointer bounds or NULL if no bounds are returned.
10947
10948 VALTYPE is a data type of a value returned by the function.
10949
10950 FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
10951 or FUNCTION_TYPE of the function.
10952
10953 If OUTGOING is false, return a place in which the caller will
10954 see the return value. Otherwise, return a place where a
10955 function returns a value. */
10956
10957 static rtx
10958 ix86_function_value_bounds (const_tree valtype,
10959 const_tree fntype_or_decl ATTRIBUTE_UNUSED,
10960 bool outgoing ATTRIBUTE_UNUSED)
10961 {
10962 rtx res = NULL_RTX;
10963
10964 if (BOUNDED_TYPE_P (valtype))
10965 res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
10966 else if (chkp_type_has_pointer (valtype))
10967 {
10968 bitmap slots;
10969 rtx bounds[2];
10970 bitmap_iterator bi;
10971 unsigned i, bnd_no = 0;
10972
10973 bitmap_obstack_initialize (NULL);
10974 slots = BITMAP_ALLOC (NULL);
10975 chkp_find_bound_slots (valtype, slots);
10976
10977 EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
10978 {
10979 rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
10980 rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
10981 gcc_assert (bnd_no < 2);
10982 bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
10983 }
10984
10985 res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
10986
10987 BITMAP_FREE (slots);
10988 bitmap_obstack_release (NULL);
10989 }
10990 else
10991 res = NULL_RTX;
10992
10993 return res;
10994 }
10995
10996 /* Pointer function arguments and return values are promoted to
10997 word_mode for normal functions. */
10998
10999 static machine_mode
11000 ix86_promote_function_mode (const_tree type, machine_mode mode,
11001 int *punsignedp, const_tree fntype,
11002 int for_return)
11003 {
11004 if (cfun->machine->func_type == TYPE_NORMAL
11005 && type != NULL_TREE
11006 && POINTER_TYPE_P (type))
11007 {
11008 *punsignedp = POINTERS_EXTEND_UNSIGNED;
11009 return word_mode;
11010 }
11011 return default_promote_function_mode (type, mode, punsignedp, fntype,
11012 for_return);
11013 }
11014
11015 /* Return true if a structure, union or array with MODE containing FIELD
11016 should be accessed using BLKmode. */
11017
11018 static bool
11019 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
11020 {
11021 /* Union with XFmode must be in BLKmode. */
11022 return (mode == XFmode
11023 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
11024 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
11025 }
11026
11027 rtx
11028 ix86_libcall_value (machine_mode mode)
11029 {
11030 return ix86_function_value_1 (NULL, NULL, mode, mode);
11031 }
11032
11033 /* Return true iff type is returned in memory. */
11034
11035 static bool
11036 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
11037 {
11038 #ifdef SUBTARGET_RETURN_IN_MEMORY
11039 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
11040 #else
11041 const machine_mode mode = type_natural_mode (type, NULL, true);
11042 HOST_WIDE_INT size;
11043
11044 if (POINTER_BOUNDS_TYPE_P (type))
11045 return false;
11046
11047 if (TARGET_64BIT)
11048 {
11049 if (ix86_function_type_abi (fntype) == MS_ABI)
11050 {
11051 size = int_size_in_bytes (type);
11052
11053 /* __m128 is returned in xmm0. */
11054 if ((!type || VECTOR_INTEGER_TYPE_P (type)
11055 || INTEGRAL_TYPE_P (type)
11056 || VECTOR_FLOAT_TYPE_P (type))
11057 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
11058 && !COMPLEX_MODE_P (mode)
11059 && (GET_MODE_SIZE (mode) == 16 || size == 16))
11060 return false;
11061
11062 /* Otherwise, the size must be exactly in [1248]. */
11063 return size != 1 && size != 2 && size != 4 && size != 8;
11064 }
11065 else
11066 {
11067 int needed_intregs, needed_sseregs;
11068
11069 return examine_argument (mode, type, 1,
11070 &needed_intregs, &needed_sseregs);
11071 }
11072 }
11073 else
11074 {
11075 size = int_size_in_bytes (type);
11076
11077 /* Intel MCU psABI returns scalars and aggregates no larger than 8
11078 bytes in registers. */
11079 if (TARGET_IAMCU)
11080 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
11081
11082 if (mode == BLKmode)
11083 return true;
11084
11085 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
11086 return false;
11087
11088 if (VECTOR_MODE_P (mode) || mode == TImode)
11089 {
11090 /* User-created vectors small enough to fit in EAX. */
11091 if (size < 8)
11092 return false;
11093
11094 /* Unless ABI prescibes otherwise,
11095 MMX/3dNow values are returned in MM0 if available. */
11096
11097 if (size == 8)
11098 return TARGET_VECT8_RETURNS || !TARGET_MMX;
11099
11100 /* SSE values are returned in XMM0 if available. */
11101 if (size == 16)
11102 return !TARGET_SSE;
11103
11104 /* AVX values are returned in YMM0 if available. */
11105 if (size == 32)
11106 return !TARGET_AVX;
11107
11108 /* AVX512F values are returned in ZMM0 if available. */
11109 if (size == 64)
11110 return !TARGET_AVX512F;
11111 }
11112
11113 if (mode == XFmode)
11114 return false;
11115
11116 if (size > 12)
11117 return true;
11118
11119 /* OImode shouldn't be used directly. */
11120 gcc_assert (mode != OImode);
11121
11122 return false;
11123 }
11124 #endif
11125 }
11126
11127 \f
11128 /* Create the va_list data type. */
11129
11130 static tree
11131 ix86_build_builtin_va_list_64 (void)
11132 {
11133 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
11134
11135 record = lang_hooks.types.make_type (RECORD_TYPE);
11136 type_decl = build_decl (BUILTINS_LOCATION,
11137 TYPE_DECL, get_identifier ("__va_list_tag"), record);
11138
11139 f_gpr = build_decl (BUILTINS_LOCATION,
11140 FIELD_DECL, get_identifier ("gp_offset"),
11141 unsigned_type_node);
11142 f_fpr = build_decl (BUILTINS_LOCATION,
11143 FIELD_DECL, get_identifier ("fp_offset"),
11144 unsigned_type_node);
11145 f_ovf = build_decl (BUILTINS_LOCATION,
11146 FIELD_DECL, get_identifier ("overflow_arg_area"),
11147 ptr_type_node);
11148 f_sav = build_decl (BUILTINS_LOCATION,
11149 FIELD_DECL, get_identifier ("reg_save_area"),
11150 ptr_type_node);
11151
11152 va_list_gpr_counter_field = f_gpr;
11153 va_list_fpr_counter_field = f_fpr;
11154
11155 DECL_FIELD_CONTEXT (f_gpr) = record;
11156 DECL_FIELD_CONTEXT (f_fpr) = record;
11157 DECL_FIELD_CONTEXT (f_ovf) = record;
11158 DECL_FIELD_CONTEXT (f_sav) = record;
11159
11160 TYPE_STUB_DECL (record) = type_decl;
11161 TYPE_NAME (record) = type_decl;
11162 TYPE_FIELDS (record) = f_gpr;
11163 DECL_CHAIN (f_gpr) = f_fpr;
11164 DECL_CHAIN (f_fpr) = f_ovf;
11165 DECL_CHAIN (f_ovf) = f_sav;
11166
11167 layout_type (record);
11168
11169 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
11170 NULL_TREE, TYPE_ATTRIBUTES (record));
11171
11172 /* The correct type is an array type of one element. */
11173 return build_array_type (record, build_index_type (size_zero_node));
11174 }
11175
11176 /* Setup the builtin va_list data type and for 64-bit the additional
11177 calling convention specific va_list data types. */
11178
11179 static tree
11180 ix86_build_builtin_va_list (void)
11181 {
11182 if (TARGET_64BIT)
11183 {
11184 /* Initialize ABI specific va_list builtin types.
11185
11186 In lto1, we can encounter two va_list types:
11187 - one as a result of the type-merge across TUs, and
11188 - the one constructed here.
11189 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
11190 a type identity check in canonical_va_list_type based on
11191 TYPE_MAIN_VARIANT (which we used to have) will not work.
11192 Instead, we tag each va_list_type_node with its unique attribute, and
11193 look for the attribute in the type identity check in
11194 canonical_va_list_type.
11195
11196 Tagging sysv_va_list_type_node directly with the attribute is
11197 problematic since it's a array of one record, which will degrade into a
11198 pointer to record when used as parameter (see build_va_arg comments for
11199 an example), dropping the attribute in the process. So we tag the
11200 record instead. */
11201
11202 /* For SYSV_ABI we use an array of one record. */
11203 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
11204
11205 /* For MS_ABI we use plain pointer to argument area. */
11206 tree char_ptr_type = build_pointer_type (char_type_node);
11207 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
11208 TYPE_ATTRIBUTES (char_ptr_type));
11209 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
11210
11211 return ((ix86_abi == MS_ABI)
11212 ? ms_va_list_type_node
11213 : sysv_va_list_type_node);
11214 }
11215 else
11216 {
11217 /* For i386 we use plain pointer to argument area. */
11218 return build_pointer_type (char_type_node);
11219 }
11220 }
11221
11222 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
11223
11224 static void
11225 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
11226 {
11227 rtx save_area, mem;
11228 alias_set_type set;
11229 int i, max;
11230
11231 /* GPR size of varargs save area. */
11232 if (cfun->va_list_gpr_size)
11233 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
11234 else
11235 ix86_varargs_gpr_size = 0;
11236
11237 /* FPR size of varargs save area. We don't need it if we don't pass
11238 anything in SSE registers. */
11239 if (TARGET_SSE && cfun->va_list_fpr_size)
11240 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
11241 else
11242 ix86_varargs_fpr_size = 0;
11243
11244 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
11245 return;
11246
11247 save_area = frame_pointer_rtx;
11248 set = get_varargs_alias_set ();
11249
11250 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
11251 if (max > X86_64_REGPARM_MAX)
11252 max = X86_64_REGPARM_MAX;
11253
11254 for (i = cum->regno; i < max; i++)
11255 {
11256 mem = gen_rtx_MEM (word_mode,
11257 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
11258 MEM_NOTRAP_P (mem) = 1;
11259 set_mem_alias_set (mem, set);
11260 emit_move_insn (mem,
11261 gen_rtx_REG (word_mode,
11262 x86_64_int_parameter_registers[i]));
11263 }
11264
11265 if (ix86_varargs_fpr_size)
11266 {
11267 machine_mode smode;
11268 rtx_code_label *label;
11269 rtx test;
11270
11271 /* Now emit code to save SSE registers. The AX parameter contains number
11272 of SSE parameter registers used to call this function, though all we
11273 actually check here is the zero/non-zero status. */
11274
11275 label = gen_label_rtx ();
11276 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
11277 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
11278 label));
11279
11280 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
11281 we used movdqa (i.e. TImode) instead? Perhaps even better would
11282 be if we could determine the real mode of the data, via a hook
11283 into pass_stdarg. Ignore all that for now. */
11284 smode = V4SFmode;
11285 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
11286 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
11287
11288 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
11289 if (max > X86_64_SSE_REGPARM_MAX)
11290 max = X86_64_SSE_REGPARM_MAX;
11291
11292 for (i = cum->sse_regno; i < max; ++i)
11293 {
11294 mem = plus_constant (Pmode, save_area,
11295 i * 16 + ix86_varargs_gpr_size);
11296 mem = gen_rtx_MEM (smode, mem);
11297 MEM_NOTRAP_P (mem) = 1;
11298 set_mem_alias_set (mem, set);
11299 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
11300
11301 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
11302 }
11303
11304 emit_label (label);
11305 }
11306 }
11307
11308 static void
11309 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
11310 {
11311 alias_set_type set = get_varargs_alias_set ();
11312 int i;
11313
11314 /* Reset to zero, as there might be a sysv vaarg used
11315 before. */
11316 ix86_varargs_gpr_size = 0;
11317 ix86_varargs_fpr_size = 0;
11318
11319 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
11320 {
11321 rtx reg, mem;
11322
11323 mem = gen_rtx_MEM (Pmode,
11324 plus_constant (Pmode, virtual_incoming_args_rtx,
11325 i * UNITS_PER_WORD));
11326 MEM_NOTRAP_P (mem) = 1;
11327 set_mem_alias_set (mem, set);
11328
11329 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
11330 emit_move_insn (mem, reg);
11331 }
11332 }
11333
11334 static void
11335 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
11336 tree type, int *, int no_rtl)
11337 {
11338 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
11339 CUMULATIVE_ARGS next_cum;
11340 tree fntype;
11341
11342 /* This argument doesn't appear to be used anymore. Which is good,
11343 because the old code here didn't suppress rtl generation. */
11344 gcc_assert (!no_rtl);
11345
11346 if (!TARGET_64BIT)
11347 return;
11348
11349 fntype = TREE_TYPE (current_function_decl);
11350
11351 /* For varargs, we do not want to skip the dummy va_dcl argument.
11352 For stdargs, we do want to skip the last named argument. */
11353 next_cum = *cum;
11354 if (stdarg_p (fntype))
11355 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
11356 true);
11357
11358 if (cum->call_abi == MS_ABI)
11359 setup_incoming_varargs_ms_64 (&next_cum);
11360 else
11361 setup_incoming_varargs_64 (&next_cum);
11362 }
11363
11364 static void
11365 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
11366 enum machine_mode mode,
11367 tree type,
11368 int *pretend_size ATTRIBUTE_UNUSED,
11369 int no_rtl)
11370 {
11371 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
11372 CUMULATIVE_ARGS next_cum;
11373 tree fntype;
11374 rtx save_area;
11375 int bnd_reg, i, max;
11376
11377 gcc_assert (!no_rtl);
11378
11379 /* Do nothing if we use plain pointer to argument area. */
11380 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
11381 return;
11382
11383 fntype = TREE_TYPE (current_function_decl);
11384
11385 /* For varargs, we do not want to skip the dummy va_dcl argument.
11386 For stdargs, we do want to skip the last named argument. */
11387 next_cum = *cum;
11388 if (stdarg_p (fntype))
11389 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
11390 true);
11391 save_area = frame_pointer_rtx;
11392
11393 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
11394 if (max > X86_64_REGPARM_MAX)
11395 max = X86_64_REGPARM_MAX;
11396
11397 bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
11398 if (chkp_function_instrumented_p (current_function_decl))
11399 for (i = cum->regno; i < max; i++)
11400 {
11401 rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
11402 rtx ptr = gen_rtx_REG (Pmode,
11403 x86_64_int_parameter_registers[i]);
11404 rtx bounds;
11405
11406 if (bnd_reg <= LAST_BND_REG)
11407 bounds = gen_rtx_REG (BNDmode, bnd_reg);
11408 else
11409 {
11410 rtx ldx_addr =
11411 plus_constant (Pmode, arg_pointer_rtx,
11412 (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
11413 bounds = gen_reg_rtx (BNDmode);
11414 emit_insn (BNDmode == BND64mode
11415 ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
11416 : gen_bnd32_ldx (bounds, ldx_addr, ptr));
11417 }
11418
11419 emit_insn (BNDmode == BND64mode
11420 ? gen_bnd64_stx (addr, ptr, bounds)
11421 : gen_bnd32_stx (addr, ptr, bounds));
11422
11423 bnd_reg++;
11424 }
11425 }
11426
11427
11428 /* Checks if TYPE is of kind va_list char *. */
11429
11430 static bool
11431 is_va_list_char_pointer (tree type)
11432 {
11433 tree canonic;
11434
11435 /* For 32-bit it is always true. */
11436 if (!TARGET_64BIT)
11437 return true;
11438 canonic = ix86_canonical_va_list_type (type);
11439 return (canonic == ms_va_list_type_node
11440 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
11441 }
11442
11443 /* Implement va_start. */
11444
11445 static void
11446 ix86_va_start (tree valist, rtx nextarg)
11447 {
11448 HOST_WIDE_INT words, n_gpr, n_fpr;
11449 tree f_gpr, f_fpr, f_ovf, f_sav;
11450 tree gpr, fpr, ovf, sav, t;
11451 tree type;
11452 rtx ovf_rtx;
11453
11454 if (flag_split_stack
11455 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11456 {
11457 unsigned int scratch_regno;
11458
11459 /* When we are splitting the stack, we can't refer to the stack
11460 arguments using internal_arg_pointer, because they may be on
11461 the old stack. The split stack prologue will arrange to
11462 leave a pointer to the old stack arguments in a scratch
11463 register, which we here copy to a pseudo-register. The split
11464 stack prologue can't set the pseudo-register directly because
11465 it (the prologue) runs before any registers have been saved. */
11466
11467 scratch_regno = split_stack_prologue_scratch_regno ();
11468 if (scratch_regno != INVALID_REGNUM)
11469 {
11470 rtx reg;
11471 rtx_insn *seq;
11472
11473 reg = gen_reg_rtx (Pmode);
11474 cfun->machine->split_stack_varargs_pointer = reg;
11475
11476 start_sequence ();
11477 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
11478 seq = get_insns ();
11479 end_sequence ();
11480
11481 push_topmost_sequence ();
11482 emit_insn_after (seq, entry_of_function ());
11483 pop_topmost_sequence ();
11484 }
11485 }
11486
11487 /* Only 64bit target needs something special. */
11488 if (is_va_list_char_pointer (TREE_TYPE (valist)))
11489 {
11490 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11491 std_expand_builtin_va_start (valist, nextarg);
11492 else
11493 {
11494 rtx va_r, next;
11495
11496 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
11497 next = expand_binop (ptr_mode, add_optab,
11498 cfun->machine->split_stack_varargs_pointer,
11499 crtl->args.arg_offset_rtx,
11500 NULL_RTX, 0, OPTAB_LIB_WIDEN);
11501 convert_move (va_r, next, 0);
11502
11503 /* Store zero bounds for va_list. */
11504 if (chkp_function_instrumented_p (current_function_decl))
11505 chkp_expand_bounds_reset_for_mem (valist,
11506 make_tree (TREE_TYPE (valist),
11507 next));
11508
11509 }
11510 return;
11511 }
11512
11513 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
11514 f_fpr = DECL_CHAIN (f_gpr);
11515 f_ovf = DECL_CHAIN (f_fpr);
11516 f_sav = DECL_CHAIN (f_ovf);
11517
11518 valist = build_simple_mem_ref (valist);
11519 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
11520 /* The following should be folded into the MEM_REF offset. */
11521 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
11522 f_gpr, NULL_TREE);
11523 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
11524 f_fpr, NULL_TREE);
11525 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
11526 f_ovf, NULL_TREE);
11527 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
11528 f_sav, NULL_TREE);
11529
11530 /* Count number of gp and fp argument registers used. */
11531 words = crtl->args.info.words;
11532 n_gpr = crtl->args.info.regno;
11533 n_fpr = crtl->args.info.sse_regno;
11534
11535 if (cfun->va_list_gpr_size)
11536 {
11537 type = TREE_TYPE (gpr);
11538 t = build2 (MODIFY_EXPR, type,
11539 gpr, build_int_cst (type, n_gpr * 8));
11540 TREE_SIDE_EFFECTS (t) = 1;
11541 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11542 }
11543
11544 if (TARGET_SSE && cfun->va_list_fpr_size)
11545 {
11546 type = TREE_TYPE (fpr);
11547 t = build2 (MODIFY_EXPR, type, fpr,
11548 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
11549 TREE_SIDE_EFFECTS (t) = 1;
11550 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11551 }
11552
11553 /* Find the overflow area. */
11554 type = TREE_TYPE (ovf);
11555 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11556 ovf_rtx = crtl->args.internal_arg_pointer;
11557 else
11558 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
11559 t = make_tree (type, ovf_rtx);
11560 if (words != 0)
11561 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
11562
11563 /* Store zero bounds for overflow area pointer. */
11564 if (chkp_function_instrumented_p (current_function_decl))
11565 chkp_expand_bounds_reset_for_mem (ovf, t);
11566
11567 t = build2 (MODIFY_EXPR, type, ovf, t);
11568 TREE_SIDE_EFFECTS (t) = 1;
11569 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11570
11571 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
11572 {
11573 /* Find the register save area.
11574 Prologue of the function save it right above stack frame. */
11575 type = TREE_TYPE (sav);
11576 t = make_tree (type, frame_pointer_rtx);
11577 if (!ix86_varargs_gpr_size)
11578 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
11579
11580 /* Store zero bounds for save area pointer. */
11581 if (chkp_function_instrumented_p (current_function_decl))
11582 chkp_expand_bounds_reset_for_mem (sav, t);
11583
11584 t = build2 (MODIFY_EXPR, type, sav, t);
11585 TREE_SIDE_EFFECTS (t) = 1;
11586 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11587 }
11588 }
11589
11590 /* Implement va_arg. */
11591
11592 static tree
11593 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
11594 gimple_seq *post_p)
11595 {
11596 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
11597 tree f_gpr, f_fpr, f_ovf, f_sav;
11598 tree gpr, fpr, ovf, sav, t;
11599 int size, rsize;
11600 tree lab_false, lab_over = NULL_TREE;
11601 tree addr, t2;
11602 rtx container;
11603 int indirect_p = 0;
11604 tree ptrtype;
11605 machine_mode nat_mode;
11606 unsigned int arg_boundary;
11607
11608 /* Only 64bit target needs something special. */
11609 if (is_va_list_char_pointer (TREE_TYPE (valist)))
11610 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
11611
11612 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
11613 f_fpr = DECL_CHAIN (f_gpr);
11614 f_ovf = DECL_CHAIN (f_fpr);
11615 f_sav = DECL_CHAIN (f_ovf);
11616
11617 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
11618 valist, f_gpr, NULL_TREE);
11619
11620 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
11621 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
11622 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
11623
11624 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
11625 if (indirect_p)
11626 type = build_pointer_type (type);
11627 size = int_size_in_bytes (type);
11628 rsize = CEIL (size, UNITS_PER_WORD);
11629
11630 nat_mode = type_natural_mode (type, NULL, false);
11631 switch (nat_mode)
11632 {
11633 case V8SFmode:
11634 case V8SImode:
11635 case V32QImode:
11636 case V16HImode:
11637 case V4DFmode:
11638 case V4DImode:
11639 case V16SFmode:
11640 case V16SImode:
11641 case V64QImode:
11642 case V32HImode:
11643 case V8DFmode:
11644 case V8DImode:
11645 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
11646 if (!TARGET_64BIT_MS_ABI)
11647 {
11648 container = NULL;
11649 break;
11650 }
11651 /* FALLTHRU */
11652
11653 default:
11654 container = construct_container (nat_mode, TYPE_MODE (type),
11655 type, 0, X86_64_REGPARM_MAX,
11656 X86_64_SSE_REGPARM_MAX, intreg,
11657 0);
11658 break;
11659 }
11660
11661 /* Pull the value out of the saved registers. */
11662
11663 addr = create_tmp_var (ptr_type_node, "addr");
11664
11665 if (container)
11666 {
11667 int needed_intregs, needed_sseregs;
11668 bool need_temp;
11669 tree int_addr, sse_addr;
11670
11671 lab_false = create_artificial_label (UNKNOWN_LOCATION);
11672 lab_over = create_artificial_label (UNKNOWN_LOCATION);
11673
11674 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
11675
11676 need_temp = (!REG_P (container)
11677 && ((needed_intregs && TYPE_ALIGN (type) > 64)
11678 || TYPE_ALIGN (type) > 128));
11679
11680 /* In case we are passing structure, verify that it is consecutive block
11681 on the register save area. If not we need to do moves. */
11682 if (!need_temp && !REG_P (container))
11683 {
11684 /* Verify that all registers are strictly consecutive */
11685 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
11686 {
11687 int i;
11688
11689 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11690 {
11691 rtx slot = XVECEXP (container, 0, i);
11692 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
11693 || INTVAL (XEXP (slot, 1)) != i * 16)
11694 need_temp = true;
11695 }
11696 }
11697 else
11698 {
11699 int i;
11700
11701 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11702 {
11703 rtx slot = XVECEXP (container, 0, i);
11704 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
11705 || INTVAL (XEXP (slot, 1)) != i * 8)
11706 need_temp = true;
11707 }
11708 }
11709 }
11710 if (!need_temp)
11711 {
11712 int_addr = addr;
11713 sse_addr = addr;
11714 }
11715 else
11716 {
11717 int_addr = create_tmp_var (ptr_type_node, "int_addr");
11718 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
11719 }
11720
11721 /* First ensure that we fit completely in registers. */
11722 if (needed_intregs)
11723 {
11724 t = build_int_cst (TREE_TYPE (gpr),
11725 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
11726 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
11727 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11728 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11729 gimplify_and_add (t, pre_p);
11730 }
11731 if (needed_sseregs)
11732 {
11733 t = build_int_cst (TREE_TYPE (fpr),
11734 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
11735 + X86_64_REGPARM_MAX * 8);
11736 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
11737 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11738 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11739 gimplify_and_add (t, pre_p);
11740 }
11741
11742 /* Compute index to start of area used for integer regs. */
11743 if (needed_intregs)
11744 {
11745 /* int_addr = gpr + sav; */
11746 t = fold_build_pointer_plus (sav, gpr);
11747 gimplify_assign (int_addr, t, pre_p);
11748 }
11749 if (needed_sseregs)
11750 {
11751 /* sse_addr = fpr + sav; */
11752 t = fold_build_pointer_plus (sav, fpr);
11753 gimplify_assign (sse_addr, t, pre_p);
11754 }
11755 if (need_temp)
11756 {
11757 int i, prev_size = 0;
11758 tree temp = create_tmp_var (type, "va_arg_tmp");
11759
11760 /* addr = &temp; */
11761 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
11762 gimplify_assign (addr, t, pre_p);
11763
11764 for (i = 0; i < XVECLEN (container, 0); i++)
11765 {
11766 rtx slot = XVECEXP (container, 0, i);
11767 rtx reg = XEXP (slot, 0);
11768 machine_mode mode = GET_MODE (reg);
11769 tree piece_type;
11770 tree addr_type;
11771 tree daddr_type;
11772 tree src_addr, src;
11773 int src_offset;
11774 tree dest_addr, dest;
11775 int cur_size = GET_MODE_SIZE (mode);
11776
11777 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
11778 prev_size = INTVAL (XEXP (slot, 1));
11779 if (prev_size + cur_size > size)
11780 {
11781 cur_size = size - prev_size;
11782 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
11783 if (mode == BLKmode)
11784 mode = QImode;
11785 }
11786 piece_type = lang_hooks.types.type_for_mode (mode, 1);
11787 if (mode == GET_MODE (reg))
11788 addr_type = build_pointer_type (piece_type);
11789 else
11790 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11791 true);
11792 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11793 true);
11794
11795 if (SSE_REGNO_P (REGNO (reg)))
11796 {
11797 src_addr = sse_addr;
11798 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
11799 }
11800 else
11801 {
11802 src_addr = int_addr;
11803 src_offset = REGNO (reg) * 8;
11804 }
11805 src_addr = fold_convert (addr_type, src_addr);
11806 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
11807
11808 dest_addr = fold_convert (daddr_type, addr);
11809 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
11810 if (cur_size == GET_MODE_SIZE (mode))
11811 {
11812 src = build_va_arg_indirect_ref (src_addr);
11813 dest = build_va_arg_indirect_ref (dest_addr);
11814
11815 gimplify_assign (dest, src, pre_p);
11816 }
11817 else
11818 {
11819 tree copy
11820 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
11821 3, dest_addr, src_addr,
11822 size_int (cur_size));
11823 gimplify_and_add (copy, pre_p);
11824 }
11825 prev_size += cur_size;
11826 }
11827 }
11828
11829 if (needed_intregs)
11830 {
11831 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
11832 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
11833 gimplify_assign (gpr, t, pre_p);
11834 }
11835
11836 if (needed_sseregs)
11837 {
11838 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
11839 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
11840 gimplify_assign (unshare_expr (fpr), t, pre_p);
11841 }
11842
11843 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
11844
11845 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
11846 }
11847
11848 /* ... otherwise out of the overflow area. */
11849
11850 /* When we align parameter on stack for caller, if the parameter
11851 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
11852 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
11853 here with caller. */
11854 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
11855 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
11856 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
11857
11858 /* Care for on-stack alignment if needed. */
11859 if (arg_boundary <= 64 || size == 0)
11860 t = ovf;
11861 else
11862 {
11863 HOST_WIDE_INT align = arg_boundary / 8;
11864 t = fold_build_pointer_plus_hwi (ovf, align - 1);
11865 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
11866 build_int_cst (TREE_TYPE (t), -align));
11867 }
11868
11869 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
11870 gimplify_assign (addr, t, pre_p);
11871
11872 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
11873 gimplify_assign (unshare_expr (ovf), t, pre_p);
11874
11875 if (container)
11876 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
11877
11878 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
11879 addr = fold_convert (ptrtype, addr);
11880
11881 if (indirect_p)
11882 addr = build_va_arg_indirect_ref (addr);
11883 return build_va_arg_indirect_ref (addr);
11884 }
11885 \f
11886 /* Return true if OPNUM's MEM should be matched
11887 in movabs* patterns. */
11888
11889 bool
11890 ix86_check_movabs (rtx insn, int opnum)
11891 {
11892 rtx set, mem;
11893
11894 set = PATTERN (insn);
11895 if (GET_CODE (set) == PARALLEL)
11896 set = XVECEXP (set, 0, 0);
11897 gcc_assert (GET_CODE (set) == SET);
11898 mem = XEXP (set, opnum);
11899 while (SUBREG_P (mem))
11900 mem = SUBREG_REG (mem);
11901 gcc_assert (MEM_P (mem));
11902 return volatile_ok || !MEM_VOLATILE_P (mem);
11903 }
11904
11905 /* Return false if INSN contains a MEM with a non-default address space. */
11906 bool
11907 ix86_check_no_addr_space (rtx insn)
11908 {
11909 subrtx_var_iterator::array_type array;
11910 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
11911 {
11912 rtx x = *iter;
11913 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
11914 return false;
11915 }
11916 return true;
11917 }
11918 \f
11919 /* Initialize the table of extra 80387 mathematical constants. */
11920
11921 static void
11922 init_ext_80387_constants (void)
11923 {
11924 static const char * cst[5] =
11925 {
11926 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
11927 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
11928 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
11929 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
11930 "3.1415926535897932385128089594061862044", /* 4: fldpi */
11931 };
11932 int i;
11933
11934 for (i = 0; i < 5; i++)
11935 {
11936 real_from_string (&ext_80387_constants_table[i], cst[i]);
11937 /* Ensure each constant is rounded to XFmode precision. */
11938 real_convert (&ext_80387_constants_table[i],
11939 XFmode, &ext_80387_constants_table[i]);
11940 }
11941
11942 ext_80387_constants_init = 1;
11943 }
11944
11945 /* Return non-zero if the constant is something that
11946 can be loaded with a special instruction. */
11947
11948 int
11949 standard_80387_constant_p (rtx x)
11950 {
11951 machine_mode mode = GET_MODE (x);
11952
11953 const REAL_VALUE_TYPE *r;
11954
11955 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
11956 return -1;
11957
11958 if (x == CONST0_RTX (mode))
11959 return 1;
11960 if (x == CONST1_RTX (mode))
11961 return 2;
11962
11963 r = CONST_DOUBLE_REAL_VALUE (x);
11964
11965 /* For XFmode constants, try to find a special 80387 instruction when
11966 optimizing for size or on those CPUs that benefit from them. */
11967 if (mode == XFmode
11968 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
11969 {
11970 int i;
11971
11972 if (! ext_80387_constants_init)
11973 init_ext_80387_constants ();
11974
11975 for (i = 0; i < 5; i++)
11976 if (real_identical (r, &ext_80387_constants_table[i]))
11977 return i + 3;
11978 }
11979
11980 /* Load of the constant -0.0 or -1.0 will be split as
11981 fldz;fchs or fld1;fchs sequence. */
11982 if (real_isnegzero (r))
11983 return 8;
11984 if (real_identical (r, &dconstm1))
11985 return 9;
11986
11987 return 0;
11988 }
11989
11990 /* Return the opcode of the special instruction to be used to load
11991 the constant X. */
11992
11993 const char *
11994 standard_80387_constant_opcode (rtx x)
11995 {
11996 switch (standard_80387_constant_p (x))
11997 {
11998 case 1:
11999 return "fldz";
12000 case 2:
12001 return "fld1";
12002 case 3:
12003 return "fldlg2";
12004 case 4:
12005 return "fldln2";
12006 case 5:
12007 return "fldl2e";
12008 case 6:
12009 return "fldl2t";
12010 case 7:
12011 return "fldpi";
12012 case 8:
12013 case 9:
12014 return "#";
12015 default:
12016 gcc_unreachable ();
12017 }
12018 }
12019
12020 /* Return the CONST_DOUBLE representing the 80387 constant that is
12021 loaded by the specified special instruction. The argument IDX
12022 matches the return value from standard_80387_constant_p. */
12023
12024 rtx
12025 standard_80387_constant_rtx (int idx)
12026 {
12027 int i;
12028
12029 if (! ext_80387_constants_init)
12030 init_ext_80387_constants ();
12031
12032 switch (idx)
12033 {
12034 case 3:
12035 case 4:
12036 case 5:
12037 case 6:
12038 case 7:
12039 i = idx - 3;
12040 break;
12041
12042 default:
12043 gcc_unreachable ();
12044 }
12045
12046 return const_double_from_real_value (ext_80387_constants_table[i],
12047 XFmode);
12048 }
12049
12050 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
12051 in supported SSE/AVX vector mode. */
12052
12053 int
12054 standard_sse_constant_p (rtx x, machine_mode pred_mode)
12055 {
12056 machine_mode mode;
12057
12058 if (!TARGET_SSE)
12059 return 0;
12060
12061 mode = GET_MODE (x);
12062
12063 if (x == const0_rtx || const0_operand (x, mode))
12064 return 1;
12065
12066 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
12067 {
12068 /* VOIDmode integer constant, get mode from the predicate. */
12069 if (mode == VOIDmode)
12070 mode = pred_mode;
12071
12072 switch (GET_MODE_SIZE (mode))
12073 {
12074 case 64:
12075 if (TARGET_AVX512F)
12076 return 2;
12077 break;
12078 case 32:
12079 if (TARGET_AVX2)
12080 return 2;
12081 break;
12082 case 16:
12083 if (TARGET_SSE2)
12084 return 2;
12085 break;
12086 case 0:
12087 /* VOIDmode */
12088 gcc_unreachable ();
12089 default:
12090 break;
12091 }
12092 }
12093
12094 return 0;
12095 }
12096
12097 /* Return the opcode of the special instruction to be used to load
12098 the constant X. */
12099
12100 const char *
12101 standard_sse_constant_opcode (rtx_insn *insn, rtx x)
12102 {
12103 machine_mode mode;
12104
12105 gcc_assert (TARGET_SSE);
12106
12107 mode = GET_MODE (x);
12108
12109 if (x == const0_rtx || const0_operand (x, mode))
12110 {
12111 switch (get_attr_mode (insn))
12112 {
12113 case MODE_XI:
12114 return "vpxord\t%g0, %g0, %g0";
12115 case MODE_OI:
12116 return (TARGET_AVX512VL
12117 ? "vpxord\t%x0, %x0, %x0"
12118 : "vpxor\t%x0, %x0, %x0");
12119 case MODE_TI:
12120 return (TARGET_AVX512VL
12121 ? "vpxord\t%t0, %t0, %t0"
12122 : "%vpxor\t%0, %d0");
12123
12124 case MODE_V8DF:
12125 return (TARGET_AVX512DQ
12126 ? "vxorpd\t%g0, %g0, %g0"
12127 : "vpxorq\t%g0, %g0, %g0");
12128 case MODE_V4DF:
12129 return "vxorpd\t%x0, %x0, %x0";
12130 case MODE_V2DF:
12131 return "%vxorpd\t%0, %d0";
12132
12133 case MODE_V16SF:
12134 return (TARGET_AVX512DQ
12135 ? "vxorps\t%g0, %g0, %g0"
12136 : "vpxord\t%g0, %g0, %g0");
12137 case MODE_V8SF:
12138 return "vxorps\t%x0, %x0, %x0";
12139 case MODE_V4SF:
12140 return "%vxorps\t%0, %d0";
12141
12142 default:
12143 gcc_unreachable ();
12144 }
12145 }
12146 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
12147 {
12148 enum attr_mode insn_mode = get_attr_mode (insn);
12149
12150 switch (insn_mode)
12151 {
12152 case MODE_XI:
12153 case MODE_V8DF:
12154 case MODE_V16SF:
12155 gcc_assert (TARGET_AVX512F);
12156 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
12157
12158 case MODE_OI:
12159 case MODE_V4DF:
12160 case MODE_V8SF:
12161 gcc_assert (TARGET_AVX2);
12162 /* FALLTHRU */
12163 case MODE_TI:
12164 case MODE_V2DF:
12165 case MODE_V4SF:
12166 gcc_assert (TARGET_SSE2);
12167 return (TARGET_AVX
12168 ? "vpcmpeqd\t%0, %0, %0"
12169 : "pcmpeqd\t%0, %0");
12170
12171 default:
12172 gcc_unreachable ();
12173 }
12174 }
12175
12176 gcc_unreachable ();
12177 }
12178
12179 /* Returns true if INSN can be transformed from a memory load
12180 to a supported FP constant load. */
12181
12182 bool
12183 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
12184 {
12185 rtx src = find_constant_src (insn);
12186
12187 gcc_assert (REG_P (dst));
12188
12189 if (src == NULL
12190 || (SSE_REGNO_P (REGNO (dst))
12191 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
12192 || (STACK_REGNO_P (REGNO (dst))
12193 && standard_80387_constant_p (src) < 1))
12194 return false;
12195
12196 return true;
12197 }
12198
12199 /* Returns true if OP contains a symbol reference */
12200
12201 bool
12202 symbolic_reference_mentioned_p (rtx op)
12203 {
12204 const char *fmt;
12205 int i;
12206
12207 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
12208 return true;
12209
12210 fmt = GET_RTX_FORMAT (GET_CODE (op));
12211 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
12212 {
12213 if (fmt[i] == 'E')
12214 {
12215 int j;
12216
12217 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
12218 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
12219 return true;
12220 }
12221
12222 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
12223 return true;
12224 }
12225
12226 return false;
12227 }
12228
12229 /* Return true if it is appropriate to emit `ret' instructions in the
12230 body of a function. Do this only if the epilogue is simple, needing a
12231 couple of insns. Prior to reloading, we can't tell how many registers
12232 must be saved, so return false then. Return false if there is no frame
12233 marker to de-allocate. */
12234
12235 bool
12236 ix86_can_use_return_insn_p (void)
12237 {
12238 struct ix86_frame frame;
12239
12240 /* Don't use `ret' instruction in interrupt handler. */
12241 if (! reload_completed
12242 || frame_pointer_needed
12243 || cfun->machine->func_type != TYPE_NORMAL)
12244 return 0;
12245
12246 /* Don't allow more than 32k pop, since that's all we can do
12247 with one instruction. */
12248 if (crtl->args.pops_args && crtl->args.size >= 32768)
12249 return 0;
12250
12251 frame = cfun->machine->frame;
12252 return (frame.stack_pointer_offset == UNITS_PER_WORD
12253 && (frame.nregs + frame.nsseregs) == 0);
12254 }
12255 \f
12256 /* Value should be nonzero if functions must have frame pointers.
12257 Zero means the frame pointer need not be set up (and parms may
12258 be accessed via the stack pointer) in functions that seem suitable. */
12259
12260 static bool
12261 ix86_frame_pointer_required (void)
12262 {
12263 /* If we accessed previous frames, then the generated code expects
12264 to be able to access the saved ebp value in our frame. */
12265 if (cfun->machine->accesses_prev_frame)
12266 return true;
12267
12268 /* Several x86 os'es need a frame pointer for other reasons,
12269 usually pertaining to setjmp. */
12270 if (SUBTARGET_FRAME_POINTER_REQUIRED)
12271 return true;
12272
12273 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
12274 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
12275 return true;
12276
12277 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
12278 allocation is 4GB. */
12279 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
12280 return true;
12281
12282 /* SSE saves require frame-pointer when stack is misaligned. */
12283 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
12284 return true;
12285
12286 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
12287 turns off the frame pointer by default. Turn it back on now if
12288 we've not got a leaf function. */
12289 if (TARGET_OMIT_LEAF_FRAME_POINTER
12290 && (!crtl->is_leaf
12291 || ix86_current_function_calls_tls_descriptor))
12292 return true;
12293
12294 if (crtl->profile && !flag_fentry)
12295 return true;
12296
12297 return false;
12298 }
12299
12300 /* Record that the current function accesses previous call frames. */
12301
12302 void
12303 ix86_setup_frame_addresses (void)
12304 {
12305 cfun->machine->accesses_prev_frame = 1;
12306 }
12307 \f
12308 #ifndef USE_HIDDEN_LINKONCE
12309 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
12310 # define USE_HIDDEN_LINKONCE 1
12311 # else
12312 # define USE_HIDDEN_LINKONCE 0
12313 # endif
12314 #endif
12315
12316 static int pic_labels_used;
12317
12318 /* Fills in the label name that should be used for a pc thunk for
12319 the given register. */
12320
12321 static void
12322 get_pc_thunk_name (char name[32], unsigned int regno)
12323 {
12324 gcc_assert (!TARGET_64BIT);
12325
12326 if (USE_HIDDEN_LINKONCE)
12327 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
12328 else
12329 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
12330 }
12331
12332
12333 /* This function generates code for -fpic that loads %ebx with
12334 the return address of the caller and then returns. */
12335
12336 static void
12337 ix86_code_end (void)
12338 {
12339 rtx xops[2];
12340 int regno;
12341
12342 for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
12343 {
12344 char name[32];
12345 tree decl;
12346
12347 if (!(pic_labels_used & (1 << regno)))
12348 continue;
12349
12350 get_pc_thunk_name (name, regno);
12351
12352 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
12353 get_identifier (name),
12354 build_function_type_list (void_type_node, NULL_TREE));
12355 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
12356 NULL_TREE, void_type_node);
12357 TREE_PUBLIC (decl) = 1;
12358 TREE_STATIC (decl) = 1;
12359 DECL_IGNORED_P (decl) = 1;
12360
12361 #if TARGET_MACHO
12362 if (TARGET_MACHO)
12363 {
12364 switch_to_section (darwin_sections[picbase_thunk_section]);
12365 fputs ("\t.weak_definition\t", asm_out_file);
12366 assemble_name (asm_out_file, name);
12367 fputs ("\n\t.private_extern\t", asm_out_file);
12368 assemble_name (asm_out_file, name);
12369 putc ('\n', asm_out_file);
12370 ASM_OUTPUT_LABEL (asm_out_file, name);
12371 DECL_WEAK (decl) = 1;
12372 }
12373 else
12374 #endif
12375 if (USE_HIDDEN_LINKONCE)
12376 {
12377 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
12378
12379 targetm.asm_out.unique_section (decl, 0);
12380 switch_to_section (get_named_section (decl, NULL, 0));
12381
12382 targetm.asm_out.globalize_label (asm_out_file, name);
12383 fputs ("\t.hidden\t", asm_out_file);
12384 assemble_name (asm_out_file, name);
12385 putc ('\n', asm_out_file);
12386 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
12387 }
12388 else
12389 {
12390 switch_to_section (text_section);
12391 ASM_OUTPUT_LABEL (asm_out_file, name);
12392 }
12393
12394 DECL_INITIAL (decl) = make_node (BLOCK);
12395 current_function_decl = decl;
12396 allocate_struct_function (decl, false);
12397 init_function_start (decl);
12398 /* We're about to hide the function body from callees of final_* by
12399 emitting it directly; tell them we're a thunk, if they care. */
12400 cfun->is_thunk = true;
12401 first_function_block_is_cold = false;
12402 /* Make sure unwind info is emitted for the thunk if needed. */
12403 final_start_function (emit_barrier (), asm_out_file, 1);
12404
12405 /* Pad stack IP move with 4 instructions (two NOPs count
12406 as one instruction). */
12407 if (TARGET_PAD_SHORT_FUNCTION)
12408 {
12409 int i = 8;
12410
12411 while (i--)
12412 fputs ("\tnop\n", asm_out_file);
12413 }
12414
12415 xops[0] = gen_rtx_REG (Pmode, regno);
12416 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
12417 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
12418 output_asm_insn ("%!ret", NULL);
12419 final_end_function ();
12420 init_insn_lengths ();
12421 free_after_compilation (cfun);
12422 set_cfun (NULL);
12423 current_function_decl = NULL;
12424 }
12425
12426 if (flag_split_stack)
12427 file_end_indicate_split_stack ();
12428 }
12429
12430 /* Emit code for the SET_GOT patterns. */
12431
12432 const char *
12433 output_set_got (rtx dest, rtx label)
12434 {
12435 rtx xops[3];
12436
12437 xops[0] = dest;
12438
12439 if (TARGET_VXWORKS_RTP && flag_pic)
12440 {
12441 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
12442 xops[2] = gen_rtx_MEM (Pmode,
12443 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
12444 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
12445
12446 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
12447 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
12448 an unadorned address. */
12449 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
12450 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
12451 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
12452 return "";
12453 }
12454
12455 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
12456
12457 if (flag_pic)
12458 {
12459 char name[32];
12460 get_pc_thunk_name (name, REGNO (dest));
12461 pic_labels_used |= 1 << REGNO (dest);
12462
12463 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
12464 xops[2] = gen_rtx_MEM (QImode, xops[2]);
12465 output_asm_insn ("%!call\t%X2", xops);
12466
12467 #if TARGET_MACHO
12468 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
12469 This is what will be referenced by the Mach-O PIC subsystem. */
12470 if (machopic_should_output_picbase_label () || !label)
12471 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
12472
12473 /* When we are restoring the pic base at the site of a nonlocal label,
12474 and we decided to emit the pic base above, we will still output a
12475 local label used for calculating the correction offset (even though
12476 the offset will be 0 in that case). */
12477 if (label)
12478 targetm.asm_out.internal_label (asm_out_file, "L",
12479 CODE_LABEL_NUMBER (label));
12480 #endif
12481 }
12482 else
12483 {
12484 if (TARGET_MACHO)
12485 /* We don't need a pic base, we're not producing pic. */
12486 gcc_unreachable ();
12487
12488 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
12489 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
12490 targetm.asm_out.internal_label (asm_out_file, "L",
12491 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
12492 }
12493
12494 if (!TARGET_MACHO)
12495 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
12496
12497 return "";
12498 }
12499
12500 /* Generate an "push" pattern for input ARG. */
12501
12502 static rtx
12503 gen_push (rtx arg)
12504 {
12505 struct machine_function *m = cfun->machine;
12506
12507 if (m->fs.cfa_reg == stack_pointer_rtx)
12508 m->fs.cfa_offset += UNITS_PER_WORD;
12509 m->fs.sp_offset += UNITS_PER_WORD;
12510
12511 if (REG_P (arg) && GET_MODE (arg) != word_mode)
12512 arg = gen_rtx_REG (word_mode, REGNO (arg));
12513
12514 return gen_rtx_SET (gen_rtx_MEM (word_mode,
12515 gen_rtx_PRE_DEC (Pmode,
12516 stack_pointer_rtx)),
12517 arg);
12518 }
12519
12520 /* Generate an "pop" pattern for input ARG. */
12521
12522 static rtx
12523 gen_pop (rtx arg)
12524 {
12525 if (REG_P (arg) && GET_MODE (arg) != word_mode)
12526 arg = gen_rtx_REG (word_mode, REGNO (arg));
12527
12528 return gen_rtx_SET (arg,
12529 gen_rtx_MEM (word_mode,
12530 gen_rtx_POST_INC (Pmode,
12531 stack_pointer_rtx)));
12532 }
12533
12534 /* Return >= 0 if there is an unused call-clobbered register available
12535 for the entire function. */
12536
12537 static unsigned int
12538 ix86_select_alt_pic_regnum (void)
12539 {
12540 if (ix86_use_pseudo_pic_reg ())
12541 return INVALID_REGNUM;
12542
12543 if (crtl->is_leaf
12544 && !crtl->profile
12545 && !ix86_current_function_calls_tls_descriptor)
12546 {
12547 int i, drap;
12548 /* Can't use the same register for both PIC and DRAP. */
12549 if (crtl->drap_reg)
12550 drap = REGNO (crtl->drap_reg);
12551 else
12552 drap = -1;
12553 for (i = 2; i >= 0; --i)
12554 if (i != drap && !df_regs_ever_live_p (i))
12555 return i;
12556 }
12557
12558 return INVALID_REGNUM;
12559 }
12560
12561 /* Return true if REGNO is used by the epilogue. */
12562
12563 bool
12564 ix86_epilogue_uses (int regno)
12565 {
12566 /* If there are no caller-saved registers, we preserve all registers,
12567 except for MMX and x87 registers which aren't supported when saving
12568 and restoring registers. Don't explicitly save SP register since
12569 it is always preserved. */
12570 return (epilogue_completed
12571 && cfun->machine->no_caller_saved_registers
12572 && !fixed_regs[regno]
12573 && !STACK_REGNO_P (regno)
12574 && !MMX_REGNO_P (regno));
12575 }
12576
12577 /* Return nonzero if register REGNO can be used as a scratch register
12578 in peephole2. */
12579
12580 static bool
12581 ix86_hard_regno_scratch_ok (unsigned int regno)
12582 {
12583 /* If there are no caller-saved registers, we can't use any register
12584 as a scratch register after epilogue and use REGNO as scratch
12585 register only if it has been used before to avoid saving and
12586 restoring it. */
12587 return (!cfun->machine->no_caller_saved_registers
12588 || (!epilogue_completed
12589 && df_regs_ever_live_p (regno)));
12590 }
12591
12592 /* Return true if register class CL should be an additional allocno
12593 class. */
12594
12595 static bool
12596 ix86_additional_allocno_class_p (reg_class_t cl)
12597 {
12598 return cl == MOD4_SSE_REGS;
12599 }
12600
12601 /* Return TRUE if we need to save REGNO. */
12602
12603 static bool
12604 ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
12605 {
12606 /* If there are no caller-saved registers, we preserve all registers,
12607 except for MMX and x87 registers which aren't supported when saving
12608 and restoring registers. Don't explicitly save SP register since
12609 it is always preserved. */
12610 if (cfun->machine->no_caller_saved_registers)
12611 {
12612 /* Don't preserve registers used for function return value. */
12613 rtx reg = crtl->return_rtx;
12614 if (reg)
12615 {
12616 unsigned int i = REGNO (reg);
12617 unsigned int nregs = hard_regno_nregs[i][GET_MODE (reg)];
12618 while (nregs-- > 0)
12619 if ((i + nregs) == regno)
12620 return false;
12621
12622 reg = crtl->return_bnd;
12623 if (reg)
12624 {
12625 i = REGNO (reg);
12626 nregs = hard_regno_nregs[i][GET_MODE (reg)];
12627 while (nregs-- > 0)
12628 if ((i + nregs) == regno)
12629 return false;
12630 }
12631 }
12632
12633 return (df_regs_ever_live_p (regno)
12634 && !fixed_regs[regno]
12635 && !STACK_REGNO_P (regno)
12636 && !MMX_REGNO_P (regno)
12637 && (regno != HARD_FRAME_POINTER_REGNUM
12638 || !frame_pointer_needed));
12639 }
12640
12641 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
12642 && pic_offset_table_rtx)
12643 {
12644 if (ix86_use_pseudo_pic_reg ())
12645 {
12646 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
12647 _mcount in prologue. */
12648 if (!TARGET_64BIT && flag_pic && crtl->profile)
12649 return true;
12650 }
12651 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
12652 || crtl->profile
12653 || crtl->calls_eh_return
12654 || crtl->uses_const_pool
12655 || cfun->has_nonlocal_label)
12656 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
12657 }
12658
12659 if (crtl->calls_eh_return && maybe_eh_return)
12660 {
12661 unsigned i;
12662 for (i = 0; ; i++)
12663 {
12664 unsigned test = EH_RETURN_DATA_REGNO (i);
12665 if (test == INVALID_REGNUM)
12666 break;
12667 if (test == regno)
12668 return true;
12669 }
12670 }
12671
12672 if (ignore_outlined && cfun->machine->call_ms2sysv)
12673 {
12674 unsigned count = cfun->machine->call_ms2sysv_extra_regs
12675 + xlogue_layout::MIN_REGS;
12676 if (xlogue_layout::is_stub_managed_reg (regno, count))
12677 return false;
12678 }
12679
12680 if (crtl->drap_reg
12681 && regno == REGNO (crtl->drap_reg)
12682 && !cfun->machine->no_drap_save_restore)
12683 return true;
12684
12685 return (df_regs_ever_live_p (regno)
12686 && !call_used_regs[regno]
12687 && !fixed_regs[regno]
12688 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
12689 }
12690
12691 /* Return number of saved general prupose registers. */
12692
12693 static int
12694 ix86_nsaved_regs (void)
12695 {
12696 int nregs = 0;
12697 int regno;
12698
12699 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12700 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12701 nregs ++;
12702 return nregs;
12703 }
12704
12705 /* Return number of saved SSE registers. */
12706
12707 static int
12708 ix86_nsaved_sseregs (void)
12709 {
12710 int nregs = 0;
12711 int regno;
12712
12713 if (!TARGET_64BIT_MS_ABI)
12714 return 0;
12715 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12716 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12717 nregs ++;
12718 return nregs;
12719 }
12720
12721 /* Given FROM and TO register numbers, say whether this elimination is
12722 allowed. If stack alignment is needed, we can only replace argument
12723 pointer with hard frame pointer, or replace frame pointer with stack
12724 pointer. Otherwise, frame pointer elimination is automatically
12725 handled and all other eliminations are valid. */
12726
12727 static bool
12728 ix86_can_eliminate (const int from, const int to)
12729 {
12730 if (stack_realign_fp)
12731 return ((from == ARG_POINTER_REGNUM
12732 && to == HARD_FRAME_POINTER_REGNUM)
12733 || (from == FRAME_POINTER_REGNUM
12734 && to == STACK_POINTER_REGNUM));
12735 else
12736 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
12737 }
12738
12739 /* Return the offset between two registers, one to be eliminated, and the other
12740 its replacement, at the start of a routine. */
12741
12742 HOST_WIDE_INT
12743 ix86_initial_elimination_offset (int from, int to)
12744 {
12745 struct ix86_frame frame = cfun->machine->frame;
12746
12747 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
12748 return frame.hard_frame_pointer_offset;
12749 else if (from == FRAME_POINTER_REGNUM
12750 && to == HARD_FRAME_POINTER_REGNUM)
12751 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
12752 else
12753 {
12754 gcc_assert (to == STACK_POINTER_REGNUM);
12755
12756 if (from == ARG_POINTER_REGNUM)
12757 return frame.stack_pointer_offset;
12758
12759 gcc_assert (from == FRAME_POINTER_REGNUM);
12760 return frame.stack_pointer_offset - frame.frame_pointer_offset;
12761 }
12762 }
12763
12764 /* In a dynamically-aligned function, we can't know the offset from
12765 stack pointer to frame pointer, so we must ensure that setjmp
12766 eliminates fp against the hard fp (%ebp) rather than trying to
12767 index from %esp up to the top of the frame across a gap that is
12768 of unknown (at compile-time) size. */
12769 static rtx
12770 ix86_builtin_setjmp_frame_value (void)
12771 {
12772 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
12773 }
12774
12775 /* Emits a warning for unsupported msabi to sysv pro/epilogues. */
12776 static void warn_once_call_ms2sysv_xlogues (const char *feature)
12777 {
12778 static bool warned_once = false;
12779 if (!warned_once)
12780 {
12781 warning (0, "-mcall-ms2sysv-xlogues is not compatible with %s",
12782 feature);
12783 warned_once = true;
12784 }
12785 }
12786
12787 /* When using -fsplit-stack, the allocation routines set a field in
12788 the TCB to the bottom of the stack plus this much space, measured
12789 in bytes. */
12790
12791 #define SPLIT_STACK_AVAILABLE 256
12792
12793 /* Fill structure ix86_frame about frame of currently computed function. */
12794
12795 static void
12796 ix86_compute_frame_layout (void)
12797 {
12798 struct ix86_frame *frame = &cfun->machine->frame;
12799 struct machine_function *m = cfun->machine;
12800 unsigned HOST_WIDE_INT stack_alignment_needed;
12801 HOST_WIDE_INT offset;
12802 unsigned HOST_WIDE_INT preferred_alignment;
12803 HOST_WIDE_INT size = get_frame_size ();
12804 HOST_WIDE_INT to_allocate;
12805
12806 /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
12807 * ms_abi functions that call a sysv function. We now need to prune away
12808 * cases where it should be disabled. */
12809 if (TARGET_64BIT && m->call_ms2sysv)
12810 {
12811 gcc_assert (TARGET_64BIT_MS_ABI);
12812 gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
12813 gcc_assert (!TARGET_SEH);
12814 gcc_assert (TARGET_SSE);
12815 gcc_assert (!ix86_using_red_zone ());
12816
12817 if (crtl->calls_eh_return)
12818 {
12819 gcc_assert (!reload_completed);
12820 m->call_ms2sysv = false;
12821 warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
12822 }
12823
12824 else if (ix86_static_chain_on_stack)
12825 {
12826 gcc_assert (!reload_completed);
12827 m->call_ms2sysv = false;
12828 warn_once_call_ms2sysv_xlogues ("static call chains");
12829 }
12830
12831 /* Finally, compute which registers the stub will manage. */
12832 else
12833 {
12834 unsigned count = xlogue_layout::count_stub_managed_regs ();
12835 m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
12836 }
12837 }
12838
12839 frame->nregs = ix86_nsaved_regs ();
12840 frame->nsseregs = ix86_nsaved_sseregs ();
12841 m->call_ms2sysv_pad_in = 0;
12842 m->call_ms2sysv_pad_out = 0;
12843
12844 /* 64-bit MS ABI seem to require stack alignment to be always 16,
12845 except for function prologues, leaf functions and when the defult
12846 incoming stack boundary is overriden at command line or via
12847 force_align_arg_pointer attribute. */
12848 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
12849 && (!crtl->is_leaf || cfun->calls_alloca != 0
12850 || ix86_current_function_calls_tls_descriptor
12851 || ix86_incoming_stack_boundary < 128))
12852 {
12853 crtl->preferred_stack_boundary = 128;
12854 crtl->stack_alignment_needed = 128;
12855 }
12856
12857 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
12858 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
12859
12860 gcc_assert (!size || stack_alignment_needed);
12861 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
12862 gcc_assert (preferred_alignment <= stack_alignment_needed);
12863
12864 /* For SEH we have to limit the amount of code movement into the prologue.
12865 At present we do this via a BLOCKAGE, at which point there's very little
12866 scheduling that can be done, which means that there's very little point
12867 in doing anything except PUSHs. */
12868 if (TARGET_SEH)
12869 m->use_fast_prologue_epilogue = false;
12870 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
12871 {
12872 int count = frame->nregs;
12873 struct cgraph_node *node = cgraph_node::get (current_function_decl);
12874
12875 /* The fast prologue uses move instead of push to save registers. This
12876 is significantly longer, but also executes faster as modern hardware
12877 can execute the moves in parallel, but can't do that for push/pop.
12878
12879 Be careful about choosing what prologue to emit: When function takes
12880 many instructions to execute we may use slow version as well as in
12881 case function is known to be outside hot spot (this is known with
12882 feedback only). Weight the size of function by number of registers
12883 to save as it is cheap to use one or two push instructions but very
12884 slow to use many of them. */
12885 if (count)
12886 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
12887 if (node->frequency < NODE_FREQUENCY_NORMAL
12888 || (flag_branch_probabilities
12889 && node->frequency < NODE_FREQUENCY_HOT))
12890 m->use_fast_prologue_epilogue = false;
12891 else
12892 m->use_fast_prologue_epilogue
12893 = !expensive_function_p (count);
12894 }
12895
12896 frame->save_regs_using_mov
12897 = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
12898 /* If static stack checking is enabled and done with probes,
12899 the registers need to be saved before allocating the frame. */
12900 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
12901
12902 /* Skip return address. */
12903 offset = UNITS_PER_WORD;
12904
12905 /* Skip pushed static chain. */
12906 if (ix86_static_chain_on_stack)
12907 offset += UNITS_PER_WORD;
12908
12909 /* Skip saved base pointer. */
12910 if (frame_pointer_needed)
12911 offset += UNITS_PER_WORD;
12912 frame->hfp_save_offset = offset;
12913
12914 /* The traditional frame pointer location is at the top of the frame. */
12915 frame->hard_frame_pointer_offset = offset;
12916
12917 /* Register save area */
12918 offset += frame->nregs * UNITS_PER_WORD;
12919 frame->reg_save_offset = offset;
12920
12921 /* On SEH target, registers are pushed just before the frame pointer
12922 location. */
12923 if (TARGET_SEH)
12924 frame->hard_frame_pointer_offset = offset;
12925
12926 /* When re-aligning the stack frame, but not saving SSE registers, this
12927 is the offset we want adjust the stack pointer to. */
12928 frame->stack_realign_allocate_offset = offset;
12929
12930 /* The re-aligned stack starts here. Values before this point are not
12931 directly comparable with values below this point. Use sp_valid_at
12932 to determine if the stack pointer is valid for a given offset and
12933 fp_valid_at for the frame pointer. */
12934 if (stack_realign_fp)
12935 offset = ROUND_UP (offset, stack_alignment_needed);
12936 frame->stack_realign_offset = offset;
12937
12938 if (TARGET_64BIT && m->call_ms2sysv)
12939 {
12940 gcc_assert (stack_alignment_needed >= 16);
12941 gcc_assert (!frame->nsseregs);
12942
12943 m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
12944
12945 /* Select an appropriate layout for incoming stack offset. */
12946 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
12947
12948 if ((offset + xlogue.get_stack_space_used ()) & UNITS_PER_WORD)
12949 m->call_ms2sysv_pad_out = 1;
12950
12951 offset += xlogue.get_stack_space_used ();
12952 gcc_assert (!(offset & 0xf));
12953 frame->outlined_save_offset = offset;
12954 }
12955
12956 /* Align and set SSE register save area. */
12957 else if (frame->nsseregs)
12958 {
12959 /* The only ABI that has saved SSE registers (Win64) also has a
12960 16-byte aligned default stack. However, many programs violate
12961 the ABI, and Wine64 forces stack realignment to compensate.
12962
12963 If the incoming stack boundary is at least 16 bytes, or DRAP is
12964 required and the DRAP re-alignment boundary is at least 16 bytes,
12965 then we want the SSE register save area properly aligned. */
12966 if (ix86_incoming_stack_boundary >= 128
12967 || (stack_realign_drap && stack_alignment_needed >= 16))
12968 offset = ROUND_UP (offset, 16);
12969 offset += frame->nsseregs * 16;
12970 frame->stack_realign_allocate_offset = offset;
12971 }
12972
12973 frame->sse_reg_save_offset = offset;
12974
12975 /* Va-arg area */
12976 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
12977 offset += frame->va_arg_size;
12978
12979 /* Align start of frame for local function. */
12980 if (stack_realign_fp
12981 || offset != frame->sse_reg_save_offset
12982 || size != 0
12983 || !crtl->is_leaf
12984 || cfun->calls_alloca
12985 || ix86_current_function_calls_tls_descriptor)
12986 offset = ROUND_UP (offset, stack_alignment_needed);
12987
12988 /* Frame pointer points here. */
12989 frame->frame_pointer_offset = offset;
12990
12991 offset += size;
12992
12993 /* Add outgoing arguments area. Can be skipped if we eliminated
12994 all the function calls as dead code.
12995 Skipping is however impossible when function calls alloca. Alloca
12996 expander assumes that last crtl->outgoing_args_size
12997 of stack frame are unused. */
12998 if (ACCUMULATE_OUTGOING_ARGS
12999 && (!crtl->is_leaf || cfun->calls_alloca
13000 || ix86_current_function_calls_tls_descriptor))
13001 {
13002 offset += crtl->outgoing_args_size;
13003 frame->outgoing_arguments_size = crtl->outgoing_args_size;
13004 }
13005 else
13006 frame->outgoing_arguments_size = 0;
13007
13008 /* Align stack boundary. Only needed if we're calling another function
13009 or using alloca. */
13010 if (!crtl->is_leaf || cfun->calls_alloca
13011 || ix86_current_function_calls_tls_descriptor)
13012 offset = ROUND_UP (offset, preferred_alignment);
13013
13014 /* We've reached end of stack frame. */
13015 frame->stack_pointer_offset = offset;
13016
13017 /* Size prologue needs to allocate. */
13018 to_allocate = offset - frame->sse_reg_save_offset;
13019
13020 if ((!to_allocate && frame->nregs <= 1)
13021 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)))
13022 frame->save_regs_using_mov = false;
13023
13024 if (ix86_using_red_zone ()
13025 && crtl->sp_is_unchanging
13026 && crtl->is_leaf
13027 && !ix86_pc_thunk_call_expanded
13028 && !ix86_current_function_calls_tls_descriptor)
13029 {
13030 frame->red_zone_size = to_allocate;
13031 if (frame->save_regs_using_mov)
13032 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
13033 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
13034 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
13035 }
13036 else
13037 frame->red_zone_size = 0;
13038 frame->stack_pointer_offset -= frame->red_zone_size;
13039
13040 /* The SEH frame pointer location is near the bottom of the frame.
13041 This is enforced by the fact that the difference between the
13042 stack pointer and the frame pointer is limited to 240 bytes in
13043 the unwind data structure. */
13044 if (TARGET_SEH)
13045 {
13046 HOST_WIDE_INT diff;
13047
13048 /* If we can leave the frame pointer where it is, do so. Also, returns
13049 the establisher frame for __builtin_frame_address (0). */
13050 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
13051 if (diff <= SEH_MAX_FRAME_SIZE
13052 && (diff > 240 || (diff & 15) != 0)
13053 && !crtl->accesses_prior_frames)
13054 {
13055 /* Ideally we'd determine what portion of the local stack frame
13056 (within the constraint of the lowest 240) is most heavily used.
13057 But without that complication, simply bias the frame pointer
13058 by 128 bytes so as to maximize the amount of the local stack
13059 frame that is addressable with 8-bit offsets. */
13060 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
13061 }
13062 }
13063 }
13064
13065 /* This is semi-inlined memory_address_length, but simplified
13066 since we know that we're always dealing with reg+offset, and
13067 to avoid having to create and discard all that rtl. */
13068
13069 static inline int
13070 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
13071 {
13072 int len = 4;
13073
13074 if (offset == 0)
13075 {
13076 /* EBP and R13 cannot be encoded without an offset. */
13077 len = (regno == BP_REG || regno == R13_REG);
13078 }
13079 else if (IN_RANGE (offset, -128, 127))
13080 len = 1;
13081
13082 /* ESP and R12 must be encoded with a SIB byte. */
13083 if (regno == SP_REG || regno == R12_REG)
13084 len++;
13085
13086 return len;
13087 }
13088
13089 /* Determine if the stack pointer is valid for accessing the cfa_offset. */
13090
13091 static inline bool
13092 sp_valid_at (HOST_WIDE_INT cfa_offset)
13093 {
13094 const struct machine_frame_state &fs = cfun->machine->fs;
13095 return fs.sp_valid && !(fs.sp_realigned
13096 && cfa_offset < fs.sp_realigned_offset);
13097 }
13098
13099 /* Determine if the frame pointer is valid for accessing the cfa_offset. */
13100
13101 static inline bool
13102 fp_valid_at (HOST_WIDE_INT cfa_offset)
13103 {
13104 const struct machine_frame_state &fs = cfun->machine->fs;
13105 return fs.fp_valid && !(fs.sp_valid && fs.sp_realigned
13106 && cfa_offset >= fs.sp_realigned_offset);
13107 }
13108
13109 /* Choose a base register based upon alignment requested, speed and/or
13110 size. */
13111
13112 static void
13113 choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
13114 HOST_WIDE_INT &base_offset,
13115 unsigned int align_reqested, unsigned int *align)
13116 {
13117 const struct machine_function *m = cfun->machine;
13118 unsigned int hfp_align;
13119 unsigned int drap_align;
13120 unsigned int sp_align;
13121 bool hfp_ok = fp_valid_at (cfa_offset);
13122 bool drap_ok = m->fs.drap_valid;
13123 bool sp_ok = sp_valid_at (cfa_offset);
13124
13125 hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
13126
13127 /* Filter out any registers that don't meet the requested alignment
13128 criteria. */
13129 if (align_reqested)
13130 {
13131 if (m->fs.realigned)
13132 hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
13133 /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
13134 notes (which we would need to use a realigned stack pointer),
13135 so disable on SEH targets. */
13136 else if (m->fs.sp_realigned)
13137 sp_align = crtl->stack_alignment_needed;
13138
13139 hfp_ok = hfp_ok && hfp_align >= align_reqested;
13140 drap_ok = drap_ok && drap_align >= align_reqested;
13141 sp_ok = sp_ok && sp_align >= align_reqested;
13142 }
13143
13144 if (m->use_fast_prologue_epilogue)
13145 {
13146 /* Choose the base register most likely to allow the most scheduling
13147 opportunities. Generally FP is valid throughout the function,
13148 while DRAP must be reloaded within the epilogue. But choose either
13149 over the SP due to increased encoding size. */
13150
13151 if (hfp_ok)
13152 {
13153 base_reg = hard_frame_pointer_rtx;
13154 base_offset = m->fs.fp_offset - cfa_offset;
13155 }
13156 else if (drap_ok)
13157 {
13158 base_reg = crtl->drap_reg;
13159 base_offset = 0 - cfa_offset;
13160 }
13161 else if (sp_ok)
13162 {
13163 base_reg = stack_pointer_rtx;
13164 base_offset = m->fs.sp_offset - cfa_offset;
13165 }
13166 }
13167 else
13168 {
13169 HOST_WIDE_INT toffset;
13170 int len = 16, tlen;
13171
13172 /* Choose the base register with the smallest address encoding.
13173 With a tie, choose FP > DRAP > SP. */
13174 if (sp_ok)
13175 {
13176 base_reg = stack_pointer_rtx;
13177 base_offset = m->fs.sp_offset - cfa_offset;
13178 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
13179 }
13180 if (drap_ok)
13181 {
13182 toffset = 0 - cfa_offset;
13183 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
13184 if (tlen <= len)
13185 {
13186 base_reg = crtl->drap_reg;
13187 base_offset = toffset;
13188 len = tlen;
13189 }
13190 }
13191 if (hfp_ok)
13192 {
13193 toffset = m->fs.fp_offset - cfa_offset;
13194 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
13195 if (tlen <= len)
13196 {
13197 base_reg = hard_frame_pointer_rtx;
13198 base_offset = toffset;
13199 len = tlen;
13200 }
13201 }
13202 }
13203
13204 /* Set the align return value. */
13205 if (align)
13206 {
13207 if (base_reg == stack_pointer_rtx)
13208 *align = sp_align;
13209 else if (base_reg == crtl->drap_reg)
13210 *align = drap_align;
13211 else if (base_reg == hard_frame_pointer_rtx)
13212 *align = hfp_align;
13213 }
13214 }
13215
13216 /* Return an RTX that points to CFA_OFFSET within the stack frame and
13217 the alignment of address. If align is non-null, it should point to
13218 an alignment value (in bits) that is preferred or zero and will
13219 recieve the alignment of the base register that was selected. The
13220 valid base registers are taken from CFUN->MACHINE->FS. */
13221
13222 static rtx
13223 choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align)
13224 {
13225 rtx base_reg = NULL;
13226 HOST_WIDE_INT base_offset = 0;
13227
13228 /* If a specific alignment is requested, try to get a base register
13229 with that alignment first. */
13230 if (align && *align)
13231 choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
13232
13233 if (!base_reg)
13234 choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
13235
13236 gcc_assert (base_reg != NULL);
13237 return plus_constant (Pmode, base_reg, base_offset);
13238 }
13239
13240 /* Emit code to save registers in the prologue. */
13241
13242 static void
13243 ix86_emit_save_regs (void)
13244 {
13245 unsigned int regno;
13246 rtx_insn *insn;
13247
13248 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
13249 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
13250 {
13251 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
13252 RTX_FRAME_RELATED_P (insn) = 1;
13253 }
13254 }
13255
13256 /* Emit a single register save at CFA - CFA_OFFSET. */
13257
13258 static void
13259 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
13260 HOST_WIDE_INT cfa_offset)
13261 {
13262 struct machine_function *m = cfun->machine;
13263 rtx reg = gen_rtx_REG (mode, regno);
13264 rtx mem, addr, base, insn;
13265 unsigned int align = GET_MODE_ALIGNMENT (mode);
13266
13267 addr = choose_baseaddr (cfa_offset, &align);
13268 mem = gen_frame_mem (mode, addr);
13269
13270 /* The location aligment depends upon the base register. */
13271 align = MIN (GET_MODE_ALIGNMENT (mode), align);
13272 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
13273 set_mem_align (mem, align);
13274
13275 insn = emit_insn (gen_rtx_SET (mem, reg));
13276 RTX_FRAME_RELATED_P (insn) = 1;
13277
13278 base = addr;
13279 if (GET_CODE (base) == PLUS)
13280 base = XEXP (base, 0);
13281 gcc_checking_assert (REG_P (base));
13282
13283 /* When saving registers into a re-aligned local stack frame, avoid
13284 any tricky guessing by dwarf2out. */
13285 if (m->fs.realigned)
13286 {
13287 gcc_checking_assert (stack_realign_drap);
13288
13289 if (regno == REGNO (crtl->drap_reg))
13290 {
13291 /* A bit of a hack. We force the DRAP register to be saved in
13292 the re-aligned stack frame, which provides us with a copy
13293 of the CFA that will last past the prologue. Install it. */
13294 gcc_checking_assert (cfun->machine->fs.fp_valid);
13295 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
13296 cfun->machine->fs.fp_offset - cfa_offset);
13297 mem = gen_rtx_MEM (mode, addr);
13298 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
13299 }
13300 else
13301 {
13302 /* The frame pointer is a stable reference within the
13303 aligned frame. Use it. */
13304 gcc_checking_assert (cfun->machine->fs.fp_valid);
13305 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
13306 cfun->machine->fs.fp_offset - cfa_offset);
13307 mem = gen_rtx_MEM (mode, addr);
13308 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
13309 }
13310 }
13311
13312 else if (base == stack_pointer_rtx && m->fs.sp_realigned
13313 && cfa_offset >= m->fs.sp_realigned_offset)
13314 {
13315 gcc_checking_assert (stack_realign_fp);
13316 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
13317 }
13318
13319 /* The memory may not be relative to the current CFA register,
13320 which means that we may need to generate a new pattern for
13321 use by the unwind info. */
13322 else if (base != m->fs.cfa_reg)
13323 {
13324 addr = plus_constant (Pmode, m->fs.cfa_reg,
13325 m->fs.cfa_offset - cfa_offset);
13326 mem = gen_rtx_MEM (mode, addr);
13327 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
13328 }
13329 }
13330
13331 /* Emit code to save registers using MOV insns.
13332 First register is stored at CFA - CFA_OFFSET. */
13333 static void
13334 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
13335 {
13336 unsigned int regno;
13337
13338 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13339 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
13340 {
13341 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
13342 cfa_offset -= UNITS_PER_WORD;
13343 }
13344 }
13345
13346 /* Emit code to save SSE registers using MOV insns.
13347 First register is stored at CFA - CFA_OFFSET. */
13348 static void
13349 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
13350 {
13351 unsigned int regno;
13352
13353 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13354 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
13355 {
13356 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
13357 cfa_offset -= GET_MODE_SIZE (V4SFmode);
13358 }
13359 }
13360
13361 static GTY(()) rtx queued_cfa_restores;
13362
13363 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
13364 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
13365 Don't add the note if the previously saved value will be left untouched
13366 within stack red-zone till return, as unwinders can find the same value
13367 in the register and on the stack. */
13368
13369 static void
13370 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
13371 {
13372 if (!crtl->shrink_wrapped
13373 && cfa_offset <= cfun->machine->fs.red_zone_offset)
13374 return;
13375
13376 if (insn)
13377 {
13378 add_reg_note (insn, REG_CFA_RESTORE, reg);
13379 RTX_FRAME_RELATED_P (insn) = 1;
13380 }
13381 else
13382 queued_cfa_restores
13383 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
13384 }
13385
13386 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
13387
13388 static void
13389 ix86_add_queued_cfa_restore_notes (rtx insn)
13390 {
13391 rtx last;
13392 if (!queued_cfa_restores)
13393 return;
13394 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
13395 ;
13396 XEXP (last, 1) = REG_NOTES (insn);
13397 REG_NOTES (insn) = queued_cfa_restores;
13398 queued_cfa_restores = NULL_RTX;
13399 RTX_FRAME_RELATED_P (insn) = 1;
13400 }
13401
13402 /* Expand prologue or epilogue stack adjustment.
13403 The pattern exist to put a dependency on all ebp-based memory accesses.
13404 STYLE should be negative if instructions should be marked as frame related,
13405 zero if %r11 register is live and cannot be freely used and positive
13406 otherwise. */
13407
13408 static void
13409 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
13410 int style, bool set_cfa)
13411 {
13412 struct machine_function *m = cfun->machine;
13413 rtx insn;
13414 bool add_frame_related_expr = false;
13415
13416 if (Pmode == SImode)
13417 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
13418 else if (x86_64_immediate_operand (offset, DImode))
13419 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
13420 else
13421 {
13422 rtx tmp;
13423 /* r11 is used by indirect sibcall return as well, set before the
13424 epilogue and used after the epilogue. */
13425 if (style)
13426 tmp = gen_rtx_REG (DImode, R11_REG);
13427 else
13428 {
13429 gcc_assert (src != hard_frame_pointer_rtx
13430 && dest != hard_frame_pointer_rtx);
13431 tmp = hard_frame_pointer_rtx;
13432 }
13433 insn = emit_insn (gen_rtx_SET (tmp, offset));
13434 if (style < 0)
13435 add_frame_related_expr = true;
13436
13437 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
13438 }
13439
13440 insn = emit_insn (insn);
13441 if (style >= 0)
13442 ix86_add_queued_cfa_restore_notes (insn);
13443
13444 if (set_cfa)
13445 {
13446 rtx r;
13447
13448 gcc_assert (m->fs.cfa_reg == src);
13449 m->fs.cfa_offset += INTVAL (offset);
13450 m->fs.cfa_reg = dest;
13451
13452 r = gen_rtx_PLUS (Pmode, src, offset);
13453 r = gen_rtx_SET (dest, r);
13454 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
13455 RTX_FRAME_RELATED_P (insn) = 1;
13456 }
13457 else if (style < 0)
13458 {
13459 RTX_FRAME_RELATED_P (insn) = 1;
13460 if (add_frame_related_expr)
13461 {
13462 rtx r = gen_rtx_PLUS (Pmode, src, offset);
13463 r = gen_rtx_SET (dest, r);
13464 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
13465 }
13466 }
13467
13468 if (dest == stack_pointer_rtx)
13469 {
13470 HOST_WIDE_INT ooffset = m->fs.sp_offset;
13471 bool valid = m->fs.sp_valid;
13472 bool realigned = m->fs.sp_realigned;
13473
13474 if (src == hard_frame_pointer_rtx)
13475 {
13476 valid = m->fs.fp_valid;
13477 realigned = false;
13478 ooffset = m->fs.fp_offset;
13479 }
13480 else if (src == crtl->drap_reg)
13481 {
13482 valid = m->fs.drap_valid;
13483 realigned = false;
13484 ooffset = 0;
13485 }
13486 else
13487 {
13488 /* Else there are two possibilities: SP itself, which we set
13489 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
13490 taken care of this by hand along the eh_return path. */
13491 gcc_checking_assert (src == stack_pointer_rtx
13492 || offset == const0_rtx);
13493 }
13494
13495 m->fs.sp_offset = ooffset - INTVAL (offset);
13496 m->fs.sp_valid = valid;
13497 m->fs.sp_realigned = realigned;
13498 }
13499 }
13500
13501 /* Find an available register to be used as dynamic realign argument
13502 pointer regsiter. Such a register will be written in prologue and
13503 used in begin of body, so it must not be
13504 1. parameter passing register.
13505 2. GOT pointer.
13506 We reuse static-chain register if it is available. Otherwise, we
13507 use DI for i386 and R13 for x86-64. We chose R13 since it has
13508 shorter encoding.
13509
13510 Return: the regno of chosen register. */
13511
13512 static unsigned int
13513 find_drap_reg (void)
13514 {
13515 tree decl = cfun->decl;
13516
13517 /* Always use callee-saved register if there are no caller-saved
13518 registers. */
13519 if (TARGET_64BIT)
13520 {
13521 /* Use R13 for nested function or function need static chain.
13522 Since function with tail call may use any caller-saved
13523 registers in epilogue, DRAP must not use caller-saved
13524 register in such case. */
13525 if (DECL_STATIC_CHAIN (decl)
13526 || cfun->machine->no_caller_saved_registers
13527 || crtl->tail_call_emit)
13528 return R13_REG;
13529
13530 return R10_REG;
13531 }
13532 else
13533 {
13534 /* Use DI for nested function or function need static chain.
13535 Since function with tail call may use any caller-saved
13536 registers in epilogue, DRAP must not use caller-saved
13537 register in such case. */
13538 if (DECL_STATIC_CHAIN (decl)
13539 || cfun->machine->no_caller_saved_registers
13540 || crtl->tail_call_emit)
13541 return DI_REG;
13542
13543 /* Reuse static chain register if it isn't used for parameter
13544 passing. */
13545 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
13546 {
13547 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
13548 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
13549 return CX_REG;
13550 }
13551 return DI_REG;
13552 }
13553 }
13554
13555 /* Handle a "force_align_arg_pointer" attribute. */
13556
13557 static tree
13558 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
13559 tree, int, bool *no_add_attrs)
13560 {
13561 if (TREE_CODE (*node) != FUNCTION_TYPE
13562 && TREE_CODE (*node) != METHOD_TYPE
13563 && TREE_CODE (*node) != FIELD_DECL
13564 && TREE_CODE (*node) != TYPE_DECL)
13565 {
13566 warning (OPT_Wattributes, "%qE attribute only applies to functions",
13567 name);
13568 *no_add_attrs = true;
13569 }
13570
13571 return NULL_TREE;
13572 }
13573
13574 /* Return minimum incoming stack alignment. */
13575
13576 static unsigned int
13577 ix86_minimum_incoming_stack_boundary (bool sibcall)
13578 {
13579 unsigned int incoming_stack_boundary;
13580
13581 /* Stack of interrupt handler is aligned to 128 bits in 64bit
13582 mode. */
13583 if (cfun->machine->func_type != TYPE_NORMAL)
13584 incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
13585 /* Prefer the one specified at command line. */
13586 else if (ix86_user_incoming_stack_boundary)
13587 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
13588 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
13589 if -mstackrealign is used, it isn't used for sibcall check and
13590 estimated stack alignment is 128bit. */
13591 else if (!sibcall
13592 && ix86_force_align_arg_pointer
13593 && crtl->stack_alignment_estimated == 128)
13594 incoming_stack_boundary = MIN_STACK_BOUNDARY;
13595 else
13596 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
13597
13598 /* Incoming stack alignment can be changed on individual functions
13599 via force_align_arg_pointer attribute. We use the smallest
13600 incoming stack boundary. */
13601 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
13602 && lookup_attribute (ix86_force_align_arg_pointer_string,
13603 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
13604 incoming_stack_boundary = MIN_STACK_BOUNDARY;
13605
13606 /* The incoming stack frame has to be aligned at least at
13607 parm_stack_boundary. */
13608 if (incoming_stack_boundary < crtl->parm_stack_boundary)
13609 incoming_stack_boundary = crtl->parm_stack_boundary;
13610
13611 /* Stack at entrance of main is aligned by runtime. We use the
13612 smallest incoming stack boundary. */
13613 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
13614 && DECL_NAME (current_function_decl)
13615 && MAIN_NAME_P (DECL_NAME (current_function_decl))
13616 && DECL_FILE_SCOPE_P (current_function_decl))
13617 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
13618
13619 return incoming_stack_boundary;
13620 }
13621
13622 /* Update incoming stack boundary and estimated stack alignment. */
13623
13624 static void
13625 ix86_update_stack_boundary (void)
13626 {
13627 ix86_incoming_stack_boundary
13628 = ix86_minimum_incoming_stack_boundary (false);
13629
13630 /* x86_64 vararg needs 16byte stack alignment for register save
13631 area. */
13632 if (TARGET_64BIT
13633 && cfun->stdarg
13634 && crtl->stack_alignment_estimated < 128)
13635 crtl->stack_alignment_estimated = 128;
13636
13637 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
13638 if (ix86_tls_descriptor_calls_expanded_in_cfun
13639 && crtl->preferred_stack_boundary < 128)
13640 crtl->preferred_stack_boundary = 128;
13641 }
13642
13643 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
13644 needed or an rtx for DRAP otherwise. */
13645
13646 static rtx
13647 ix86_get_drap_rtx (void)
13648 {
13649 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
13650 crtl->need_drap = true;
13651
13652 if (stack_realign_drap)
13653 {
13654 /* Assign DRAP to vDRAP and returns vDRAP */
13655 unsigned int regno = find_drap_reg ();
13656 rtx drap_vreg;
13657 rtx arg_ptr;
13658 rtx_insn *seq, *insn;
13659
13660 arg_ptr = gen_rtx_REG (Pmode, regno);
13661 crtl->drap_reg = arg_ptr;
13662
13663 start_sequence ();
13664 drap_vreg = copy_to_reg (arg_ptr);
13665 seq = get_insns ();
13666 end_sequence ();
13667
13668 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
13669 if (!optimize)
13670 {
13671 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
13672 RTX_FRAME_RELATED_P (insn) = 1;
13673 }
13674 return drap_vreg;
13675 }
13676 else
13677 return NULL;
13678 }
13679
13680 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
13681
13682 static rtx
13683 ix86_internal_arg_pointer (void)
13684 {
13685 return virtual_incoming_args_rtx;
13686 }
13687
13688 struct scratch_reg {
13689 rtx reg;
13690 bool saved;
13691 };
13692
13693 /* Return a short-lived scratch register for use on function entry.
13694 In 32-bit mode, it is valid only after the registers are saved
13695 in the prologue. This register must be released by means of
13696 release_scratch_register_on_entry once it is dead. */
13697
13698 static void
13699 get_scratch_register_on_entry (struct scratch_reg *sr)
13700 {
13701 int regno;
13702
13703 sr->saved = false;
13704
13705 if (TARGET_64BIT)
13706 {
13707 /* We always use R11 in 64-bit mode. */
13708 regno = R11_REG;
13709 }
13710 else
13711 {
13712 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
13713 bool fastcall_p
13714 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13715 bool thiscall_p
13716 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13717 bool static_chain_p = DECL_STATIC_CHAIN (decl);
13718 int regparm = ix86_function_regparm (fntype, decl);
13719 int drap_regno
13720 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
13721
13722 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
13723 for the static chain register. */
13724 if ((regparm < 1 || (fastcall_p && !static_chain_p))
13725 && drap_regno != AX_REG)
13726 regno = AX_REG;
13727 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
13728 for the static chain register. */
13729 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
13730 regno = AX_REG;
13731 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
13732 regno = DX_REG;
13733 /* ecx is the static chain register. */
13734 else if (regparm < 3 && !fastcall_p && !thiscall_p
13735 && !static_chain_p
13736 && drap_regno != CX_REG)
13737 regno = CX_REG;
13738 else if (ix86_save_reg (BX_REG, true, false))
13739 regno = BX_REG;
13740 /* esi is the static chain register. */
13741 else if (!(regparm == 3 && static_chain_p)
13742 && ix86_save_reg (SI_REG, true, false))
13743 regno = SI_REG;
13744 else if (ix86_save_reg (DI_REG, true, false))
13745 regno = DI_REG;
13746 else
13747 {
13748 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
13749 sr->saved = true;
13750 }
13751 }
13752
13753 sr->reg = gen_rtx_REG (Pmode, regno);
13754 if (sr->saved)
13755 {
13756 rtx_insn *insn = emit_insn (gen_push (sr->reg));
13757 RTX_FRAME_RELATED_P (insn) = 1;
13758 }
13759 }
13760
13761 /* Release a scratch register obtained from the preceding function. */
13762
13763 static void
13764 release_scratch_register_on_entry (struct scratch_reg *sr)
13765 {
13766 if (sr->saved)
13767 {
13768 struct machine_function *m = cfun->machine;
13769 rtx x, insn = emit_insn (gen_pop (sr->reg));
13770
13771 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
13772 RTX_FRAME_RELATED_P (insn) = 1;
13773 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
13774 x = gen_rtx_SET (stack_pointer_rtx, x);
13775 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
13776 m->fs.sp_offset -= UNITS_PER_WORD;
13777 }
13778 }
13779
13780 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
13781
13782 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
13783
13784 static void
13785 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
13786 {
13787 /* We skip the probe for the first interval + a small dope of 4 words and
13788 probe that many bytes past the specified size to maintain a protection
13789 area at the botton of the stack. */
13790 const int dope = 4 * UNITS_PER_WORD;
13791 rtx size_rtx = GEN_INT (size), last;
13792
13793 /* See if we have a constant small number of probes to generate. If so,
13794 that's the easy case. The run-time loop is made up of 9 insns in the
13795 generic case while the compile-time loop is made up of 3+2*(n-1) insns
13796 for n # of intervals. */
13797 if (size <= 4 * PROBE_INTERVAL)
13798 {
13799 HOST_WIDE_INT i, adjust;
13800 bool first_probe = true;
13801
13802 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
13803 values of N from 1 until it exceeds SIZE. If only one probe is
13804 needed, this will not generate any code. Then adjust and probe
13805 to PROBE_INTERVAL + SIZE. */
13806 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
13807 {
13808 if (first_probe)
13809 {
13810 adjust = 2 * PROBE_INTERVAL + dope;
13811 first_probe = false;
13812 }
13813 else
13814 adjust = PROBE_INTERVAL;
13815
13816 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13817 plus_constant (Pmode, stack_pointer_rtx,
13818 -adjust)));
13819 emit_stack_probe (stack_pointer_rtx);
13820 }
13821
13822 if (first_probe)
13823 adjust = size + PROBE_INTERVAL + dope;
13824 else
13825 adjust = size + PROBE_INTERVAL - i;
13826
13827 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13828 plus_constant (Pmode, stack_pointer_rtx,
13829 -adjust)));
13830 emit_stack_probe (stack_pointer_rtx);
13831
13832 /* Adjust back to account for the additional first interval. */
13833 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13834 plus_constant (Pmode, stack_pointer_rtx,
13835 PROBE_INTERVAL + dope)));
13836 }
13837
13838 /* Otherwise, do the same as above, but in a loop. Note that we must be
13839 extra careful with variables wrapping around because we might be at
13840 the very top (or the very bottom) of the address space and we have
13841 to be able to handle this case properly; in particular, we use an
13842 equality test for the loop condition. */
13843 else
13844 {
13845 HOST_WIDE_INT rounded_size;
13846 struct scratch_reg sr;
13847
13848 get_scratch_register_on_entry (&sr);
13849
13850
13851 /* Step 1: round SIZE to the previous multiple of the interval. */
13852
13853 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
13854
13855
13856 /* Step 2: compute initial and final value of the loop counter. */
13857
13858 /* SP = SP_0 + PROBE_INTERVAL. */
13859 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13860 plus_constant (Pmode, stack_pointer_rtx,
13861 - (PROBE_INTERVAL + dope))));
13862
13863 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
13864 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
13865 emit_insn (gen_rtx_SET (sr.reg,
13866 plus_constant (Pmode, stack_pointer_rtx,
13867 -rounded_size)));
13868 else
13869 {
13870 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
13871 emit_insn (gen_rtx_SET (sr.reg,
13872 gen_rtx_PLUS (Pmode, sr.reg,
13873 stack_pointer_rtx)));
13874 }
13875
13876
13877 /* Step 3: the loop
13878
13879 do
13880 {
13881 SP = SP + PROBE_INTERVAL
13882 probe at SP
13883 }
13884 while (SP != LAST_ADDR)
13885
13886 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
13887 values of N from 1 until it is equal to ROUNDED_SIZE. */
13888
13889 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
13890
13891
13892 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
13893 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
13894
13895 if (size != rounded_size)
13896 {
13897 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13898 plus_constant (Pmode, stack_pointer_rtx,
13899 rounded_size - size)));
13900 emit_stack_probe (stack_pointer_rtx);
13901 }
13902
13903 /* Adjust back to account for the additional first interval. */
13904 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13905 plus_constant (Pmode, stack_pointer_rtx,
13906 PROBE_INTERVAL + dope)));
13907
13908 release_scratch_register_on_entry (&sr);
13909 }
13910
13911 /* Even if the stack pointer isn't the CFA register, we need to correctly
13912 describe the adjustments made to it, in particular differentiate the
13913 frame-related ones from the frame-unrelated ones. */
13914 if (size > 0)
13915 {
13916 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
13917 XVECEXP (expr, 0, 0)
13918 = gen_rtx_SET (stack_pointer_rtx,
13919 plus_constant (Pmode, stack_pointer_rtx, -size));
13920 XVECEXP (expr, 0, 1)
13921 = gen_rtx_SET (stack_pointer_rtx,
13922 plus_constant (Pmode, stack_pointer_rtx,
13923 PROBE_INTERVAL + dope + size));
13924 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
13925 RTX_FRAME_RELATED_P (last) = 1;
13926
13927 cfun->machine->fs.sp_offset += size;
13928 }
13929
13930 /* Make sure nothing is scheduled before we are done. */
13931 emit_insn (gen_blockage ());
13932 }
13933
13934 /* Adjust the stack pointer up to REG while probing it. */
13935
13936 const char *
13937 output_adjust_stack_and_probe (rtx reg)
13938 {
13939 static int labelno = 0;
13940 char loop_lab[32];
13941 rtx xops[2];
13942
13943 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13944
13945 /* Loop. */
13946 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13947
13948 /* SP = SP + PROBE_INTERVAL. */
13949 xops[0] = stack_pointer_rtx;
13950 xops[1] = GEN_INT (PROBE_INTERVAL);
13951 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13952
13953 /* Probe at SP. */
13954 xops[1] = const0_rtx;
13955 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
13956
13957 /* Test if SP == LAST_ADDR. */
13958 xops[0] = stack_pointer_rtx;
13959 xops[1] = reg;
13960 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13961
13962 /* Branch. */
13963 fputs ("\tjne\t", asm_out_file);
13964 assemble_name_raw (asm_out_file, loop_lab);
13965 fputc ('\n', asm_out_file);
13966
13967 return "";
13968 }
13969
13970 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
13971 inclusive. These are offsets from the current stack pointer. */
13972
13973 static void
13974 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
13975 {
13976 /* See if we have a constant small number of probes to generate. If so,
13977 that's the easy case. The run-time loop is made up of 6 insns in the
13978 generic case while the compile-time loop is made up of n insns for n #
13979 of intervals. */
13980 if (size <= 6 * PROBE_INTERVAL)
13981 {
13982 HOST_WIDE_INT i;
13983
13984 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
13985 it exceeds SIZE. If only one probe is needed, this will not
13986 generate any code. Then probe at FIRST + SIZE. */
13987 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
13988 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
13989 -(first + i)));
13990
13991 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
13992 -(first + size)));
13993 }
13994
13995 /* Otherwise, do the same as above, but in a loop. Note that we must be
13996 extra careful with variables wrapping around because we might be at
13997 the very top (or the very bottom) of the address space and we have
13998 to be able to handle this case properly; in particular, we use an
13999 equality test for the loop condition. */
14000 else
14001 {
14002 HOST_WIDE_INT rounded_size, last;
14003 struct scratch_reg sr;
14004
14005 get_scratch_register_on_entry (&sr);
14006
14007
14008 /* Step 1: round SIZE to the previous multiple of the interval. */
14009
14010 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
14011
14012
14013 /* Step 2: compute initial and final value of the loop counter. */
14014
14015 /* TEST_OFFSET = FIRST. */
14016 emit_move_insn (sr.reg, GEN_INT (-first));
14017
14018 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
14019 last = first + rounded_size;
14020
14021
14022 /* Step 3: the loop
14023
14024 do
14025 {
14026 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
14027 probe at TEST_ADDR
14028 }
14029 while (TEST_ADDR != LAST_ADDR)
14030
14031 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
14032 until it is equal to ROUNDED_SIZE. */
14033
14034 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
14035
14036
14037 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
14038 that SIZE is equal to ROUNDED_SIZE. */
14039
14040 if (size != rounded_size)
14041 emit_stack_probe (plus_constant (Pmode,
14042 gen_rtx_PLUS (Pmode,
14043 stack_pointer_rtx,
14044 sr.reg),
14045 rounded_size - size));
14046
14047 release_scratch_register_on_entry (&sr);
14048 }
14049
14050 /* Make sure nothing is scheduled before we are done. */
14051 emit_insn (gen_blockage ());
14052 }
14053
14054 /* Probe a range of stack addresses from REG to END, inclusive. These are
14055 offsets from the current stack pointer. */
14056
14057 const char *
14058 output_probe_stack_range (rtx reg, rtx end)
14059 {
14060 static int labelno = 0;
14061 char loop_lab[32];
14062 rtx xops[3];
14063
14064 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
14065
14066 /* Loop. */
14067 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
14068
14069 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
14070 xops[0] = reg;
14071 xops[1] = GEN_INT (PROBE_INTERVAL);
14072 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
14073
14074 /* Probe at TEST_ADDR. */
14075 xops[0] = stack_pointer_rtx;
14076 xops[1] = reg;
14077 xops[2] = const0_rtx;
14078 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
14079
14080 /* Test if TEST_ADDR == LAST_ADDR. */
14081 xops[0] = reg;
14082 xops[1] = end;
14083 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
14084
14085 /* Branch. */
14086 fputs ("\tjne\t", asm_out_file);
14087 assemble_name_raw (asm_out_file, loop_lab);
14088 fputc ('\n', asm_out_file);
14089
14090 return "";
14091 }
14092
14093 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
14094 to be generated in correct form. */
14095 static void
14096 ix86_finalize_stack_realign_flags (void)
14097 {
14098 /* Check if stack realign is really needed after reload, and
14099 stores result in cfun */
14100 unsigned int incoming_stack_boundary
14101 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
14102 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
14103 unsigned int stack_realign
14104 = (incoming_stack_boundary
14105 < (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
14106 ? crtl->max_used_stack_slot_alignment
14107 : crtl->stack_alignment_needed));
14108 bool recompute_frame_layout_p = false;
14109
14110 if (crtl->stack_realign_finalized)
14111 {
14112 /* After stack_realign_needed is finalized, we can't no longer
14113 change it. */
14114 gcc_assert (crtl->stack_realign_needed == stack_realign);
14115 return;
14116 }
14117
14118 /* If the only reason for frame_pointer_needed is that we conservatively
14119 assumed stack realignment might be needed, but in the end nothing that
14120 needed the stack alignment had been spilled, clear frame_pointer_needed
14121 and say we don't need stack realignment. */
14122 if (stack_realign
14123 && frame_pointer_needed
14124 && crtl->is_leaf
14125 && flag_omit_frame_pointer
14126 && crtl->sp_is_unchanging
14127 && !ix86_current_function_calls_tls_descriptor
14128 && !crtl->accesses_prior_frames
14129 && !cfun->calls_alloca
14130 && !crtl->calls_eh_return
14131 /* See ira_setup_eliminable_regset for the rationale. */
14132 && !(STACK_CHECK_MOVING_SP
14133 && flag_stack_check
14134 && flag_exceptions
14135 && cfun->can_throw_non_call_exceptions)
14136 && !ix86_frame_pointer_required ()
14137 && get_frame_size () == 0
14138 && ix86_nsaved_sseregs () == 0
14139 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
14140 {
14141 HARD_REG_SET set_up_by_prologue, prologue_used;
14142 basic_block bb;
14143
14144 CLEAR_HARD_REG_SET (prologue_used);
14145 CLEAR_HARD_REG_SET (set_up_by_prologue);
14146 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
14147 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
14148 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
14149 HARD_FRAME_POINTER_REGNUM);
14150 FOR_EACH_BB_FN (bb, cfun)
14151 {
14152 rtx_insn *insn;
14153 FOR_BB_INSNS (bb, insn)
14154 if (NONDEBUG_INSN_P (insn)
14155 && requires_stack_frame_p (insn, prologue_used,
14156 set_up_by_prologue))
14157 {
14158 if (crtl->stack_realign_needed != stack_realign)
14159 recompute_frame_layout_p = true;
14160 crtl->stack_realign_needed = stack_realign;
14161 crtl->stack_realign_finalized = true;
14162 if (recompute_frame_layout_p)
14163 ix86_compute_frame_layout ();
14164 return;
14165 }
14166 }
14167
14168 /* If drap has been set, but it actually isn't live at the start
14169 of the function, there is no reason to set it up. */
14170 if (crtl->drap_reg)
14171 {
14172 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
14173 if (! REGNO_REG_SET_P (DF_LR_IN (bb), REGNO (crtl->drap_reg)))
14174 {
14175 crtl->drap_reg = NULL_RTX;
14176 crtl->need_drap = false;
14177 }
14178 }
14179 else
14180 cfun->machine->no_drap_save_restore = true;
14181
14182 frame_pointer_needed = false;
14183 stack_realign = false;
14184 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
14185 crtl->stack_alignment_needed = incoming_stack_boundary;
14186 crtl->stack_alignment_estimated = incoming_stack_boundary;
14187 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
14188 crtl->preferred_stack_boundary = incoming_stack_boundary;
14189 df_finish_pass (true);
14190 df_scan_alloc (NULL);
14191 df_scan_blocks ();
14192 df_compute_regs_ever_live (true);
14193 df_analyze ();
14194 recompute_frame_layout_p = true;
14195 }
14196
14197 if (crtl->stack_realign_needed != stack_realign)
14198 recompute_frame_layout_p = true;
14199 crtl->stack_realign_needed = stack_realign;
14200 crtl->stack_realign_finalized = true;
14201 if (recompute_frame_layout_p)
14202 ix86_compute_frame_layout ();
14203 }
14204
14205 /* Delete SET_GOT right after entry block if it is allocated to reg. */
14206
14207 static void
14208 ix86_elim_entry_set_got (rtx reg)
14209 {
14210 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
14211 rtx_insn *c_insn = BB_HEAD (bb);
14212 if (!NONDEBUG_INSN_P (c_insn))
14213 c_insn = next_nonnote_nondebug_insn (c_insn);
14214 if (c_insn && NONJUMP_INSN_P (c_insn))
14215 {
14216 rtx pat = PATTERN (c_insn);
14217 if (GET_CODE (pat) == PARALLEL)
14218 {
14219 rtx vec = XVECEXP (pat, 0, 0);
14220 if (GET_CODE (vec) == SET
14221 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
14222 && REGNO (XEXP (vec, 0)) == REGNO (reg))
14223 delete_insn (c_insn);
14224 }
14225 }
14226 }
14227
14228 static rtx
14229 gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
14230 {
14231 rtx addr, mem;
14232
14233 if (offset)
14234 addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
14235 mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
14236 return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
14237 }
14238
14239 static inline rtx
14240 gen_frame_load (rtx reg, rtx frame_reg, int offset)
14241 {
14242 return gen_frame_set (reg, frame_reg, offset, false);
14243 }
14244
14245 static inline rtx
14246 gen_frame_store (rtx reg, rtx frame_reg, int offset)
14247 {
14248 return gen_frame_set (reg, frame_reg, offset, true);
14249 }
14250
14251 static void
14252 ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
14253 {
14254 struct machine_function *m = cfun->machine;
14255 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
14256 + m->call_ms2sysv_extra_regs;
14257 rtvec v = rtvec_alloc (ncregs + 1);
14258 unsigned int align, i, vi = 0;
14259 rtx_insn *insn;
14260 rtx sym, addr;
14261 rtx rax = gen_rtx_REG (word_mode, AX_REG);
14262 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
14263 HOST_WIDE_INT rax_offset = xlogue.get_stub_ptr_offset () + m->fs.sp_offset;
14264 HOST_WIDE_INT stack_alloc_size = frame.stack_pointer_offset - m->fs.sp_offset;
14265 HOST_WIDE_INT stack_align_off_in = xlogue.get_stack_align_off_in ();
14266
14267 /* Verify that the incoming stack 16-byte alignment offset matches the
14268 layout we're using. */
14269 gcc_assert (stack_align_off_in == (m->fs.sp_offset & UNITS_PER_WORD));
14270
14271 /* Get the stub symbol. */
14272 sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
14273 : XLOGUE_STUB_SAVE);
14274 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
14275
14276 /* Setup RAX as the stub's base pointer. */
14277 align = GET_MODE_ALIGNMENT (V4SFmode);
14278 addr = choose_baseaddr (rax_offset, &align);
14279 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
14280 insn = emit_insn (gen_rtx_SET (rax, addr));
14281
14282 gcc_assert (stack_alloc_size >= xlogue.get_stack_space_used ());
14283 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14284 GEN_INT (-stack_alloc_size), -1,
14285 m->fs.cfa_reg == stack_pointer_rtx);
14286 for (i = 0; i < ncregs; ++i)
14287 {
14288 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
14289 rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
14290 r.regno);
14291 RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);;
14292 }
14293
14294 gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
14295
14296 insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
14297 RTX_FRAME_RELATED_P (insn) = true;
14298 }
14299
14300 /* Expand the prologue into a bunch of separate insns. */
14301
14302 void
14303 ix86_expand_prologue (void)
14304 {
14305 struct machine_function *m = cfun->machine;
14306 rtx insn, t;
14307 struct ix86_frame frame;
14308 HOST_WIDE_INT allocate;
14309 bool int_registers_saved;
14310 bool sse_registers_saved;
14311 rtx static_chain = NULL_RTX;
14312
14313 ix86_finalize_stack_realign_flags ();
14314
14315 /* DRAP should not coexist with stack_realign_fp */
14316 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
14317
14318 memset (&m->fs, 0, sizeof (m->fs));
14319
14320 /* Initialize CFA state for before the prologue. */
14321 m->fs.cfa_reg = stack_pointer_rtx;
14322 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
14323
14324 /* Track SP offset to the CFA. We continue tracking this after we've
14325 swapped the CFA register away from SP. In the case of re-alignment
14326 this is fudged; we're interested to offsets within the local frame. */
14327 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
14328 m->fs.sp_valid = true;
14329 m->fs.sp_realigned = false;
14330
14331 frame = m->frame;
14332
14333 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
14334 {
14335 /* We should have already generated an error for any use of
14336 ms_hook on a nested function. */
14337 gcc_checking_assert (!ix86_static_chain_on_stack);
14338
14339 /* Check if profiling is active and we shall use profiling before
14340 prologue variant. If so sorry. */
14341 if (crtl->profile && flag_fentry != 0)
14342 sorry ("ms_hook_prologue attribute isn%'t compatible "
14343 "with -mfentry for 32-bit");
14344
14345 /* In ix86_asm_output_function_label we emitted:
14346 8b ff movl.s %edi,%edi
14347 55 push %ebp
14348 8b ec movl.s %esp,%ebp
14349
14350 This matches the hookable function prologue in Win32 API
14351 functions in Microsoft Windows XP Service Pack 2 and newer.
14352 Wine uses this to enable Windows apps to hook the Win32 API
14353 functions provided by Wine.
14354
14355 What that means is that we've already set up the frame pointer. */
14356
14357 if (frame_pointer_needed
14358 && !(crtl->drap_reg && crtl->stack_realign_needed))
14359 {
14360 rtx push, mov;
14361
14362 /* We've decided to use the frame pointer already set up.
14363 Describe this to the unwinder by pretending that both
14364 push and mov insns happen right here.
14365
14366 Putting the unwind info here at the end of the ms_hook
14367 is done so that we can make absolutely certain we get
14368 the required byte sequence at the start of the function,
14369 rather than relying on an assembler that can produce
14370 the exact encoding required.
14371
14372 However it does mean (in the unpatched case) that we have
14373 a 1 insn window where the asynchronous unwind info is
14374 incorrect. However, if we placed the unwind info at
14375 its correct location we would have incorrect unwind info
14376 in the patched case. Which is probably all moot since
14377 I don't expect Wine generates dwarf2 unwind info for the
14378 system libraries that use this feature. */
14379
14380 insn = emit_insn (gen_blockage ());
14381
14382 push = gen_push (hard_frame_pointer_rtx);
14383 mov = gen_rtx_SET (hard_frame_pointer_rtx,
14384 stack_pointer_rtx);
14385 RTX_FRAME_RELATED_P (push) = 1;
14386 RTX_FRAME_RELATED_P (mov) = 1;
14387
14388 RTX_FRAME_RELATED_P (insn) = 1;
14389 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14390 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
14391
14392 /* Note that gen_push incremented m->fs.cfa_offset, even
14393 though we didn't emit the push insn here. */
14394 m->fs.cfa_reg = hard_frame_pointer_rtx;
14395 m->fs.fp_offset = m->fs.cfa_offset;
14396 m->fs.fp_valid = true;
14397 }
14398 else
14399 {
14400 /* The frame pointer is not needed so pop %ebp again.
14401 This leaves us with a pristine state. */
14402 emit_insn (gen_pop (hard_frame_pointer_rtx));
14403 }
14404 }
14405
14406 /* The first insn of a function that accepts its static chain on the
14407 stack is to push the register that would be filled in by a direct
14408 call. This insn will be skipped by the trampoline. */
14409 else if (ix86_static_chain_on_stack)
14410 {
14411 static_chain = ix86_static_chain (cfun->decl, false);
14412 insn = emit_insn (gen_push (static_chain));
14413 emit_insn (gen_blockage ());
14414
14415 /* We don't want to interpret this push insn as a register save,
14416 only as a stack adjustment. The real copy of the register as
14417 a save will be done later, if needed. */
14418 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
14419 t = gen_rtx_SET (stack_pointer_rtx, t);
14420 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
14421 RTX_FRAME_RELATED_P (insn) = 1;
14422 }
14423
14424 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
14425 of DRAP is needed and stack realignment is really needed after reload */
14426 if (stack_realign_drap)
14427 {
14428 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
14429
14430 /* Can't use DRAP in interrupt function. */
14431 if (cfun->machine->func_type != TYPE_NORMAL)
14432 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
14433 "in interrupt service routine. This may be worked "
14434 "around by avoiding functions with aggregate return.");
14435
14436 /* Only need to push parameter pointer reg if it is caller saved. */
14437 if (!call_used_regs[REGNO (crtl->drap_reg)])
14438 {
14439 /* Push arg pointer reg */
14440 insn = emit_insn (gen_push (crtl->drap_reg));
14441 RTX_FRAME_RELATED_P (insn) = 1;
14442 }
14443
14444 /* Grab the argument pointer. */
14445 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
14446 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
14447 RTX_FRAME_RELATED_P (insn) = 1;
14448 m->fs.cfa_reg = crtl->drap_reg;
14449 m->fs.cfa_offset = 0;
14450
14451 /* Align the stack. */
14452 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
14453 stack_pointer_rtx,
14454 GEN_INT (-align_bytes)));
14455 RTX_FRAME_RELATED_P (insn) = 1;
14456
14457 /* Replicate the return address on the stack so that return
14458 address can be reached via (argp - 1) slot. This is needed
14459 to implement macro RETURN_ADDR_RTX and intrinsic function
14460 expand_builtin_return_addr etc. */
14461 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
14462 t = gen_frame_mem (word_mode, t);
14463 insn = emit_insn (gen_push (t));
14464 RTX_FRAME_RELATED_P (insn) = 1;
14465
14466 /* For the purposes of frame and register save area addressing,
14467 we've started over with a new frame. */
14468 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
14469 m->fs.realigned = true;
14470
14471 if (static_chain)
14472 {
14473 /* Replicate static chain on the stack so that static chain
14474 can be reached via (argp - 2) slot. This is needed for
14475 nested function with stack realignment. */
14476 insn = emit_insn (gen_push (static_chain));
14477 RTX_FRAME_RELATED_P (insn) = 1;
14478 }
14479 }
14480
14481 int_registers_saved = (frame.nregs == 0);
14482 sse_registers_saved = (frame.nsseregs == 0);
14483
14484 if (frame_pointer_needed && !m->fs.fp_valid)
14485 {
14486 /* Note: AT&T enter does NOT have reversed args. Enter is probably
14487 slower on all targets. Also sdb doesn't like it. */
14488 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
14489 RTX_FRAME_RELATED_P (insn) = 1;
14490
14491 /* Push registers now, before setting the frame pointer
14492 on SEH target. */
14493 if (!int_registers_saved
14494 && TARGET_SEH
14495 && !frame.save_regs_using_mov)
14496 {
14497 ix86_emit_save_regs ();
14498 int_registers_saved = true;
14499 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
14500 }
14501
14502 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
14503 {
14504 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
14505 RTX_FRAME_RELATED_P (insn) = 1;
14506
14507 if (m->fs.cfa_reg == stack_pointer_rtx)
14508 m->fs.cfa_reg = hard_frame_pointer_rtx;
14509 m->fs.fp_offset = m->fs.sp_offset;
14510 m->fs.fp_valid = true;
14511 }
14512 }
14513
14514 if (!int_registers_saved)
14515 {
14516 /* If saving registers via PUSH, do so now. */
14517 if (!frame.save_regs_using_mov)
14518 {
14519 ix86_emit_save_regs ();
14520 int_registers_saved = true;
14521 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
14522 }
14523
14524 /* When using red zone we may start register saving before allocating
14525 the stack frame saving one cycle of the prologue. However, avoid
14526 doing this if we have to probe the stack; at least on x86_64 the
14527 stack probe can turn into a call that clobbers a red zone location. */
14528 else if (ix86_using_red_zone ()
14529 && (! TARGET_STACK_PROBE
14530 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
14531 {
14532 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
14533 int_registers_saved = true;
14534 }
14535 }
14536
14537 if (stack_realign_fp)
14538 {
14539 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
14540 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
14541
14542 /* The computation of the size of the re-aligned stack frame means
14543 that we must allocate the size of the register save area before
14544 performing the actual alignment. Otherwise we cannot guarantee
14545 that there's enough storage above the realignment point. */
14546 allocate = frame.stack_realign_allocate_offset - m->fs.sp_offset;
14547 if (allocate && !m->call_ms2sysv)
14548 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14549 GEN_INT (-allocate), -1, false);
14550
14551 /* Align the stack. */
14552 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
14553 stack_pointer_rtx,
14554 GEN_INT (-align_bytes)));
14555 /* For the purposes of register save area addressing, the stack
14556 pointer can no longer be used to access anything in the frame
14557 below m->fs.sp_realigned_offset and the frame pointer cannot be
14558 used for anything at or above. */
14559 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
14560 m->fs.sp_realigned = true;
14561 m->fs.sp_realigned_offset = m->fs.sp_offset - frame.nsseregs * 16;
14562 gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
14563 /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
14564 is needed to describe where a register is saved using a realigned
14565 stack pointer, so we need to invalidate the stack pointer for that
14566 target. */
14567 if (TARGET_SEH)
14568 m->fs.sp_valid = false;
14569 }
14570
14571 if (m->call_ms2sysv)
14572 ix86_emit_outlined_ms2sysv_save (frame);
14573
14574 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
14575
14576 if (flag_stack_usage_info)
14577 {
14578 /* We start to count from ARG_POINTER. */
14579 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
14580
14581 /* If it was realigned, take into account the fake frame. */
14582 if (stack_realign_drap)
14583 {
14584 if (ix86_static_chain_on_stack)
14585 stack_size += UNITS_PER_WORD;
14586
14587 if (!call_used_regs[REGNO (crtl->drap_reg)])
14588 stack_size += UNITS_PER_WORD;
14589
14590 /* This over-estimates by 1 minimal-stack-alignment-unit but
14591 mitigates that by counting in the new return address slot. */
14592 current_function_dynamic_stack_size
14593 += crtl->stack_alignment_needed / BITS_PER_UNIT;
14594 }
14595
14596 current_function_static_stack_size = stack_size;
14597 }
14598
14599 /* On SEH target with very large frame size, allocate an area to save
14600 SSE registers (as the very large allocation won't be described). */
14601 if (TARGET_SEH
14602 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
14603 && !sse_registers_saved)
14604 {
14605 HOST_WIDE_INT sse_size =
14606 frame.sse_reg_save_offset - frame.reg_save_offset;
14607
14608 gcc_assert (int_registers_saved);
14609
14610 /* No need to do stack checking as the area will be immediately
14611 written. */
14612 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14613 GEN_INT (-sse_size), -1,
14614 m->fs.cfa_reg == stack_pointer_rtx);
14615 allocate -= sse_size;
14616 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
14617 sse_registers_saved = true;
14618 }
14619
14620 /* The stack has already been decremented by the instruction calling us
14621 so probe if the size is non-negative to preserve the protection area. */
14622 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
14623 {
14624 /* We expect the registers to be saved when probes are used. */
14625 gcc_assert (int_registers_saved);
14626
14627 if (STACK_CHECK_MOVING_SP)
14628 {
14629 if (!(crtl->is_leaf && !cfun->calls_alloca
14630 && allocate <= PROBE_INTERVAL))
14631 {
14632 ix86_adjust_stack_and_probe (allocate);
14633 allocate = 0;
14634 }
14635 }
14636 else
14637 {
14638 HOST_WIDE_INT size = allocate;
14639
14640 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
14641 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
14642
14643 if (TARGET_STACK_PROBE)
14644 {
14645 if (crtl->is_leaf && !cfun->calls_alloca)
14646 {
14647 if (size > PROBE_INTERVAL)
14648 ix86_emit_probe_stack_range (0, size);
14649 }
14650 else
14651 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
14652 }
14653 else
14654 {
14655 if (crtl->is_leaf && !cfun->calls_alloca)
14656 {
14657 if (size > PROBE_INTERVAL && size > STACK_CHECK_PROTECT)
14658 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT,
14659 size - STACK_CHECK_PROTECT);
14660 }
14661 else
14662 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
14663 }
14664 }
14665 }
14666
14667 if (allocate == 0)
14668 ;
14669 else if (!ix86_target_stack_probe ()
14670 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
14671 {
14672 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14673 GEN_INT (-allocate), -1,
14674 m->fs.cfa_reg == stack_pointer_rtx);
14675 }
14676 else
14677 {
14678 rtx eax = gen_rtx_REG (Pmode, AX_REG);
14679 rtx r10 = NULL;
14680 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
14681 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
14682 bool eax_live = ix86_eax_live_at_start_p ();
14683 bool r10_live = false;
14684
14685 if (TARGET_64BIT)
14686 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
14687
14688 if (eax_live)
14689 {
14690 insn = emit_insn (gen_push (eax));
14691 allocate -= UNITS_PER_WORD;
14692 /* Note that SEH directives need to continue tracking the stack
14693 pointer even after the frame pointer has been set up. */
14694 if (sp_is_cfa_reg || TARGET_SEH)
14695 {
14696 if (sp_is_cfa_reg)
14697 m->fs.cfa_offset += UNITS_PER_WORD;
14698 RTX_FRAME_RELATED_P (insn) = 1;
14699 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14700 gen_rtx_SET (stack_pointer_rtx,
14701 plus_constant (Pmode, stack_pointer_rtx,
14702 -UNITS_PER_WORD)));
14703 }
14704 }
14705
14706 if (r10_live)
14707 {
14708 r10 = gen_rtx_REG (Pmode, R10_REG);
14709 insn = emit_insn (gen_push (r10));
14710 allocate -= UNITS_PER_WORD;
14711 if (sp_is_cfa_reg || TARGET_SEH)
14712 {
14713 if (sp_is_cfa_reg)
14714 m->fs.cfa_offset += UNITS_PER_WORD;
14715 RTX_FRAME_RELATED_P (insn) = 1;
14716 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14717 gen_rtx_SET (stack_pointer_rtx,
14718 plus_constant (Pmode, stack_pointer_rtx,
14719 -UNITS_PER_WORD)));
14720 }
14721 }
14722
14723 emit_move_insn (eax, GEN_INT (allocate));
14724 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
14725
14726 /* Use the fact that AX still contains ALLOCATE. */
14727 adjust_stack_insn = (Pmode == DImode
14728 ? gen_pro_epilogue_adjust_stack_di_sub
14729 : gen_pro_epilogue_adjust_stack_si_sub);
14730
14731 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
14732 stack_pointer_rtx, eax));
14733
14734 if (sp_is_cfa_reg || TARGET_SEH)
14735 {
14736 if (sp_is_cfa_reg)
14737 m->fs.cfa_offset += allocate;
14738 RTX_FRAME_RELATED_P (insn) = 1;
14739 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14740 gen_rtx_SET (stack_pointer_rtx,
14741 plus_constant (Pmode, stack_pointer_rtx,
14742 -allocate)));
14743 }
14744 m->fs.sp_offset += allocate;
14745
14746 /* Use stack_pointer_rtx for relative addressing so that code
14747 works for realigned stack, too. */
14748 if (r10_live && eax_live)
14749 {
14750 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
14751 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
14752 gen_frame_mem (word_mode, t));
14753 t = plus_constant (Pmode, t, UNITS_PER_WORD);
14754 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
14755 gen_frame_mem (word_mode, t));
14756 }
14757 else if (eax_live || r10_live)
14758 {
14759 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
14760 emit_move_insn (gen_rtx_REG (word_mode,
14761 (eax_live ? AX_REG : R10_REG)),
14762 gen_frame_mem (word_mode, t));
14763 }
14764 }
14765 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
14766
14767 /* If we havn't already set up the frame pointer, do so now. */
14768 if (frame_pointer_needed && !m->fs.fp_valid)
14769 {
14770 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
14771 GEN_INT (frame.stack_pointer_offset
14772 - frame.hard_frame_pointer_offset));
14773 insn = emit_insn (insn);
14774 RTX_FRAME_RELATED_P (insn) = 1;
14775 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
14776
14777 if (m->fs.cfa_reg == stack_pointer_rtx)
14778 m->fs.cfa_reg = hard_frame_pointer_rtx;
14779 m->fs.fp_offset = frame.hard_frame_pointer_offset;
14780 m->fs.fp_valid = true;
14781 }
14782
14783 if (!int_registers_saved)
14784 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
14785 if (!sse_registers_saved)
14786 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
14787
14788 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
14789 in PROLOGUE. */
14790 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
14791 {
14792 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
14793 insn = emit_insn (gen_set_got (pic));
14794 RTX_FRAME_RELATED_P (insn) = 1;
14795 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
14796 emit_insn (gen_prologue_use (pic));
14797 /* Deleting already emmitted SET_GOT if exist and allocated to
14798 REAL_PIC_OFFSET_TABLE_REGNUM. */
14799 ix86_elim_entry_set_got (pic);
14800 }
14801
14802 if (crtl->drap_reg && !crtl->stack_realign_needed)
14803 {
14804 /* vDRAP is setup but after reload it turns out stack realign
14805 isn't necessary, here we will emit prologue to setup DRAP
14806 without stack realign adjustment */
14807 t = choose_baseaddr (0, NULL);
14808 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
14809 }
14810
14811 /* Prevent instructions from being scheduled into register save push
14812 sequence when access to the redzone area is done through frame pointer.
14813 The offset between the frame pointer and the stack pointer is calculated
14814 relative to the value of the stack pointer at the end of the function
14815 prologue, and moving instructions that access redzone area via frame
14816 pointer inside push sequence violates this assumption. */
14817 if (frame_pointer_needed && frame.red_zone_size)
14818 emit_insn (gen_memory_blockage ());
14819
14820 /* SEH requires that the prologue end within 256 bytes of the start of
14821 the function. Prevent instruction schedules that would extend that.
14822 Further, prevent alloca modifications to the stack pointer from being
14823 combined with prologue modifications. */
14824 if (TARGET_SEH)
14825 emit_insn (gen_prologue_use (stack_pointer_rtx));
14826 }
14827
14828 /* Emit code to restore REG using a POP insn. */
14829
14830 static void
14831 ix86_emit_restore_reg_using_pop (rtx reg)
14832 {
14833 struct machine_function *m = cfun->machine;
14834 rtx_insn *insn = emit_insn (gen_pop (reg));
14835
14836 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
14837 m->fs.sp_offset -= UNITS_PER_WORD;
14838
14839 if (m->fs.cfa_reg == crtl->drap_reg
14840 && REGNO (reg) == REGNO (crtl->drap_reg))
14841 {
14842 /* Previously we'd represented the CFA as an expression
14843 like *(%ebp - 8). We've just popped that value from
14844 the stack, which means we need to reset the CFA to
14845 the drap register. This will remain until we restore
14846 the stack pointer. */
14847 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14848 RTX_FRAME_RELATED_P (insn) = 1;
14849
14850 /* This means that the DRAP register is valid for addressing too. */
14851 m->fs.drap_valid = true;
14852 return;
14853 }
14854
14855 if (m->fs.cfa_reg == stack_pointer_rtx)
14856 {
14857 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14858 x = gen_rtx_SET (stack_pointer_rtx, x);
14859 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14860 RTX_FRAME_RELATED_P (insn) = 1;
14861
14862 m->fs.cfa_offset -= UNITS_PER_WORD;
14863 }
14864
14865 /* When the frame pointer is the CFA, and we pop it, we are
14866 swapping back to the stack pointer as the CFA. This happens
14867 for stack frames that don't allocate other data, so we assume
14868 the stack pointer is now pointing at the return address, i.e.
14869 the function entry state, which makes the offset be 1 word. */
14870 if (reg == hard_frame_pointer_rtx)
14871 {
14872 m->fs.fp_valid = false;
14873 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14874 {
14875 m->fs.cfa_reg = stack_pointer_rtx;
14876 m->fs.cfa_offset -= UNITS_PER_WORD;
14877
14878 add_reg_note (insn, REG_CFA_DEF_CFA,
14879 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14880 GEN_INT (m->fs.cfa_offset)));
14881 RTX_FRAME_RELATED_P (insn) = 1;
14882 }
14883 }
14884 }
14885
14886 /* Emit code to restore saved registers using POP insns. */
14887
14888 static void
14889 ix86_emit_restore_regs_using_pop (void)
14890 {
14891 unsigned int regno;
14892
14893 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14894 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
14895 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
14896 }
14897
14898 /* Emit code and notes for the LEAVE instruction. If insn is non-null,
14899 omits the emit and only attaches the notes. */
14900
14901 static void
14902 ix86_emit_leave (rtx_insn *insn)
14903 {
14904 struct machine_function *m = cfun->machine;
14905 if (!insn)
14906 insn = emit_insn (ix86_gen_leave ());
14907
14908 ix86_add_queued_cfa_restore_notes (insn);
14909
14910 gcc_assert (m->fs.fp_valid);
14911 m->fs.sp_valid = true;
14912 m->fs.sp_realigned = false;
14913 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
14914 m->fs.fp_valid = false;
14915
14916 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14917 {
14918 m->fs.cfa_reg = stack_pointer_rtx;
14919 m->fs.cfa_offset = m->fs.sp_offset;
14920
14921 add_reg_note (insn, REG_CFA_DEF_CFA,
14922 plus_constant (Pmode, stack_pointer_rtx,
14923 m->fs.sp_offset));
14924 RTX_FRAME_RELATED_P (insn) = 1;
14925 }
14926 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
14927 m->fs.fp_offset);
14928 }
14929
14930 /* Emit code to restore saved registers using MOV insns.
14931 First register is restored from CFA - CFA_OFFSET. */
14932 static void
14933 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
14934 bool maybe_eh_return)
14935 {
14936 struct machine_function *m = cfun->machine;
14937 unsigned int regno;
14938
14939 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14940 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
14941 {
14942 rtx reg = gen_rtx_REG (word_mode, regno);
14943 rtx mem;
14944 rtx_insn *insn;
14945
14946 mem = choose_baseaddr (cfa_offset, NULL);
14947 mem = gen_frame_mem (word_mode, mem);
14948 insn = emit_move_insn (reg, mem);
14949
14950 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
14951 {
14952 /* Previously we'd represented the CFA as an expression
14953 like *(%ebp - 8). We've just popped that value from
14954 the stack, which means we need to reset the CFA to
14955 the drap register. This will remain until we restore
14956 the stack pointer. */
14957 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14958 RTX_FRAME_RELATED_P (insn) = 1;
14959
14960 /* This means that the DRAP register is valid for addressing. */
14961 m->fs.drap_valid = true;
14962 }
14963 else
14964 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14965
14966 cfa_offset -= UNITS_PER_WORD;
14967 }
14968 }
14969
14970 /* Emit code to restore saved registers using MOV insns.
14971 First register is restored from CFA - CFA_OFFSET. */
14972 static void
14973 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
14974 bool maybe_eh_return)
14975 {
14976 unsigned int regno;
14977
14978 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14979 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
14980 {
14981 rtx reg = gen_rtx_REG (V4SFmode, regno);
14982 rtx mem;
14983 unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
14984
14985 mem = choose_baseaddr (cfa_offset, &align);
14986 mem = gen_rtx_MEM (V4SFmode, mem);
14987
14988 /* The location aligment depends upon the base register. */
14989 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
14990 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
14991 set_mem_align (mem, align);
14992 emit_insn (gen_rtx_SET (reg, mem));
14993
14994 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14995
14996 cfa_offset -= GET_MODE_SIZE (V4SFmode);
14997 }
14998 }
14999
15000 static void
15001 ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
15002 bool use_call, int style)
15003 {
15004 struct machine_function *m = cfun->machine;
15005 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
15006 + m->call_ms2sysv_extra_regs;
15007 rtvec v;
15008 unsigned int elems_needed, align, i, vi = 0;
15009 rtx_insn *insn;
15010 rtx sym, tmp;
15011 rtx rsi = gen_rtx_REG (word_mode, SI_REG);
15012 rtx r10 = NULL_RTX;
15013 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
15014 HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
15015 HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
15016 rtx rsi_frame_load = NULL_RTX;
15017 HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
15018 enum xlogue_stub stub;
15019
15020 gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
15021
15022 /* If using a realigned stack, we should never start with padding. */
15023 gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
15024
15025 /* Setup RSI as the stub's base pointer. */
15026 align = GET_MODE_ALIGNMENT (V4SFmode);
15027 tmp = choose_baseaddr (rsi_offset, &align);
15028 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
15029 emit_insn (gen_rtx_SET (rsi, tmp));
15030
15031 /* Get a symbol for the stub. */
15032 if (frame_pointer_needed)
15033 stub = use_call ? XLOGUE_STUB_RESTORE_HFP
15034 : XLOGUE_STUB_RESTORE_HFP_TAIL;
15035 else
15036 stub = use_call ? XLOGUE_STUB_RESTORE
15037 : XLOGUE_STUB_RESTORE_TAIL;
15038 sym = xlogue.get_stub_rtx (stub);
15039
15040 elems_needed = ncregs;
15041 if (use_call)
15042 elems_needed += 1;
15043 else
15044 elems_needed += frame_pointer_needed ? 5 : 3;
15045 v = rtvec_alloc (elems_needed);
15046
15047 /* We call the epilogue stub when we need to pop incoming args or we are
15048 doing a sibling call as the tail. Otherwise, we will emit a jmp to the
15049 epilogue stub and it is the tail-call. */
15050 if (use_call)
15051 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
15052 else
15053 {
15054 RTVEC_ELT (v, vi++) = ret_rtx;
15055 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
15056 if (frame_pointer_needed)
15057 {
15058 rtx rbp = gen_rtx_REG (DImode, BP_REG);
15059 gcc_assert (m->fs.fp_valid);
15060 gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
15061
15062 tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
15063 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
15064 RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
15065 tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
15066 RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
15067 }
15068 else
15069 {
15070 /* If no hard frame pointer, we set R10 to the SP restore value. */
15071 gcc_assert (!m->fs.fp_valid);
15072 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
15073 gcc_assert (m->fs.sp_valid);
15074
15075 r10 = gen_rtx_REG (DImode, R10_REG);
15076 tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
15077 emit_insn (gen_rtx_SET (r10, tmp));
15078
15079 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
15080 }
15081 }
15082
15083 /* Generate frame load insns and restore notes. */
15084 for (i = 0; i < ncregs; ++i)
15085 {
15086 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
15087 enum machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
15088 rtx reg, frame_load;
15089
15090 reg = gen_rtx_REG (mode, r.regno);
15091 frame_load = gen_frame_load (reg, rsi, r.offset);
15092
15093 /* Save RSI frame load insn & note to add last. */
15094 if (r.regno == SI_REG)
15095 {
15096 gcc_assert (!rsi_frame_load);
15097 rsi_frame_load = frame_load;
15098 rsi_restore_offset = r.offset;
15099 }
15100 else
15101 {
15102 RTVEC_ELT (v, vi++) = frame_load;
15103 ix86_add_cfa_restore_note (NULL, reg, r.offset);
15104 }
15105 }
15106
15107 /* Add RSI frame load & restore note at the end. */
15108 gcc_assert (rsi_frame_load);
15109 gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
15110 RTVEC_ELT (v, vi++) = rsi_frame_load;
15111 ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
15112 rsi_restore_offset);
15113
15114 /* Finally, for tail-call w/o a hard frame pointer, set SP to R10. */
15115 if (!use_call && !frame_pointer_needed)
15116 {
15117 gcc_assert (m->fs.sp_valid);
15118 gcc_assert (!m->fs.sp_realigned);
15119
15120 /* At this point, R10 should point to frame.stack_realign_offset. */
15121 if (m->fs.cfa_reg == stack_pointer_rtx)
15122 m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
15123 m->fs.sp_offset = frame.stack_realign_offset;
15124 }
15125
15126 gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
15127 tmp = gen_rtx_PARALLEL (VOIDmode, v);
15128 if (use_call)
15129 insn = emit_insn (tmp);
15130 else
15131 {
15132 insn = emit_jump_insn (tmp);
15133 JUMP_LABEL (insn) = ret_rtx;
15134
15135 if (frame_pointer_needed)
15136 ix86_emit_leave (insn);
15137 else
15138 {
15139 /* Need CFA adjust note. */
15140 tmp = gen_rtx_SET (stack_pointer_rtx, r10);
15141 add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
15142 }
15143 }
15144
15145 RTX_FRAME_RELATED_P (insn) = true;
15146 ix86_add_queued_cfa_restore_notes (insn);
15147
15148 /* If we're not doing a tail-call, we need to adjust the stack. */
15149 if (use_call && m->fs.sp_valid)
15150 {
15151 HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
15152 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15153 GEN_INT (dealloc), style,
15154 m->fs.cfa_reg == stack_pointer_rtx);
15155 }
15156 }
15157
15158 /* Restore function stack, frame, and registers. */
15159
15160 void
15161 ix86_expand_epilogue (int style)
15162 {
15163 struct machine_function *m = cfun->machine;
15164 struct machine_frame_state frame_state_save = m->fs;
15165 struct ix86_frame frame;
15166 bool restore_regs_via_mov;
15167 bool using_drap;
15168 bool restore_stub_is_tail = false;
15169
15170 ix86_finalize_stack_realign_flags ();
15171 frame = m->frame;
15172
15173 m->fs.sp_realigned = stack_realign_fp;
15174 m->fs.sp_valid = stack_realign_fp
15175 || !frame_pointer_needed
15176 || crtl->sp_is_unchanging;
15177 gcc_assert (!m->fs.sp_valid
15178 || m->fs.sp_offset == frame.stack_pointer_offset);
15179
15180 /* The FP must be valid if the frame pointer is present. */
15181 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
15182 gcc_assert (!m->fs.fp_valid
15183 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
15184
15185 /* We must have *some* valid pointer to the stack frame. */
15186 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
15187
15188 /* The DRAP is never valid at this point. */
15189 gcc_assert (!m->fs.drap_valid);
15190
15191 /* See the comment about red zone and frame
15192 pointer usage in ix86_expand_prologue. */
15193 if (frame_pointer_needed && frame.red_zone_size)
15194 emit_insn (gen_memory_blockage ());
15195
15196 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
15197 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
15198
15199 /* Determine the CFA offset of the end of the red-zone. */
15200 m->fs.red_zone_offset = 0;
15201 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
15202 {
15203 /* The red-zone begins below the return address. */
15204 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
15205
15206 /* When the register save area is in the aligned portion of
15207 the stack, determine the maximum runtime displacement that
15208 matches up with the aligned frame. */
15209 if (stack_realign_drap)
15210 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
15211 + UNITS_PER_WORD);
15212 }
15213
15214 /* Special care must be taken for the normal return case of a function
15215 using eh_return: the eax and edx registers are marked as saved, but
15216 not restored along this path. Adjust the save location to match. */
15217 if (crtl->calls_eh_return && style != 2)
15218 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
15219
15220 /* EH_RETURN requires the use of moves to function properly. */
15221 if (crtl->calls_eh_return)
15222 restore_regs_via_mov = true;
15223 /* SEH requires the use of pops to identify the epilogue. */
15224 else if (TARGET_SEH)
15225 restore_regs_via_mov = false;
15226 /* If we're only restoring one register and sp cannot be used then
15227 using a move instruction to restore the register since it's
15228 less work than reloading sp and popping the register. */
15229 else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
15230 restore_regs_via_mov = true;
15231 else if (TARGET_EPILOGUE_USING_MOVE
15232 && cfun->machine->use_fast_prologue_epilogue
15233 && (frame.nregs > 1
15234 || m->fs.sp_offset != frame.reg_save_offset))
15235 restore_regs_via_mov = true;
15236 else if (frame_pointer_needed
15237 && !frame.nregs
15238 && m->fs.sp_offset != frame.reg_save_offset)
15239 restore_regs_via_mov = true;
15240 else if (frame_pointer_needed
15241 && TARGET_USE_LEAVE
15242 && cfun->machine->use_fast_prologue_epilogue
15243 && frame.nregs == 1)
15244 restore_regs_via_mov = true;
15245 else
15246 restore_regs_via_mov = false;
15247
15248 if (restore_regs_via_mov || frame.nsseregs)
15249 {
15250 /* Ensure that the entire register save area is addressable via
15251 the stack pointer, if we will restore via sp. */
15252 if (TARGET_64BIT
15253 && m->fs.sp_offset > 0x7fffffff
15254 && !(fp_valid_at (frame.stack_realign_offset) || m->fs.drap_valid)
15255 && (frame.nsseregs + frame.nregs) != 0)
15256 {
15257 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15258 GEN_INT (m->fs.sp_offset
15259 - frame.sse_reg_save_offset),
15260 style,
15261 m->fs.cfa_reg == stack_pointer_rtx);
15262 }
15263 }
15264
15265 /* If there are any SSE registers to restore, then we have to do it
15266 via moves, since there's obviously no pop for SSE regs. */
15267 if (frame.nsseregs)
15268 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
15269 style == 2);
15270
15271 if (m->call_ms2sysv)
15272 {
15273 int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
15274
15275 /* We cannot use a tail-call for the stub if:
15276 1. We have to pop incoming args,
15277 2. We have additional int regs to restore, or
15278 3. A sibling call will be the tail-call, or
15279 4. We are emitting an eh_return_internal epilogue.
15280
15281 TODO: Item 4 has not yet tested!
15282
15283 If any of the above are true, we will call the stub rather than
15284 jump to it. */
15285 restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
15286 ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
15287 }
15288
15289 /* If using out-of-line stub that is a tail-call, then...*/
15290 if (m->call_ms2sysv && restore_stub_is_tail)
15291 {
15292 /* TODO: parinoid tests. (remove eventually) */
15293 gcc_assert (m->fs.sp_valid);
15294 gcc_assert (!m->fs.sp_realigned);
15295 gcc_assert (!m->fs.fp_valid);
15296 gcc_assert (!m->fs.realigned);
15297 gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
15298 gcc_assert (!crtl->drap_reg);
15299 gcc_assert (!frame.nregs);
15300 }
15301 else if (restore_regs_via_mov)
15302 {
15303 rtx t;
15304
15305 if (frame.nregs)
15306 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
15307
15308 /* eh_return epilogues need %ecx added to the stack pointer. */
15309 if (style == 2)
15310 {
15311 rtx sa = EH_RETURN_STACKADJ_RTX;
15312 rtx_insn *insn;
15313
15314 /* %ecx can't be used for both DRAP register and eh_return. */
15315 if (crtl->drap_reg)
15316 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
15317
15318 /* regparm nested functions don't work with eh_return. */
15319 gcc_assert (!ix86_static_chain_on_stack);
15320
15321 if (frame_pointer_needed)
15322 {
15323 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
15324 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
15325 emit_insn (gen_rtx_SET (sa, t));
15326
15327 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
15328 insn = emit_move_insn (hard_frame_pointer_rtx, t);
15329
15330 /* Note that we use SA as a temporary CFA, as the return
15331 address is at the proper place relative to it. We
15332 pretend this happens at the FP restore insn because
15333 prior to this insn the FP would be stored at the wrong
15334 offset relative to SA, and after this insn we have no
15335 other reasonable register to use for the CFA. We don't
15336 bother resetting the CFA to the SP for the duration of
15337 the return insn. */
15338 add_reg_note (insn, REG_CFA_DEF_CFA,
15339 plus_constant (Pmode, sa, UNITS_PER_WORD));
15340 ix86_add_queued_cfa_restore_notes (insn);
15341 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
15342 RTX_FRAME_RELATED_P (insn) = 1;
15343
15344 m->fs.cfa_reg = sa;
15345 m->fs.cfa_offset = UNITS_PER_WORD;
15346 m->fs.fp_valid = false;
15347
15348 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
15349 const0_rtx, style, false);
15350 }
15351 else
15352 {
15353 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
15354 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
15355 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
15356 ix86_add_queued_cfa_restore_notes (insn);
15357
15358 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
15359 if (m->fs.cfa_offset != UNITS_PER_WORD)
15360 {
15361 m->fs.cfa_offset = UNITS_PER_WORD;
15362 add_reg_note (insn, REG_CFA_DEF_CFA,
15363 plus_constant (Pmode, stack_pointer_rtx,
15364 UNITS_PER_WORD));
15365 RTX_FRAME_RELATED_P (insn) = 1;
15366 }
15367 }
15368 m->fs.sp_offset = UNITS_PER_WORD;
15369 m->fs.sp_valid = true;
15370 m->fs.sp_realigned = false;
15371 }
15372 }
15373 else
15374 {
15375 /* SEH requires that the function end with (1) a stack adjustment
15376 if necessary, (2) a sequence of pops, and (3) a return or
15377 jump instruction. Prevent insns from the function body from
15378 being scheduled into this sequence. */
15379 if (TARGET_SEH)
15380 {
15381 /* Prevent a catch region from being adjacent to the standard
15382 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
15383 several other flags that would be interesting to test are
15384 not yet set up. */
15385 if (flag_non_call_exceptions)
15386 emit_insn (gen_nops (const1_rtx));
15387 else
15388 emit_insn (gen_blockage ());
15389 }
15390
15391 /* First step is to deallocate the stack frame so that we can
15392 pop the registers. If the stack pointer was realigned, it needs
15393 to be restored now. Also do it on SEH target for very large
15394 frame as the emitted instructions aren't allowed by the ABI
15395 in epilogues. */
15396 if (!m->fs.sp_valid || m->fs.sp_realigned
15397 || (TARGET_SEH
15398 && (m->fs.sp_offset - frame.reg_save_offset
15399 >= SEH_MAX_FRAME_SIZE)))
15400 {
15401 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
15402 GEN_INT (m->fs.fp_offset
15403 - frame.reg_save_offset),
15404 style, false);
15405 }
15406 else if (m->fs.sp_offset != frame.reg_save_offset)
15407 {
15408 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15409 GEN_INT (m->fs.sp_offset
15410 - frame.reg_save_offset),
15411 style,
15412 m->fs.cfa_reg == stack_pointer_rtx);
15413 }
15414
15415 ix86_emit_restore_regs_using_pop ();
15416 }
15417
15418 /* If we used a stack pointer and haven't already got rid of it,
15419 then do so now. */
15420 if (m->fs.fp_valid)
15421 {
15422 /* If the stack pointer is valid and pointing at the frame
15423 pointer store address, then we only need a pop. */
15424 if (sp_valid_at (frame.hfp_save_offset)
15425 && m->fs.sp_offset == frame.hfp_save_offset)
15426 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
15427 /* Leave results in shorter dependency chains on CPUs that are
15428 able to grok it fast. */
15429 else if (TARGET_USE_LEAVE
15430 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
15431 || !cfun->machine->use_fast_prologue_epilogue)
15432 ix86_emit_leave (NULL);
15433 else
15434 {
15435 pro_epilogue_adjust_stack (stack_pointer_rtx,
15436 hard_frame_pointer_rtx,
15437 const0_rtx, style, !using_drap);
15438 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
15439 }
15440 }
15441
15442 if (using_drap)
15443 {
15444 int param_ptr_offset = UNITS_PER_WORD;
15445 rtx_insn *insn;
15446
15447 gcc_assert (stack_realign_drap);
15448
15449 if (ix86_static_chain_on_stack)
15450 param_ptr_offset += UNITS_PER_WORD;
15451 if (!call_used_regs[REGNO (crtl->drap_reg)])
15452 param_ptr_offset += UNITS_PER_WORD;
15453
15454 insn = emit_insn (gen_rtx_SET
15455 (stack_pointer_rtx,
15456 gen_rtx_PLUS (Pmode,
15457 crtl->drap_reg,
15458 GEN_INT (-param_ptr_offset))));
15459 m->fs.cfa_reg = stack_pointer_rtx;
15460 m->fs.cfa_offset = param_ptr_offset;
15461 m->fs.sp_offset = param_ptr_offset;
15462 m->fs.realigned = false;
15463
15464 add_reg_note (insn, REG_CFA_DEF_CFA,
15465 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15466 GEN_INT (param_ptr_offset)));
15467 RTX_FRAME_RELATED_P (insn) = 1;
15468
15469 if (!call_used_regs[REGNO (crtl->drap_reg)])
15470 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
15471 }
15472
15473 /* At this point the stack pointer must be valid, and we must have
15474 restored all of the registers. We may not have deallocated the
15475 entire stack frame. We've delayed this until now because it may
15476 be possible to merge the local stack deallocation with the
15477 deallocation forced by ix86_static_chain_on_stack. */
15478 gcc_assert (m->fs.sp_valid);
15479 gcc_assert (!m->fs.sp_realigned);
15480 gcc_assert (!m->fs.fp_valid);
15481 gcc_assert (!m->fs.realigned);
15482 if (m->fs.sp_offset != UNITS_PER_WORD)
15483 {
15484 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15485 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
15486 style, true);
15487 }
15488 else
15489 ix86_add_queued_cfa_restore_notes (get_last_insn ());
15490
15491 /* Sibcall epilogues don't want a return instruction. */
15492 if (style == 0)
15493 {
15494 m->fs = frame_state_save;
15495 return;
15496 }
15497
15498 if (cfun->machine->func_type != TYPE_NORMAL)
15499 {
15500 /* Return with the "IRET" instruction from interrupt handler.
15501 Pop the 'ERROR_CODE' off the stack before the 'IRET'
15502 instruction in exception handler. */
15503 if (cfun->machine->func_type == TYPE_EXCEPTION)
15504 {
15505 rtx r = plus_constant (Pmode, stack_pointer_rtx,
15506 UNITS_PER_WORD);
15507 emit_insn (gen_rtx_SET (stack_pointer_rtx, r));
15508 }
15509 emit_jump_insn (gen_interrupt_return ());
15510 }
15511 else if (crtl->args.pops_args && crtl->args.size)
15512 {
15513 rtx popc = GEN_INT (crtl->args.pops_args);
15514
15515 /* i386 can only pop 64K bytes. If asked to pop more, pop return
15516 address, do explicit add, and jump indirectly to the caller. */
15517
15518 if (crtl->args.pops_args >= 65536)
15519 {
15520 rtx ecx = gen_rtx_REG (SImode, CX_REG);
15521 rtx_insn *insn;
15522
15523 /* There is no "pascal" calling convention in any 64bit ABI. */
15524 gcc_assert (!TARGET_64BIT);
15525
15526 insn = emit_insn (gen_pop (ecx));
15527 m->fs.cfa_offset -= UNITS_PER_WORD;
15528 m->fs.sp_offset -= UNITS_PER_WORD;
15529
15530 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
15531 x = gen_rtx_SET (stack_pointer_rtx, x);
15532 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
15533 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
15534 RTX_FRAME_RELATED_P (insn) = 1;
15535
15536 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15537 popc, -1, true);
15538 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
15539 }
15540 else
15541 emit_jump_insn (gen_simple_return_pop_internal (popc));
15542 }
15543 else if (!m->call_ms2sysv || !restore_stub_is_tail)
15544 emit_jump_insn (gen_simple_return_internal ());
15545
15546 /* Restore the state back to the state from the prologue,
15547 so that it's correct for the next epilogue. */
15548 m->fs = frame_state_save;
15549 }
15550
15551 /* Reset from the function's potential modifications. */
15552
15553 static void
15554 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED, HOST_WIDE_INT)
15555 {
15556 if (pic_offset_table_rtx
15557 && !ix86_use_pseudo_pic_reg ())
15558 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
15559
15560 if (TARGET_MACHO)
15561 {
15562 rtx_insn *insn = get_last_insn ();
15563 rtx_insn *deleted_debug_label = NULL;
15564
15565 /* Mach-O doesn't support labels at the end of objects, so if
15566 it looks like we might want one, take special action.
15567 First, collect any sequence of deleted debug labels. */
15568 while (insn
15569 && NOTE_P (insn)
15570 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
15571 {
15572 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
15573 notes only, instead set their CODE_LABEL_NUMBER to -1,
15574 otherwise there would be code generation differences
15575 in between -g and -g0. */
15576 if (NOTE_P (insn) && NOTE_KIND (insn)
15577 == NOTE_INSN_DELETED_DEBUG_LABEL)
15578 deleted_debug_label = insn;
15579 insn = PREV_INSN (insn);
15580 }
15581
15582 /* If we have:
15583 label:
15584 barrier
15585 then this needs to be detected, so skip past the barrier. */
15586
15587 if (insn && BARRIER_P (insn))
15588 insn = PREV_INSN (insn);
15589
15590 /* Up to now we've only seen notes or barriers. */
15591 if (insn)
15592 {
15593 if (LABEL_P (insn)
15594 || (NOTE_P (insn)
15595 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
15596 /* Trailing label. */
15597 fputs ("\tnop\n", file);
15598 else if (cfun && ! cfun->is_thunk)
15599 {
15600 /* See if we have a completely empty function body, skipping
15601 the special case of the picbase thunk emitted as asm. */
15602 while (insn && ! INSN_P (insn))
15603 insn = PREV_INSN (insn);
15604 /* If we don't find any insns, we've got an empty function body;
15605 I.e. completely empty - without a return or branch. This is
15606 taken as the case where a function body has been removed
15607 because it contains an inline __builtin_unreachable(). GCC
15608 declares that reaching __builtin_unreachable() means UB so
15609 we're not obliged to do anything special; however, we want
15610 non-zero-sized function bodies. To meet this, and help the
15611 user out, let's trap the case. */
15612 if (insn == NULL)
15613 fputs ("\tud2\n", file);
15614 }
15615 }
15616 else if (deleted_debug_label)
15617 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
15618 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
15619 CODE_LABEL_NUMBER (insn) = -1;
15620 }
15621 }
15622
15623 /* Return a scratch register to use in the split stack prologue. The
15624 split stack prologue is used for -fsplit-stack. It is the first
15625 instructions in the function, even before the regular prologue.
15626 The scratch register can be any caller-saved register which is not
15627 used for parameters or for the static chain. */
15628
15629 static unsigned int
15630 split_stack_prologue_scratch_regno (void)
15631 {
15632 if (TARGET_64BIT)
15633 return R11_REG;
15634 else
15635 {
15636 bool is_fastcall, is_thiscall;
15637 int regparm;
15638
15639 is_fastcall = (lookup_attribute ("fastcall",
15640 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
15641 != NULL);
15642 is_thiscall = (lookup_attribute ("thiscall",
15643 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
15644 != NULL);
15645 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
15646
15647 if (is_fastcall)
15648 {
15649 if (DECL_STATIC_CHAIN (cfun->decl))
15650 {
15651 sorry ("-fsplit-stack does not support fastcall with "
15652 "nested function");
15653 return INVALID_REGNUM;
15654 }
15655 return AX_REG;
15656 }
15657 else if (is_thiscall)
15658 {
15659 if (!DECL_STATIC_CHAIN (cfun->decl))
15660 return DX_REG;
15661 return AX_REG;
15662 }
15663 else if (regparm < 3)
15664 {
15665 if (!DECL_STATIC_CHAIN (cfun->decl))
15666 return CX_REG;
15667 else
15668 {
15669 if (regparm >= 2)
15670 {
15671 sorry ("-fsplit-stack does not support 2 register "
15672 "parameters for a nested function");
15673 return INVALID_REGNUM;
15674 }
15675 return DX_REG;
15676 }
15677 }
15678 else
15679 {
15680 /* FIXME: We could make this work by pushing a register
15681 around the addition and comparison. */
15682 sorry ("-fsplit-stack does not support 3 register parameters");
15683 return INVALID_REGNUM;
15684 }
15685 }
15686 }
15687
15688 /* A SYMBOL_REF for the function which allocates new stackspace for
15689 -fsplit-stack. */
15690
15691 static GTY(()) rtx split_stack_fn;
15692
15693 /* A SYMBOL_REF for the more stack function when using the large
15694 model. */
15695
15696 static GTY(()) rtx split_stack_fn_large;
15697
15698 /* Handle -fsplit-stack. These are the first instructions in the
15699 function, even before the regular prologue. */
15700
15701 void
15702 ix86_expand_split_stack_prologue (void)
15703 {
15704 struct ix86_frame frame;
15705 HOST_WIDE_INT allocate;
15706 unsigned HOST_WIDE_INT args_size;
15707 rtx_code_label *label;
15708 rtx limit, current, allocate_rtx, call_insn, call_fusage;
15709 rtx scratch_reg = NULL_RTX;
15710 rtx_code_label *varargs_label = NULL;
15711 rtx fn;
15712
15713 gcc_assert (flag_split_stack && reload_completed);
15714
15715 ix86_finalize_stack_realign_flags ();
15716 frame = cfun->machine->frame;
15717 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
15718
15719 /* This is the label we will branch to if we have enough stack
15720 space. We expect the basic block reordering pass to reverse this
15721 branch if optimizing, so that we branch in the unlikely case. */
15722 label = gen_label_rtx ();
15723
15724 /* We need to compare the stack pointer minus the frame size with
15725 the stack boundary in the TCB. The stack boundary always gives
15726 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
15727 can compare directly. Otherwise we need to do an addition. */
15728
15729 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
15730 UNSPEC_STACK_CHECK);
15731 limit = gen_rtx_CONST (Pmode, limit);
15732 limit = gen_rtx_MEM (Pmode, limit);
15733 if (allocate < SPLIT_STACK_AVAILABLE)
15734 current = stack_pointer_rtx;
15735 else
15736 {
15737 unsigned int scratch_regno;
15738 rtx offset;
15739
15740 /* We need a scratch register to hold the stack pointer minus
15741 the required frame size. Since this is the very start of the
15742 function, the scratch register can be any caller-saved
15743 register which is not used for parameters. */
15744 offset = GEN_INT (- allocate);
15745 scratch_regno = split_stack_prologue_scratch_regno ();
15746 if (scratch_regno == INVALID_REGNUM)
15747 return;
15748 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
15749 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
15750 {
15751 /* We don't use ix86_gen_add3 in this case because it will
15752 want to split to lea, but when not optimizing the insn
15753 will not be split after this point. */
15754 emit_insn (gen_rtx_SET (scratch_reg,
15755 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15756 offset)));
15757 }
15758 else
15759 {
15760 emit_move_insn (scratch_reg, offset);
15761 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
15762 stack_pointer_rtx));
15763 }
15764 current = scratch_reg;
15765 }
15766
15767 ix86_expand_branch (GEU, current, limit, label);
15768 rtx_insn *jump_insn = get_last_insn ();
15769 JUMP_LABEL (jump_insn) = label;
15770
15771 /* Mark the jump as very likely to be taken. */
15772 add_int_reg_note (jump_insn, REG_BR_PROB,
15773 REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100);
15774
15775 if (split_stack_fn == NULL_RTX)
15776 {
15777 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
15778 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
15779 }
15780 fn = split_stack_fn;
15781
15782 /* Get more stack space. We pass in the desired stack space and the
15783 size of the arguments to copy to the new stack. In 32-bit mode
15784 we push the parameters; __morestack will return on a new stack
15785 anyhow. In 64-bit mode we pass the parameters in r10 and
15786 r11. */
15787 allocate_rtx = GEN_INT (allocate);
15788 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
15789 call_fusage = NULL_RTX;
15790 rtx pop = NULL_RTX;
15791 if (TARGET_64BIT)
15792 {
15793 rtx reg10, reg11;
15794
15795 reg10 = gen_rtx_REG (Pmode, R10_REG);
15796 reg11 = gen_rtx_REG (Pmode, R11_REG);
15797
15798 /* If this function uses a static chain, it will be in %r10.
15799 Preserve it across the call to __morestack. */
15800 if (DECL_STATIC_CHAIN (cfun->decl))
15801 {
15802 rtx rax;
15803
15804 rax = gen_rtx_REG (word_mode, AX_REG);
15805 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
15806 use_reg (&call_fusage, rax);
15807 }
15808
15809 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
15810 && !TARGET_PECOFF)
15811 {
15812 HOST_WIDE_INT argval;
15813
15814 gcc_assert (Pmode == DImode);
15815 /* When using the large model we need to load the address
15816 into a register, and we've run out of registers. So we
15817 switch to a different calling convention, and we call a
15818 different function: __morestack_large. We pass the
15819 argument size in the upper 32 bits of r10 and pass the
15820 frame size in the lower 32 bits. */
15821 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
15822 gcc_assert ((args_size & 0xffffffff) == args_size);
15823
15824 if (split_stack_fn_large == NULL_RTX)
15825 {
15826 split_stack_fn_large =
15827 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
15828 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
15829 }
15830 if (ix86_cmodel == CM_LARGE_PIC)
15831 {
15832 rtx_code_label *label;
15833 rtx x;
15834
15835 label = gen_label_rtx ();
15836 emit_label (label);
15837 LABEL_PRESERVE_P (label) = 1;
15838 emit_insn (gen_set_rip_rex64 (reg10, label));
15839 emit_insn (gen_set_got_offset_rex64 (reg11, label));
15840 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
15841 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
15842 UNSPEC_GOT);
15843 x = gen_rtx_CONST (Pmode, x);
15844 emit_move_insn (reg11, x);
15845 x = gen_rtx_PLUS (Pmode, reg10, reg11);
15846 x = gen_const_mem (Pmode, x);
15847 emit_move_insn (reg11, x);
15848 }
15849 else
15850 emit_move_insn (reg11, split_stack_fn_large);
15851
15852 fn = reg11;
15853
15854 argval = ((args_size << 16) << 16) + allocate;
15855 emit_move_insn (reg10, GEN_INT (argval));
15856 }
15857 else
15858 {
15859 emit_move_insn (reg10, allocate_rtx);
15860 emit_move_insn (reg11, GEN_INT (args_size));
15861 use_reg (&call_fusage, reg11);
15862 }
15863
15864 use_reg (&call_fusage, reg10);
15865 }
15866 else
15867 {
15868 rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
15869 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
15870 insn = emit_insn (gen_push (allocate_rtx));
15871 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
15872 pop = GEN_INT (2 * UNITS_PER_WORD);
15873 }
15874 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
15875 GEN_INT (UNITS_PER_WORD), constm1_rtx,
15876 pop, false);
15877 add_function_usage_to (call_insn, call_fusage);
15878 if (!TARGET_64BIT)
15879 add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
15880 /* Indicate that this function can't jump to non-local gotos. */
15881 make_reg_eh_region_note_nothrow_nononlocal (as_a <rtx_insn *> (call_insn));
15882
15883 /* In order to make call/return prediction work right, we now need
15884 to execute a return instruction. See
15885 libgcc/config/i386/morestack.S for the details on how this works.
15886
15887 For flow purposes gcc must not see this as a return
15888 instruction--we need control flow to continue at the subsequent
15889 label. Therefore, we use an unspec. */
15890 gcc_assert (crtl->args.pops_args < 65536);
15891 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
15892
15893 /* If we are in 64-bit mode and this function uses a static chain,
15894 we saved %r10 in %rax before calling _morestack. */
15895 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
15896 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
15897 gen_rtx_REG (word_mode, AX_REG));
15898
15899 /* If this function calls va_start, we need to store a pointer to
15900 the arguments on the old stack, because they may not have been
15901 all copied to the new stack. At this point the old stack can be
15902 found at the frame pointer value used by __morestack, because
15903 __morestack has set that up before calling back to us. Here we
15904 store that pointer in a scratch register, and in
15905 ix86_expand_prologue we store the scratch register in a stack
15906 slot. */
15907 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15908 {
15909 unsigned int scratch_regno;
15910 rtx frame_reg;
15911 int words;
15912
15913 scratch_regno = split_stack_prologue_scratch_regno ();
15914 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
15915 frame_reg = gen_rtx_REG (Pmode, BP_REG);
15916
15917 /* 64-bit:
15918 fp -> old fp value
15919 return address within this function
15920 return address of caller of this function
15921 stack arguments
15922 So we add three words to get to the stack arguments.
15923
15924 32-bit:
15925 fp -> old fp value
15926 return address within this function
15927 first argument to __morestack
15928 second argument to __morestack
15929 return address of caller of this function
15930 stack arguments
15931 So we add five words to get to the stack arguments.
15932 */
15933 words = TARGET_64BIT ? 3 : 5;
15934 emit_insn (gen_rtx_SET (scratch_reg,
15935 gen_rtx_PLUS (Pmode, frame_reg,
15936 GEN_INT (words * UNITS_PER_WORD))));
15937
15938 varargs_label = gen_label_rtx ();
15939 emit_jump_insn (gen_jump (varargs_label));
15940 JUMP_LABEL (get_last_insn ()) = varargs_label;
15941
15942 emit_barrier ();
15943 }
15944
15945 emit_label (label);
15946 LABEL_NUSES (label) = 1;
15947
15948 /* If this function calls va_start, we now have to set the scratch
15949 register for the case where we do not call __morestack. In this
15950 case we need to set it based on the stack pointer. */
15951 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15952 {
15953 emit_insn (gen_rtx_SET (scratch_reg,
15954 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15955 GEN_INT (UNITS_PER_WORD))));
15956
15957 emit_label (varargs_label);
15958 LABEL_NUSES (varargs_label) = 1;
15959 }
15960 }
15961
15962 /* We may have to tell the dataflow pass that the split stack prologue
15963 is initializing a scratch register. */
15964
15965 static void
15966 ix86_live_on_entry (bitmap regs)
15967 {
15968 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15969 {
15970 gcc_assert (flag_split_stack);
15971 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
15972 }
15973 }
15974 \f
15975 /* Extract the parts of an RTL expression that is a valid memory address
15976 for an instruction. Return 0 if the structure of the address is
15977 grossly off. Return -1 if the address contains ASHIFT, so it is not
15978 strictly valid, but still used for computing length of lea instruction. */
15979
15980 int
15981 ix86_decompose_address (rtx addr, struct ix86_address *out)
15982 {
15983 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
15984 rtx base_reg, index_reg;
15985 HOST_WIDE_INT scale = 1;
15986 rtx scale_rtx = NULL_RTX;
15987 rtx tmp;
15988 int retval = 1;
15989 addr_space_t seg = ADDR_SPACE_GENERIC;
15990
15991 /* Allow zero-extended SImode addresses,
15992 they will be emitted with addr32 prefix. */
15993 if (TARGET_64BIT && GET_MODE (addr) == DImode)
15994 {
15995 if (GET_CODE (addr) == ZERO_EXTEND
15996 && GET_MODE (XEXP (addr, 0)) == SImode)
15997 {
15998 addr = XEXP (addr, 0);
15999 if (CONST_INT_P (addr))
16000 return 0;
16001 }
16002 else if (GET_CODE (addr) == AND
16003 && const_32bit_mask (XEXP (addr, 1), DImode))
16004 {
16005 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
16006 if (addr == NULL_RTX)
16007 return 0;
16008
16009 if (CONST_INT_P (addr))
16010 return 0;
16011 }
16012 }
16013
16014 /* Allow SImode subregs of DImode addresses,
16015 they will be emitted with addr32 prefix. */
16016 if (TARGET_64BIT && GET_MODE (addr) == SImode)
16017 {
16018 if (SUBREG_P (addr)
16019 && GET_MODE (SUBREG_REG (addr)) == DImode)
16020 {
16021 addr = SUBREG_REG (addr);
16022 if (CONST_INT_P (addr))
16023 return 0;
16024 }
16025 }
16026
16027 if (REG_P (addr))
16028 base = addr;
16029 else if (SUBREG_P (addr))
16030 {
16031 if (REG_P (SUBREG_REG (addr)))
16032 base = addr;
16033 else
16034 return 0;
16035 }
16036 else if (GET_CODE (addr) == PLUS)
16037 {
16038 rtx addends[4], op;
16039 int n = 0, i;
16040
16041 op = addr;
16042 do
16043 {
16044 if (n >= 4)
16045 return 0;
16046 addends[n++] = XEXP (op, 1);
16047 op = XEXP (op, 0);
16048 }
16049 while (GET_CODE (op) == PLUS);
16050 if (n >= 4)
16051 return 0;
16052 addends[n] = op;
16053
16054 for (i = n; i >= 0; --i)
16055 {
16056 op = addends[i];
16057 switch (GET_CODE (op))
16058 {
16059 case MULT:
16060 if (index)
16061 return 0;
16062 index = XEXP (op, 0);
16063 scale_rtx = XEXP (op, 1);
16064 break;
16065
16066 case ASHIFT:
16067 if (index)
16068 return 0;
16069 index = XEXP (op, 0);
16070 tmp = XEXP (op, 1);
16071 if (!CONST_INT_P (tmp))
16072 return 0;
16073 scale = INTVAL (tmp);
16074 if ((unsigned HOST_WIDE_INT) scale > 3)
16075 return 0;
16076 scale = 1 << scale;
16077 break;
16078
16079 case ZERO_EXTEND:
16080 op = XEXP (op, 0);
16081 if (GET_CODE (op) != UNSPEC)
16082 return 0;
16083 /* FALLTHRU */
16084
16085 case UNSPEC:
16086 if (XINT (op, 1) == UNSPEC_TP
16087 && TARGET_TLS_DIRECT_SEG_REFS
16088 && seg == ADDR_SPACE_GENERIC)
16089 seg = DEFAULT_TLS_SEG_REG;
16090 else
16091 return 0;
16092 break;
16093
16094 case SUBREG:
16095 if (!REG_P (SUBREG_REG (op)))
16096 return 0;
16097 /* FALLTHRU */
16098
16099 case REG:
16100 if (!base)
16101 base = op;
16102 else if (!index)
16103 index = op;
16104 else
16105 return 0;
16106 break;
16107
16108 case CONST:
16109 case CONST_INT:
16110 case SYMBOL_REF:
16111 case LABEL_REF:
16112 if (disp)
16113 return 0;
16114 disp = op;
16115 break;
16116
16117 default:
16118 return 0;
16119 }
16120 }
16121 }
16122 else if (GET_CODE (addr) == MULT)
16123 {
16124 index = XEXP (addr, 0); /* index*scale */
16125 scale_rtx = XEXP (addr, 1);
16126 }
16127 else if (GET_CODE (addr) == ASHIFT)
16128 {
16129 /* We're called for lea too, which implements ashift on occasion. */
16130 index = XEXP (addr, 0);
16131 tmp = XEXP (addr, 1);
16132 if (!CONST_INT_P (tmp))
16133 return 0;
16134 scale = INTVAL (tmp);
16135 if ((unsigned HOST_WIDE_INT) scale > 3)
16136 return 0;
16137 scale = 1 << scale;
16138 retval = -1;
16139 }
16140 else
16141 disp = addr; /* displacement */
16142
16143 if (index)
16144 {
16145 if (REG_P (index))
16146 ;
16147 else if (SUBREG_P (index)
16148 && REG_P (SUBREG_REG (index)))
16149 ;
16150 else
16151 return 0;
16152 }
16153
16154 /* Extract the integral value of scale. */
16155 if (scale_rtx)
16156 {
16157 if (!CONST_INT_P (scale_rtx))
16158 return 0;
16159 scale = INTVAL (scale_rtx);
16160 }
16161
16162 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
16163 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
16164
16165 /* Avoid useless 0 displacement. */
16166 if (disp == const0_rtx && (base || index))
16167 disp = NULL_RTX;
16168
16169 /* Allow arg pointer and stack pointer as index if there is not scaling. */
16170 if (base_reg && index_reg && scale == 1
16171 && (index_reg == arg_pointer_rtx
16172 || index_reg == frame_pointer_rtx
16173 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
16174 {
16175 std::swap (base, index);
16176 std::swap (base_reg, index_reg);
16177 }
16178
16179 /* Special case: %ebp cannot be encoded as a base without a displacement.
16180 Similarly %r13. */
16181 if (!disp
16182 && base_reg
16183 && (base_reg == hard_frame_pointer_rtx
16184 || base_reg == frame_pointer_rtx
16185 || base_reg == arg_pointer_rtx
16186 || (REG_P (base_reg)
16187 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
16188 || REGNO (base_reg) == R13_REG))))
16189 disp = const0_rtx;
16190
16191 /* Special case: on K6, [%esi] makes the instruction vector decoded.
16192 Avoid this by transforming to [%esi+0].
16193 Reload calls address legitimization without cfun defined, so we need
16194 to test cfun for being non-NULL. */
16195 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
16196 && base_reg && !index_reg && !disp
16197 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
16198 disp = const0_rtx;
16199
16200 /* Special case: encode reg+reg instead of reg*2. */
16201 if (!base && index && scale == 2)
16202 base = index, base_reg = index_reg, scale = 1;
16203
16204 /* Special case: scaling cannot be encoded without base or displacement. */
16205 if (!base && !disp && index && scale != 1)
16206 disp = const0_rtx;
16207
16208 out->base = base;
16209 out->index = index;
16210 out->disp = disp;
16211 out->scale = scale;
16212 out->seg = seg;
16213
16214 return retval;
16215 }
16216 \f
16217 /* Return cost of the memory address x.
16218 For i386, it is better to use a complex address than let gcc copy
16219 the address into a reg and make a new pseudo. But not if the address
16220 requires to two regs - that would mean more pseudos with longer
16221 lifetimes. */
16222 static int
16223 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
16224 {
16225 struct ix86_address parts;
16226 int cost = 1;
16227 int ok = ix86_decompose_address (x, &parts);
16228
16229 gcc_assert (ok);
16230
16231 if (parts.base && SUBREG_P (parts.base))
16232 parts.base = SUBREG_REG (parts.base);
16233 if (parts.index && SUBREG_P (parts.index))
16234 parts.index = SUBREG_REG (parts.index);
16235
16236 /* Attempt to minimize number of registers in the address by increasing
16237 address cost for each used register. We don't increase address cost
16238 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
16239 is not invariant itself it most likely means that base or index is not
16240 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
16241 which is not profitable for x86. */
16242 if (parts.base
16243 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
16244 && (current_pass->type == GIMPLE_PASS
16245 || !pic_offset_table_rtx
16246 || !REG_P (parts.base)
16247 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
16248 cost++;
16249
16250 if (parts.index
16251 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
16252 && (current_pass->type == GIMPLE_PASS
16253 || !pic_offset_table_rtx
16254 || !REG_P (parts.index)
16255 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
16256 cost++;
16257
16258 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
16259 since it's predecode logic can't detect the length of instructions
16260 and it degenerates to vector decoded. Increase cost of such
16261 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
16262 to split such addresses or even refuse such addresses at all.
16263
16264 Following addressing modes are affected:
16265 [base+scale*index]
16266 [scale*index+disp]
16267 [base+index]
16268
16269 The first and last case may be avoidable by explicitly coding the zero in
16270 memory address, but I don't have AMD-K6 machine handy to check this
16271 theory. */
16272
16273 if (TARGET_K6
16274 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
16275 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
16276 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
16277 cost += 10;
16278
16279 return cost;
16280 }
16281 \f
16282 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
16283 this is used for to form addresses to local data when -fPIC is in
16284 use. */
16285
16286 static bool
16287 darwin_local_data_pic (rtx disp)
16288 {
16289 return (GET_CODE (disp) == UNSPEC
16290 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
16291 }
16292
16293 /* True if operand X should be loaded from GOT. */
16294
16295 bool
16296 ix86_force_load_from_GOT_p (rtx x)
16297 {
16298 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
16299 && !TARGET_PECOFF && !TARGET_MACHO
16300 && !flag_plt && !flag_pic
16301 && ix86_cmodel != CM_LARGE
16302 && GET_CODE (x) == SYMBOL_REF
16303 && SYMBOL_REF_FUNCTION_P (x)
16304 && !SYMBOL_REF_LOCAL_P (x));
16305 }
16306
16307 /* Determine if a given RTX is a valid constant. We already know this
16308 satisfies CONSTANT_P. */
16309
16310 static bool
16311 ix86_legitimate_constant_p (machine_mode mode, rtx x)
16312 {
16313 /* Pointer bounds constants are not valid. */
16314 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
16315 return false;
16316
16317 switch (GET_CODE (x))
16318 {
16319 case CONST:
16320 x = XEXP (x, 0);
16321
16322 if (GET_CODE (x) == PLUS)
16323 {
16324 if (!CONST_INT_P (XEXP (x, 1)))
16325 return false;
16326 x = XEXP (x, 0);
16327 }
16328
16329 if (TARGET_MACHO && darwin_local_data_pic (x))
16330 return true;
16331
16332 /* Only some unspecs are valid as "constants". */
16333 if (GET_CODE (x) == UNSPEC)
16334 switch (XINT (x, 1))
16335 {
16336 case UNSPEC_GOT:
16337 case UNSPEC_GOTOFF:
16338 case UNSPEC_PLTOFF:
16339 return TARGET_64BIT;
16340 case UNSPEC_TPOFF:
16341 case UNSPEC_NTPOFF:
16342 x = XVECEXP (x, 0, 0);
16343 return (GET_CODE (x) == SYMBOL_REF
16344 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
16345 case UNSPEC_DTPOFF:
16346 x = XVECEXP (x, 0, 0);
16347 return (GET_CODE (x) == SYMBOL_REF
16348 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
16349 default:
16350 return false;
16351 }
16352
16353 /* We must have drilled down to a symbol. */
16354 if (GET_CODE (x) == LABEL_REF)
16355 return true;
16356 if (GET_CODE (x) != SYMBOL_REF)
16357 return false;
16358 /* FALLTHRU */
16359
16360 case SYMBOL_REF:
16361 /* TLS symbols are never valid. */
16362 if (SYMBOL_REF_TLS_MODEL (x))
16363 return false;
16364
16365 /* DLLIMPORT symbols are never valid. */
16366 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
16367 && SYMBOL_REF_DLLIMPORT_P (x))
16368 return false;
16369
16370 #if TARGET_MACHO
16371 /* mdynamic-no-pic */
16372 if (MACHO_DYNAMIC_NO_PIC_P)
16373 return machopic_symbol_defined_p (x);
16374 #endif
16375
16376 /* External function address should be loaded
16377 via the GOT slot to avoid PLT. */
16378 if (ix86_force_load_from_GOT_p (x))
16379 return false;
16380
16381 break;
16382
16383 CASE_CONST_SCALAR_INT:
16384 switch (mode)
16385 {
16386 case TImode:
16387 if (TARGET_64BIT)
16388 return true;
16389 /* FALLTHRU */
16390 case OImode:
16391 case XImode:
16392 if (!standard_sse_constant_p (x, mode))
16393 return false;
16394 default:
16395 break;
16396 }
16397 break;
16398
16399 case CONST_VECTOR:
16400 if (!standard_sse_constant_p (x, mode))
16401 return false;
16402
16403 default:
16404 break;
16405 }
16406
16407 /* Otherwise we handle everything else in the move patterns. */
16408 return true;
16409 }
16410
16411 /* Determine if it's legal to put X into the constant pool. This
16412 is not possible for the address of thread-local symbols, which
16413 is checked above. */
16414
16415 static bool
16416 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
16417 {
16418 /* We can put any immediate constant in memory. */
16419 switch (GET_CODE (x))
16420 {
16421 CASE_CONST_ANY:
16422 return false;
16423
16424 default:
16425 break;
16426 }
16427
16428 return !ix86_legitimate_constant_p (mode, x);
16429 }
16430
16431 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
16432 otherwise zero. */
16433
16434 static bool
16435 is_imported_p (rtx x)
16436 {
16437 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
16438 || GET_CODE (x) != SYMBOL_REF)
16439 return false;
16440
16441 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
16442 }
16443
16444
16445 /* Nonzero if the constant value X is a legitimate general operand
16446 when generating PIC code. It is given that flag_pic is on and
16447 that X satisfies CONSTANT_P. */
16448
16449 bool
16450 legitimate_pic_operand_p (rtx x)
16451 {
16452 rtx inner;
16453
16454 switch (GET_CODE (x))
16455 {
16456 case CONST:
16457 inner = XEXP (x, 0);
16458 if (GET_CODE (inner) == PLUS
16459 && CONST_INT_P (XEXP (inner, 1)))
16460 inner = XEXP (inner, 0);
16461
16462 /* Only some unspecs are valid as "constants". */
16463 if (GET_CODE (inner) == UNSPEC)
16464 switch (XINT (inner, 1))
16465 {
16466 case UNSPEC_GOT:
16467 case UNSPEC_GOTOFF:
16468 case UNSPEC_PLTOFF:
16469 return TARGET_64BIT;
16470 case UNSPEC_TPOFF:
16471 x = XVECEXP (inner, 0, 0);
16472 return (GET_CODE (x) == SYMBOL_REF
16473 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
16474 case UNSPEC_MACHOPIC_OFFSET:
16475 return legitimate_pic_address_disp_p (x);
16476 default:
16477 return false;
16478 }
16479 /* FALLTHRU */
16480
16481 case SYMBOL_REF:
16482 case LABEL_REF:
16483 return legitimate_pic_address_disp_p (x);
16484
16485 default:
16486 return true;
16487 }
16488 }
16489
16490 /* Determine if a given CONST RTX is a valid memory displacement
16491 in PIC mode. */
16492
16493 bool
16494 legitimate_pic_address_disp_p (rtx disp)
16495 {
16496 bool saw_plus;
16497
16498 /* In 64bit mode we can allow direct addresses of symbols and labels
16499 when they are not dynamic symbols. */
16500 if (TARGET_64BIT)
16501 {
16502 rtx op0 = disp, op1;
16503
16504 switch (GET_CODE (disp))
16505 {
16506 case LABEL_REF:
16507 return true;
16508
16509 case CONST:
16510 if (GET_CODE (XEXP (disp, 0)) != PLUS)
16511 break;
16512 op0 = XEXP (XEXP (disp, 0), 0);
16513 op1 = XEXP (XEXP (disp, 0), 1);
16514 if (!CONST_INT_P (op1)
16515 || INTVAL (op1) >= 16*1024*1024
16516 || INTVAL (op1) < -16*1024*1024)
16517 break;
16518 if (GET_CODE (op0) == LABEL_REF)
16519 return true;
16520 if (GET_CODE (op0) == CONST
16521 && GET_CODE (XEXP (op0, 0)) == UNSPEC
16522 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
16523 return true;
16524 if (GET_CODE (op0) == UNSPEC
16525 && XINT (op0, 1) == UNSPEC_PCREL)
16526 return true;
16527 if (GET_CODE (op0) != SYMBOL_REF)
16528 break;
16529 /* FALLTHRU */
16530
16531 case SYMBOL_REF:
16532 /* TLS references should always be enclosed in UNSPEC.
16533 The dllimported symbol needs always to be resolved. */
16534 if (SYMBOL_REF_TLS_MODEL (op0)
16535 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
16536 return false;
16537
16538 if (TARGET_PECOFF)
16539 {
16540 if (is_imported_p (op0))
16541 return true;
16542
16543 if (SYMBOL_REF_FAR_ADDR_P (op0)
16544 || !SYMBOL_REF_LOCAL_P (op0))
16545 break;
16546
16547 /* Function-symbols need to be resolved only for
16548 large-model.
16549 For the small-model we don't need to resolve anything
16550 here. */
16551 if ((ix86_cmodel != CM_LARGE_PIC
16552 && SYMBOL_REF_FUNCTION_P (op0))
16553 || ix86_cmodel == CM_SMALL_PIC)
16554 return true;
16555 /* Non-external symbols don't need to be resolved for
16556 large, and medium-model. */
16557 if ((ix86_cmodel == CM_LARGE_PIC
16558 || ix86_cmodel == CM_MEDIUM_PIC)
16559 && !SYMBOL_REF_EXTERNAL_P (op0))
16560 return true;
16561 }
16562 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
16563 && (SYMBOL_REF_LOCAL_P (op0)
16564 || (HAVE_LD_PIE_COPYRELOC
16565 && flag_pie
16566 && !SYMBOL_REF_WEAK (op0)
16567 && !SYMBOL_REF_FUNCTION_P (op0)))
16568 && ix86_cmodel != CM_LARGE_PIC)
16569 return true;
16570 break;
16571
16572 default:
16573 break;
16574 }
16575 }
16576 if (GET_CODE (disp) != CONST)
16577 return false;
16578 disp = XEXP (disp, 0);
16579
16580 if (TARGET_64BIT)
16581 {
16582 /* We are unsafe to allow PLUS expressions. This limit allowed distance
16583 of GOT tables. We should not need these anyway. */
16584 if (GET_CODE (disp) != UNSPEC
16585 || (XINT (disp, 1) != UNSPEC_GOTPCREL
16586 && XINT (disp, 1) != UNSPEC_GOTOFF
16587 && XINT (disp, 1) != UNSPEC_PCREL
16588 && XINT (disp, 1) != UNSPEC_PLTOFF))
16589 return false;
16590
16591 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
16592 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
16593 return false;
16594 return true;
16595 }
16596
16597 saw_plus = false;
16598 if (GET_CODE (disp) == PLUS)
16599 {
16600 if (!CONST_INT_P (XEXP (disp, 1)))
16601 return false;
16602 disp = XEXP (disp, 0);
16603 saw_plus = true;
16604 }
16605
16606 if (TARGET_MACHO && darwin_local_data_pic (disp))
16607 return true;
16608
16609 if (GET_CODE (disp) != UNSPEC)
16610 return false;
16611
16612 switch (XINT (disp, 1))
16613 {
16614 case UNSPEC_GOT:
16615 if (saw_plus)
16616 return false;
16617 /* We need to check for both symbols and labels because VxWorks loads
16618 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
16619 details. */
16620 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
16621 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
16622 case UNSPEC_GOTOFF:
16623 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
16624 While ABI specify also 32bit relocation but we don't produce it in
16625 small PIC model at all. */
16626 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
16627 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
16628 && !TARGET_64BIT)
16629 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
16630 return false;
16631 case UNSPEC_GOTTPOFF:
16632 case UNSPEC_GOTNTPOFF:
16633 case UNSPEC_INDNTPOFF:
16634 if (saw_plus)
16635 return false;
16636 disp = XVECEXP (disp, 0, 0);
16637 return (GET_CODE (disp) == SYMBOL_REF
16638 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
16639 case UNSPEC_NTPOFF:
16640 disp = XVECEXP (disp, 0, 0);
16641 return (GET_CODE (disp) == SYMBOL_REF
16642 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
16643 case UNSPEC_DTPOFF:
16644 disp = XVECEXP (disp, 0, 0);
16645 return (GET_CODE (disp) == SYMBOL_REF
16646 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
16647 }
16648
16649 return false;
16650 }
16651
16652 /* Determine if op is suitable RTX for an address register.
16653 Return naked register if a register or a register subreg is
16654 found, otherwise return NULL_RTX. */
16655
16656 static rtx
16657 ix86_validate_address_register (rtx op)
16658 {
16659 machine_mode mode = GET_MODE (op);
16660
16661 /* Only SImode or DImode registers can form the address. */
16662 if (mode != SImode && mode != DImode)
16663 return NULL_RTX;
16664
16665 if (REG_P (op))
16666 return op;
16667 else if (SUBREG_P (op))
16668 {
16669 rtx reg = SUBREG_REG (op);
16670
16671 if (!REG_P (reg))
16672 return NULL_RTX;
16673
16674 mode = GET_MODE (reg);
16675
16676 /* Don't allow SUBREGs that span more than a word. It can
16677 lead to spill failures when the register is one word out
16678 of a two word structure. */
16679 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
16680 return NULL_RTX;
16681
16682 /* Allow only SUBREGs of non-eliminable hard registers. */
16683 if (register_no_elim_operand (reg, mode))
16684 return reg;
16685 }
16686
16687 /* Op is not a register. */
16688 return NULL_RTX;
16689 }
16690
16691 /* Recognizes RTL expressions that are valid memory addresses for an
16692 instruction. The MODE argument is the machine mode for the MEM
16693 expression that wants to use this address.
16694
16695 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
16696 convert common non-canonical forms to canonical form so that they will
16697 be recognized. */
16698
16699 static bool
16700 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
16701 {
16702 struct ix86_address parts;
16703 rtx base, index, disp;
16704 HOST_WIDE_INT scale;
16705 addr_space_t seg;
16706
16707 if (ix86_decompose_address (addr, &parts) <= 0)
16708 /* Decomposition failed. */
16709 return false;
16710
16711 base = parts.base;
16712 index = parts.index;
16713 disp = parts.disp;
16714 scale = parts.scale;
16715 seg = parts.seg;
16716
16717 /* Validate base register. */
16718 if (base)
16719 {
16720 rtx reg = ix86_validate_address_register (base);
16721
16722 if (reg == NULL_RTX)
16723 return false;
16724
16725 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
16726 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
16727 /* Base is not valid. */
16728 return false;
16729 }
16730
16731 /* Validate index register. */
16732 if (index)
16733 {
16734 rtx reg = ix86_validate_address_register (index);
16735
16736 if (reg == NULL_RTX)
16737 return false;
16738
16739 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
16740 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
16741 /* Index is not valid. */
16742 return false;
16743 }
16744
16745 /* Index and base should have the same mode. */
16746 if (base && index
16747 && GET_MODE (base) != GET_MODE (index))
16748 return false;
16749
16750 /* Address override works only on the (%reg) part of %fs:(%reg). */
16751 if (seg != ADDR_SPACE_GENERIC
16752 && ((base && GET_MODE (base) != word_mode)
16753 || (index && GET_MODE (index) != word_mode)))
16754 return false;
16755
16756 /* Validate scale factor. */
16757 if (scale != 1)
16758 {
16759 if (!index)
16760 /* Scale without index. */
16761 return false;
16762
16763 if (scale != 2 && scale != 4 && scale != 8)
16764 /* Scale is not a valid multiplier. */
16765 return false;
16766 }
16767
16768 /* Validate displacement. */
16769 if (disp)
16770 {
16771 if (GET_CODE (disp) == CONST
16772 && GET_CODE (XEXP (disp, 0)) == UNSPEC
16773 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
16774 switch (XINT (XEXP (disp, 0), 1))
16775 {
16776 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
16777 when used. While ABI specify also 32bit relocations, we
16778 don't produce them at all and use IP relative instead.
16779 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
16780 should be loaded via GOT. */
16781 case UNSPEC_GOT:
16782 if (!TARGET_64BIT
16783 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
16784 goto is_legitimate_pic;
16785 /* FALLTHRU */
16786 case UNSPEC_GOTOFF:
16787 gcc_assert (flag_pic);
16788 if (!TARGET_64BIT)
16789 goto is_legitimate_pic;
16790
16791 /* 64bit address unspec. */
16792 return false;
16793
16794 case UNSPEC_GOTPCREL:
16795 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
16796 goto is_legitimate_pic;
16797 /* FALLTHRU */
16798 case UNSPEC_PCREL:
16799 gcc_assert (flag_pic);
16800 goto is_legitimate_pic;
16801
16802 case UNSPEC_GOTTPOFF:
16803 case UNSPEC_GOTNTPOFF:
16804 case UNSPEC_INDNTPOFF:
16805 case UNSPEC_NTPOFF:
16806 case UNSPEC_DTPOFF:
16807 break;
16808
16809 case UNSPEC_STACK_CHECK:
16810 gcc_assert (flag_split_stack);
16811 break;
16812
16813 default:
16814 /* Invalid address unspec. */
16815 return false;
16816 }
16817
16818 else if (SYMBOLIC_CONST (disp)
16819 && (flag_pic
16820 || (TARGET_MACHO
16821 #if TARGET_MACHO
16822 && MACHOPIC_INDIRECT
16823 && !machopic_operand_p (disp)
16824 #endif
16825 )))
16826 {
16827
16828 is_legitimate_pic:
16829 if (TARGET_64BIT && (index || base))
16830 {
16831 /* foo@dtpoff(%rX) is ok. */
16832 if (GET_CODE (disp) != CONST
16833 || GET_CODE (XEXP (disp, 0)) != PLUS
16834 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
16835 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
16836 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
16837 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
16838 /* Non-constant pic memory reference. */
16839 return false;
16840 }
16841 else if ((!TARGET_MACHO || flag_pic)
16842 && ! legitimate_pic_address_disp_p (disp))
16843 /* Displacement is an invalid pic construct. */
16844 return false;
16845 #if TARGET_MACHO
16846 else if (MACHO_DYNAMIC_NO_PIC_P
16847 && !ix86_legitimate_constant_p (Pmode, disp))
16848 /* displacment must be referenced via non_lazy_pointer */
16849 return false;
16850 #endif
16851
16852 /* This code used to verify that a symbolic pic displacement
16853 includes the pic_offset_table_rtx register.
16854
16855 While this is good idea, unfortunately these constructs may
16856 be created by "adds using lea" optimization for incorrect
16857 code like:
16858
16859 int a;
16860 int foo(int i)
16861 {
16862 return *(&a+i);
16863 }
16864
16865 This code is nonsensical, but results in addressing
16866 GOT table with pic_offset_table_rtx base. We can't
16867 just refuse it easily, since it gets matched by
16868 "addsi3" pattern, that later gets split to lea in the
16869 case output register differs from input. While this
16870 can be handled by separate addsi pattern for this case
16871 that never results in lea, this seems to be easier and
16872 correct fix for crash to disable this test. */
16873 }
16874 else if (GET_CODE (disp) != LABEL_REF
16875 && !CONST_INT_P (disp)
16876 && (GET_CODE (disp) != CONST
16877 || !ix86_legitimate_constant_p (Pmode, disp))
16878 && (GET_CODE (disp) != SYMBOL_REF
16879 || !ix86_legitimate_constant_p (Pmode, disp)))
16880 /* Displacement is not constant. */
16881 return false;
16882 else if (TARGET_64BIT
16883 && !x86_64_immediate_operand (disp, VOIDmode))
16884 /* Displacement is out of range. */
16885 return false;
16886 /* In x32 mode, constant addresses are sign extended to 64bit, so
16887 we have to prevent addresses from 0x80000000 to 0xffffffff. */
16888 else if (TARGET_X32 && !(index || base)
16889 && CONST_INT_P (disp)
16890 && val_signbit_known_set_p (SImode, INTVAL (disp)))
16891 return false;
16892 }
16893
16894 /* Everything looks valid. */
16895 return true;
16896 }
16897
16898 /* Determine if a given RTX is a valid constant address. */
16899
16900 bool
16901 constant_address_p (rtx x)
16902 {
16903 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
16904 }
16905 \f
16906 /* Return a unique alias set for the GOT. */
16907
16908 static alias_set_type
16909 ix86_GOT_alias_set (void)
16910 {
16911 static alias_set_type set = -1;
16912 if (set == -1)
16913 set = new_alias_set ();
16914 return set;
16915 }
16916
16917 /* Return a legitimate reference for ORIG (an address) using the
16918 register REG. If REG is 0, a new pseudo is generated.
16919
16920 There are two types of references that must be handled:
16921
16922 1. Global data references must load the address from the GOT, via
16923 the PIC reg. An insn is emitted to do this load, and the reg is
16924 returned.
16925
16926 2. Static data references, constant pool addresses, and code labels
16927 compute the address as an offset from the GOT, whose base is in
16928 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
16929 differentiate them from global data objects. The returned
16930 address is the PIC reg + an unspec constant.
16931
16932 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
16933 reg also appears in the address. */
16934
16935 static rtx
16936 legitimize_pic_address (rtx orig, rtx reg)
16937 {
16938 rtx addr = orig;
16939 rtx new_rtx = orig;
16940
16941 #if TARGET_MACHO
16942 if (TARGET_MACHO && !TARGET_64BIT)
16943 {
16944 if (reg == 0)
16945 reg = gen_reg_rtx (Pmode);
16946 /* Use the generic Mach-O PIC machinery. */
16947 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
16948 }
16949 #endif
16950
16951 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16952 {
16953 rtx tmp = legitimize_pe_coff_symbol (addr, true);
16954 if (tmp)
16955 return tmp;
16956 }
16957
16958 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
16959 new_rtx = addr;
16960 else if ((!TARGET_64BIT
16961 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
16962 && !TARGET_PECOFF
16963 && gotoff_operand (addr, Pmode))
16964 {
16965 /* This symbol may be referenced via a displacement
16966 from the PIC base address (@GOTOFF). */
16967 if (GET_CODE (addr) == CONST)
16968 addr = XEXP (addr, 0);
16969
16970 if (GET_CODE (addr) == PLUS)
16971 {
16972 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
16973 UNSPEC_GOTOFF);
16974 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
16975 }
16976 else
16977 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
16978
16979 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16980
16981 if (TARGET_64BIT)
16982 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
16983
16984 if (reg != 0)
16985 {
16986 gcc_assert (REG_P (reg));
16987 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
16988 new_rtx, reg, 1, OPTAB_DIRECT);
16989 }
16990 else
16991 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16992 }
16993 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
16994 /* We can't use @GOTOFF for text labels
16995 on VxWorks, see gotoff_operand. */
16996 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
16997 {
16998 rtx tmp = legitimize_pe_coff_symbol (addr, true);
16999 if (tmp)
17000 return tmp;
17001
17002 /* For x64 PE-COFF there is no GOT table,
17003 so we use address directly. */
17004 if (TARGET_64BIT && TARGET_PECOFF)
17005 {
17006 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
17007 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17008 }
17009 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
17010 {
17011 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
17012 UNSPEC_GOTPCREL);
17013 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17014 new_rtx = gen_const_mem (Pmode, new_rtx);
17015 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
17016 }
17017 else
17018 {
17019 /* This symbol must be referenced via a load
17020 from the Global Offset Table (@GOT). */
17021 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
17022 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17023 if (TARGET_64BIT)
17024 new_rtx = force_reg (Pmode, new_rtx);
17025 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
17026 new_rtx = gen_const_mem (Pmode, new_rtx);
17027 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
17028 }
17029
17030 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
17031 }
17032 else
17033 {
17034 if (CONST_INT_P (addr)
17035 && !x86_64_immediate_operand (addr, VOIDmode))
17036 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
17037 else if (GET_CODE (addr) == CONST)
17038 {
17039 addr = XEXP (addr, 0);
17040
17041 /* We must match stuff we generate before. Assume the only
17042 unspecs that can get here are ours. Not that we could do
17043 anything with them anyway.... */
17044 if (GET_CODE (addr) == UNSPEC
17045 || (GET_CODE (addr) == PLUS
17046 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
17047 return orig;
17048 gcc_assert (GET_CODE (addr) == PLUS);
17049 }
17050
17051 if (GET_CODE (addr) == PLUS)
17052 {
17053 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
17054
17055 /* Check first to see if this is a constant
17056 offset from a @GOTOFF symbol reference. */
17057 if (!TARGET_PECOFF
17058 && gotoff_operand (op0, Pmode)
17059 && CONST_INT_P (op1))
17060 {
17061 if (!TARGET_64BIT)
17062 {
17063 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
17064 UNSPEC_GOTOFF);
17065 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
17066 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17067
17068 if (reg != 0)
17069 {
17070 gcc_assert (REG_P (reg));
17071 new_rtx = expand_simple_binop (Pmode, PLUS,
17072 pic_offset_table_rtx,
17073 new_rtx, reg, 1,
17074 OPTAB_DIRECT);
17075 }
17076 else
17077 new_rtx
17078 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
17079 }
17080 else
17081 {
17082 if (INTVAL (op1) < -16*1024*1024
17083 || INTVAL (op1) >= 16*1024*1024)
17084 {
17085 if (!x86_64_immediate_operand (op1, Pmode))
17086 op1 = force_reg (Pmode, op1);
17087
17088 new_rtx
17089 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
17090 }
17091 }
17092 }
17093 else
17094 {
17095 rtx base = legitimize_pic_address (op0, reg);
17096 machine_mode mode = GET_MODE (base);
17097 new_rtx
17098 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
17099
17100 if (CONST_INT_P (new_rtx))
17101 {
17102 if (INTVAL (new_rtx) < -16*1024*1024
17103 || INTVAL (new_rtx) >= 16*1024*1024)
17104 {
17105 if (!x86_64_immediate_operand (new_rtx, mode))
17106 new_rtx = force_reg (mode, new_rtx);
17107
17108 new_rtx
17109 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
17110 }
17111 else
17112 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
17113 }
17114 else
17115 {
17116 /* For %rip addressing, we have to use
17117 just disp32, not base nor index. */
17118 if (TARGET_64BIT
17119 && (GET_CODE (base) == SYMBOL_REF
17120 || GET_CODE (base) == LABEL_REF))
17121 base = force_reg (mode, base);
17122 if (GET_CODE (new_rtx) == PLUS
17123 && CONSTANT_P (XEXP (new_rtx, 1)))
17124 {
17125 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
17126 new_rtx = XEXP (new_rtx, 1);
17127 }
17128 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
17129 }
17130 }
17131 }
17132 }
17133 return new_rtx;
17134 }
17135 \f
17136 /* Load the thread pointer. If TO_REG is true, force it into a register. */
17137
17138 static rtx
17139 get_thread_pointer (machine_mode tp_mode, bool to_reg)
17140 {
17141 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
17142
17143 if (GET_MODE (tp) != tp_mode)
17144 {
17145 gcc_assert (GET_MODE (tp) == SImode);
17146 gcc_assert (tp_mode == DImode);
17147
17148 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
17149 }
17150
17151 if (to_reg)
17152 tp = copy_to_mode_reg (tp_mode, tp);
17153
17154 return tp;
17155 }
17156
17157 /* Construct the SYMBOL_REF for the tls_get_addr function. */
17158
17159 static GTY(()) rtx ix86_tls_symbol;
17160
17161 static rtx
17162 ix86_tls_get_addr (void)
17163 {
17164 if (!ix86_tls_symbol)
17165 {
17166 const char *sym
17167 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
17168 ? "___tls_get_addr" : "__tls_get_addr");
17169
17170 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
17171 }
17172
17173 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
17174 {
17175 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
17176 UNSPEC_PLTOFF);
17177 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
17178 gen_rtx_CONST (Pmode, unspec));
17179 }
17180
17181 return ix86_tls_symbol;
17182 }
17183
17184 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
17185
17186 static GTY(()) rtx ix86_tls_module_base_symbol;
17187
17188 rtx
17189 ix86_tls_module_base (void)
17190 {
17191 if (!ix86_tls_module_base_symbol)
17192 {
17193 ix86_tls_module_base_symbol
17194 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
17195
17196 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
17197 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
17198 }
17199
17200 return ix86_tls_module_base_symbol;
17201 }
17202
17203 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
17204 false if we expect this to be used for a memory address and true if
17205 we expect to load the address into a register. */
17206
17207 static rtx
17208 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
17209 {
17210 rtx dest, base, off;
17211 rtx pic = NULL_RTX, tp = NULL_RTX;
17212 machine_mode tp_mode = Pmode;
17213 int type;
17214
17215 /* Fall back to global dynamic model if tool chain cannot support local
17216 dynamic. */
17217 if (TARGET_SUN_TLS && !TARGET_64BIT
17218 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
17219 && model == TLS_MODEL_LOCAL_DYNAMIC)
17220 model = TLS_MODEL_GLOBAL_DYNAMIC;
17221
17222 switch (model)
17223 {
17224 case TLS_MODEL_GLOBAL_DYNAMIC:
17225 dest = gen_reg_rtx (Pmode);
17226
17227 if (!TARGET_64BIT)
17228 {
17229 if (flag_pic && !TARGET_PECOFF)
17230 pic = pic_offset_table_rtx;
17231 else
17232 {
17233 pic = gen_reg_rtx (Pmode);
17234 emit_insn (gen_set_got (pic));
17235 }
17236 }
17237
17238 if (TARGET_GNU2_TLS)
17239 {
17240 if (TARGET_64BIT)
17241 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
17242 else
17243 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
17244
17245 tp = get_thread_pointer (Pmode, true);
17246 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
17247
17248 if (GET_MODE (x) != Pmode)
17249 x = gen_rtx_ZERO_EXTEND (Pmode, x);
17250
17251 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
17252 }
17253 else
17254 {
17255 rtx caddr = ix86_tls_get_addr ();
17256
17257 if (TARGET_64BIT)
17258 {
17259 rtx rax = gen_rtx_REG (Pmode, AX_REG);
17260 rtx_insn *insns;
17261
17262 start_sequence ();
17263 emit_call_insn
17264 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
17265 insns = get_insns ();
17266 end_sequence ();
17267
17268 if (GET_MODE (x) != Pmode)
17269 x = gen_rtx_ZERO_EXTEND (Pmode, x);
17270
17271 RTL_CONST_CALL_P (insns) = 1;
17272 emit_libcall_block (insns, dest, rax, x);
17273 }
17274 else
17275 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
17276 }
17277 break;
17278
17279 case TLS_MODEL_LOCAL_DYNAMIC:
17280 base = gen_reg_rtx (Pmode);
17281
17282 if (!TARGET_64BIT)
17283 {
17284 if (flag_pic)
17285 pic = pic_offset_table_rtx;
17286 else
17287 {
17288 pic = gen_reg_rtx (Pmode);
17289 emit_insn (gen_set_got (pic));
17290 }
17291 }
17292
17293 if (TARGET_GNU2_TLS)
17294 {
17295 rtx tmp = ix86_tls_module_base ();
17296
17297 if (TARGET_64BIT)
17298 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
17299 else
17300 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
17301
17302 tp = get_thread_pointer (Pmode, true);
17303 set_unique_reg_note (get_last_insn (), REG_EQUAL,
17304 gen_rtx_MINUS (Pmode, tmp, tp));
17305 }
17306 else
17307 {
17308 rtx caddr = ix86_tls_get_addr ();
17309
17310 if (TARGET_64BIT)
17311 {
17312 rtx rax = gen_rtx_REG (Pmode, AX_REG);
17313 rtx_insn *insns;
17314 rtx eqv;
17315
17316 start_sequence ();
17317 emit_call_insn
17318 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
17319 insns = get_insns ();
17320 end_sequence ();
17321
17322 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
17323 share the LD_BASE result with other LD model accesses. */
17324 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
17325 UNSPEC_TLS_LD_BASE);
17326
17327 RTL_CONST_CALL_P (insns) = 1;
17328 emit_libcall_block (insns, base, rax, eqv);
17329 }
17330 else
17331 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
17332 }
17333
17334 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
17335 off = gen_rtx_CONST (Pmode, off);
17336
17337 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
17338
17339 if (TARGET_GNU2_TLS)
17340 {
17341 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
17342
17343 if (GET_MODE (x) != Pmode)
17344 x = gen_rtx_ZERO_EXTEND (Pmode, x);
17345
17346 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
17347 }
17348 break;
17349
17350 case TLS_MODEL_INITIAL_EXEC:
17351 if (TARGET_64BIT)
17352 {
17353 if (TARGET_SUN_TLS && !TARGET_X32)
17354 {
17355 /* The Sun linker took the AMD64 TLS spec literally
17356 and can only handle %rax as destination of the
17357 initial executable code sequence. */
17358
17359 dest = gen_reg_rtx (DImode);
17360 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
17361 return dest;
17362 }
17363
17364 /* Generate DImode references to avoid %fs:(%reg32)
17365 problems and linker IE->LE relaxation bug. */
17366 tp_mode = DImode;
17367 pic = NULL;
17368 type = UNSPEC_GOTNTPOFF;
17369 }
17370 else if (flag_pic)
17371 {
17372 pic = pic_offset_table_rtx;
17373 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
17374 }
17375 else if (!TARGET_ANY_GNU_TLS)
17376 {
17377 pic = gen_reg_rtx (Pmode);
17378 emit_insn (gen_set_got (pic));
17379 type = UNSPEC_GOTTPOFF;
17380 }
17381 else
17382 {
17383 pic = NULL;
17384 type = UNSPEC_INDNTPOFF;
17385 }
17386
17387 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
17388 off = gen_rtx_CONST (tp_mode, off);
17389 if (pic)
17390 off = gen_rtx_PLUS (tp_mode, pic, off);
17391 off = gen_const_mem (tp_mode, off);
17392 set_mem_alias_set (off, ix86_GOT_alias_set ());
17393
17394 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
17395 {
17396 base = get_thread_pointer (tp_mode,
17397 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
17398 off = force_reg (tp_mode, off);
17399 dest = gen_rtx_PLUS (tp_mode, base, off);
17400 if (tp_mode != Pmode)
17401 dest = convert_to_mode (Pmode, dest, 1);
17402 }
17403 else
17404 {
17405 base = get_thread_pointer (Pmode, true);
17406 dest = gen_reg_rtx (Pmode);
17407 emit_insn (ix86_gen_sub3 (dest, base, off));
17408 }
17409 break;
17410
17411 case TLS_MODEL_LOCAL_EXEC:
17412 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
17413 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
17414 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
17415 off = gen_rtx_CONST (Pmode, off);
17416
17417 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
17418 {
17419 base = get_thread_pointer (Pmode,
17420 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
17421 return gen_rtx_PLUS (Pmode, base, off);
17422 }
17423 else
17424 {
17425 base = get_thread_pointer (Pmode, true);
17426 dest = gen_reg_rtx (Pmode);
17427 emit_insn (ix86_gen_sub3 (dest, base, off));
17428 }
17429 break;
17430
17431 default:
17432 gcc_unreachable ();
17433 }
17434
17435 return dest;
17436 }
17437
17438 /* Create or return the unique __imp_DECL dllimport symbol corresponding
17439 to symbol DECL if BEIMPORT is true. Otherwise create or return the
17440 unique refptr-DECL symbol corresponding to symbol DECL. */
17441
17442 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
17443 {
17444 static inline hashval_t hash (tree_map *m) { return m->hash; }
17445 static inline bool
17446 equal (tree_map *a, tree_map *b)
17447 {
17448 return a->base.from == b->base.from;
17449 }
17450
17451 static int
17452 keep_cache_entry (tree_map *&m)
17453 {
17454 return ggc_marked_p (m->base.from);
17455 }
17456 };
17457
17458 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
17459
17460 static tree
17461 get_dllimport_decl (tree decl, bool beimport)
17462 {
17463 struct tree_map *h, in;
17464 const char *name;
17465 const char *prefix;
17466 size_t namelen, prefixlen;
17467 char *imp_name;
17468 tree to;
17469 rtx rtl;
17470
17471 if (!dllimport_map)
17472 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
17473
17474 in.hash = htab_hash_pointer (decl);
17475 in.base.from = decl;
17476 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
17477 h = *loc;
17478 if (h)
17479 return h->to;
17480
17481 *loc = h = ggc_alloc<tree_map> ();
17482 h->hash = in.hash;
17483 h->base.from = decl;
17484 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
17485 VAR_DECL, NULL, ptr_type_node);
17486 DECL_ARTIFICIAL (to) = 1;
17487 DECL_IGNORED_P (to) = 1;
17488 DECL_EXTERNAL (to) = 1;
17489 TREE_READONLY (to) = 1;
17490
17491 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
17492 name = targetm.strip_name_encoding (name);
17493 if (beimport)
17494 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
17495 ? "*__imp_" : "*__imp__";
17496 else
17497 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
17498 namelen = strlen (name);
17499 prefixlen = strlen (prefix);
17500 imp_name = (char *) alloca (namelen + prefixlen + 1);
17501 memcpy (imp_name, prefix, prefixlen);
17502 memcpy (imp_name + prefixlen, name, namelen + 1);
17503
17504 name = ggc_alloc_string (imp_name, namelen + prefixlen);
17505 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
17506 SET_SYMBOL_REF_DECL (rtl, to);
17507 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
17508 if (!beimport)
17509 {
17510 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
17511 #ifdef SUB_TARGET_RECORD_STUB
17512 SUB_TARGET_RECORD_STUB (name);
17513 #endif
17514 }
17515
17516 rtl = gen_const_mem (Pmode, rtl);
17517 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
17518
17519 SET_DECL_RTL (to, rtl);
17520 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
17521
17522 return to;
17523 }
17524
17525 /* Expand SYMBOL into its corresponding far-address symbol.
17526 WANT_REG is true if we require the result be a register. */
17527
17528 static rtx
17529 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
17530 {
17531 tree imp_decl;
17532 rtx x;
17533
17534 gcc_assert (SYMBOL_REF_DECL (symbol));
17535 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
17536
17537 x = DECL_RTL (imp_decl);
17538 if (want_reg)
17539 x = force_reg (Pmode, x);
17540 return x;
17541 }
17542
17543 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
17544 true if we require the result be a register. */
17545
17546 static rtx
17547 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
17548 {
17549 tree imp_decl;
17550 rtx x;
17551
17552 gcc_assert (SYMBOL_REF_DECL (symbol));
17553 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
17554
17555 x = DECL_RTL (imp_decl);
17556 if (want_reg)
17557 x = force_reg (Pmode, x);
17558 return x;
17559 }
17560
17561 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
17562 is true if we require the result be a register. */
17563
17564 static rtx
17565 legitimize_pe_coff_symbol (rtx addr, bool inreg)
17566 {
17567 if (!TARGET_PECOFF)
17568 return NULL_RTX;
17569
17570 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
17571 {
17572 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
17573 return legitimize_dllimport_symbol (addr, inreg);
17574 if (GET_CODE (addr) == CONST
17575 && GET_CODE (XEXP (addr, 0)) == PLUS
17576 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
17577 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
17578 {
17579 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
17580 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
17581 }
17582 }
17583
17584 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
17585 return NULL_RTX;
17586 if (GET_CODE (addr) == SYMBOL_REF
17587 && !is_imported_p (addr)
17588 && SYMBOL_REF_EXTERNAL_P (addr)
17589 && SYMBOL_REF_DECL (addr))
17590 return legitimize_pe_coff_extern_decl (addr, inreg);
17591
17592 if (GET_CODE (addr) == CONST
17593 && GET_CODE (XEXP (addr, 0)) == PLUS
17594 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
17595 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
17596 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
17597 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
17598 {
17599 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
17600 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
17601 }
17602 return NULL_RTX;
17603 }
17604
17605 /* Try machine-dependent ways of modifying an illegitimate address
17606 to be legitimate. If we find one, return the new, valid address.
17607 This macro is used in only one place: `memory_address' in explow.c.
17608
17609 OLDX is the address as it was before break_out_memory_refs was called.
17610 In some cases it is useful to look at this to decide what needs to be done.
17611
17612 It is always safe for this macro to do nothing. It exists to recognize
17613 opportunities to optimize the output.
17614
17615 For the 80386, we handle X+REG by loading X into a register R and
17616 using R+REG. R will go in a general reg and indexing will be used.
17617 However, if REG is a broken-out memory address or multiplication,
17618 nothing needs to be done because REG can certainly go in a general reg.
17619
17620 When -fpic is used, special handling is needed for symbolic references.
17621 See comments by legitimize_pic_address in i386.c for details. */
17622
17623 static rtx
17624 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
17625 {
17626 bool changed = false;
17627 unsigned log;
17628
17629 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
17630 if (log)
17631 return legitimize_tls_address (x, (enum tls_model) log, false);
17632 if (GET_CODE (x) == CONST
17633 && GET_CODE (XEXP (x, 0)) == PLUS
17634 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
17635 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
17636 {
17637 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
17638 (enum tls_model) log, false);
17639 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
17640 }
17641
17642 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
17643 {
17644 rtx tmp = legitimize_pe_coff_symbol (x, true);
17645 if (tmp)
17646 return tmp;
17647 }
17648
17649 if (flag_pic && SYMBOLIC_CONST (x))
17650 return legitimize_pic_address (x, 0);
17651
17652 #if TARGET_MACHO
17653 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
17654 return machopic_indirect_data_reference (x, 0);
17655 #endif
17656
17657 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
17658 if (GET_CODE (x) == ASHIFT
17659 && CONST_INT_P (XEXP (x, 1))
17660 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
17661 {
17662 changed = true;
17663 log = INTVAL (XEXP (x, 1));
17664 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
17665 GEN_INT (1 << log));
17666 }
17667
17668 if (GET_CODE (x) == PLUS)
17669 {
17670 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
17671
17672 if (GET_CODE (XEXP (x, 0)) == ASHIFT
17673 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
17674 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
17675 {
17676 changed = true;
17677 log = INTVAL (XEXP (XEXP (x, 0), 1));
17678 XEXP (x, 0) = gen_rtx_MULT (Pmode,
17679 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
17680 GEN_INT (1 << log));
17681 }
17682
17683 if (GET_CODE (XEXP (x, 1)) == ASHIFT
17684 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
17685 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
17686 {
17687 changed = true;
17688 log = INTVAL (XEXP (XEXP (x, 1), 1));
17689 XEXP (x, 1) = gen_rtx_MULT (Pmode,
17690 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
17691 GEN_INT (1 << log));
17692 }
17693
17694 /* Put multiply first if it isn't already. */
17695 if (GET_CODE (XEXP (x, 1)) == MULT)
17696 {
17697 std::swap (XEXP (x, 0), XEXP (x, 1));
17698 changed = true;
17699 }
17700
17701 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
17702 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
17703 created by virtual register instantiation, register elimination, and
17704 similar optimizations. */
17705 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
17706 {
17707 changed = true;
17708 x = gen_rtx_PLUS (Pmode,
17709 gen_rtx_PLUS (Pmode, XEXP (x, 0),
17710 XEXP (XEXP (x, 1), 0)),
17711 XEXP (XEXP (x, 1), 1));
17712 }
17713
17714 /* Canonicalize
17715 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
17716 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
17717 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
17718 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
17719 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
17720 && CONSTANT_P (XEXP (x, 1)))
17721 {
17722 rtx constant;
17723 rtx other = NULL_RTX;
17724
17725 if (CONST_INT_P (XEXP (x, 1)))
17726 {
17727 constant = XEXP (x, 1);
17728 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
17729 }
17730 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
17731 {
17732 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
17733 other = XEXP (x, 1);
17734 }
17735 else
17736 constant = 0;
17737
17738 if (constant)
17739 {
17740 changed = true;
17741 x = gen_rtx_PLUS (Pmode,
17742 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
17743 XEXP (XEXP (XEXP (x, 0), 1), 0)),
17744 plus_constant (Pmode, other,
17745 INTVAL (constant)));
17746 }
17747 }
17748
17749 if (changed && ix86_legitimate_address_p (mode, x, false))
17750 return x;
17751
17752 if (GET_CODE (XEXP (x, 0)) == MULT)
17753 {
17754 changed = true;
17755 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
17756 }
17757
17758 if (GET_CODE (XEXP (x, 1)) == MULT)
17759 {
17760 changed = true;
17761 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
17762 }
17763
17764 if (changed
17765 && REG_P (XEXP (x, 1))
17766 && REG_P (XEXP (x, 0)))
17767 return x;
17768
17769 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
17770 {
17771 changed = true;
17772 x = legitimize_pic_address (x, 0);
17773 }
17774
17775 if (changed && ix86_legitimate_address_p (mode, x, false))
17776 return x;
17777
17778 if (REG_P (XEXP (x, 0)))
17779 {
17780 rtx temp = gen_reg_rtx (Pmode);
17781 rtx val = force_operand (XEXP (x, 1), temp);
17782 if (val != temp)
17783 {
17784 val = convert_to_mode (Pmode, val, 1);
17785 emit_move_insn (temp, val);
17786 }
17787
17788 XEXP (x, 1) = temp;
17789 return x;
17790 }
17791
17792 else if (REG_P (XEXP (x, 1)))
17793 {
17794 rtx temp = gen_reg_rtx (Pmode);
17795 rtx val = force_operand (XEXP (x, 0), temp);
17796 if (val != temp)
17797 {
17798 val = convert_to_mode (Pmode, val, 1);
17799 emit_move_insn (temp, val);
17800 }
17801
17802 XEXP (x, 0) = temp;
17803 return x;
17804 }
17805 }
17806
17807 return x;
17808 }
17809 \f
17810 /* Print an integer constant expression in assembler syntax. Addition
17811 and subtraction are the only arithmetic that may appear in these
17812 expressions. FILE is the stdio stream to write to, X is the rtx, and
17813 CODE is the operand print code from the output string. */
17814
17815 static void
17816 output_pic_addr_const (FILE *file, rtx x, int code)
17817 {
17818 char buf[256];
17819
17820 switch (GET_CODE (x))
17821 {
17822 case PC:
17823 gcc_assert (flag_pic);
17824 putc ('.', file);
17825 break;
17826
17827 case SYMBOL_REF:
17828 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
17829 output_addr_const (file, x);
17830 else
17831 {
17832 const char *name = XSTR (x, 0);
17833
17834 /* Mark the decl as referenced so that cgraph will
17835 output the function. */
17836 if (SYMBOL_REF_DECL (x))
17837 mark_decl_referenced (SYMBOL_REF_DECL (x));
17838
17839 #if TARGET_MACHO
17840 if (MACHOPIC_INDIRECT
17841 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
17842 name = machopic_indirection_name (x, /*stub_p=*/true);
17843 #endif
17844 assemble_name (file, name);
17845 }
17846 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
17847 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
17848 fputs ("@PLT", file);
17849 break;
17850
17851 case LABEL_REF:
17852 x = XEXP (x, 0);
17853 /* FALLTHRU */
17854 case CODE_LABEL:
17855 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
17856 assemble_name (asm_out_file, buf);
17857 break;
17858
17859 case CONST_INT:
17860 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
17861 break;
17862
17863 case CONST:
17864 /* This used to output parentheses around the expression,
17865 but that does not work on the 386 (either ATT or BSD assembler). */
17866 output_pic_addr_const (file, XEXP (x, 0), code);
17867 break;
17868
17869 case CONST_DOUBLE:
17870 /* We can't handle floating point constants;
17871 TARGET_PRINT_OPERAND must handle them. */
17872 output_operand_lossage ("floating constant misused");
17873 break;
17874
17875 case PLUS:
17876 /* Some assemblers need integer constants to appear first. */
17877 if (CONST_INT_P (XEXP (x, 0)))
17878 {
17879 output_pic_addr_const (file, XEXP (x, 0), code);
17880 putc ('+', file);
17881 output_pic_addr_const (file, XEXP (x, 1), code);
17882 }
17883 else
17884 {
17885 gcc_assert (CONST_INT_P (XEXP (x, 1)));
17886 output_pic_addr_const (file, XEXP (x, 1), code);
17887 putc ('+', file);
17888 output_pic_addr_const (file, XEXP (x, 0), code);
17889 }
17890 break;
17891
17892 case MINUS:
17893 if (!TARGET_MACHO)
17894 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
17895 output_pic_addr_const (file, XEXP (x, 0), code);
17896 putc ('-', file);
17897 output_pic_addr_const (file, XEXP (x, 1), code);
17898 if (!TARGET_MACHO)
17899 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
17900 break;
17901
17902 case UNSPEC:
17903 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
17904 {
17905 bool f = i386_asm_output_addr_const_extra (file, x);
17906 gcc_assert (f);
17907 break;
17908 }
17909
17910 gcc_assert (XVECLEN (x, 0) == 1);
17911 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
17912 switch (XINT (x, 1))
17913 {
17914 case UNSPEC_GOT:
17915 fputs ("@GOT", file);
17916 break;
17917 case UNSPEC_GOTOFF:
17918 fputs ("@GOTOFF", file);
17919 break;
17920 case UNSPEC_PLTOFF:
17921 fputs ("@PLTOFF", file);
17922 break;
17923 case UNSPEC_PCREL:
17924 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17925 "(%rip)" : "[rip]", file);
17926 break;
17927 case UNSPEC_GOTPCREL:
17928 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17929 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
17930 break;
17931 case UNSPEC_GOTTPOFF:
17932 /* FIXME: This might be @TPOFF in Sun ld too. */
17933 fputs ("@gottpoff", file);
17934 break;
17935 case UNSPEC_TPOFF:
17936 fputs ("@tpoff", file);
17937 break;
17938 case UNSPEC_NTPOFF:
17939 if (TARGET_64BIT)
17940 fputs ("@tpoff", file);
17941 else
17942 fputs ("@ntpoff", file);
17943 break;
17944 case UNSPEC_DTPOFF:
17945 fputs ("@dtpoff", file);
17946 break;
17947 case UNSPEC_GOTNTPOFF:
17948 if (TARGET_64BIT)
17949 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17950 "@gottpoff(%rip)": "@gottpoff[rip]", file);
17951 else
17952 fputs ("@gotntpoff", file);
17953 break;
17954 case UNSPEC_INDNTPOFF:
17955 fputs ("@indntpoff", file);
17956 break;
17957 #if TARGET_MACHO
17958 case UNSPEC_MACHOPIC_OFFSET:
17959 putc ('-', file);
17960 machopic_output_function_base_name (file);
17961 break;
17962 #endif
17963 default:
17964 output_operand_lossage ("invalid UNSPEC as operand");
17965 break;
17966 }
17967 break;
17968
17969 default:
17970 output_operand_lossage ("invalid expression as operand");
17971 }
17972 }
17973
17974 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
17975 We need to emit DTP-relative relocations. */
17976
17977 static void ATTRIBUTE_UNUSED
17978 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
17979 {
17980 fputs (ASM_LONG, file);
17981 output_addr_const (file, x);
17982 fputs ("@dtpoff", file);
17983 switch (size)
17984 {
17985 case 4:
17986 break;
17987 case 8:
17988 fputs (", 0", file);
17989 break;
17990 default:
17991 gcc_unreachable ();
17992 }
17993 }
17994
17995 /* Return true if X is a representation of the PIC register. This copes
17996 with calls from ix86_find_base_term, where the register might have
17997 been replaced by a cselib value. */
17998
17999 static bool
18000 ix86_pic_register_p (rtx x)
18001 {
18002 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
18003 return (pic_offset_table_rtx
18004 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
18005 else if (!REG_P (x))
18006 return false;
18007 else if (pic_offset_table_rtx)
18008 {
18009 if (REGNO (x) == REGNO (pic_offset_table_rtx))
18010 return true;
18011 if (HARD_REGISTER_P (x)
18012 && !HARD_REGISTER_P (pic_offset_table_rtx)
18013 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
18014 return true;
18015 return false;
18016 }
18017 else
18018 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
18019 }
18020
18021 /* Helper function for ix86_delegitimize_address.
18022 Attempt to delegitimize TLS local-exec accesses. */
18023
18024 static rtx
18025 ix86_delegitimize_tls_address (rtx orig_x)
18026 {
18027 rtx x = orig_x, unspec;
18028 struct ix86_address addr;
18029
18030 if (!TARGET_TLS_DIRECT_SEG_REFS)
18031 return orig_x;
18032 if (MEM_P (x))
18033 x = XEXP (x, 0);
18034 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
18035 return orig_x;
18036 if (ix86_decompose_address (x, &addr) == 0
18037 || addr.seg != DEFAULT_TLS_SEG_REG
18038 || addr.disp == NULL_RTX
18039 || GET_CODE (addr.disp) != CONST)
18040 return orig_x;
18041 unspec = XEXP (addr.disp, 0);
18042 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
18043 unspec = XEXP (unspec, 0);
18044 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
18045 return orig_x;
18046 x = XVECEXP (unspec, 0, 0);
18047 gcc_assert (GET_CODE (x) == SYMBOL_REF);
18048 if (unspec != XEXP (addr.disp, 0))
18049 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
18050 if (addr.index)
18051 {
18052 rtx idx = addr.index;
18053 if (addr.scale != 1)
18054 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
18055 x = gen_rtx_PLUS (Pmode, idx, x);
18056 }
18057 if (addr.base)
18058 x = gen_rtx_PLUS (Pmode, addr.base, x);
18059 if (MEM_P (orig_x))
18060 x = replace_equiv_address_nv (orig_x, x);
18061 return x;
18062 }
18063
18064 /* In the name of slightly smaller debug output, and to cater to
18065 general assembler lossage, recognize PIC+GOTOFF and turn it back
18066 into a direct symbol reference.
18067
18068 On Darwin, this is necessary to avoid a crash, because Darwin
18069 has a different PIC label for each routine but the DWARF debugging
18070 information is not associated with any particular routine, so it's
18071 necessary to remove references to the PIC label from RTL stored by
18072 the DWARF output code.
18073
18074 This helper is used in the normal ix86_delegitimize_address
18075 entrypoint (e.g. used in the target delegitimization hook) and
18076 in ix86_find_base_term. As compile time memory optimization, we
18077 avoid allocating rtxes that will not change anything on the outcome
18078 of the callers (find_base_value and find_base_term). */
18079
18080 static inline rtx
18081 ix86_delegitimize_address_1 (rtx x, bool base_term_p)
18082 {
18083 rtx orig_x = delegitimize_mem_from_attrs (x);
18084 /* addend is NULL or some rtx if x is something+GOTOFF where
18085 something doesn't include the PIC register. */
18086 rtx addend = NULL_RTX;
18087 /* reg_addend is NULL or a multiple of some register. */
18088 rtx reg_addend = NULL_RTX;
18089 /* const_addend is NULL or a const_int. */
18090 rtx const_addend = NULL_RTX;
18091 /* This is the result, or NULL. */
18092 rtx result = NULL_RTX;
18093
18094 x = orig_x;
18095
18096 if (MEM_P (x))
18097 x = XEXP (x, 0);
18098
18099 if (TARGET_64BIT)
18100 {
18101 if (GET_CODE (x) == CONST
18102 && GET_CODE (XEXP (x, 0)) == PLUS
18103 && GET_MODE (XEXP (x, 0)) == Pmode
18104 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
18105 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
18106 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
18107 {
18108 /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
18109 base. A CONST can't be arg_pointer_rtx based. */
18110 if (base_term_p && MEM_P (orig_x))
18111 return orig_x;
18112 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
18113 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
18114 if (MEM_P (orig_x))
18115 x = replace_equiv_address_nv (orig_x, x);
18116 return x;
18117 }
18118
18119 if (GET_CODE (x) == CONST
18120 && GET_CODE (XEXP (x, 0)) == UNSPEC
18121 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
18122 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
18123 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
18124 {
18125 x = XVECEXP (XEXP (x, 0), 0, 0);
18126 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
18127 {
18128 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
18129 if (x == NULL_RTX)
18130 return orig_x;
18131 }
18132 return x;
18133 }
18134
18135 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
18136 return ix86_delegitimize_tls_address (orig_x);
18137
18138 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
18139 and -mcmodel=medium -fpic. */
18140 }
18141
18142 if (GET_CODE (x) != PLUS
18143 || GET_CODE (XEXP (x, 1)) != CONST)
18144 return ix86_delegitimize_tls_address (orig_x);
18145
18146 if (ix86_pic_register_p (XEXP (x, 0)))
18147 /* %ebx + GOT/GOTOFF */
18148 ;
18149 else if (GET_CODE (XEXP (x, 0)) == PLUS)
18150 {
18151 /* %ebx + %reg * scale + GOT/GOTOFF */
18152 reg_addend = XEXP (x, 0);
18153 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
18154 reg_addend = XEXP (reg_addend, 1);
18155 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
18156 reg_addend = XEXP (reg_addend, 0);
18157 else
18158 {
18159 reg_addend = NULL_RTX;
18160 addend = XEXP (x, 0);
18161 }
18162 }
18163 else
18164 addend = XEXP (x, 0);
18165
18166 x = XEXP (XEXP (x, 1), 0);
18167 if (GET_CODE (x) == PLUS
18168 && CONST_INT_P (XEXP (x, 1)))
18169 {
18170 const_addend = XEXP (x, 1);
18171 x = XEXP (x, 0);
18172 }
18173
18174 if (GET_CODE (x) == UNSPEC
18175 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
18176 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
18177 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
18178 && !MEM_P (orig_x) && !addend)))
18179 result = XVECEXP (x, 0, 0);
18180
18181 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
18182 && !MEM_P (orig_x))
18183 result = XVECEXP (x, 0, 0);
18184
18185 if (! result)
18186 return ix86_delegitimize_tls_address (orig_x);
18187
18188 /* For (PLUS something CONST_INT) both find_base_{value,term} just
18189 recurse on the first operand. */
18190 if (const_addend && !base_term_p)
18191 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
18192 if (reg_addend)
18193 result = gen_rtx_PLUS (Pmode, reg_addend, result);
18194 if (addend)
18195 {
18196 /* If the rest of original X doesn't involve the PIC register, add
18197 addend and subtract pic_offset_table_rtx. This can happen e.g.
18198 for code like:
18199 leal (%ebx, %ecx, 4), %ecx
18200 ...
18201 movl foo@GOTOFF(%ecx), %edx
18202 in which case we return (%ecx - %ebx) + foo
18203 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
18204 and reload has completed. */
18205 if (pic_offset_table_rtx
18206 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
18207 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
18208 pic_offset_table_rtx),
18209 result);
18210 else if (pic_offset_table_rtx && !TARGET_MACHO && !TARGET_VXWORKS_RTP)
18211 {
18212 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
18213 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
18214 result = gen_rtx_PLUS (Pmode, tmp, result);
18215 }
18216 else
18217 return orig_x;
18218 }
18219 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
18220 {
18221 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
18222 if (result == NULL_RTX)
18223 return orig_x;
18224 }
18225 return result;
18226 }
18227
18228 /* The normal instantiation of the above template. */
18229
18230 static rtx
18231 ix86_delegitimize_address (rtx x)
18232 {
18233 return ix86_delegitimize_address_1 (x, false);
18234 }
18235
18236 /* If X is a machine specific address (i.e. a symbol or label being
18237 referenced as a displacement from the GOT implemented using an
18238 UNSPEC), then return the base term. Otherwise return X. */
18239
18240 rtx
18241 ix86_find_base_term (rtx x)
18242 {
18243 rtx term;
18244
18245 if (TARGET_64BIT)
18246 {
18247 if (GET_CODE (x) != CONST)
18248 return x;
18249 term = XEXP (x, 0);
18250 if (GET_CODE (term) == PLUS
18251 && CONST_INT_P (XEXP (term, 1)))
18252 term = XEXP (term, 0);
18253 if (GET_CODE (term) != UNSPEC
18254 || (XINT (term, 1) != UNSPEC_GOTPCREL
18255 && XINT (term, 1) != UNSPEC_PCREL))
18256 return x;
18257
18258 return XVECEXP (term, 0, 0);
18259 }
18260
18261 return ix86_delegitimize_address_1 (x, true);
18262 }
18263 \f
18264 static void
18265 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
18266 bool fp, FILE *file)
18267 {
18268 const char *suffix;
18269
18270 if (mode == CCFPmode || mode == CCFPUmode)
18271 {
18272 code = ix86_fp_compare_code_to_integer (code);
18273 mode = CCmode;
18274 }
18275 if (reverse)
18276 code = reverse_condition (code);
18277
18278 switch (code)
18279 {
18280 case EQ:
18281 switch (mode)
18282 {
18283 case CCAmode:
18284 suffix = "a";
18285 break;
18286 case CCCmode:
18287 suffix = "c";
18288 break;
18289 case CCOmode:
18290 suffix = "o";
18291 break;
18292 case CCPmode:
18293 suffix = "p";
18294 break;
18295 case CCSmode:
18296 suffix = "s";
18297 break;
18298 default:
18299 suffix = "e";
18300 break;
18301 }
18302 break;
18303 case NE:
18304 switch (mode)
18305 {
18306 case CCAmode:
18307 suffix = "na";
18308 break;
18309 case CCCmode:
18310 suffix = "nc";
18311 break;
18312 case CCOmode:
18313 suffix = "no";
18314 break;
18315 case CCPmode:
18316 suffix = "np";
18317 break;
18318 case CCSmode:
18319 suffix = "ns";
18320 break;
18321 default:
18322 suffix = "ne";
18323 break;
18324 }
18325 break;
18326 case GT:
18327 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
18328 suffix = "g";
18329 break;
18330 case GTU:
18331 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
18332 Those same assemblers have the same but opposite lossage on cmov. */
18333 if (mode == CCmode)
18334 suffix = fp ? "nbe" : "a";
18335 else
18336 gcc_unreachable ();
18337 break;
18338 case LT:
18339 switch (mode)
18340 {
18341 case CCNOmode:
18342 case CCGOCmode:
18343 suffix = "s";
18344 break;
18345
18346 case CCmode:
18347 case CCGCmode:
18348 suffix = "l";
18349 break;
18350
18351 default:
18352 gcc_unreachable ();
18353 }
18354 break;
18355 case LTU:
18356 if (mode == CCmode)
18357 suffix = "b";
18358 else if (mode == CCCmode)
18359 suffix = fp ? "b" : "c";
18360 else
18361 gcc_unreachable ();
18362 break;
18363 case GE:
18364 switch (mode)
18365 {
18366 case CCNOmode:
18367 case CCGOCmode:
18368 suffix = "ns";
18369 break;
18370
18371 case CCmode:
18372 case CCGCmode:
18373 suffix = "ge";
18374 break;
18375
18376 default:
18377 gcc_unreachable ();
18378 }
18379 break;
18380 case GEU:
18381 if (mode == CCmode)
18382 suffix = "nb";
18383 else if (mode == CCCmode)
18384 suffix = fp ? "nb" : "nc";
18385 else
18386 gcc_unreachable ();
18387 break;
18388 case LE:
18389 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
18390 suffix = "le";
18391 break;
18392 case LEU:
18393 if (mode == CCmode)
18394 suffix = "be";
18395 else
18396 gcc_unreachable ();
18397 break;
18398 case UNORDERED:
18399 suffix = fp ? "u" : "p";
18400 break;
18401 case ORDERED:
18402 suffix = fp ? "nu" : "np";
18403 break;
18404 default:
18405 gcc_unreachable ();
18406 }
18407 fputs (suffix, file);
18408 }
18409
18410 /* Print the name of register X to FILE based on its machine mode and number.
18411 If CODE is 'w', pretend the mode is HImode.
18412 If CODE is 'b', pretend the mode is QImode.
18413 If CODE is 'k', pretend the mode is SImode.
18414 If CODE is 'q', pretend the mode is DImode.
18415 If CODE is 'x', pretend the mode is V4SFmode.
18416 If CODE is 't', pretend the mode is V8SFmode.
18417 If CODE is 'g', pretend the mode is V16SFmode.
18418 If CODE is 'h', pretend the reg is the 'high' byte register.
18419 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
18420 If CODE is 'd', duplicate the operand for AVX instruction.
18421 */
18422
18423 void
18424 print_reg (rtx x, int code, FILE *file)
18425 {
18426 const char *reg;
18427 int msize;
18428 unsigned int regno;
18429 bool duplicated;
18430
18431 if (ASSEMBLER_DIALECT == ASM_ATT)
18432 putc ('%', file);
18433
18434 if (x == pc_rtx)
18435 {
18436 gcc_assert (TARGET_64BIT);
18437 fputs ("rip", file);
18438 return;
18439 }
18440
18441 if (code == 'y' && STACK_TOP_P (x))
18442 {
18443 fputs ("st(0)", file);
18444 return;
18445 }
18446
18447 if (code == 'w')
18448 msize = 2;
18449 else if (code == 'b')
18450 msize = 1;
18451 else if (code == 'k')
18452 msize = 4;
18453 else if (code == 'q')
18454 msize = 8;
18455 else if (code == 'h')
18456 msize = 0;
18457 else if (code == 'x')
18458 msize = 16;
18459 else if (code == 't')
18460 msize = 32;
18461 else if (code == 'g')
18462 msize = 64;
18463 else
18464 msize = GET_MODE_SIZE (GET_MODE (x));
18465
18466 regno = REGNO (x);
18467
18468 if (regno == ARG_POINTER_REGNUM
18469 || regno == FRAME_POINTER_REGNUM
18470 || regno == FPSR_REG
18471 || regno == FPCR_REG)
18472 {
18473 output_operand_lossage
18474 ("invalid use of register '%s'", reg_names[regno]);
18475 return;
18476 }
18477 else if (regno == FLAGS_REG)
18478 {
18479 output_operand_lossage ("invalid use of asm flag output");
18480 return;
18481 }
18482
18483 duplicated = code == 'd' && TARGET_AVX;
18484
18485 switch (msize)
18486 {
18487 case 16:
18488 case 12:
18489 case 8:
18490 if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
18491 warning (0, "unsupported size for integer register");
18492 /* FALLTHRU */
18493 case 4:
18494 if (LEGACY_INT_REGNO_P (regno))
18495 putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
18496 /* FALLTHRU */
18497 case 2:
18498 normal:
18499 reg = hi_reg_name[regno];
18500 break;
18501 case 1:
18502 if (regno >= ARRAY_SIZE (qi_reg_name))
18503 goto normal;
18504 if (!ANY_QI_REGNO_P (regno))
18505 error ("unsupported size for integer register");
18506 reg = qi_reg_name[regno];
18507 break;
18508 case 0:
18509 if (regno >= ARRAY_SIZE (qi_high_reg_name))
18510 goto normal;
18511 reg = qi_high_reg_name[regno];
18512 break;
18513 case 32:
18514 case 64:
18515 if (SSE_REGNO_P (regno))
18516 {
18517 gcc_assert (!duplicated);
18518 putc (msize == 32 ? 'y' : 'z', file);
18519 reg = hi_reg_name[regno] + 1;
18520 break;
18521 }
18522 goto normal;
18523 default:
18524 gcc_unreachable ();
18525 }
18526
18527 fputs (reg, file);
18528
18529 /* Irritatingly, AMD extended registers use
18530 different naming convention: "r%d[bwd]" */
18531 if (REX_INT_REGNO_P (regno))
18532 {
18533 gcc_assert (TARGET_64BIT);
18534 switch (msize)
18535 {
18536 case 0:
18537 error ("extended registers have no high halves");
18538 break;
18539 case 1:
18540 putc ('b', file);
18541 break;
18542 case 2:
18543 putc ('w', file);
18544 break;
18545 case 4:
18546 putc ('d', file);
18547 break;
18548 case 8:
18549 /* no suffix */
18550 break;
18551 default:
18552 error ("unsupported operand size for extended register");
18553 break;
18554 }
18555 return;
18556 }
18557
18558 if (duplicated)
18559 {
18560 if (ASSEMBLER_DIALECT == ASM_ATT)
18561 fprintf (file, ", %%%s", reg);
18562 else
18563 fprintf (file, ", %s", reg);
18564 }
18565 }
18566
18567 /* Meaning of CODE:
18568 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
18569 C -- print opcode suffix for set/cmov insn.
18570 c -- like C, but print reversed condition
18571 F,f -- likewise, but for floating-point.
18572 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
18573 otherwise nothing
18574 R -- print embeded rounding and sae.
18575 r -- print only sae.
18576 z -- print the opcode suffix for the size of the current operand.
18577 Z -- likewise, with special suffixes for x87 instructions.
18578 * -- print a star (in certain assembler syntax)
18579 A -- print an absolute memory reference.
18580 E -- print address with DImode register names if TARGET_64BIT.
18581 w -- print the operand as if it's a "word" (HImode) even if it isn't.
18582 s -- print a shift double count, followed by the assemblers argument
18583 delimiter.
18584 b -- print the QImode name of the register for the indicated operand.
18585 %b0 would print %al if operands[0] is reg 0.
18586 w -- likewise, print the HImode name of the register.
18587 k -- likewise, print the SImode name of the register.
18588 q -- likewise, print the DImode name of the register.
18589 x -- likewise, print the V4SFmode name of the register.
18590 t -- likewise, print the V8SFmode name of the register.
18591 g -- likewise, print the V16SFmode name of the register.
18592 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
18593 y -- print "st(0)" instead of "st" as a register.
18594 d -- print duplicated register operand for AVX instruction.
18595 D -- print condition for SSE cmp instruction.
18596 P -- if PIC, print an @PLT suffix.
18597 p -- print raw symbol name.
18598 X -- don't print any sort of PIC '@' suffix for a symbol.
18599 & -- print some in-use local-dynamic symbol name.
18600 H -- print a memory address offset by 8; used for sse high-parts
18601 Y -- print condition for XOP pcom* instruction.
18602 + -- print a branch hint as 'cs' or 'ds' prefix
18603 ; -- print a semicolon (after prefixes due to bug in older gas).
18604 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
18605 @ -- print a segment register of thread base pointer load
18606 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
18607 ! -- print MPX prefix for jxx/call/ret instructions if required.
18608 */
18609
18610 void
18611 ix86_print_operand (FILE *file, rtx x, int code)
18612 {
18613 if (code)
18614 {
18615 switch (code)
18616 {
18617 case 'A':
18618 switch (ASSEMBLER_DIALECT)
18619 {
18620 case ASM_ATT:
18621 putc ('*', file);
18622 break;
18623
18624 case ASM_INTEL:
18625 /* Intel syntax. For absolute addresses, registers should not
18626 be surrounded by braces. */
18627 if (!REG_P (x))
18628 {
18629 putc ('[', file);
18630 ix86_print_operand (file, x, 0);
18631 putc (']', file);
18632 return;
18633 }
18634 break;
18635
18636 default:
18637 gcc_unreachable ();
18638 }
18639
18640 ix86_print_operand (file, x, 0);
18641 return;
18642
18643 case 'E':
18644 /* Wrap address in an UNSPEC to declare special handling. */
18645 if (TARGET_64BIT)
18646 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
18647
18648 output_address (VOIDmode, x);
18649 return;
18650
18651 case 'L':
18652 if (ASSEMBLER_DIALECT == ASM_ATT)
18653 putc ('l', file);
18654 return;
18655
18656 case 'W':
18657 if (ASSEMBLER_DIALECT == ASM_ATT)
18658 putc ('w', file);
18659 return;
18660
18661 case 'B':
18662 if (ASSEMBLER_DIALECT == ASM_ATT)
18663 putc ('b', file);
18664 return;
18665
18666 case 'Q':
18667 if (ASSEMBLER_DIALECT == ASM_ATT)
18668 putc ('l', file);
18669 return;
18670
18671 case 'S':
18672 if (ASSEMBLER_DIALECT == ASM_ATT)
18673 putc ('s', file);
18674 return;
18675
18676 case 'T':
18677 if (ASSEMBLER_DIALECT == ASM_ATT)
18678 putc ('t', file);
18679 return;
18680
18681 case 'O':
18682 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
18683 if (ASSEMBLER_DIALECT != ASM_ATT)
18684 return;
18685
18686 switch (GET_MODE_SIZE (GET_MODE (x)))
18687 {
18688 case 2:
18689 putc ('w', file);
18690 break;
18691
18692 case 4:
18693 putc ('l', file);
18694 break;
18695
18696 case 8:
18697 putc ('q', file);
18698 break;
18699
18700 default:
18701 output_operand_lossage ("invalid operand size for operand "
18702 "code 'O'");
18703 return;
18704 }
18705
18706 putc ('.', file);
18707 #endif
18708 return;
18709
18710 case 'z':
18711 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
18712 {
18713 /* Opcodes don't get size suffixes if using Intel opcodes. */
18714 if (ASSEMBLER_DIALECT == ASM_INTEL)
18715 return;
18716
18717 switch (GET_MODE_SIZE (GET_MODE (x)))
18718 {
18719 case 1:
18720 putc ('b', file);
18721 return;
18722
18723 case 2:
18724 putc ('w', file);
18725 return;
18726
18727 case 4:
18728 putc ('l', file);
18729 return;
18730
18731 case 8:
18732 putc ('q', file);
18733 return;
18734
18735 default:
18736 output_operand_lossage ("invalid operand size for operand "
18737 "code 'z'");
18738 return;
18739 }
18740 }
18741
18742 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
18743 warning (0, "non-integer operand used with operand code 'z'");
18744 /* FALLTHRU */
18745
18746 case 'Z':
18747 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
18748 if (ASSEMBLER_DIALECT == ASM_INTEL)
18749 return;
18750
18751 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
18752 {
18753 switch (GET_MODE_SIZE (GET_MODE (x)))
18754 {
18755 case 2:
18756 #ifdef HAVE_AS_IX86_FILDS
18757 putc ('s', file);
18758 #endif
18759 return;
18760
18761 case 4:
18762 putc ('l', file);
18763 return;
18764
18765 case 8:
18766 #ifdef HAVE_AS_IX86_FILDQ
18767 putc ('q', file);
18768 #else
18769 fputs ("ll", file);
18770 #endif
18771 return;
18772
18773 default:
18774 break;
18775 }
18776 }
18777 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
18778 {
18779 /* 387 opcodes don't get size suffixes
18780 if the operands are registers. */
18781 if (STACK_REG_P (x))
18782 return;
18783
18784 switch (GET_MODE_SIZE (GET_MODE (x)))
18785 {
18786 case 4:
18787 putc ('s', file);
18788 return;
18789
18790 case 8:
18791 putc ('l', file);
18792 return;
18793
18794 case 12:
18795 case 16:
18796 putc ('t', file);
18797 return;
18798
18799 default:
18800 break;
18801 }
18802 }
18803 else
18804 {
18805 output_operand_lossage ("invalid operand type used with "
18806 "operand code 'Z'");
18807 return;
18808 }
18809
18810 output_operand_lossage ("invalid operand size for operand code 'Z'");
18811 return;
18812
18813 case 'd':
18814 case 'b':
18815 case 'w':
18816 case 'k':
18817 case 'q':
18818 case 'h':
18819 case 't':
18820 case 'g':
18821 case 'y':
18822 case 'x':
18823 case 'X':
18824 case 'P':
18825 case 'p':
18826 break;
18827
18828 case 's':
18829 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
18830 {
18831 ix86_print_operand (file, x, 0);
18832 fputs (", ", file);
18833 }
18834 return;
18835
18836 case 'Y':
18837 switch (GET_CODE (x))
18838 {
18839 case NE:
18840 fputs ("neq", file);
18841 break;
18842 case EQ:
18843 fputs ("eq", file);
18844 break;
18845 case GE:
18846 case GEU:
18847 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
18848 break;
18849 case GT:
18850 case GTU:
18851 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
18852 break;
18853 case LE:
18854 case LEU:
18855 fputs ("le", file);
18856 break;
18857 case LT:
18858 case LTU:
18859 fputs ("lt", file);
18860 break;
18861 case UNORDERED:
18862 fputs ("unord", file);
18863 break;
18864 case ORDERED:
18865 fputs ("ord", file);
18866 break;
18867 case UNEQ:
18868 fputs ("ueq", file);
18869 break;
18870 case UNGE:
18871 fputs ("nlt", file);
18872 break;
18873 case UNGT:
18874 fputs ("nle", file);
18875 break;
18876 case UNLE:
18877 fputs ("ule", file);
18878 break;
18879 case UNLT:
18880 fputs ("ult", file);
18881 break;
18882 case LTGT:
18883 fputs ("une", file);
18884 break;
18885 default:
18886 output_operand_lossage ("operand is not a condition code, "
18887 "invalid operand code 'Y'");
18888 return;
18889 }
18890 return;
18891
18892 case 'D':
18893 /* Little bit of braindamage here. The SSE compare instructions
18894 does use completely different names for the comparisons that the
18895 fp conditional moves. */
18896 switch (GET_CODE (x))
18897 {
18898 case UNEQ:
18899 if (TARGET_AVX)
18900 {
18901 fputs ("eq_us", file);
18902 break;
18903 }
18904 /* FALLTHRU */
18905 case EQ:
18906 fputs ("eq", file);
18907 break;
18908 case UNLT:
18909 if (TARGET_AVX)
18910 {
18911 fputs ("nge", file);
18912 break;
18913 }
18914 /* FALLTHRU */
18915 case LT:
18916 fputs ("lt", file);
18917 break;
18918 case UNLE:
18919 if (TARGET_AVX)
18920 {
18921 fputs ("ngt", file);
18922 break;
18923 }
18924 /* FALLTHRU */
18925 case LE:
18926 fputs ("le", file);
18927 break;
18928 case UNORDERED:
18929 fputs ("unord", file);
18930 break;
18931 case LTGT:
18932 if (TARGET_AVX)
18933 {
18934 fputs ("neq_oq", file);
18935 break;
18936 }
18937 /* FALLTHRU */
18938 case NE:
18939 fputs ("neq", file);
18940 break;
18941 case GE:
18942 if (TARGET_AVX)
18943 {
18944 fputs ("ge", file);
18945 break;
18946 }
18947 /* FALLTHRU */
18948 case UNGE:
18949 fputs ("nlt", file);
18950 break;
18951 case GT:
18952 if (TARGET_AVX)
18953 {
18954 fputs ("gt", file);
18955 break;
18956 }
18957 /* FALLTHRU */
18958 case UNGT:
18959 fputs ("nle", file);
18960 break;
18961 case ORDERED:
18962 fputs ("ord", file);
18963 break;
18964 default:
18965 output_operand_lossage ("operand is not a condition code, "
18966 "invalid operand code 'D'");
18967 return;
18968 }
18969 return;
18970
18971 case 'F':
18972 case 'f':
18973 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
18974 if (ASSEMBLER_DIALECT == ASM_ATT)
18975 putc ('.', file);
18976 gcc_fallthrough ();
18977 #endif
18978
18979 case 'C':
18980 case 'c':
18981 if (!COMPARISON_P (x))
18982 {
18983 output_operand_lossage ("operand is not a condition code, "
18984 "invalid operand code '%c'", code);
18985 return;
18986 }
18987 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
18988 code == 'c' || code == 'f',
18989 code == 'F' || code == 'f',
18990 file);
18991 return;
18992
18993 case 'H':
18994 if (!offsettable_memref_p (x))
18995 {
18996 output_operand_lossage ("operand is not an offsettable memory "
18997 "reference, invalid operand code 'H'");
18998 return;
18999 }
19000 /* It doesn't actually matter what mode we use here, as we're
19001 only going to use this for printing. */
19002 x = adjust_address_nv (x, DImode, 8);
19003 /* Output 'qword ptr' for intel assembler dialect. */
19004 if (ASSEMBLER_DIALECT == ASM_INTEL)
19005 code = 'q';
19006 break;
19007
19008 case 'K':
19009 if (!CONST_INT_P (x))
19010 {
19011 output_operand_lossage ("operand is not an integer, invalid "
19012 "operand code 'K'");
19013 return;
19014 }
19015
19016 if (INTVAL (x) & IX86_HLE_ACQUIRE)
19017 #ifdef HAVE_AS_IX86_HLE
19018 fputs ("xacquire ", file);
19019 #else
19020 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
19021 #endif
19022 else if (INTVAL (x) & IX86_HLE_RELEASE)
19023 #ifdef HAVE_AS_IX86_HLE
19024 fputs ("xrelease ", file);
19025 #else
19026 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
19027 #endif
19028 /* We do not want to print value of the operand. */
19029 return;
19030
19031 case 'N':
19032 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
19033 fputs ("{z}", file);
19034 return;
19035
19036 case 'r':
19037 if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
19038 {
19039 output_operand_lossage ("operand is not a specific integer, "
19040 "invalid operand code 'r'");
19041 return;
19042 }
19043
19044 if (ASSEMBLER_DIALECT == ASM_INTEL)
19045 fputs (", ", file);
19046
19047 fputs ("{sae}", file);
19048
19049 if (ASSEMBLER_DIALECT == ASM_ATT)
19050 fputs (", ", file);
19051
19052 return;
19053
19054 case 'R':
19055 if (!CONST_INT_P (x))
19056 {
19057 output_operand_lossage ("operand is not an integer, invalid "
19058 "operand code 'R'");
19059 return;
19060 }
19061
19062 if (ASSEMBLER_DIALECT == ASM_INTEL)
19063 fputs (", ", file);
19064
19065 switch (INTVAL (x))
19066 {
19067 case ROUND_NEAREST_INT | ROUND_SAE:
19068 fputs ("{rn-sae}", file);
19069 break;
19070 case ROUND_NEG_INF | ROUND_SAE:
19071 fputs ("{rd-sae}", file);
19072 break;
19073 case ROUND_POS_INF | ROUND_SAE:
19074 fputs ("{ru-sae}", file);
19075 break;
19076 case ROUND_ZERO | ROUND_SAE:
19077 fputs ("{rz-sae}", file);
19078 break;
19079 default:
19080 output_operand_lossage ("operand is not a specific integer, "
19081 "invalid operand code 'R'");
19082 }
19083
19084 if (ASSEMBLER_DIALECT == ASM_ATT)
19085 fputs (", ", file);
19086
19087 return;
19088
19089 case '*':
19090 if (ASSEMBLER_DIALECT == ASM_ATT)
19091 putc ('*', file);
19092 return;
19093
19094 case '&':
19095 {
19096 const char *name = get_some_local_dynamic_name ();
19097 if (name == NULL)
19098 output_operand_lossage ("'%%&' used without any "
19099 "local dynamic TLS references");
19100 else
19101 assemble_name (file, name);
19102 return;
19103 }
19104
19105 case '+':
19106 {
19107 rtx x;
19108
19109 if (!optimize
19110 || optimize_function_for_size_p (cfun)
19111 || !TARGET_BRANCH_PREDICTION_HINTS)
19112 return;
19113
19114 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
19115 if (x)
19116 {
19117 int pred_val = XINT (x, 0);
19118
19119 if (pred_val < REG_BR_PROB_BASE * 45 / 100
19120 || pred_val > REG_BR_PROB_BASE * 55 / 100)
19121 {
19122 bool taken = pred_val > REG_BR_PROB_BASE / 2;
19123 bool cputaken
19124 = final_forward_branch_p (current_output_insn) == 0;
19125
19126 /* Emit hints only in the case default branch prediction
19127 heuristics would fail. */
19128 if (taken != cputaken)
19129 {
19130 /* We use 3e (DS) prefix for taken branches and
19131 2e (CS) prefix for not taken branches. */
19132 if (taken)
19133 fputs ("ds ; ", file);
19134 else
19135 fputs ("cs ; ", file);
19136 }
19137 }
19138 }
19139 return;
19140 }
19141
19142 case ';':
19143 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
19144 putc (';', file);
19145 #endif
19146 return;
19147
19148 case '@':
19149 if (ASSEMBLER_DIALECT == ASM_ATT)
19150 putc ('%', file);
19151
19152 /* The kernel uses a different segment register for performance
19153 reasons; a system call would not have to trash the userspace
19154 segment register, which would be expensive. */
19155 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
19156 fputs ("fs", file);
19157 else
19158 fputs ("gs", file);
19159 return;
19160
19161 case '~':
19162 putc (TARGET_AVX2 ? 'i' : 'f', file);
19163 return;
19164
19165 case '^':
19166 if (TARGET_64BIT && Pmode != word_mode)
19167 fputs ("addr32 ", file);
19168 return;
19169
19170 case '!':
19171 if (ix86_bnd_prefixed_insn_p (current_output_insn))
19172 fputs ("bnd ", file);
19173 return;
19174
19175 default:
19176 output_operand_lossage ("invalid operand code '%c'", code);
19177 }
19178 }
19179
19180 if (REG_P (x))
19181 print_reg (x, code, file);
19182
19183 else if (MEM_P (x))
19184 {
19185 rtx addr = XEXP (x, 0);
19186
19187 /* No `byte ptr' prefix for call instructions ... */
19188 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
19189 {
19190 machine_mode mode = GET_MODE (x);
19191 const char *size;
19192
19193 /* Check for explicit size override codes. */
19194 if (code == 'b')
19195 size = "BYTE";
19196 else if (code == 'w')
19197 size = "WORD";
19198 else if (code == 'k')
19199 size = "DWORD";
19200 else if (code == 'q')
19201 size = "QWORD";
19202 else if (code == 'x')
19203 size = "XMMWORD";
19204 else if (code == 't')
19205 size = "YMMWORD";
19206 else if (code == 'g')
19207 size = "ZMMWORD";
19208 else if (mode == BLKmode)
19209 /* ... or BLKmode operands, when not overridden. */
19210 size = NULL;
19211 else
19212 switch (GET_MODE_SIZE (mode))
19213 {
19214 case 1: size = "BYTE"; break;
19215 case 2: size = "WORD"; break;
19216 case 4: size = "DWORD"; break;
19217 case 8: size = "QWORD"; break;
19218 case 12: size = "TBYTE"; break;
19219 case 16:
19220 if (mode == XFmode)
19221 size = "TBYTE";
19222 else
19223 size = "XMMWORD";
19224 break;
19225 case 32: size = "YMMWORD"; break;
19226 case 64: size = "ZMMWORD"; break;
19227 default:
19228 gcc_unreachable ();
19229 }
19230 if (size)
19231 {
19232 fputs (size, file);
19233 fputs (" PTR ", file);
19234 }
19235 }
19236
19237 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
19238 output_operand_lossage ("invalid constraints for operand");
19239 else
19240 ix86_print_operand_address_as
19241 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
19242 }
19243
19244 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
19245 {
19246 long l;
19247
19248 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
19249
19250 if (ASSEMBLER_DIALECT == ASM_ATT)
19251 putc ('$', file);
19252 /* Sign extend 32bit SFmode immediate to 8 bytes. */
19253 if (code == 'q')
19254 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
19255 (unsigned long long) (int) l);
19256 else
19257 fprintf (file, "0x%08x", (unsigned int) l);
19258 }
19259
19260 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
19261 {
19262 long l[2];
19263
19264 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
19265
19266 if (ASSEMBLER_DIALECT == ASM_ATT)
19267 putc ('$', file);
19268 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
19269 }
19270
19271 /* These float cases don't actually occur as immediate operands. */
19272 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
19273 {
19274 char dstr[30];
19275
19276 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
19277 fputs (dstr, file);
19278 }
19279
19280 else
19281 {
19282 /* We have patterns that allow zero sets of memory, for instance.
19283 In 64-bit mode, we should probably support all 8-byte vectors,
19284 since we can in fact encode that into an immediate. */
19285 if (GET_CODE (x) == CONST_VECTOR)
19286 {
19287 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
19288 x = const0_rtx;
19289 }
19290
19291 if (code != 'P' && code != 'p')
19292 {
19293 if (CONST_INT_P (x))
19294 {
19295 if (ASSEMBLER_DIALECT == ASM_ATT)
19296 putc ('$', file);
19297 }
19298 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
19299 || GET_CODE (x) == LABEL_REF)
19300 {
19301 if (ASSEMBLER_DIALECT == ASM_ATT)
19302 putc ('$', file);
19303 else
19304 fputs ("OFFSET FLAT:", file);
19305 }
19306 }
19307 if (CONST_INT_P (x))
19308 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
19309 else if (flag_pic || MACHOPIC_INDIRECT)
19310 output_pic_addr_const (file, x, code);
19311 else
19312 output_addr_const (file, x);
19313 }
19314 }
19315
19316 static bool
19317 ix86_print_operand_punct_valid_p (unsigned char code)
19318 {
19319 return (code == '@' || code == '*' || code == '+' || code == '&'
19320 || code == ';' || code == '~' || code == '^' || code == '!');
19321 }
19322 \f
19323 /* Print a memory operand whose address is ADDR. */
19324
19325 static void
19326 ix86_print_operand_address_as (FILE *file, rtx addr,
19327 addr_space_t as, bool no_rip)
19328 {
19329 struct ix86_address parts;
19330 rtx base, index, disp;
19331 int scale;
19332 int ok;
19333 bool vsib = false;
19334 int code = 0;
19335
19336 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
19337 {
19338 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
19339 gcc_assert (parts.index == NULL_RTX);
19340 parts.index = XVECEXP (addr, 0, 1);
19341 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
19342 addr = XVECEXP (addr, 0, 0);
19343 vsib = true;
19344 }
19345 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
19346 {
19347 gcc_assert (TARGET_64BIT);
19348 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
19349 code = 'q';
19350 }
19351 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
19352 {
19353 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
19354 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
19355 if (parts.base != NULL_RTX)
19356 {
19357 parts.index = parts.base;
19358 parts.scale = 1;
19359 }
19360 parts.base = XVECEXP (addr, 0, 0);
19361 addr = XVECEXP (addr, 0, 0);
19362 }
19363 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
19364 {
19365 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
19366 gcc_assert (parts.index == NULL_RTX);
19367 parts.index = XVECEXP (addr, 0, 1);
19368 addr = XVECEXP (addr, 0, 0);
19369 }
19370 else
19371 ok = ix86_decompose_address (addr, &parts);
19372
19373 gcc_assert (ok);
19374
19375 base = parts.base;
19376 index = parts.index;
19377 disp = parts.disp;
19378 scale = parts.scale;
19379
19380 if (ADDR_SPACE_GENERIC_P (as))
19381 as = parts.seg;
19382 else
19383 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
19384
19385 if (!ADDR_SPACE_GENERIC_P (as))
19386 {
19387 const char *string;
19388
19389 if (as == ADDR_SPACE_SEG_FS)
19390 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
19391 else if (as == ADDR_SPACE_SEG_GS)
19392 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
19393 else
19394 gcc_unreachable ();
19395 fputs (string, file);
19396 }
19397
19398 /* Use one byte shorter RIP relative addressing for 64bit mode. */
19399 if (TARGET_64BIT && !base && !index && !no_rip)
19400 {
19401 rtx symbol = disp;
19402
19403 if (GET_CODE (disp) == CONST
19404 && GET_CODE (XEXP (disp, 0)) == PLUS
19405 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
19406 symbol = XEXP (XEXP (disp, 0), 0);
19407
19408 if (GET_CODE (symbol) == LABEL_REF
19409 || (GET_CODE (symbol) == SYMBOL_REF
19410 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
19411 base = pc_rtx;
19412 }
19413
19414 if (!base && !index)
19415 {
19416 /* Displacement only requires special attention. */
19417 if (CONST_INT_P (disp))
19418 {
19419 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == ADDR_SPACE_GENERIC)
19420 fputs ("ds:", file);
19421 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
19422 }
19423 /* Load the external function address via the GOT slot to avoid PLT. */
19424 else if (GET_CODE (disp) == CONST
19425 && GET_CODE (XEXP (disp, 0)) == UNSPEC
19426 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
19427 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
19428 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
19429 output_pic_addr_const (file, disp, 0);
19430 else if (flag_pic)
19431 output_pic_addr_const (file, disp, 0);
19432 else
19433 output_addr_const (file, disp);
19434 }
19435 else
19436 {
19437 /* Print SImode register names to force addr32 prefix. */
19438 if (SImode_address_operand (addr, VOIDmode))
19439 {
19440 if (flag_checking)
19441 {
19442 gcc_assert (TARGET_64BIT);
19443 switch (GET_CODE (addr))
19444 {
19445 case SUBREG:
19446 gcc_assert (GET_MODE (addr) == SImode);
19447 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
19448 break;
19449 case ZERO_EXTEND:
19450 case AND:
19451 gcc_assert (GET_MODE (addr) == DImode);
19452 break;
19453 default:
19454 gcc_unreachable ();
19455 }
19456 }
19457 gcc_assert (!code);
19458 code = 'k';
19459 }
19460 else if (code == 0
19461 && TARGET_X32
19462 && disp
19463 && CONST_INT_P (disp)
19464 && INTVAL (disp) < -16*1024*1024)
19465 {
19466 /* X32 runs in 64-bit mode, where displacement, DISP, in
19467 address DISP(%r64), is encoded as 32-bit immediate sign-
19468 extended from 32-bit to 64-bit. For -0x40000300(%r64),
19469 address is %r64 + 0xffffffffbffffd00. When %r64 <
19470 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
19471 which is invalid for x32. The correct address is %r64
19472 - 0x40000300 == 0xf7ffdd64. To properly encode
19473 -0x40000300(%r64) for x32, we zero-extend negative
19474 displacement by forcing addr32 prefix which truncates
19475 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
19476 zero-extend all negative displacements, including -1(%rsp).
19477 However, for small negative displacements, sign-extension
19478 won't cause overflow. We only zero-extend negative
19479 displacements if they < -16*1024*1024, which is also used
19480 to check legitimate address displacements for PIC. */
19481 code = 'k';
19482 }
19483
19484 if (ASSEMBLER_DIALECT == ASM_ATT)
19485 {
19486 if (disp)
19487 {
19488 if (flag_pic)
19489 output_pic_addr_const (file, disp, 0);
19490 else if (GET_CODE (disp) == LABEL_REF)
19491 output_asm_label (disp);
19492 else
19493 output_addr_const (file, disp);
19494 }
19495
19496 putc ('(', file);
19497 if (base)
19498 print_reg (base, code, file);
19499 if (index)
19500 {
19501 putc (',', file);
19502 print_reg (index, vsib ? 0 : code, file);
19503 if (scale != 1 || vsib)
19504 fprintf (file, ",%d", scale);
19505 }
19506 putc (')', file);
19507 }
19508 else
19509 {
19510 rtx offset = NULL_RTX;
19511
19512 if (disp)
19513 {
19514 /* Pull out the offset of a symbol; print any symbol itself. */
19515 if (GET_CODE (disp) == CONST
19516 && GET_CODE (XEXP (disp, 0)) == PLUS
19517 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
19518 {
19519 offset = XEXP (XEXP (disp, 0), 1);
19520 disp = gen_rtx_CONST (VOIDmode,
19521 XEXP (XEXP (disp, 0), 0));
19522 }
19523
19524 if (flag_pic)
19525 output_pic_addr_const (file, disp, 0);
19526 else if (GET_CODE (disp) == LABEL_REF)
19527 output_asm_label (disp);
19528 else if (CONST_INT_P (disp))
19529 offset = disp;
19530 else
19531 output_addr_const (file, disp);
19532 }
19533
19534 putc ('[', file);
19535 if (base)
19536 {
19537 print_reg (base, code, file);
19538 if (offset)
19539 {
19540 if (INTVAL (offset) >= 0)
19541 putc ('+', file);
19542 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
19543 }
19544 }
19545 else if (offset)
19546 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
19547 else
19548 putc ('0', file);
19549
19550 if (index)
19551 {
19552 putc ('+', file);
19553 print_reg (index, vsib ? 0 : code, file);
19554 if (scale != 1 || vsib)
19555 fprintf (file, "*%d", scale);
19556 }
19557 putc (']', file);
19558 }
19559 }
19560 }
19561
19562 static void
19563 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
19564 {
19565 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
19566 }
19567
19568 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
19569
19570 static bool
19571 i386_asm_output_addr_const_extra (FILE *file, rtx x)
19572 {
19573 rtx op;
19574
19575 if (GET_CODE (x) != UNSPEC)
19576 return false;
19577
19578 op = XVECEXP (x, 0, 0);
19579 switch (XINT (x, 1))
19580 {
19581 case UNSPEC_GOTTPOFF:
19582 output_addr_const (file, op);
19583 /* FIXME: This might be @TPOFF in Sun ld. */
19584 fputs ("@gottpoff", file);
19585 break;
19586 case UNSPEC_TPOFF:
19587 output_addr_const (file, op);
19588 fputs ("@tpoff", file);
19589 break;
19590 case UNSPEC_NTPOFF:
19591 output_addr_const (file, op);
19592 if (TARGET_64BIT)
19593 fputs ("@tpoff", file);
19594 else
19595 fputs ("@ntpoff", file);
19596 break;
19597 case UNSPEC_DTPOFF:
19598 output_addr_const (file, op);
19599 fputs ("@dtpoff", file);
19600 break;
19601 case UNSPEC_GOTNTPOFF:
19602 output_addr_const (file, op);
19603 if (TARGET_64BIT)
19604 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
19605 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
19606 else
19607 fputs ("@gotntpoff", file);
19608 break;
19609 case UNSPEC_INDNTPOFF:
19610 output_addr_const (file, op);
19611 fputs ("@indntpoff", file);
19612 break;
19613 #if TARGET_MACHO
19614 case UNSPEC_MACHOPIC_OFFSET:
19615 output_addr_const (file, op);
19616 putc ('-', file);
19617 machopic_output_function_base_name (file);
19618 break;
19619 #endif
19620
19621 case UNSPEC_STACK_CHECK:
19622 {
19623 int offset;
19624
19625 gcc_assert (flag_split_stack);
19626
19627 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
19628 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
19629 #else
19630 gcc_unreachable ();
19631 #endif
19632
19633 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
19634 }
19635 break;
19636
19637 default:
19638 return false;
19639 }
19640
19641 return true;
19642 }
19643 \f
19644 /* Split one or more double-mode RTL references into pairs of half-mode
19645 references. The RTL can be REG, offsettable MEM, integer constant, or
19646 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
19647 split and "num" is its length. lo_half and hi_half are output arrays
19648 that parallel "operands". */
19649
19650 void
19651 split_double_mode (machine_mode mode, rtx operands[],
19652 int num, rtx lo_half[], rtx hi_half[])
19653 {
19654 machine_mode half_mode;
19655 unsigned int byte;
19656
19657 switch (mode)
19658 {
19659 case TImode:
19660 half_mode = DImode;
19661 break;
19662 case DImode:
19663 half_mode = SImode;
19664 break;
19665 default:
19666 gcc_unreachable ();
19667 }
19668
19669 byte = GET_MODE_SIZE (half_mode);
19670
19671 while (num--)
19672 {
19673 rtx op = operands[num];
19674
19675 /* simplify_subreg refuse to split volatile memory addresses,
19676 but we still have to handle it. */
19677 if (MEM_P (op))
19678 {
19679 lo_half[num] = adjust_address (op, half_mode, 0);
19680 hi_half[num] = adjust_address (op, half_mode, byte);
19681 }
19682 else
19683 {
19684 lo_half[num] = simplify_gen_subreg (half_mode, op,
19685 GET_MODE (op) == VOIDmode
19686 ? mode : GET_MODE (op), 0);
19687 hi_half[num] = simplify_gen_subreg (half_mode, op,
19688 GET_MODE (op) == VOIDmode
19689 ? mode : GET_MODE (op), byte);
19690 }
19691 }
19692 }
19693 \f
19694 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
19695 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
19696 is the expression of the binary operation. The output may either be
19697 emitted here, or returned to the caller, like all output_* functions.
19698
19699 There is no guarantee that the operands are the same mode, as they
19700 might be within FLOAT or FLOAT_EXTEND expressions. */
19701
19702 #ifndef SYSV386_COMPAT
19703 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
19704 wants to fix the assemblers because that causes incompatibility
19705 with gcc. No-one wants to fix gcc because that causes
19706 incompatibility with assemblers... You can use the option of
19707 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
19708 #define SYSV386_COMPAT 1
19709 #endif
19710
19711 const char *
19712 output_387_binary_op (rtx_insn *insn, rtx *operands)
19713 {
19714 static char buf[40];
19715 const char *p;
19716 const char *ssep;
19717 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
19718
19719 /* Even if we do not want to check the inputs, this documents input
19720 constraints. Which helps in understanding the following code. */
19721 if (flag_checking)
19722 {
19723 if (STACK_REG_P (operands[0])
19724 && ((REG_P (operands[1])
19725 && REGNO (operands[0]) == REGNO (operands[1])
19726 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
19727 || (REG_P (operands[2])
19728 && REGNO (operands[0]) == REGNO (operands[2])
19729 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
19730 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
19731 ; /* ok */
19732 else
19733 gcc_assert (is_sse);
19734 }
19735
19736 switch (GET_CODE (operands[3]))
19737 {
19738 case PLUS:
19739 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19740 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19741 p = "fiadd";
19742 else
19743 p = "fadd";
19744 ssep = "vadd";
19745 break;
19746
19747 case MINUS:
19748 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19749 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19750 p = "fisub";
19751 else
19752 p = "fsub";
19753 ssep = "vsub";
19754 break;
19755
19756 case MULT:
19757 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19758 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19759 p = "fimul";
19760 else
19761 p = "fmul";
19762 ssep = "vmul";
19763 break;
19764
19765 case DIV:
19766 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19767 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19768 p = "fidiv";
19769 else
19770 p = "fdiv";
19771 ssep = "vdiv";
19772 break;
19773
19774 default:
19775 gcc_unreachable ();
19776 }
19777
19778 if (is_sse)
19779 {
19780 if (TARGET_AVX)
19781 {
19782 strcpy (buf, ssep);
19783 if (GET_MODE (operands[0]) == SFmode)
19784 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
19785 else
19786 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
19787 }
19788 else
19789 {
19790 strcpy (buf, ssep + 1);
19791 if (GET_MODE (operands[0]) == SFmode)
19792 strcat (buf, "ss\t{%2, %0|%0, %2}");
19793 else
19794 strcat (buf, "sd\t{%2, %0|%0, %2}");
19795 }
19796 return buf;
19797 }
19798 strcpy (buf, p);
19799
19800 switch (GET_CODE (operands[3]))
19801 {
19802 case MULT:
19803 case PLUS:
19804 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
19805 std::swap (operands[1], operands[2]);
19806
19807 /* know operands[0] == operands[1]. */
19808
19809 if (MEM_P (operands[2]))
19810 {
19811 p = "%Z2\t%2";
19812 break;
19813 }
19814
19815 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
19816 {
19817 if (STACK_TOP_P (operands[0]))
19818 /* How is it that we are storing to a dead operand[2]?
19819 Well, presumably operands[1] is dead too. We can't
19820 store the result to st(0) as st(0) gets popped on this
19821 instruction. Instead store to operands[2] (which I
19822 think has to be st(1)). st(1) will be popped later.
19823 gcc <= 2.8.1 didn't have this check and generated
19824 assembly code that the Unixware assembler rejected. */
19825 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
19826 else
19827 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
19828 break;
19829 }
19830
19831 if (STACK_TOP_P (operands[0]))
19832 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
19833 else
19834 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
19835 break;
19836
19837 case MINUS:
19838 case DIV:
19839 if (MEM_P (operands[1]))
19840 {
19841 p = "r%Z1\t%1";
19842 break;
19843 }
19844
19845 if (MEM_P (operands[2]))
19846 {
19847 p = "%Z2\t%2";
19848 break;
19849 }
19850
19851 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
19852 {
19853 #if SYSV386_COMPAT
19854 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
19855 derived assemblers, confusingly reverse the direction of
19856 the operation for fsub{r} and fdiv{r} when the
19857 destination register is not st(0). The Intel assembler
19858 doesn't have this brain damage. Read !SYSV386_COMPAT to
19859 figure out what the hardware really does. */
19860 if (STACK_TOP_P (operands[0]))
19861 p = "{p\t%0, %2|rp\t%2, %0}";
19862 else
19863 p = "{rp\t%2, %0|p\t%0, %2}";
19864 #else
19865 if (STACK_TOP_P (operands[0]))
19866 /* As above for fmul/fadd, we can't store to st(0). */
19867 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
19868 else
19869 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
19870 #endif
19871 break;
19872 }
19873
19874 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
19875 {
19876 #if SYSV386_COMPAT
19877 if (STACK_TOP_P (operands[0]))
19878 p = "{rp\t%0, %1|p\t%1, %0}";
19879 else
19880 p = "{p\t%1, %0|rp\t%0, %1}";
19881 #else
19882 if (STACK_TOP_P (operands[0]))
19883 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
19884 else
19885 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
19886 #endif
19887 break;
19888 }
19889
19890 if (STACK_TOP_P (operands[0]))
19891 {
19892 if (STACK_TOP_P (operands[1]))
19893 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
19894 else
19895 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
19896 break;
19897 }
19898 else if (STACK_TOP_P (operands[1]))
19899 {
19900 #if SYSV386_COMPAT
19901 p = "{\t%1, %0|r\t%0, %1}";
19902 #else
19903 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
19904 #endif
19905 }
19906 else
19907 {
19908 #if SYSV386_COMPAT
19909 p = "{r\t%2, %0|\t%0, %2}";
19910 #else
19911 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
19912 #endif
19913 }
19914 break;
19915
19916 default:
19917 gcc_unreachable ();
19918 }
19919
19920 strcat (buf, p);
19921 return buf;
19922 }
19923
19924 /* Return needed mode for entity in optimize_mode_switching pass. */
19925
19926 static int
19927 ix86_dirflag_mode_needed (rtx_insn *insn)
19928 {
19929 if (CALL_P (insn))
19930 {
19931 if (cfun->machine->func_type == TYPE_NORMAL)
19932 return X86_DIRFLAG_ANY;
19933 else
19934 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
19935 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
19936 }
19937
19938 if (recog_memoized (insn) < 0)
19939 return X86_DIRFLAG_ANY;
19940
19941 if (get_attr_type (insn) == TYPE_STR)
19942 {
19943 /* Emit cld instruction if stringops are used in the function. */
19944 if (cfun->machine->func_type == TYPE_NORMAL)
19945 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
19946 else
19947 return X86_DIRFLAG_RESET;
19948 }
19949
19950 return X86_DIRFLAG_ANY;
19951 }
19952
19953 /* Check if a 256bit AVX register is referenced inside of EXP. */
19954
19955 static bool
19956 ix86_check_avx256_register (const_rtx exp)
19957 {
19958 if (SUBREG_P (exp))
19959 exp = SUBREG_REG (exp);
19960
19961 return (REG_P (exp)
19962 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)));
19963 }
19964
19965 /* Return needed mode for entity in optimize_mode_switching pass. */
19966
19967 static int
19968 ix86_avx_u128_mode_needed (rtx_insn *insn)
19969 {
19970 if (CALL_P (insn))
19971 {
19972 rtx link;
19973
19974 /* Needed mode is set to AVX_U128_CLEAN if there are
19975 no 256bit modes used in function arguments. */
19976 for (link = CALL_INSN_FUNCTION_USAGE (insn);
19977 link;
19978 link = XEXP (link, 1))
19979 {
19980 if (GET_CODE (XEXP (link, 0)) == USE)
19981 {
19982 rtx arg = XEXP (XEXP (link, 0), 0);
19983
19984 if (ix86_check_avx256_register (arg))
19985 return AVX_U128_DIRTY;
19986 }
19987 }
19988
19989 return AVX_U128_CLEAN;
19990 }
19991
19992 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
19993 changes state only when a 256bit register is written to, but we need
19994 to prevent the compiler from moving optimal insertion point above
19995 eventual read from 256bit register. */
19996 subrtx_iterator::array_type array;
19997 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
19998 if (ix86_check_avx256_register (*iter))
19999 return AVX_U128_DIRTY;
20000
20001 return AVX_U128_ANY;
20002 }
20003
20004 /* Return mode that i387 must be switched into
20005 prior to the execution of insn. */
20006
20007 static int
20008 ix86_i387_mode_needed (int entity, rtx_insn *insn)
20009 {
20010 enum attr_i387_cw mode;
20011
20012 /* The mode UNINITIALIZED is used to store control word after a
20013 function call or ASM pattern. The mode ANY specify that function
20014 has no requirements on the control word and make no changes in the
20015 bits we are interested in. */
20016
20017 if (CALL_P (insn)
20018 || (NONJUMP_INSN_P (insn)
20019 && (asm_noperands (PATTERN (insn)) >= 0
20020 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
20021 return I387_CW_UNINITIALIZED;
20022
20023 if (recog_memoized (insn) < 0)
20024 return I387_CW_ANY;
20025
20026 mode = get_attr_i387_cw (insn);
20027
20028 switch (entity)
20029 {
20030 case I387_TRUNC:
20031 if (mode == I387_CW_TRUNC)
20032 return mode;
20033 break;
20034
20035 case I387_FLOOR:
20036 if (mode == I387_CW_FLOOR)
20037 return mode;
20038 break;
20039
20040 case I387_CEIL:
20041 if (mode == I387_CW_CEIL)
20042 return mode;
20043 break;
20044
20045 case I387_MASK_PM:
20046 if (mode == I387_CW_MASK_PM)
20047 return mode;
20048 break;
20049
20050 default:
20051 gcc_unreachable ();
20052 }
20053
20054 return I387_CW_ANY;
20055 }
20056
20057 /* Return mode that entity must be switched into
20058 prior to the execution of insn. */
20059
20060 static int
20061 ix86_mode_needed (int entity, rtx_insn *insn)
20062 {
20063 switch (entity)
20064 {
20065 case X86_DIRFLAG:
20066 return ix86_dirflag_mode_needed (insn);
20067 case AVX_U128:
20068 return ix86_avx_u128_mode_needed (insn);
20069 case I387_TRUNC:
20070 case I387_FLOOR:
20071 case I387_CEIL:
20072 case I387_MASK_PM:
20073 return ix86_i387_mode_needed (entity, insn);
20074 default:
20075 gcc_unreachable ();
20076 }
20077 return 0;
20078 }
20079
20080 /* Check if a 256bit AVX register is referenced in stores. */
20081
20082 static void
20083 ix86_check_avx256_stores (rtx dest, const_rtx, void *data)
20084 {
20085 if (ix86_check_avx256_register (dest))
20086 {
20087 bool *used = (bool *) data;
20088 *used = true;
20089 }
20090 }
20091
20092 /* Calculate mode of upper 128bit AVX registers after the insn. */
20093
20094 static int
20095 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
20096 {
20097 rtx pat = PATTERN (insn);
20098
20099 if (vzeroupper_operation (pat, VOIDmode)
20100 || vzeroall_operation (pat, VOIDmode))
20101 return AVX_U128_CLEAN;
20102
20103 /* We know that state is clean after CALL insn if there are no
20104 256bit registers used in the function return register. */
20105 if (CALL_P (insn))
20106 {
20107 bool avx_reg256_found = false;
20108 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
20109
20110 return avx_reg256_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
20111 }
20112
20113 /* Otherwise, return current mode. Remember that if insn
20114 references AVX 256bit registers, the mode was already changed
20115 to DIRTY from MODE_NEEDED. */
20116 return mode;
20117 }
20118
20119 /* Return the mode that an insn results in. */
20120
20121 static int
20122 ix86_mode_after (int entity, int mode, rtx_insn *insn)
20123 {
20124 switch (entity)
20125 {
20126 case X86_DIRFLAG:
20127 return mode;
20128 case AVX_U128:
20129 return ix86_avx_u128_mode_after (mode, insn);
20130 case I387_TRUNC:
20131 case I387_FLOOR:
20132 case I387_CEIL:
20133 case I387_MASK_PM:
20134 return mode;
20135 default:
20136 gcc_unreachable ();
20137 }
20138 }
20139
20140 static int
20141 ix86_dirflag_mode_entry (void)
20142 {
20143 /* For TARGET_CLD or in the interrupt handler we can't assume
20144 direction flag state at function entry. */
20145 if (TARGET_CLD
20146 || cfun->machine->func_type != TYPE_NORMAL)
20147 return X86_DIRFLAG_ANY;
20148
20149 return X86_DIRFLAG_RESET;
20150 }
20151
20152 static int
20153 ix86_avx_u128_mode_entry (void)
20154 {
20155 tree arg;
20156
20157 /* Entry mode is set to AVX_U128_DIRTY if there are
20158 256bit modes used in function arguments. */
20159 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
20160 arg = TREE_CHAIN (arg))
20161 {
20162 rtx incoming = DECL_INCOMING_RTL (arg);
20163
20164 if (incoming && ix86_check_avx256_register (incoming))
20165 return AVX_U128_DIRTY;
20166 }
20167
20168 return AVX_U128_CLEAN;
20169 }
20170
20171 /* Return a mode that ENTITY is assumed to be
20172 switched to at function entry. */
20173
20174 static int
20175 ix86_mode_entry (int entity)
20176 {
20177 switch (entity)
20178 {
20179 case X86_DIRFLAG:
20180 return ix86_dirflag_mode_entry ();
20181 case AVX_U128:
20182 return ix86_avx_u128_mode_entry ();
20183 case I387_TRUNC:
20184 case I387_FLOOR:
20185 case I387_CEIL:
20186 case I387_MASK_PM:
20187 return I387_CW_ANY;
20188 default:
20189 gcc_unreachable ();
20190 }
20191 }
20192
20193 static int
20194 ix86_avx_u128_mode_exit (void)
20195 {
20196 rtx reg = crtl->return_rtx;
20197
20198 /* Exit mode is set to AVX_U128_DIRTY if there are
20199 256bit modes used in the function return register. */
20200 if (reg && ix86_check_avx256_register (reg))
20201 return AVX_U128_DIRTY;
20202
20203 return AVX_U128_CLEAN;
20204 }
20205
20206 /* Return a mode that ENTITY is assumed to be
20207 switched to at function exit. */
20208
20209 static int
20210 ix86_mode_exit (int entity)
20211 {
20212 switch (entity)
20213 {
20214 case X86_DIRFLAG:
20215 return X86_DIRFLAG_ANY;
20216 case AVX_U128:
20217 return ix86_avx_u128_mode_exit ();
20218 case I387_TRUNC:
20219 case I387_FLOOR:
20220 case I387_CEIL:
20221 case I387_MASK_PM:
20222 return I387_CW_ANY;
20223 default:
20224 gcc_unreachable ();
20225 }
20226 }
20227
20228 static int
20229 ix86_mode_priority (int, int n)
20230 {
20231 return n;
20232 }
20233
20234 /* Output code to initialize control word copies used by trunc?f?i and
20235 rounding patterns. CURRENT_MODE is set to current control word,
20236 while NEW_MODE is set to new control word. */
20237
20238 static void
20239 emit_i387_cw_initialization (int mode)
20240 {
20241 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
20242 rtx new_mode;
20243
20244 enum ix86_stack_slot slot;
20245
20246 rtx reg = gen_reg_rtx (HImode);
20247
20248 emit_insn (gen_x86_fnstcw_1 (stored_mode));
20249 emit_move_insn (reg, copy_rtx (stored_mode));
20250
20251 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
20252 || optimize_insn_for_size_p ())
20253 {
20254 switch (mode)
20255 {
20256 case I387_CW_TRUNC:
20257 /* round toward zero (truncate) */
20258 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
20259 slot = SLOT_CW_TRUNC;
20260 break;
20261
20262 case I387_CW_FLOOR:
20263 /* round down toward -oo */
20264 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
20265 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
20266 slot = SLOT_CW_FLOOR;
20267 break;
20268
20269 case I387_CW_CEIL:
20270 /* round up toward +oo */
20271 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
20272 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
20273 slot = SLOT_CW_CEIL;
20274 break;
20275
20276 case I387_CW_MASK_PM:
20277 /* mask precision exception for nearbyint() */
20278 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
20279 slot = SLOT_CW_MASK_PM;
20280 break;
20281
20282 default:
20283 gcc_unreachable ();
20284 }
20285 }
20286 else
20287 {
20288 switch (mode)
20289 {
20290 case I387_CW_TRUNC:
20291 /* round toward zero (truncate) */
20292 emit_insn (gen_insvsi_1 (reg, GEN_INT (0xc)));
20293 slot = SLOT_CW_TRUNC;
20294 break;
20295
20296 case I387_CW_FLOOR:
20297 /* round down toward -oo */
20298 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x4)));
20299 slot = SLOT_CW_FLOOR;
20300 break;
20301
20302 case I387_CW_CEIL:
20303 /* round up toward +oo */
20304 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x8)));
20305 slot = SLOT_CW_CEIL;
20306 break;
20307
20308 case I387_CW_MASK_PM:
20309 /* mask precision exception for nearbyint() */
20310 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
20311 slot = SLOT_CW_MASK_PM;
20312 break;
20313
20314 default:
20315 gcc_unreachable ();
20316 }
20317 }
20318
20319 gcc_assert (slot < MAX_386_STACK_LOCALS);
20320
20321 new_mode = assign_386_stack_local (HImode, slot);
20322 emit_move_insn (new_mode, reg);
20323 }
20324
20325 /* Emit vzeroupper. */
20326
20327 void
20328 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
20329 {
20330 int i;
20331
20332 /* Cancel automatic vzeroupper insertion if there are
20333 live call-saved SSE registers at the insertion point. */
20334
20335 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
20336 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
20337 return;
20338
20339 if (TARGET_64BIT)
20340 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
20341 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
20342 return;
20343
20344 emit_insn (gen_avx_vzeroupper ());
20345 }
20346
20347 /* Generate one or more insns to set ENTITY to MODE. */
20348
20349 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
20350 is the set of hard registers live at the point where the insn(s)
20351 are to be inserted. */
20352
20353 static void
20354 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
20355 HARD_REG_SET regs_live)
20356 {
20357 switch (entity)
20358 {
20359 case X86_DIRFLAG:
20360 if (mode == X86_DIRFLAG_RESET)
20361 emit_insn (gen_cld ());
20362 break;
20363 case AVX_U128:
20364 if (mode == AVX_U128_CLEAN)
20365 ix86_avx_emit_vzeroupper (regs_live);
20366 break;
20367 case I387_TRUNC:
20368 case I387_FLOOR:
20369 case I387_CEIL:
20370 case I387_MASK_PM:
20371 if (mode != I387_CW_ANY
20372 && mode != I387_CW_UNINITIALIZED)
20373 emit_i387_cw_initialization (mode);
20374 break;
20375 default:
20376 gcc_unreachable ();
20377 }
20378 }
20379
20380 /* Output code for INSN to convert a float to a signed int. OPERANDS
20381 are the insn operands. The output may be [HSD]Imode and the input
20382 operand may be [SDX]Fmode. */
20383
20384 const char *
20385 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
20386 {
20387 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
20388 int dimode_p = GET_MODE (operands[0]) == DImode;
20389 int round_mode = get_attr_i387_cw (insn);
20390
20391 /* Jump through a hoop or two for DImode, since the hardware has no
20392 non-popping instruction. We used to do this a different way, but
20393 that was somewhat fragile and broke with post-reload splitters. */
20394 if ((dimode_p || fisttp) && !stack_top_dies)
20395 output_asm_insn ("fld\t%y1", operands);
20396
20397 gcc_assert (STACK_TOP_P (operands[1]));
20398 gcc_assert (MEM_P (operands[0]));
20399 gcc_assert (GET_MODE (operands[1]) != TFmode);
20400
20401 if (fisttp)
20402 output_asm_insn ("fisttp%Z0\t%0", operands);
20403 else
20404 {
20405 if (round_mode != I387_CW_ANY)
20406 output_asm_insn ("fldcw\t%3", operands);
20407 if (stack_top_dies || dimode_p)
20408 output_asm_insn ("fistp%Z0\t%0", operands);
20409 else
20410 output_asm_insn ("fist%Z0\t%0", operands);
20411 if (round_mode != I387_CW_ANY)
20412 output_asm_insn ("fldcw\t%2", operands);
20413 }
20414
20415 return "";
20416 }
20417
20418 /* Output code for x87 ffreep insn. The OPNO argument, which may only
20419 have the values zero or one, indicates the ffreep insn's operand
20420 from the OPERANDS array. */
20421
20422 static const char *
20423 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
20424 {
20425 if (TARGET_USE_FFREEP)
20426 #ifdef HAVE_AS_IX86_FFREEP
20427 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
20428 #else
20429 {
20430 static char retval[32];
20431 int regno = REGNO (operands[opno]);
20432
20433 gcc_assert (STACK_REGNO_P (regno));
20434
20435 regno -= FIRST_STACK_REG;
20436
20437 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
20438 return retval;
20439 }
20440 #endif
20441
20442 return opno ? "fstp\t%y1" : "fstp\t%y0";
20443 }
20444
20445
20446 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
20447 should be used. UNORDERED_P is true when fucom should be used. */
20448
20449 const char *
20450 output_fp_compare (rtx_insn *insn, rtx *operands, bool eflags_p, bool unordered_p)
20451 {
20452 int stack_top_dies;
20453 rtx cmp_op0, cmp_op1;
20454 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
20455
20456 if (eflags_p)
20457 {
20458 cmp_op0 = operands[0];
20459 cmp_op1 = operands[1];
20460 }
20461 else
20462 {
20463 cmp_op0 = operands[1];
20464 cmp_op1 = operands[2];
20465 }
20466
20467 if (is_sse)
20468 {
20469 if (GET_MODE (operands[0]) == SFmode)
20470 if (unordered_p)
20471 return "%vucomiss\t{%1, %0|%0, %1}";
20472 else
20473 return "%vcomiss\t{%1, %0|%0, %1}";
20474 else
20475 if (unordered_p)
20476 return "%vucomisd\t{%1, %0|%0, %1}";
20477 else
20478 return "%vcomisd\t{%1, %0|%0, %1}";
20479 }
20480
20481 gcc_assert (STACK_TOP_P (cmp_op0));
20482
20483 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
20484
20485 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
20486 {
20487 if (stack_top_dies)
20488 {
20489 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
20490 return output_387_ffreep (operands, 1);
20491 }
20492 else
20493 return "ftst\n\tfnstsw\t%0";
20494 }
20495
20496 if (STACK_REG_P (cmp_op1)
20497 && stack_top_dies
20498 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
20499 && REGNO (cmp_op1) != FIRST_STACK_REG)
20500 {
20501 /* If both the top of the 387 stack dies, and the other operand
20502 is also a stack register that dies, then this must be a
20503 `fcompp' float compare */
20504
20505 if (eflags_p)
20506 {
20507 /* There is no double popping fcomi variant. Fortunately,
20508 eflags is immune from the fstp's cc clobbering. */
20509 if (unordered_p)
20510 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
20511 else
20512 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
20513 return output_387_ffreep (operands, 0);
20514 }
20515 else
20516 {
20517 if (unordered_p)
20518 return "fucompp\n\tfnstsw\t%0";
20519 else
20520 return "fcompp\n\tfnstsw\t%0";
20521 }
20522 }
20523 else
20524 {
20525 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
20526
20527 static const char * const alt[16] =
20528 {
20529 "fcom%Z2\t%y2\n\tfnstsw\t%0",
20530 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
20531 "fucom%Z2\t%y2\n\tfnstsw\t%0",
20532 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
20533
20534 "ficom%Z2\t%y2\n\tfnstsw\t%0",
20535 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
20536 NULL,
20537 NULL,
20538
20539 "fcomi\t{%y1, %0|%0, %y1}",
20540 "fcomip\t{%y1, %0|%0, %y1}",
20541 "fucomi\t{%y1, %0|%0, %y1}",
20542 "fucomip\t{%y1, %0|%0, %y1}",
20543
20544 NULL,
20545 NULL,
20546 NULL,
20547 NULL
20548 };
20549
20550 int mask;
20551 const char *ret;
20552
20553 mask = eflags_p << 3;
20554 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
20555 mask |= unordered_p << 1;
20556 mask |= stack_top_dies;
20557
20558 gcc_assert (mask < 16);
20559 ret = alt[mask];
20560 gcc_assert (ret);
20561
20562 return ret;
20563 }
20564 }
20565
20566 void
20567 ix86_output_addr_vec_elt (FILE *file, int value)
20568 {
20569 const char *directive = ASM_LONG;
20570
20571 #ifdef ASM_QUAD
20572 if (TARGET_LP64)
20573 directive = ASM_QUAD;
20574 #else
20575 gcc_assert (!TARGET_64BIT);
20576 #endif
20577
20578 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
20579 }
20580
20581 void
20582 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
20583 {
20584 const char *directive = ASM_LONG;
20585
20586 #ifdef ASM_QUAD
20587 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
20588 directive = ASM_QUAD;
20589 #else
20590 gcc_assert (!TARGET_64BIT);
20591 #endif
20592 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
20593 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
20594 fprintf (file, "%s%s%d-%s%d\n",
20595 directive, LPREFIX, value, LPREFIX, rel);
20596 else if (HAVE_AS_GOTOFF_IN_DATA)
20597 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
20598 #if TARGET_MACHO
20599 else if (TARGET_MACHO)
20600 {
20601 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
20602 machopic_output_function_base_name (file);
20603 putc ('\n', file);
20604 }
20605 #endif
20606 else
20607 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
20608 GOT_SYMBOL_NAME, LPREFIX, value);
20609 }
20610 \f
20611 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
20612 for the target. */
20613
20614 void
20615 ix86_expand_clear (rtx dest)
20616 {
20617 rtx tmp;
20618
20619 /* We play register width games, which are only valid after reload. */
20620 gcc_assert (reload_completed);
20621
20622 /* Avoid HImode and its attendant prefix byte. */
20623 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
20624 dest = gen_rtx_REG (SImode, REGNO (dest));
20625 tmp = gen_rtx_SET (dest, const0_rtx);
20626
20627 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
20628 {
20629 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20630 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
20631 }
20632
20633 emit_insn (tmp);
20634 }
20635
20636 /* X is an unchanging MEM. If it is a constant pool reference, return
20637 the constant pool rtx, else NULL. */
20638
20639 rtx
20640 maybe_get_pool_constant (rtx x)
20641 {
20642 x = ix86_delegitimize_address (XEXP (x, 0));
20643
20644 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
20645 return get_pool_constant (x);
20646
20647 return NULL_RTX;
20648 }
20649
20650 void
20651 ix86_expand_move (machine_mode mode, rtx operands[])
20652 {
20653 rtx op0, op1;
20654 rtx tmp, addend = NULL_RTX;
20655 enum tls_model model;
20656
20657 op0 = operands[0];
20658 op1 = operands[1];
20659
20660 switch (GET_CODE (op1))
20661 {
20662 case CONST:
20663 tmp = XEXP (op1, 0);
20664
20665 if (GET_CODE (tmp) != PLUS
20666 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
20667 break;
20668
20669 op1 = XEXP (tmp, 0);
20670 addend = XEXP (tmp, 1);
20671 /* FALLTHRU */
20672
20673 case SYMBOL_REF:
20674 model = SYMBOL_REF_TLS_MODEL (op1);
20675
20676 if (model)
20677 op1 = legitimize_tls_address (op1, model, true);
20678 else if (ix86_force_load_from_GOT_p (op1))
20679 {
20680 /* Load the external function address via GOT slot to avoid PLT. */
20681 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
20682 (TARGET_64BIT
20683 ? UNSPEC_GOTPCREL
20684 : UNSPEC_GOT));
20685 op1 = gen_rtx_CONST (Pmode, op1);
20686 op1 = gen_const_mem (Pmode, op1);
20687 set_mem_alias_set (op1, ix86_GOT_alias_set ());
20688 }
20689 else
20690 {
20691 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
20692 if (tmp)
20693 {
20694 op1 = tmp;
20695 if (!addend)
20696 break;
20697 }
20698 else
20699 {
20700 op1 = operands[1];
20701 break;
20702 }
20703 }
20704
20705 if (addend)
20706 {
20707 op1 = force_operand (op1, NULL_RTX);
20708 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
20709 op0, 1, OPTAB_DIRECT);
20710 }
20711 else
20712 op1 = force_operand (op1, op0);
20713
20714 if (op1 == op0)
20715 return;
20716
20717 op1 = convert_to_mode (mode, op1, 1);
20718
20719 default:
20720 break;
20721 }
20722
20723 if ((flag_pic || MACHOPIC_INDIRECT)
20724 && symbolic_operand (op1, mode))
20725 {
20726 if (TARGET_MACHO && !TARGET_64BIT)
20727 {
20728 #if TARGET_MACHO
20729 /* dynamic-no-pic */
20730 if (MACHOPIC_INDIRECT)
20731 {
20732 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
20733 ? op0 : gen_reg_rtx (Pmode);
20734 op1 = machopic_indirect_data_reference (op1, temp);
20735 if (MACHOPIC_PURE)
20736 op1 = machopic_legitimize_pic_address (op1, mode,
20737 temp == op1 ? 0 : temp);
20738 }
20739 if (op0 != op1 && GET_CODE (op0) != MEM)
20740 {
20741 rtx insn = gen_rtx_SET (op0, op1);
20742 emit_insn (insn);
20743 return;
20744 }
20745 if (GET_CODE (op0) == MEM)
20746 op1 = force_reg (Pmode, op1);
20747 else
20748 {
20749 rtx temp = op0;
20750 if (GET_CODE (temp) != REG)
20751 temp = gen_reg_rtx (Pmode);
20752 temp = legitimize_pic_address (op1, temp);
20753 if (temp == op0)
20754 return;
20755 op1 = temp;
20756 }
20757 /* dynamic-no-pic */
20758 #endif
20759 }
20760 else
20761 {
20762 if (MEM_P (op0))
20763 op1 = force_reg (mode, op1);
20764 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
20765 {
20766 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
20767 op1 = legitimize_pic_address (op1, reg);
20768 if (op0 == op1)
20769 return;
20770 op1 = convert_to_mode (mode, op1, 1);
20771 }
20772 }
20773 }
20774 else
20775 {
20776 if (MEM_P (op0)
20777 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
20778 || !push_operand (op0, mode))
20779 && MEM_P (op1))
20780 op1 = force_reg (mode, op1);
20781
20782 if (push_operand (op0, mode)
20783 && ! general_no_elim_operand (op1, mode))
20784 op1 = copy_to_mode_reg (mode, op1);
20785
20786 /* Force large constants in 64bit compilation into register
20787 to get them CSEed. */
20788 if (can_create_pseudo_p ()
20789 && (mode == DImode) && TARGET_64BIT
20790 && immediate_operand (op1, mode)
20791 && !x86_64_zext_immediate_operand (op1, VOIDmode)
20792 && !register_operand (op0, mode)
20793 && optimize)
20794 op1 = copy_to_mode_reg (mode, op1);
20795
20796 if (can_create_pseudo_p ()
20797 && CONST_DOUBLE_P (op1))
20798 {
20799 /* If we are loading a floating point constant to a register,
20800 force the value to memory now, since we'll get better code
20801 out the back end. */
20802
20803 op1 = validize_mem (force_const_mem (mode, op1));
20804 if (!register_operand (op0, mode))
20805 {
20806 rtx temp = gen_reg_rtx (mode);
20807 emit_insn (gen_rtx_SET (temp, op1));
20808 emit_move_insn (op0, temp);
20809 return;
20810 }
20811 }
20812 }
20813
20814 emit_insn (gen_rtx_SET (op0, op1));
20815 }
20816
20817 void
20818 ix86_expand_vector_move (machine_mode mode, rtx operands[])
20819 {
20820 rtx op0 = operands[0], op1 = operands[1];
20821 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
20822 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
20823 unsigned int align = (TARGET_IAMCU
20824 ? GET_MODE_BITSIZE (mode)
20825 : GET_MODE_ALIGNMENT (mode));
20826
20827 if (push_operand (op0, VOIDmode))
20828 op0 = emit_move_resolve_push (mode, op0);
20829
20830 /* Force constants other than zero into memory. We do not know how
20831 the instructions used to build constants modify the upper 64 bits
20832 of the register, once we have that information we may be able
20833 to handle some of them more efficiently. */
20834 if (can_create_pseudo_p ()
20835 && (CONSTANT_P (op1)
20836 || (SUBREG_P (op1)
20837 && CONSTANT_P (SUBREG_REG (op1))))
20838 && ((register_operand (op0, mode)
20839 && !standard_sse_constant_p (op1, mode))
20840 /* ix86_expand_vector_move_misalign() does not like constants. */
20841 || (SSE_REG_MODE_P (mode)
20842 && MEM_P (op0)
20843 && MEM_ALIGN (op0) < align)))
20844 {
20845 if (SUBREG_P (op1))
20846 {
20847 machine_mode imode = GET_MODE (SUBREG_REG (op1));
20848 rtx r = force_const_mem (imode, SUBREG_REG (op1));
20849 if (r)
20850 r = validize_mem (r);
20851 else
20852 r = force_reg (imode, SUBREG_REG (op1));
20853 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
20854 }
20855 else
20856 op1 = validize_mem (force_const_mem (mode, op1));
20857 }
20858
20859 /* We need to check memory alignment for SSE mode since attribute
20860 can make operands unaligned. */
20861 if (can_create_pseudo_p ()
20862 && SSE_REG_MODE_P (mode)
20863 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
20864 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
20865 {
20866 rtx tmp[2];
20867
20868 /* ix86_expand_vector_move_misalign() does not like both
20869 arguments in memory. */
20870 if (!register_operand (op0, mode)
20871 && !register_operand (op1, mode))
20872 op1 = force_reg (mode, op1);
20873
20874 tmp[0] = op0; tmp[1] = op1;
20875 ix86_expand_vector_move_misalign (mode, tmp);
20876 return;
20877 }
20878
20879 /* Make operand1 a register if it isn't already. */
20880 if (can_create_pseudo_p ()
20881 && !register_operand (op0, mode)
20882 && !register_operand (op1, mode))
20883 {
20884 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
20885 return;
20886 }
20887
20888 emit_insn (gen_rtx_SET (op0, op1));
20889 }
20890
20891 /* Split 32-byte AVX unaligned load and store if needed. */
20892
20893 static void
20894 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
20895 {
20896 rtx m;
20897 rtx (*extract) (rtx, rtx, rtx);
20898 machine_mode mode;
20899
20900 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
20901 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
20902 {
20903 emit_insn (gen_rtx_SET (op0, op1));
20904 return;
20905 }
20906
20907 rtx orig_op0 = NULL_RTX;
20908 mode = GET_MODE (op0);
20909 switch (GET_MODE_CLASS (mode))
20910 {
20911 case MODE_VECTOR_INT:
20912 case MODE_INT:
20913 if (mode != V32QImode)
20914 {
20915 if (!MEM_P (op0))
20916 {
20917 orig_op0 = op0;
20918 op0 = gen_reg_rtx (V32QImode);
20919 }
20920 else
20921 op0 = gen_lowpart (V32QImode, op0);
20922 op1 = gen_lowpart (V32QImode, op1);
20923 mode = V32QImode;
20924 }
20925 break;
20926 case MODE_VECTOR_FLOAT:
20927 break;
20928 default:
20929 gcc_unreachable ();
20930 }
20931
20932 switch (mode)
20933 {
20934 default:
20935 gcc_unreachable ();
20936 case V32QImode:
20937 extract = gen_avx_vextractf128v32qi;
20938 mode = V16QImode;
20939 break;
20940 case V8SFmode:
20941 extract = gen_avx_vextractf128v8sf;
20942 mode = V4SFmode;
20943 break;
20944 case V4DFmode:
20945 extract = gen_avx_vextractf128v4df;
20946 mode = V2DFmode;
20947 break;
20948 }
20949
20950 if (MEM_P (op1))
20951 {
20952 rtx r = gen_reg_rtx (mode);
20953 m = adjust_address (op1, mode, 0);
20954 emit_move_insn (r, m);
20955 m = adjust_address (op1, mode, 16);
20956 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
20957 emit_move_insn (op0, r);
20958 }
20959 else if (MEM_P (op0))
20960 {
20961 m = adjust_address (op0, mode, 0);
20962 emit_insn (extract (m, op1, const0_rtx));
20963 m = adjust_address (op0, mode, 16);
20964 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
20965 }
20966 else
20967 gcc_unreachable ();
20968
20969 if (orig_op0)
20970 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
20971 }
20972
20973 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
20974 straight to ix86_expand_vector_move. */
20975 /* Code generation for scalar reg-reg moves of single and double precision data:
20976 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
20977 movaps reg, reg
20978 else
20979 movss reg, reg
20980 if (x86_sse_partial_reg_dependency == true)
20981 movapd reg, reg
20982 else
20983 movsd reg, reg
20984
20985 Code generation for scalar loads of double precision data:
20986 if (x86_sse_split_regs == true)
20987 movlpd mem, reg (gas syntax)
20988 else
20989 movsd mem, reg
20990
20991 Code generation for unaligned packed loads of single precision data
20992 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
20993 if (x86_sse_unaligned_move_optimal)
20994 movups mem, reg
20995
20996 if (x86_sse_partial_reg_dependency == true)
20997 {
20998 xorps reg, reg
20999 movlps mem, reg
21000 movhps mem+8, reg
21001 }
21002 else
21003 {
21004 movlps mem, reg
21005 movhps mem+8, reg
21006 }
21007
21008 Code generation for unaligned packed loads of double precision data
21009 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
21010 if (x86_sse_unaligned_move_optimal)
21011 movupd mem, reg
21012
21013 if (x86_sse_split_regs == true)
21014 {
21015 movlpd mem, reg
21016 movhpd mem+8, reg
21017 }
21018 else
21019 {
21020 movsd mem, reg
21021 movhpd mem+8, reg
21022 }
21023 */
21024
21025 void
21026 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
21027 {
21028 rtx op0, op1, m;
21029
21030 op0 = operands[0];
21031 op1 = operands[1];
21032
21033 /* Use unaligned load/store for AVX512 or when optimizing for size. */
21034 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
21035 {
21036 emit_insn (gen_rtx_SET (op0, op1));
21037 return;
21038 }
21039
21040 if (TARGET_AVX)
21041 {
21042 if (GET_MODE_SIZE (mode) == 32)
21043 ix86_avx256_split_vector_move_misalign (op0, op1);
21044 else
21045 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
21046 emit_insn (gen_rtx_SET (op0, op1));
21047 return;
21048 }
21049
21050 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
21051 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
21052 {
21053 emit_insn (gen_rtx_SET (op0, op1));
21054 return;
21055 }
21056
21057 /* ??? If we have typed data, then it would appear that using
21058 movdqu is the only way to get unaligned data loaded with
21059 integer type. */
21060 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
21061 {
21062 emit_insn (gen_rtx_SET (op0, op1));
21063 return;
21064 }
21065
21066 if (MEM_P (op1))
21067 {
21068 if (TARGET_SSE2 && mode == V2DFmode)
21069 {
21070 rtx zero;
21071
21072 /* When SSE registers are split into halves, we can avoid
21073 writing to the top half twice. */
21074 if (TARGET_SSE_SPLIT_REGS)
21075 {
21076 emit_clobber (op0);
21077 zero = op0;
21078 }
21079 else
21080 {
21081 /* ??? Not sure about the best option for the Intel chips.
21082 The following would seem to satisfy; the register is
21083 entirely cleared, breaking the dependency chain. We
21084 then store to the upper half, with a dependency depth
21085 of one. A rumor has it that Intel recommends two movsd
21086 followed by an unpacklpd, but this is unconfirmed. And
21087 given that the dependency depth of the unpacklpd would
21088 still be one, I'm not sure why this would be better. */
21089 zero = CONST0_RTX (V2DFmode);
21090 }
21091
21092 m = adjust_address (op1, DFmode, 0);
21093 emit_insn (gen_sse2_loadlpd (op0, zero, m));
21094 m = adjust_address (op1, DFmode, 8);
21095 emit_insn (gen_sse2_loadhpd (op0, op0, m));
21096 }
21097 else
21098 {
21099 rtx t;
21100
21101 if (mode != V4SFmode)
21102 t = gen_reg_rtx (V4SFmode);
21103 else
21104 t = op0;
21105
21106 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
21107 emit_move_insn (t, CONST0_RTX (V4SFmode));
21108 else
21109 emit_clobber (t);
21110
21111 m = adjust_address (op1, V2SFmode, 0);
21112 emit_insn (gen_sse_loadlps (t, t, m));
21113 m = adjust_address (op1, V2SFmode, 8);
21114 emit_insn (gen_sse_loadhps (t, t, m));
21115 if (mode != V4SFmode)
21116 emit_move_insn (op0, gen_lowpart (mode, t));
21117 }
21118 }
21119 else if (MEM_P (op0))
21120 {
21121 if (TARGET_SSE2 && mode == V2DFmode)
21122 {
21123 m = adjust_address (op0, DFmode, 0);
21124 emit_insn (gen_sse2_storelpd (m, op1));
21125 m = adjust_address (op0, DFmode, 8);
21126 emit_insn (gen_sse2_storehpd (m, op1));
21127 }
21128 else
21129 {
21130 if (mode != V4SFmode)
21131 op1 = gen_lowpart (V4SFmode, op1);
21132
21133 m = adjust_address (op0, V2SFmode, 0);
21134 emit_insn (gen_sse_storelps (m, op1));
21135 m = adjust_address (op0, V2SFmode, 8);
21136 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
21137 }
21138 }
21139 else
21140 gcc_unreachable ();
21141 }
21142
21143 /* Helper function of ix86_fixup_binary_operands to canonicalize
21144 operand order. Returns true if the operands should be swapped. */
21145
21146 static bool
21147 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
21148 rtx operands[])
21149 {
21150 rtx dst = operands[0];
21151 rtx src1 = operands[1];
21152 rtx src2 = operands[2];
21153
21154 /* If the operation is not commutative, we can't do anything. */
21155 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
21156 return false;
21157
21158 /* Highest priority is that src1 should match dst. */
21159 if (rtx_equal_p (dst, src1))
21160 return false;
21161 if (rtx_equal_p (dst, src2))
21162 return true;
21163
21164 /* Next highest priority is that immediate constants come second. */
21165 if (immediate_operand (src2, mode))
21166 return false;
21167 if (immediate_operand (src1, mode))
21168 return true;
21169
21170 /* Lowest priority is that memory references should come second. */
21171 if (MEM_P (src2))
21172 return false;
21173 if (MEM_P (src1))
21174 return true;
21175
21176 return false;
21177 }
21178
21179
21180 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
21181 destination to use for the operation. If different from the true
21182 destination in operands[0], a copy operation will be required. */
21183
21184 rtx
21185 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
21186 rtx operands[])
21187 {
21188 rtx dst = operands[0];
21189 rtx src1 = operands[1];
21190 rtx src2 = operands[2];
21191
21192 /* Canonicalize operand order. */
21193 if (ix86_swap_binary_operands_p (code, mode, operands))
21194 {
21195 /* It is invalid to swap operands of different modes. */
21196 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
21197
21198 std::swap (src1, src2);
21199 }
21200
21201 /* Both source operands cannot be in memory. */
21202 if (MEM_P (src1) && MEM_P (src2))
21203 {
21204 /* Optimization: Only read from memory once. */
21205 if (rtx_equal_p (src1, src2))
21206 {
21207 src2 = force_reg (mode, src2);
21208 src1 = src2;
21209 }
21210 else if (rtx_equal_p (dst, src1))
21211 src2 = force_reg (mode, src2);
21212 else
21213 src1 = force_reg (mode, src1);
21214 }
21215
21216 /* If the destination is memory, and we do not have matching source
21217 operands, do things in registers. */
21218 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
21219 dst = gen_reg_rtx (mode);
21220
21221 /* Source 1 cannot be a constant. */
21222 if (CONSTANT_P (src1))
21223 src1 = force_reg (mode, src1);
21224
21225 /* Source 1 cannot be a non-matching memory. */
21226 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
21227 src1 = force_reg (mode, src1);
21228
21229 /* Improve address combine. */
21230 if (code == PLUS
21231 && GET_MODE_CLASS (mode) == MODE_INT
21232 && MEM_P (src2))
21233 src2 = force_reg (mode, src2);
21234
21235 operands[1] = src1;
21236 operands[2] = src2;
21237 return dst;
21238 }
21239
21240 /* Similarly, but assume that the destination has already been
21241 set up properly. */
21242
21243 void
21244 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
21245 machine_mode mode, rtx operands[])
21246 {
21247 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
21248 gcc_assert (dst == operands[0]);
21249 }
21250
21251 /* Attempt to expand a binary operator. Make the expansion closer to the
21252 actual machine, then just general_operand, which will allow 3 separate
21253 memory references (one output, two input) in a single insn. */
21254
21255 void
21256 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
21257 rtx operands[])
21258 {
21259 rtx src1, src2, dst, op, clob;
21260
21261 dst = ix86_fixup_binary_operands (code, mode, operands);
21262 src1 = operands[1];
21263 src2 = operands[2];
21264
21265 /* Emit the instruction. */
21266
21267 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
21268
21269 if (reload_completed
21270 && code == PLUS
21271 && !rtx_equal_p (dst, src1))
21272 {
21273 /* This is going to be an LEA; avoid splitting it later. */
21274 emit_insn (op);
21275 }
21276 else
21277 {
21278 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21279 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21280 }
21281
21282 /* Fix up the destination if needed. */
21283 if (dst != operands[0])
21284 emit_move_insn (operands[0], dst);
21285 }
21286
21287 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
21288 the given OPERANDS. */
21289
21290 void
21291 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
21292 rtx operands[])
21293 {
21294 rtx op1 = NULL_RTX, op2 = NULL_RTX;
21295 if (SUBREG_P (operands[1]))
21296 {
21297 op1 = operands[1];
21298 op2 = operands[2];
21299 }
21300 else if (SUBREG_P (operands[2]))
21301 {
21302 op1 = operands[2];
21303 op2 = operands[1];
21304 }
21305 /* Optimize (__m128i) d | (__m128i) e and similar code
21306 when d and e are float vectors into float vector logical
21307 insn. In C/C++ without using intrinsics there is no other way
21308 to express vector logical operation on float vectors than
21309 to cast them temporarily to integer vectors. */
21310 if (op1
21311 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
21312 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
21313 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
21314 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
21315 && SUBREG_BYTE (op1) == 0
21316 && (GET_CODE (op2) == CONST_VECTOR
21317 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
21318 && SUBREG_BYTE (op2) == 0))
21319 && can_create_pseudo_p ())
21320 {
21321 rtx dst;
21322 switch (GET_MODE (SUBREG_REG (op1)))
21323 {
21324 case V4SFmode:
21325 case V8SFmode:
21326 case V16SFmode:
21327 case V2DFmode:
21328 case V4DFmode:
21329 case V8DFmode:
21330 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
21331 if (GET_CODE (op2) == CONST_VECTOR)
21332 {
21333 op2 = gen_lowpart (GET_MODE (dst), op2);
21334 op2 = force_reg (GET_MODE (dst), op2);
21335 }
21336 else
21337 {
21338 op1 = operands[1];
21339 op2 = SUBREG_REG (operands[2]);
21340 if (!vector_operand (op2, GET_MODE (dst)))
21341 op2 = force_reg (GET_MODE (dst), op2);
21342 }
21343 op1 = SUBREG_REG (op1);
21344 if (!vector_operand (op1, GET_MODE (dst)))
21345 op1 = force_reg (GET_MODE (dst), op1);
21346 emit_insn (gen_rtx_SET (dst,
21347 gen_rtx_fmt_ee (code, GET_MODE (dst),
21348 op1, op2)));
21349 emit_move_insn (operands[0], gen_lowpart (mode, dst));
21350 return;
21351 default:
21352 break;
21353 }
21354 }
21355 if (!vector_operand (operands[1], mode))
21356 operands[1] = force_reg (mode, operands[1]);
21357 if (!vector_operand (operands[2], mode))
21358 operands[2] = force_reg (mode, operands[2]);
21359 ix86_fixup_binary_operands_no_copy (code, mode, operands);
21360 emit_insn (gen_rtx_SET (operands[0],
21361 gen_rtx_fmt_ee (code, mode, operands[1],
21362 operands[2])));
21363 }
21364
21365 /* Return TRUE or FALSE depending on whether the binary operator meets the
21366 appropriate constraints. */
21367
21368 bool
21369 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
21370 rtx operands[3])
21371 {
21372 rtx dst = operands[0];
21373 rtx src1 = operands[1];
21374 rtx src2 = operands[2];
21375
21376 /* Both source operands cannot be in memory. */
21377 if (MEM_P (src1) && MEM_P (src2))
21378 return false;
21379
21380 /* Canonicalize operand order for commutative operators. */
21381 if (ix86_swap_binary_operands_p (code, mode, operands))
21382 std::swap (src1, src2);
21383
21384 /* If the destination is memory, we must have a matching source operand. */
21385 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
21386 return false;
21387
21388 /* Source 1 cannot be a constant. */
21389 if (CONSTANT_P (src1))
21390 return false;
21391
21392 /* Source 1 cannot be a non-matching memory. */
21393 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
21394 /* Support "andhi/andsi/anddi" as a zero-extending move. */
21395 return (code == AND
21396 && (mode == HImode
21397 || mode == SImode
21398 || (TARGET_64BIT && mode == DImode))
21399 && satisfies_constraint_L (src2));
21400
21401 return true;
21402 }
21403
21404 /* Attempt to expand a unary operator. Make the expansion closer to the
21405 actual machine, then just general_operand, which will allow 2 separate
21406 memory references (one output, one input) in a single insn. */
21407
21408 void
21409 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
21410 rtx operands[])
21411 {
21412 bool matching_memory = false;
21413 rtx src, dst, op, clob;
21414
21415 dst = operands[0];
21416 src = operands[1];
21417
21418 /* If the destination is memory, and we do not have matching source
21419 operands, do things in registers. */
21420 if (MEM_P (dst))
21421 {
21422 if (rtx_equal_p (dst, src))
21423 matching_memory = true;
21424 else
21425 dst = gen_reg_rtx (mode);
21426 }
21427
21428 /* When source operand is memory, destination must match. */
21429 if (MEM_P (src) && !matching_memory)
21430 src = force_reg (mode, src);
21431
21432 /* Emit the instruction. */
21433
21434 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
21435
21436 if (code == NOT)
21437 emit_insn (op);
21438 else
21439 {
21440 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21441 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21442 }
21443
21444 /* Fix up the destination if needed. */
21445 if (dst != operands[0])
21446 emit_move_insn (operands[0], dst);
21447 }
21448
21449 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
21450 divisor are within the range [0-255]. */
21451
21452 void
21453 ix86_split_idivmod (machine_mode mode, rtx operands[],
21454 bool signed_p)
21455 {
21456 rtx_code_label *end_label, *qimode_label;
21457 rtx div, mod;
21458 rtx_insn *insn;
21459 rtx scratch, tmp0, tmp1, tmp2;
21460 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
21461 rtx (*gen_zero_extend) (rtx, rtx);
21462 rtx (*gen_test_ccno_1) (rtx, rtx);
21463
21464 switch (mode)
21465 {
21466 case SImode:
21467 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
21468 gen_test_ccno_1 = gen_testsi_ccno_1;
21469 gen_zero_extend = gen_zero_extendqisi2;
21470 break;
21471 case DImode:
21472 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
21473 gen_test_ccno_1 = gen_testdi_ccno_1;
21474 gen_zero_extend = gen_zero_extendqidi2;
21475 break;
21476 default:
21477 gcc_unreachable ();
21478 }
21479
21480 end_label = gen_label_rtx ();
21481 qimode_label = gen_label_rtx ();
21482
21483 scratch = gen_reg_rtx (mode);
21484
21485 /* Use 8bit unsigned divimod if dividend and divisor are within
21486 the range [0-255]. */
21487 emit_move_insn (scratch, operands[2]);
21488 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
21489 scratch, 1, OPTAB_DIRECT);
21490 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
21491 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
21492 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
21493 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
21494 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
21495 pc_rtx);
21496 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
21497 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21498 JUMP_LABEL (insn) = qimode_label;
21499
21500 /* Generate original signed/unsigned divimod. */
21501 div = gen_divmod4_1 (operands[0], operands[1],
21502 operands[2], operands[3]);
21503 emit_insn (div);
21504
21505 /* Branch to the end. */
21506 emit_jump_insn (gen_jump (end_label));
21507 emit_barrier ();
21508
21509 /* Generate 8bit unsigned divide. */
21510 emit_label (qimode_label);
21511 /* Don't use operands[0] for result of 8bit divide since not all
21512 registers support QImode ZERO_EXTRACT. */
21513 tmp0 = lowpart_subreg (HImode, scratch, mode);
21514 tmp1 = lowpart_subreg (HImode, operands[2], mode);
21515 tmp2 = lowpart_subreg (QImode, operands[3], mode);
21516 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
21517
21518 if (signed_p)
21519 {
21520 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
21521 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
21522 }
21523 else
21524 {
21525 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
21526 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
21527 }
21528
21529 /* Extract remainder from AH. */
21530 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
21531 if (REG_P (operands[1]))
21532 insn = emit_move_insn (operands[1], tmp1);
21533 else
21534 {
21535 /* Need a new scratch register since the old one has result
21536 of 8bit divide. */
21537 scratch = gen_reg_rtx (mode);
21538 emit_move_insn (scratch, tmp1);
21539 insn = emit_move_insn (operands[1], scratch);
21540 }
21541 set_unique_reg_note (insn, REG_EQUAL, mod);
21542
21543 /* Zero extend quotient from AL. */
21544 tmp1 = gen_lowpart (QImode, tmp0);
21545 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
21546 set_unique_reg_note (insn, REG_EQUAL, div);
21547
21548 emit_label (end_label);
21549 }
21550
21551 #define LEA_MAX_STALL (3)
21552 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
21553
21554 /* Increase given DISTANCE in half-cycles according to
21555 dependencies between PREV and NEXT instructions.
21556 Add 1 half-cycle if there is no dependency and
21557 go to next cycle if there is some dependecy. */
21558
21559 static unsigned int
21560 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
21561 {
21562 df_ref def, use;
21563
21564 if (!prev || !next)
21565 return distance + (distance & 1) + 2;
21566
21567 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
21568 return distance + 1;
21569
21570 FOR_EACH_INSN_USE (use, next)
21571 FOR_EACH_INSN_DEF (def, prev)
21572 if (!DF_REF_IS_ARTIFICIAL (def)
21573 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
21574 return distance + (distance & 1) + 2;
21575
21576 return distance + 1;
21577 }
21578
21579 /* Function checks if instruction INSN defines register number
21580 REGNO1 or REGNO2. */
21581
21582 static bool
21583 insn_defines_reg (unsigned int regno1, unsigned int regno2,
21584 rtx_insn *insn)
21585 {
21586 df_ref def;
21587
21588 FOR_EACH_INSN_DEF (def, insn)
21589 if (DF_REF_REG_DEF_P (def)
21590 && !DF_REF_IS_ARTIFICIAL (def)
21591 && (regno1 == DF_REF_REGNO (def)
21592 || regno2 == DF_REF_REGNO (def)))
21593 return true;
21594
21595 return false;
21596 }
21597
21598 /* Function checks if instruction INSN uses register number
21599 REGNO as a part of address expression. */
21600
21601 static bool
21602 insn_uses_reg_mem (unsigned int regno, rtx insn)
21603 {
21604 df_ref use;
21605
21606 FOR_EACH_INSN_USE (use, insn)
21607 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
21608 return true;
21609
21610 return false;
21611 }
21612
21613 /* Search backward for non-agu definition of register number REGNO1
21614 or register number REGNO2 in basic block starting from instruction
21615 START up to head of basic block or instruction INSN.
21616
21617 Function puts true value into *FOUND var if definition was found
21618 and false otherwise.
21619
21620 Distance in half-cycles between START and found instruction or head
21621 of BB is added to DISTANCE and returned. */
21622
21623 static int
21624 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
21625 rtx_insn *insn, int distance,
21626 rtx_insn *start, bool *found)
21627 {
21628 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
21629 rtx_insn *prev = start;
21630 rtx_insn *next = NULL;
21631
21632 *found = false;
21633
21634 while (prev
21635 && prev != insn
21636 && distance < LEA_SEARCH_THRESHOLD)
21637 {
21638 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
21639 {
21640 distance = increase_distance (prev, next, distance);
21641 if (insn_defines_reg (regno1, regno2, prev))
21642 {
21643 if (recog_memoized (prev) < 0
21644 || get_attr_type (prev) != TYPE_LEA)
21645 {
21646 *found = true;
21647 return distance;
21648 }
21649 }
21650
21651 next = prev;
21652 }
21653 if (prev == BB_HEAD (bb))
21654 break;
21655
21656 prev = PREV_INSN (prev);
21657 }
21658
21659 return distance;
21660 }
21661
21662 /* Search backward for non-agu definition of register number REGNO1
21663 or register number REGNO2 in INSN's basic block until
21664 1. Pass LEA_SEARCH_THRESHOLD instructions, or
21665 2. Reach neighbor BBs boundary, or
21666 3. Reach agu definition.
21667 Returns the distance between the non-agu definition point and INSN.
21668 If no definition point, returns -1. */
21669
21670 static int
21671 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
21672 rtx_insn *insn)
21673 {
21674 basic_block bb = BLOCK_FOR_INSN (insn);
21675 int distance = 0;
21676 bool found = false;
21677
21678 if (insn != BB_HEAD (bb))
21679 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
21680 distance, PREV_INSN (insn),
21681 &found);
21682
21683 if (!found && distance < LEA_SEARCH_THRESHOLD)
21684 {
21685 edge e;
21686 edge_iterator ei;
21687 bool simple_loop = false;
21688
21689 FOR_EACH_EDGE (e, ei, bb->preds)
21690 if (e->src == bb)
21691 {
21692 simple_loop = true;
21693 break;
21694 }
21695
21696 if (simple_loop)
21697 distance = distance_non_agu_define_in_bb (regno1, regno2,
21698 insn, distance,
21699 BB_END (bb), &found);
21700 else
21701 {
21702 int shortest_dist = -1;
21703 bool found_in_bb = false;
21704
21705 FOR_EACH_EDGE (e, ei, bb->preds)
21706 {
21707 int bb_dist
21708 = distance_non_agu_define_in_bb (regno1, regno2,
21709 insn, distance,
21710 BB_END (e->src),
21711 &found_in_bb);
21712 if (found_in_bb)
21713 {
21714 if (shortest_dist < 0)
21715 shortest_dist = bb_dist;
21716 else if (bb_dist > 0)
21717 shortest_dist = MIN (bb_dist, shortest_dist);
21718
21719 found = true;
21720 }
21721 }
21722
21723 distance = shortest_dist;
21724 }
21725 }
21726
21727 /* get_attr_type may modify recog data. We want to make sure
21728 that recog data is valid for instruction INSN, on which
21729 distance_non_agu_define is called. INSN is unchanged here. */
21730 extract_insn_cached (insn);
21731
21732 if (!found)
21733 return -1;
21734
21735 return distance >> 1;
21736 }
21737
21738 /* Return the distance in half-cycles between INSN and the next
21739 insn that uses register number REGNO in memory address added
21740 to DISTANCE. Return -1 if REGNO0 is set.
21741
21742 Put true value into *FOUND if register usage was found and
21743 false otherwise.
21744 Put true value into *REDEFINED if register redefinition was
21745 found and false otherwise. */
21746
21747 static int
21748 distance_agu_use_in_bb (unsigned int regno,
21749 rtx_insn *insn, int distance, rtx_insn *start,
21750 bool *found, bool *redefined)
21751 {
21752 basic_block bb = NULL;
21753 rtx_insn *next = start;
21754 rtx_insn *prev = NULL;
21755
21756 *found = false;
21757 *redefined = false;
21758
21759 if (start != NULL_RTX)
21760 {
21761 bb = BLOCK_FOR_INSN (start);
21762 if (start != BB_HEAD (bb))
21763 /* If insn and start belong to the same bb, set prev to insn,
21764 so the call to increase_distance will increase the distance
21765 between insns by 1. */
21766 prev = insn;
21767 }
21768
21769 while (next
21770 && next != insn
21771 && distance < LEA_SEARCH_THRESHOLD)
21772 {
21773 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
21774 {
21775 distance = increase_distance(prev, next, distance);
21776 if (insn_uses_reg_mem (regno, next))
21777 {
21778 /* Return DISTANCE if OP0 is used in memory
21779 address in NEXT. */
21780 *found = true;
21781 return distance;
21782 }
21783
21784 if (insn_defines_reg (regno, INVALID_REGNUM, next))
21785 {
21786 /* Return -1 if OP0 is set in NEXT. */
21787 *redefined = true;
21788 return -1;
21789 }
21790
21791 prev = next;
21792 }
21793
21794 if (next == BB_END (bb))
21795 break;
21796
21797 next = NEXT_INSN (next);
21798 }
21799
21800 return distance;
21801 }
21802
21803 /* Return the distance between INSN and the next insn that uses
21804 register number REGNO0 in memory address. Return -1 if no such
21805 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
21806
21807 static int
21808 distance_agu_use (unsigned int regno0, rtx_insn *insn)
21809 {
21810 basic_block bb = BLOCK_FOR_INSN (insn);
21811 int distance = 0;
21812 bool found = false;
21813 bool redefined = false;
21814
21815 if (insn != BB_END (bb))
21816 distance = distance_agu_use_in_bb (regno0, insn, distance,
21817 NEXT_INSN (insn),
21818 &found, &redefined);
21819
21820 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
21821 {
21822 edge e;
21823 edge_iterator ei;
21824 bool simple_loop = false;
21825
21826 FOR_EACH_EDGE (e, ei, bb->succs)
21827 if (e->dest == bb)
21828 {
21829 simple_loop = true;
21830 break;
21831 }
21832
21833 if (simple_loop)
21834 distance = distance_agu_use_in_bb (regno0, insn,
21835 distance, BB_HEAD (bb),
21836 &found, &redefined);
21837 else
21838 {
21839 int shortest_dist = -1;
21840 bool found_in_bb = false;
21841 bool redefined_in_bb = false;
21842
21843 FOR_EACH_EDGE (e, ei, bb->succs)
21844 {
21845 int bb_dist
21846 = distance_agu_use_in_bb (regno0, insn,
21847 distance, BB_HEAD (e->dest),
21848 &found_in_bb, &redefined_in_bb);
21849 if (found_in_bb)
21850 {
21851 if (shortest_dist < 0)
21852 shortest_dist = bb_dist;
21853 else if (bb_dist > 0)
21854 shortest_dist = MIN (bb_dist, shortest_dist);
21855
21856 found = true;
21857 }
21858 }
21859
21860 distance = shortest_dist;
21861 }
21862 }
21863
21864 if (!found || redefined)
21865 return -1;
21866
21867 return distance >> 1;
21868 }
21869
21870 /* Define this macro to tune LEA priority vs ADD, it take effect when
21871 there is a dilemma of choicing LEA or ADD
21872 Negative value: ADD is more preferred than LEA
21873 Zero: Netrual
21874 Positive value: LEA is more preferred than ADD*/
21875 #define IX86_LEA_PRIORITY 0
21876
21877 /* Return true if usage of lea INSN has performance advantage
21878 over a sequence of instructions. Instructions sequence has
21879 SPLIT_COST cycles higher latency than lea latency. */
21880
21881 static bool
21882 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
21883 unsigned int regno2, int split_cost, bool has_scale)
21884 {
21885 int dist_define, dist_use;
21886
21887 /* For Silvermont if using a 2-source or 3-source LEA for
21888 non-destructive destination purposes, or due to wanting
21889 ability to use SCALE, the use of LEA is justified. */
21890 if (TARGET_SILVERMONT || TARGET_INTEL)
21891 {
21892 if (has_scale)
21893 return true;
21894 if (split_cost < 1)
21895 return false;
21896 if (regno0 == regno1 || regno0 == regno2)
21897 return false;
21898 return true;
21899 }
21900
21901 dist_define = distance_non_agu_define (regno1, regno2, insn);
21902 dist_use = distance_agu_use (regno0, insn);
21903
21904 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
21905 {
21906 /* If there is no non AGU operand definition, no AGU
21907 operand usage and split cost is 0 then both lea
21908 and non lea variants have same priority. Currently
21909 we prefer lea for 64 bit code and non lea on 32 bit
21910 code. */
21911 if (dist_use < 0 && split_cost == 0)
21912 return TARGET_64BIT || IX86_LEA_PRIORITY;
21913 else
21914 return true;
21915 }
21916
21917 /* With longer definitions distance lea is more preferable.
21918 Here we change it to take into account splitting cost and
21919 lea priority. */
21920 dist_define += split_cost + IX86_LEA_PRIORITY;
21921
21922 /* If there is no use in memory addess then we just check
21923 that split cost exceeds AGU stall. */
21924 if (dist_use < 0)
21925 return dist_define > LEA_MAX_STALL;
21926
21927 /* If this insn has both backward non-agu dependence and forward
21928 agu dependence, the one with short distance takes effect. */
21929 return dist_define >= dist_use;
21930 }
21931
21932 /* Return true if it is legal to clobber flags by INSN and
21933 false otherwise. */
21934
21935 static bool
21936 ix86_ok_to_clobber_flags (rtx_insn *insn)
21937 {
21938 basic_block bb = BLOCK_FOR_INSN (insn);
21939 df_ref use;
21940 bitmap live;
21941
21942 while (insn)
21943 {
21944 if (NONDEBUG_INSN_P (insn))
21945 {
21946 FOR_EACH_INSN_USE (use, insn)
21947 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
21948 return false;
21949
21950 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
21951 return true;
21952 }
21953
21954 if (insn == BB_END (bb))
21955 break;
21956
21957 insn = NEXT_INSN (insn);
21958 }
21959
21960 live = df_get_live_out(bb);
21961 return !REGNO_REG_SET_P (live, FLAGS_REG);
21962 }
21963
21964 /* Return true if we need to split op0 = op1 + op2 into a sequence of
21965 move and add to avoid AGU stalls. */
21966
21967 bool
21968 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
21969 {
21970 unsigned int regno0, regno1, regno2;
21971
21972 /* Check if we need to optimize. */
21973 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21974 return false;
21975
21976 /* Check it is correct to split here. */
21977 if (!ix86_ok_to_clobber_flags(insn))
21978 return false;
21979
21980 regno0 = true_regnum (operands[0]);
21981 regno1 = true_regnum (operands[1]);
21982 regno2 = true_regnum (operands[2]);
21983
21984 /* We need to split only adds with non destructive
21985 destination operand. */
21986 if (regno0 == regno1 || regno0 == regno2)
21987 return false;
21988 else
21989 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
21990 }
21991
21992 /* Return true if we should emit lea instruction instead of mov
21993 instruction. */
21994
21995 bool
21996 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
21997 {
21998 unsigned int regno0, regno1;
21999
22000 /* Check if we need to optimize. */
22001 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
22002 return false;
22003
22004 /* Use lea for reg to reg moves only. */
22005 if (!REG_P (operands[0]) || !REG_P (operands[1]))
22006 return false;
22007
22008 regno0 = true_regnum (operands[0]);
22009 regno1 = true_regnum (operands[1]);
22010
22011 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
22012 }
22013
22014 /* Return true if we need to split lea into a sequence of
22015 instructions to avoid AGU stalls. */
22016
22017 bool
22018 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
22019 {
22020 unsigned int regno0, regno1, regno2;
22021 int split_cost;
22022 struct ix86_address parts;
22023 int ok;
22024
22025 /* Check we need to optimize. */
22026 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
22027 return false;
22028
22029 /* The "at least two components" test below might not catch simple
22030 move or zero extension insns if parts.base is non-NULL and parts.disp
22031 is const0_rtx as the only components in the address, e.g. if the
22032 register is %rbp or %r13. As this test is much cheaper and moves or
22033 zero extensions are the common case, do this check first. */
22034 if (REG_P (operands[1])
22035 || (SImode_address_operand (operands[1], VOIDmode)
22036 && REG_P (XEXP (operands[1], 0))))
22037 return false;
22038
22039 /* Check if it is OK to split here. */
22040 if (!ix86_ok_to_clobber_flags (insn))
22041 return false;
22042
22043 ok = ix86_decompose_address (operands[1], &parts);
22044 gcc_assert (ok);
22045
22046 /* There should be at least two components in the address. */
22047 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
22048 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
22049 return false;
22050
22051 /* We should not split into add if non legitimate pic
22052 operand is used as displacement. */
22053 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
22054 return false;
22055
22056 regno0 = true_regnum (operands[0]) ;
22057 regno1 = INVALID_REGNUM;
22058 regno2 = INVALID_REGNUM;
22059
22060 if (parts.base)
22061 regno1 = true_regnum (parts.base);
22062 if (parts.index)
22063 regno2 = true_regnum (parts.index);
22064
22065 split_cost = 0;
22066
22067 /* Compute how many cycles we will add to execution time
22068 if split lea into a sequence of instructions. */
22069 if (parts.base || parts.index)
22070 {
22071 /* Have to use mov instruction if non desctructive
22072 destination form is used. */
22073 if (regno1 != regno0 && regno2 != regno0)
22074 split_cost += 1;
22075
22076 /* Have to add index to base if both exist. */
22077 if (parts.base && parts.index)
22078 split_cost += 1;
22079
22080 /* Have to use shift and adds if scale is 2 or greater. */
22081 if (parts.scale > 1)
22082 {
22083 if (regno0 != regno1)
22084 split_cost += 1;
22085 else if (regno2 == regno0)
22086 split_cost += 4;
22087 else
22088 split_cost += parts.scale;
22089 }
22090
22091 /* Have to use add instruction with immediate if
22092 disp is non zero. */
22093 if (parts.disp && parts.disp != const0_rtx)
22094 split_cost += 1;
22095
22096 /* Subtract the price of lea. */
22097 split_cost -= 1;
22098 }
22099
22100 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
22101 parts.scale > 1);
22102 }
22103
22104 /* Emit x86 binary operand CODE in mode MODE, where the first operand
22105 matches destination. RTX includes clobber of FLAGS_REG. */
22106
22107 static void
22108 ix86_emit_binop (enum rtx_code code, machine_mode mode,
22109 rtx dst, rtx src)
22110 {
22111 rtx op, clob;
22112
22113 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
22114 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
22115
22116 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
22117 }
22118
22119 /* Return true if regno1 def is nearest to the insn. */
22120
22121 static bool
22122 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
22123 {
22124 rtx_insn *prev = insn;
22125 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
22126
22127 if (insn == start)
22128 return false;
22129 while (prev && prev != start)
22130 {
22131 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
22132 {
22133 prev = PREV_INSN (prev);
22134 continue;
22135 }
22136 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
22137 return true;
22138 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
22139 return false;
22140 prev = PREV_INSN (prev);
22141 }
22142
22143 /* None of the regs is defined in the bb. */
22144 return false;
22145 }
22146
22147 /* Split lea instructions into a sequence of instructions
22148 which are executed on ALU to avoid AGU stalls.
22149 It is assumed that it is allowed to clobber flags register
22150 at lea position. */
22151
22152 void
22153 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
22154 {
22155 unsigned int regno0, regno1, regno2;
22156 struct ix86_address parts;
22157 rtx target, tmp;
22158 int ok, adds;
22159
22160 ok = ix86_decompose_address (operands[1], &parts);
22161 gcc_assert (ok);
22162
22163 target = gen_lowpart (mode, operands[0]);
22164
22165 regno0 = true_regnum (target);
22166 regno1 = INVALID_REGNUM;
22167 regno2 = INVALID_REGNUM;
22168
22169 if (parts.base)
22170 {
22171 parts.base = gen_lowpart (mode, parts.base);
22172 regno1 = true_regnum (parts.base);
22173 }
22174
22175 if (parts.index)
22176 {
22177 parts.index = gen_lowpart (mode, parts.index);
22178 regno2 = true_regnum (parts.index);
22179 }
22180
22181 if (parts.disp)
22182 parts.disp = gen_lowpart (mode, parts.disp);
22183
22184 if (parts.scale > 1)
22185 {
22186 /* Case r1 = r1 + ... */
22187 if (regno1 == regno0)
22188 {
22189 /* If we have a case r1 = r1 + C * r2 then we
22190 should use multiplication which is very
22191 expensive. Assume cost model is wrong if we
22192 have such case here. */
22193 gcc_assert (regno2 != regno0);
22194
22195 for (adds = parts.scale; adds > 0; adds--)
22196 ix86_emit_binop (PLUS, mode, target, parts.index);
22197 }
22198 else
22199 {
22200 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
22201 if (regno0 != regno2)
22202 emit_insn (gen_rtx_SET (target, parts.index));
22203
22204 /* Use shift for scaling. */
22205 ix86_emit_binop (ASHIFT, mode, target,
22206 GEN_INT (exact_log2 (parts.scale)));
22207
22208 if (parts.base)
22209 ix86_emit_binop (PLUS, mode, target, parts.base);
22210
22211 if (parts.disp && parts.disp != const0_rtx)
22212 ix86_emit_binop (PLUS, mode, target, parts.disp);
22213 }
22214 }
22215 else if (!parts.base && !parts.index)
22216 {
22217 gcc_assert(parts.disp);
22218 emit_insn (gen_rtx_SET (target, parts.disp));
22219 }
22220 else
22221 {
22222 if (!parts.base)
22223 {
22224 if (regno0 != regno2)
22225 emit_insn (gen_rtx_SET (target, parts.index));
22226 }
22227 else if (!parts.index)
22228 {
22229 if (regno0 != regno1)
22230 emit_insn (gen_rtx_SET (target, parts.base));
22231 }
22232 else
22233 {
22234 if (regno0 == regno1)
22235 tmp = parts.index;
22236 else if (regno0 == regno2)
22237 tmp = parts.base;
22238 else
22239 {
22240 rtx tmp1;
22241
22242 /* Find better operand for SET instruction, depending
22243 on which definition is farther from the insn. */
22244 if (find_nearest_reg_def (insn, regno1, regno2))
22245 tmp = parts.index, tmp1 = parts.base;
22246 else
22247 tmp = parts.base, tmp1 = parts.index;
22248
22249 emit_insn (gen_rtx_SET (target, tmp));
22250
22251 if (parts.disp && parts.disp != const0_rtx)
22252 ix86_emit_binop (PLUS, mode, target, parts.disp);
22253
22254 ix86_emit_binop (PLUS, mode, target, tmp1);
22255 return;
22256 }
22257
22258 ix86_emit_binop (PLUS, mode, target, tmp);
22259 }
22260
22261 if (parts.disp && parts.disp != const0_rtx)
22262 ix86_emit_binop (PLUS, mode, target, parts.disp);
22263 }
22264 }
22265
22266 /* Return true if it is ok to optimize an ADD operation to LEA
22267 operation to avoid flag register consumation. For most processors,
22268 ADD is faster than LEA. For the processors like BONNELL, if the
22269 destination register of LEA holds an actual address which will be
22270 used soon, LEA is better and otherwise ADD is better. */
22271
22272 bool
22273 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
22274 {
22275 unsigned int regno0 = true_regnum (operands[0]);
22276 unsigned int regno1 = true_regnum (operands[1]);
22277 unsigned int regno2 = true_regnum (operands[2]);
22278
22279 /* If a = b + c, (a!=b && a!=c), must use lea form. */
22280 if (regno0 != regno1 && regno0 != regno2)
22281 return true;
22282
22283 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
22284 return false;
22285
22286 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
22287 }
22288
22289 /* Return true if destination reg of SET_BODY is shift count of
22290 USE_BODY. */
22291
22292 static bool
22293 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
22294 {
22295 rtx set_dest;
22296 rtx shift_rtx;
22297 int i;
22298
22299 /* Retrieve destination of SET_BODY. */
22300 switch (GET_CODE (set_body))
22301 {
22302 case SET:
22303 set_dest = SET_DEST (set_body);
22304 if (!set_dest || !REG_P (set_dest))
22305 return false;
22306 break;
22307 case PARALLEL:
22308 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
22309 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
22310 use_body))
22311 return true;
22312 /* FALLTHROUGH */
22313 default:
22314 return false;
22315 }
22316
22317 /* Retrieve shift count of USE_BODY. */
22318 switch (GET_CODE (use_body))
22319 {
22320 case SET:
22321 shift_rtx = XEXP (use_body, 1);
22322 break;
22323 case PARALLEL:
22324 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
22325 if (ix86_dep_by_shift_count_body (set_body,
22326 XVECEXP (use_body, 0, i)))
22327 return true;
22328 /* FALLTHROUGH */
22329 default:
22330 return false;
22331 }
22332
22333 if (shift_rtx
22334 && (GET_CODE (shift_rtx) == ASHIFT
22335 || GET_CODE (shift_rtx) == LSHIFTRT
22336 || GET_CODE (shift_rtx) == ASHIFTRT
22337 || GET_CODE (shift_rtx) == ROTATE
22338 || GET_CODE (shift_rtx) == ROTATERT))
22339 {
22340 rtx shift_count = XEXP (shift_rtx, 1);
22341
22342 /* Return true if shift count is dest of SET_BODY. */
22343 if (REG_P (shift_count))
22344 {
22345 /* Add check since it can be invoked before register
22346 allocation in pre-reload schedule. */
22347 if (reload_completed
22348 && true_regnum (set_dest) == true_regnum (shift_count))
22349 return true;
22350 else if (REGNO(set_dest) == REGNO(shift_count))
22351 return true;
22352 }
22353 }
22354
22355 return false;
22356 }
22357
22358 /* Return true if destination reg of SET_INSN is shift count of
22359 USE_INSN. */
22360
22361 bool
22362 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
22363 {
22364 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
22365 PATTERN (use_insn));
22366 }
22367
22368 /* Return TRUE or FALSE depending on whether the unary operator meets the
22369 appropriate constraints. */
22370
22371 bool
22372 ix86_unary_operator_ok (enum rtx_code,
22373 machine_mode,
22374 rtx operands[2])
22375 {
22376 /* If one of operands is memory, source and destination must match. */
22377 if ((MEM_P (operands[0])
22378 || MEM_P (operands[1]))
22379 && ! rtx_equal_p (operands[0], operands[1]))
22380 return false;
22381 return true;
22382 }
22383
22384 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
22385 are ok, keeping in mind the possible movddup alternative. */
22386
22387 bool
22388 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
22389 {
22390 if (MEM_P (operands[0]))
22391 return rtx_equal_p (operands[0], operands[1 + high]);
22392 if (MEM_P (operands[1]) && MEM_P (operands[2]))
22393 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
22394 return true;
22395 }
22396
22397 /* Post-reload splitter for converting an SF or DFmode value in an
22398 SSE register into an unsigned SImode. */
22399
22400 void
22401 ix86_split_convert_uns_si_sse (rtx operands[])
22402 {
22403 machine_mode vecmode;
22404 rtx value, large, zero_or_two31, input, two31, x;
22405
22406 large = operands[1];
22407 zero_or_two31 = operands[2];
22408 input = operands[3];
22409 two31 = operands[4];
22410 vecmode = GET_MODE (large);
22411 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
22412
22413 /* Load up the value into the low element. We must ensure that the other
22414 elements are valid floats -- zero is the easiest such value. */
22415 if (MEM_P (input))
22416 {
22417 if (vecmode == V4SFmode)
22418 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
22419 else
22420 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
22421 }
22422 else
22423 {
22424 input = gen_rtx_REG (vecmode, REGNO (input));
22425 emit_move_insn (value, CONST0_RTX (vecmode));
22426 if (vecmode == V4SFmode)
22427 emit_insn (gen_sse_movss (value, value, input));
22428 else
22429 emit_insn (gen_sse2_movsd (value, value, input));
22430 }
22431
22432 emit_move_insn (large, two31);
22433 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
22434
22435 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
22436 emit_insn (gen_rtx_SET (large, x));
22437
22438 x = gen_rtx_AND (vecmode, zero_or_two31, large);
22439 emit_insn (gen_rtx_SET (zero_or_two31, x));
22440
22441 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
22442 emit_insn (gen_rtx_SET (value, x));
22443
22444 large = gen_rtx_REG (V4SImode, REGNO (large));
22445 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
22446
22447 x = gen_rtx_REG (V4SImode, REGNO (value));
22448 if (vecmode == V4SFmode)
22449 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
22450 else
22451 emit_insn (gen_sse2_cvttpd2dq (x, value));
22452 value = x;
22453
22454 emit_insn (gen_xorv4si3 (value, value, large));
22455 }
22456
22457 /* Convert an unsigned DImode value into a DFmode, using only SSE.
22458 Expects the 64-bit DImode to be supplied in a pair of integral
22459 registers. Requires SSE2; will use SSE3 if available. For x86_32,
22460 -mfpmath=sse, !optimize_size only. */
22461
22462 void
22463 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
22464 {
22465 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
22466 rtx int_xmm, fp_xmm;
22467 rtx biases, exponents;
22468 rtx x;
22469
22470 int_xmm = gen_reg_rtx (V4SImode);
22471 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
22472 emit_insn (gen_movdi_to_sse (int_xmm, input));
22473 else if (TARGET_SSE_SPLIT_REGS)
22474 {
22475 emit_clobber (int_xmm);
22476 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
22477 }
22478 else
22479 {
22480 x = gen_reg_rtx (V2DImode);
22481 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
22482 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
22483 }
22484
22485 x = gen_rtx_CONST_VECTOR (V4SImode,
22486 gen_rtvec (4, GEN_INT (0x43300000UL),
22487 GEN_INT (0x45300000UL),
22488 const0_rtx, const0_rtx));
22489 exponents = validize_mem (force_const_mem (V4SImode, x));
22490
22491 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
22492 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
22493
22494 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
22495 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
22496 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
22497 (0x1.0p84 + double(fp_value_hi_xmm)).
22498 Note these exponents differ by 32. */
22499
22500 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
22501
22502 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
22503 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
22504 real_ldexp (&bias_lo_rvt, &dconst1, 52);
22505 real_ldexp (&bias_hi_rvt, &dconst1, 84);
22506 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
22507 x = const_double_from_real_value (bias_hi_rvt, DFmode);
22508 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
22509 biases = validize_mem (force_const_mem (V2DFmode, biases));
22510 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
22511
22512 /* Add the upper and lower DFmode values together. */
22513 if (TARGET_SSE3)
22514 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
22515 else
22516 {
22517 x = copy_to_mode_reg (V2DFmode, fp_xmm);
22518 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
22519 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
22520 }
22521
22522 ix86_expand_vector_extract (false, target, fp_xmm, 0);
22523 }
22524
22525 /* Not used, but eases macroization of patterns. */
22526 void
22527 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
22528 {
22529 gcc_unreachable ();
22530 }
22531
22532 /* Convert an unsigned SImode value into a DFmode. Only currently used
22533 for SSE, but applicable anywhere. */
22534
22535 void
22536 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
22537 {
22538 REAL_VALUE_TYPE TWO31r;
22539 rtx x, fp;
22540
22541 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
22542 NULL, 1, OPTAB_DIRECT);
22543
22544 fp = gen_reg_rtx (DFmode);
22545 emit_insn (gen_floatsidf2 (fp, x));
22546
22547 real_ldexp (&TWO31r, &dconst1, 31);
22548 x = const_double_from_real_value (TWO31r, DFmode);
22549
22550 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
22551 if (x != target)
22552 emit_move_insn (target, x);
22553 }
22554
22555 /* Convert a signed DImode value into a DFmode. Only used for SSE in
22556 32-bit mode; otherwise we have a direct convert instruction. */
22557
22558 void
22559 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
22560 {
22561 REAL_VALUE_TYPE TWO32r;
22562 rtx fp_lo, fp_hi, x;
22563
22564 fp_lo = gen_reg_rtx (DFmode);
22565 fp_hi = gen_reg_rtx (DFmode);
22566
22567 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
22568
22569 real_ldexp (&TWO32r, &dconst1, 32);
22570 x = const_double_from_real_value (TWO32r, DFmode);
22571 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
22572
22573 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
22574
22575 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
22576 0, OPTAB_DIRECT);
22577 if (x != target)
22578 emit_move_insn (target, x);
22579 }
22580
22581 /* Convert an unsigned SImode value into a SFmode, using only SSE.
22582 For x86_32, -mfpmath=sse, !optimize_size only. */
22583 void
22584 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
22585 {
22586 REAL_VALUE_TYPE ONE16r;
22587 rtx fp_hi, fp_lo, int_hi, int_lo, x;
22588
22589 real_ldexp (&ONE16r, &dconst1, 16);
22590 x = const_double_from_real_value (ONE16r, SFmode);
22591 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
22592 NULL, 0, OPTAB_DIRECT);
22593 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
22594 NULL, 0, OPTAB_DIRECT);
22595 fp_hi = gen_reg_rtx (SFmode);
22596 fp_lo = gen_reg_rtx (SFmode);
22597 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
22598 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
22599 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
22600 0, OPTAB_DIRECT);
22601 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
22602 0, OPTAB_DIRECT);
22603 if (!rtx_equal_p (target, fp_hi))
22604 emit_move_insn (target, fp_hi);
22605 }
22606
22607 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
22608 a vector of unsigned ints VAL to vector of floats TARGET. */
22609
22610 void
22611 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
22612 {
22613 rtx tmp[8];
22614 REAL_VALUE_TYPE TWO16r;
22615 machine_mode intmode = GET_MODE (val);
22616 machine_mode fltmode = GET_MODE (target);
22617 rtx (*cvt) (rtx, rtx);
22618
22619 if (intmode == V4SImode)
22620 cvt = gen_floatv4siv4sf2;
22621 else
22622 cvt = gen_floatv8siv8sf2;
22623 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
22624 tmp[0] = force_reg (intmode, tmp[0]);
22625 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
22626 OPTAB_DIRECT);
22627 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
22628 NULL_RTX, 1, OPTAB_DIRECT);
22629 tmp[3] = gen_reg_rtx (fltmode);
22630 emit_insn (cvt (tmp[3], tmp[1]));
22631 tmp[4] = gen_reg_rtx (fltmode);
22632 emit_insn (cvt (tmp[4], tmp[2]));
22633 real_ldexp (&TWO16r, &dconst1, 16);
22634 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
22635 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
22636 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
22637 OPTAB_DIRECT);
22638 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
22639 OPTAB_DIRECT);
22640 if (tmp[7] != target)
22641 emit_move_insn (target, tmp[7]);
22642 }
22643
22644 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
22645 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
22646 This is done by doing just signed conversion if < 0x1p31, and otherwise by
22647 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
22648
22649 rtx
22650 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
22651 {
22652 REAL_VALUE_TYPE TWO31r;
22653 rtx two31r, tmp[4];
22654 machine_mode mode = GET_MODE (val);
22655 machine_mode scalarmode = GET_MODE_INNER (mode);
22656 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
22657 rtx (*cmp) (rtx, rtx, rtx, rtx);
22658 int i;
22659
22660 for (i = 0; i < 3; i++)
22661 tmp[i] = gen_reg_rtx (mode);
22662 real_ldexp (&TWO31r, &dconst1, 31);
22663 two31r = const_double_from_real_value (TWO31r, scalarmode);
22664 two31r = ix86_build_const_vector (mode, 1, two31r);
22665 two31r = force_reg (mode, two31r);
22666 switch (mode)
22667 {
22668 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
22669 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
22670 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
22671 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
22672 default: gcc_unreachable ();
22673 }
22674 tmp[3] = gen_rtx_LE (mode, two31r, val);
22675 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
22676 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
22677 0, OPTAB_DIRECT);
22678 if (intmode == V4SImode || TARGET_AVX2)
22679 *xorp = expand_simple_binop (intmode, ASHIFT,
22680 gen_lowpart (intmode, tmp[0]),
22681 GEN_INT (31), NULL_RTX, 0,
22682 OPTAB_DIRECT);
22683 else
22684 {
22685 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
22686 two31 = ix86_build_const_vector (intmode, 1, two31);
22687 *xorp = expand_simple_binop (intmode, AND,
22688 gen_lowpart (intmode, tmp[0]),
22689 two31, NULL_RTX, 0,
22690 OPTAB_DIRECT);
22691 }
22692 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
22693 0, OPTAB_DIRECT);
22694 }
22695
22696 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
22697 then replicate the value for all elements of the vector
22698 register. */
22699
22700 rtx
22701 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
22702 {
22703 int i, n_elt;
22704 rtvec v;
22705 machine_mode scalar_mode;
22706
22707 switch (mode)
22708 {
22709 case V64QImode:
22710 case V32QImode:
22711 case V16QImode:
22712 case V32HImode:
22713 case V16HImode:
22714 case V8HImode:
22715 case V16SImode:
22716 case V8SImode:
22717 case V4SImode:
22718 case V8DImode:
22719 case V4DImode:
22720 case V2DImode:
22721 gcc_assert (vect);
22722 /* FALLTHRU */
22723 case V16SFmode:
22724 case V8SFmode:
22725 case V4SFmode:
22726 case V8DFmode:
22727 case V4DFmode:
22728 case V2DFmode:
22729 n_elt = GET_MODE_NUNITS (mode);
22730 v = rtvec_alloc (n_elt);
22731 scalar_mode = GET_MODE_INNER (mode);
22732
22733 RTVEC_ELT (v, 0) = value;
22734
22735 for (i = 1; i < n_elt; ++i)
22736 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
22737
22738 return gen_rtx_CONST_VECTOR (mode, v);
22739
22740 default:
22741 gcc_unreachable ();
22742 }
22743 }
22744
22745 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
22746 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
22747 for an SSE register. If VECT is true, then replicate the mask for
22748 all elements of the vector register. If INVERT is true, then create
22749 a mask excluding the sign bit. */
22750
22751 rtx
22752 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
22753 {
22754 machine_mode vec_mode, imode;
22755 wide_int w;
22756 rtx mask, v;
22757
22758 switch (mode)
22759 {
22760 case V16SImode:
22761 case V16SFmode:
22762 case V8SImode:
22763 case V4SImode:
22764 case V8SFmode:
22765 case V4SFmode:
22766 vec_mode = mode;
22767 imode = SImode;
22768 break;
22769
22770 case V8DImode:
22771 case V4DImode:
22772 case V2DImode:
22773 case V8DFmode:
22774 case V4DFmode:
22775 case V2DFmode:
22776 vec_mode = mode;
22777 imode = DImode;
22778 break;
22779
22780 case TImode:
22781 case TFmode:
22782 vec_mode = VOIDmode;
22783 imode = TImode;
22784 break;
22785
22786 default:
22787 gcc_unreachable ();
22788 }
22789
22790 machine_mode inner_mode = GET_MODE_INNER (mode);
22791 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
22792 GET_MODE_BITSIZE (inner_mode));
22793 if (invert)
22794 w = wi::bit_not (w);
22795
22796 /* Force this value into the low part of a fp vector constant. */
22797 mask = immed_wide_int_const (w, imode);
22798 mask = gen_lowpart (inner_mode, mask);
22799
22800 if (vec_mode == VOIDmode)
22801 return force_reg (inner_mode, mask);
22802
22803 v = ix86_build_const_vector (vec_mode, vect, mask);
22804 return force_reg (vec_mode, v);
22805 }
22806
22807 /* Generate code for floating point ABS or NEG. */
22808
22809 void
22810 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
22811 rtx operands[])
22812 {
22813 rtx mask, set, dst, src;
22814 bool use_sse = false;
22815 bool vector_mode = VECTOR_MODE_P (mode);
22816 machine_mode vmode = mode;
22817
22818 if (vector_mode)
22819 use_sse = true;
22820 else if (mode == TFmode)
22821 use_sse = true;
22822 else if (TARGET_SSE_MATH)
22823 {
22824 use_sse = SSE_FLOAT_MODE_P (mode);
22825 if (mode == SFmode)
22826 vmode = V4SFmode;
22827 else if (mode == DFmode)
22828 vmode = V2DFmode;
22829 }
22830
22831 /* NEG and ABS performed with SSE use bitwise mask operations.
22832 Create the appropriate mask now. */
22833 if (use_sse)
22834 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
22835 else
22836 mask = NULL_RTX;
22837
22838 dst = operands[0];
22839 src = operands[1];
22840
22841 set = gen_rtx_fmt_e (code, mode, src);
22842 set = gen_rtx_SET (dst, set);
22843
22844 if (mask)
22845 {
22846 rtx use, clob;
22847 rtvec par;
22848
22849 use = gen_rtx_USE (VOIDmode, mask);
22850 if (vector_mode)
22851 par = gen_rtvec (2, set, use);
22852 else
22853 {
22854 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
22855 par = gen_rtvec (3, set, use, clob);
22856 }
22857 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
22858 }
22859 else
22860 emit_insn (set);
22861 }
22862
22863 /* Expand a copysign operation. Special case operand 0 being a constant. */
22864
22865 void
22866 ix86_expand_copysign (rtx operands[])
22867 {
22868 machine_mode mode, vmode;
22869 rtx dest, op0, op1, mask, nmask;
22870
22871 dest = operands[0];
22872 op0 = operands[1];
22873 op1 = operands[2];
22874
22875 mode = GET_MODE (dest);
22876
22877 if (mode == SFmode)
22878 vmode = V4SFmode;
22879 else if (mode == DFmode)
22880 vmode = V2DFmode;
22881 else
22882 vmode = mode;
22883
22884 if (CONST_DOUBLE_P (op0))
22885 {
22886 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
22887
22888 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
22889 op0 = simplify_unary_operation (ABS, mode, op0, mode);
22890
22891 if (mode == SFmode || mode == DFmode)
22892 {
22893 if (op0 == CONST0_RTX (mode))
22894 op0 = CONST0_RTX (vmode);
22895 else
22896 {
22897 rtx v = ix86_build_const_vector (vmode, false, op0);
22898
22899 op0 = force_reg (vmode, v);
22900 }
22901 }
22902 else if (op0 != CONST0_RTX (mode))
22903 op0 = force_reg (mode, op0);
22904
22905 mask = ix86_build_signbit_mask (vmode, 0, 0);
22906
22907 if (mode == SFmode)
22908 copysign_insn = gen_copysignsf3_const;
22909 else if (mode == DFmode)
22910 copysign_insn = gen_copysigndf3_const;
22911 else
22912 copysign_insn = gen_copysigntf3_const;
22913
22914 emit_insn (copysign_insn (dest, op0, op1, mask));
22915 }
22916 else
22917 {
22918 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
22919
22920 nmask = ix86_build_signbit_mask (vmode, 0, 1);
22921 mask = ix86_build_signbit_mask (vmode, 0, 0);
22922
22923 if (mode == SFmode)
22924 copysign_insn = gen_copysignsf3_var;
22925 else if (mode == DFmode)
22926 copysign_insn = gen_copysigndf3_var;
22927 else
22928 copysign_insn = gen_copysigntf3_var;
22929
22930 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
22931 }
22932 }
22933
22934 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
22935 be a constant, and so has already been expanded into a vector constant. */
22936
22937 void
22938 ix86_split_copysign_const (rtx operands[])
22939 {
22940 machine_mode mode, vmode;
22941 rtx dest, op0, mask, x;
22942
22943 dest = operands[0];
22944 op0 = operands[1];
22945 mask = operands[3];
22946
22947 mode = GET_MODE (dest);
22948 vmode = GET_MODE (mask);
22949
22950 dest = lowpart_subreg (vmode, dest, mode);
22951 x = gen_rtx_AND (vmode, dest, mask);
22952 emit_insn (gen_rtx_SET (dest, x));
22953
22954 if (op0 != CONST0_RTX (vmode))
22955 {
22956 x = gen_rtx_IOR (vmode, dest, op0);
22957 emit_insn (gen_rtx_SET (dest, x));
22958 }
22959 }
22960
22961 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
22962 so we have to do two masks. */
22963
22964 void
22965 ix86_split_copysign_var (rtx operands[])
22966 {
22967 machine_mode mode, vmode;
22968 rtx dest, scratch, op0, op1, mask, nmask, x;
22969
22970 dest = operands[0];
22971 scratch = operands[1];
22972 op0 = operands[2];
22973 op1 = operands[3];
22974 nmask = operands[4];
22975 mask = operands[5];
22976
22977 mode = GET_MODE (dest);
22978 vmode = GET_MODE (mask);
22979
22980 if (rtx_equal_p (op0, op1))
22981 {
22982 /* Shouldn't happen often (it's useless, obviously), but when it does
22983 we'd generate incorrect code if we continue below. */
22984 emit_move_insn (dest, op0);
22985 return;
22986 }
22987
22988 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
22989 {
22990 gcc_assert (REGNO (op1) == REGNO (scratch));
22991
22992 x = gen_rtx_AND (vmode, scratch, mask);
22993 emit_insn (gen_rtx_SET (scratch, x));
22994
22995 dest = mask;
22996 op0 = lowpart_subreg (vmode, op0, mode);
22997 x = gen_rtx_NOT (vmode, dest);
22998 x = gen_rtx_AND (vmode, x, op0);
22999 emit_insn (gen_rtx_SET (dest, x));
23000 }
23001 else
23002 {
23003 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
23004 {
23005 x = gen_rtx_AND (vmode, scratch, mask);
23006 }
23007 else /* alternative 2,4 */
23008 {
23009 gcc_assert (REGNO (mask) == REGNO (scratch));
23010 op1 = lowpart_subreg (vmode, op1, mode);
23011 x = gen_rtx_AND (vmode, scratch, op1);
23012 }
23013 emit_insn (gen_rtx_SET (scratch, x));
23014
23015 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
23016 {
23017 dest = lowpart_subreg (vmode, op0, mode);
23018 x = gen_rtx_AND (vmode, dest, nmask);
23019 }
23020 else /* alternative 3,4 */
23021 {
23022 gcc_assert (REGNO (nmask) == REGNO (dest));
23023 dest = nmask;
23024 op0 = lowpart_subreg (vmode, op0, mode);
23025 x = gen_rtx_AND (vmode, dest, op0);
23026 }
23027 emit_insn (gen_rtx_SET (dest, x));
23028 }
23029
23030 x = gen_rtx_IOR (vmode, dest, scratch);
23031 emit_insn (gen_rtx_SET (dest, x));
23032 }
23033
23034 /* Return TRUE or FALSE depending on whether the first SET in INSN
23035 has source and destination with matching CC modes, and that the
23036 CC mode is at least as constrained as REQ_MODE. */
23037
23038 bool
23039 ix86_match_ccmode (rtx insn, machine_mode req_mode)
23040 {
23041 rtx set;
23042 machine_mode set_mode;
23043
23044 set = PATTERN (insn);
23045 if (GET_CODE (set) == PARALLEL)
23046 set = XVECEXP (set, 0, 0);
23047 gcc_assert (GET_CODE (set) == SET);
23048 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
23049
23050 set_mode = GET_MODE (SET_DEST (set));
23051 switch (set_mode)
23052 {
23053 case CCNOmode:
23054 if (req_mode != CCNOmode
23055 && (req_mode != CCmode
23056 || XEXP (SET_SRC (set), 1) != const0_rtx))
23057 return false;
23058 break;
23059 case CCmode:
23060 if (req_mode == CCGCmode)
23061 return false;
23062 /* FALLTHRU */
23063 case CCGCmode:
23064 if (req_mode == CCGOCmode || req_mode == CCNOmode)
23065 return false;
23066 /* FALLTHRU */
23067 case CCGOCmode:
23068 if (req_mode == CCZmode)
23069 return false;
23070 /* FALLTHRU */
23071 case CCZmode:
23072 break;
23073
23074 case CCAmode:
23075 case CCCmode:
23076 case CCOmode:
23077 case CCPmode:
23078 case CCSmode:
23079 if (set_mode != req_mode)
23080 return false;
23081 break;
23082
23083 default:
23084 gcc_unreachable ();
23085 }
23086
23087 return GET_MODE (SET_SRC (set)) == set_mode;
23088 }
23089
23090 /* Generate insn patterns to do an integer compare of OPERANDS. */
23091
23092 static rtx
23093 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
23094 {
23095 machine_mode cmpmode;
23096 rtx tmp, flags;
23097
23098 cmpmode = SELECT_CC_MODE (code, op0, op1);
23099 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
23100
23101 /* This is very simple, but making the interface the same as in the
23102 FP case makes the rest of the code easier. */
23103 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
23104 emit_insn (gen_rtx_SET (flags, tmp));
23105
23106 /* Return the test that should be put into the flags user, i.e.
23107 the bcc, scc, or cmov instruction. */
23108 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
23109 }
23110
23111 /* Figure out whether to use ordered or unordered fp comparisons.
23112 Return the appropriate mode to use. */
23113
23114 machine_mode
23115 ix86_fp_compare_mode (enum rtx_code)
23116 {
23117 /* ??? In order to make all comparisons reversible, we do all comparisons
23118 non-trapping when compiling for IEEE. Once gcc is able to distinguish
23119 all forms trapping and nontrapping comparisons, we can make inequality
23120 comparisons trapping again, since it results in better code when using
23121 FCOM based compares. */
23122 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
23123 }
23124
23125 machine_mode
23126 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
23127 {
23128 machine_mode mode = GET_MODE (op0);
23129
23130 if (SCALAR_FLOAT_MODE_P (mode))
23131 {
23132 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
23133 return ix86_fp_compare_mode (code);
23134 }
23135
23136 switch (code)
23137 {
23138 /* Only zero flag is needed. */
23139 case EQ: /* ZF=0 */
23140 case NE: /* ZF!=0 */
23141 return CCZmode;
23142 /* Codes needing carry flag. */
23143 case GEU: /* CF=0 */
23144 case LTU: /* CF=1 */
23145 /* Detect overflow checks. They need just the carry flag. */
23146 if (GET_CODE (op0) == PLUS
23147 && (rtx_equal_p (op1, XEXP (op0, 0))
23148 || rtx_equal_p (op1, XEXP (op0, 1))))
23149 return CCCmode;
23150 else
23151 return CCmode;
23152 case GTU: /* CF=0 & ZF=0 */
23153 case LEU: /* CF=1 | ZF=1 */
23154 return CCmode;
23155 /* Codes possibly doable only with sign flag when
23156 comparing against zero. */
23157 case GE: /* SF=OF or SF=0 */
23158 case LT: /* SF<>OF or SF=1 */
23159 if (op1 == const0_rtx)
23160 return CCGOCmode;
23161 else
23162 /* For other cases Carry flag is not required. */
23163 return CCGCmode;
23164 /* Codes doable only with sign flag when comparing
23165 against zero, but we miss jump instruction for it
23166 so we need to use relational tests against overflow
23167 that thus needs to be zero. */
23168 case GT: /* ZF=0 & SF=OF */
23169 case LE: /* ZF=1 | SF<>OF */
23170 if (op1 == const0_rtx)
23171 return CCNOmode;
23172 else
23173 return CCGCmode;
23174 /* strcmp pattern do (use flags) and combine may ask us for proper
23175 mode. */
23176 case USE:
23177 return CCmode;
23178 default:
23179 gcc_unreachable ();
23180 }
23181 }
23182
23183 /* Return the fixed registers used for condition codes. */
23184
23185 static bool
23186 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
23187 {
23188 *p1 = FLAGS_REG;
23189 *p2 = FPSR_REG;
23190 return true;
23191 }
23192
23193 /* If two condition code modes are compatible, return a condition code
23194 mode which is compatible with both. Otherwise, return
23195 VOIDmode. */
23196
23197 static machine_mode
23198 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
23199 {
23200 if (m1 == m2)
23201 return m1;
23202
23203 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
23204 return VOIDmode;
23205
23206 if ((m1 == CCGCmode && m2 == CCGOCmode)
23207 || (m1 == CCGOCmode && m2 == CCGCmode))
23208 return CCGCmode;
23209
23210 if ((m1 == CCNOmode && m2 == CCGOCmode)
23211 || (m1 == CCGOCmode && m2 == CCNOmode))
23212 return CCNOmode;
23213
23214 if (m1 == CCZmode
23215 && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
23216 return m2;
23217 else if (m2 == CCZmode
23218 && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
23219 return m1;
23220
23221 switch (m1)
23222 {
23223 default:
23224 gcc_unreachable ();
23225
23226 case CCmode:
23227 case CCGCmode:
23228 case CCGOCmode:
23229 case CCNOmode:
23230 case CCAmode:
23231 case CCCmode:
23232 case CCOmode:
23233 case CCPmode:
23234 case CCSmode:
23235 case CCZmode:
23236 switch (m2)
23237 {
23238 default:
23239 return VOIDmode;
23240
23241 case CCmode:
23242 case CCGCmode:
23243 case CCGOCmode:
23244 case CCNOmode:
23245 case CCAmode:
23246 case CCCmode:
23247 case CCOmode:
23248 case CCPmode:
23249 case CCSmode:
23250 case CCZmode:
23251 return CCmode;
23252 }
23253
23254 case CCFPmode:
23255 case CCFPUmode:
23256 /* These are only compatible with themselves, which we already
23257 checked above. */
23258 return VOIDmode;
23259 }
23260 }
23261
23262
23263 /* Return a comparison we can do and that it is equivalent to
23264 swap_condition (code) apart possibly from orderedness.
23265 But, never change orderedness if TARGET_IEEE_FP, returning
23266 UNKNOWN in that case if necessary. */
23267
23268 static enum rtx_code
23269 ix86_fp_swap_condition (enum rtx_code code)
23270 {
23271 switch (code)
23272 {
23273 case GT: /* GTU - CF=0 & ZF=0 */
23274 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
23275 case GE: /* GEU - CF=0 */
23276 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
23277 case UNLT: /* LTU - CF=1 */
23278 return TARGET_IEEE_FP ? UNKNOWN : GT;
23279 case UNLE: /* LEU - CF=1 | ZF=1 */
23280 return TARGET_IEEE_FP ? UNKNOWN : GE;
23281 default:
23282 return swap_condition (code);
23283 }
23284 }
23285
23286 /* Return cost of comparison CODE using the best strategy for performance.
23287 All following functions do use number of instructions as a cost metrics.
23288 In future this should be tweaked to compute bytes for optimize_size and
23289 take into account performance of various instructions on various CPUs. */
23290
23291 static int
23292 ix86_fp_comparison_cost (enum rtx_code code)
23293 {
23294 int arith_cost;
23295
23296 /* The cost of code using bit-twiddling on %ah. */
23297 switch (code)
23298 {
23299 case UNLE:
23300 case UNLT:
23301 case LTGT:
23302 case GT:
23303 case GE:
23304 case UNORDERED:
23305 case ORDERED:
23306 case UNEQ:
23307 arith_cost = 4;
23308 break;
23309 case LT:
23310 case NE:
23311 case EQ:
23312 case UNGE:
23313 arith_cost = TARGET_IEEE_FP ? 5 : 4;
23314 break;
23315 case LE:
23316 case UNGT:
23317 arith_cost = TARGET_IEEE_FP ? 6 : 4;
23318 break;
23319 default:
23320 gcc_unreachable ();
23321 }
23322
23323 switch (ix86_fp_comparison_strategy (code))
23324 {
23325 case IX86_FPCMP_COMI:
23326 return arith_cost > 4 ? 3 : 2;
23327 case IX86_FPCMP_SAHF:
23328 return arith_cost > 4 ? 4 : 3;
23329 default:
23330 return arith_cost;
23331 }
23332 }
23333
23334 /* Return strategy to use for floating-point. We assume that fcomi is always
23335 preferrable where available, since that is also true when looking at size
23336 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
23337
23338 enum ix86_fpcmp_strategy
23339 ix86_fp_comparison_strategy (enum rtx_code)
23340 {
23341 /* Do fcomi/sahf based test when profitable. */
23342
23343 if (TARGET_CMOVE)
23344 return IX86_FPCMP_COMI;
23345
23346 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
23347 return IX86_FPCMP_SAHF;
23348
23349 return IX86_FPCMP_ARITH;
23350 }
23351
23352 /* Swap, force into registers, or otherwise massage the two operands
23353 to a fp comparison. The operands are updated in place; the new
23354 comparison code is returned. */
23355
23356 static enum rtx_code
23357 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
23358 {
23359 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
23360 rtx op0 = *pop0, op1 = *pop1;
23361 machine_mode op_mode = GET_MODE (op0);
23362 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
23363
23364 /* All of the unordered compare instructions only work on registers.
23365 The same is true of the fcomi compare instructions. The XFmode
23366 compare instructions require registers except when comparing
23367 against zero or when converting operand 1 from fixed point to
23368 floating point. */
23369
23370 if (!is_sse
23371 && (fpcmp_mode == CCFPUmode
23372 || (op_mode == XFmode
23373 && ! (standard_80387_constant_p (op0) == 1
23374 || standard_80387_constant_p (op1) == 1)
23375 && GET_CODE (op1) != FLOAT)
23376 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
23377 {
23378 op0 = force_reg (op_mode, op0);
23379 op1 = force_reg (op_mode, op1);
23380 }
23381 else
23382 {
23383 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
23384 things around if they appear profitable, otherwise force op0
23385 into a register. */
23386
23387 if (standard_80387_constant_p (op0) == 0
23388 || (MEM_P (op0)
23389 && ! (standard_80387_constant_p (op1) == 0
23390 || MEM_P (op1))))
23391 {
23392 enum rtx_code new_code = ix86_fp_swap_condition (code);
23393 if (new_code != UNKNOWN)
23394 {
23395 std::swap (op0, op1);
23396 code = new_code;
23397 }
23398 }
23399
23400 if (!REG_P (op0))
23401 op0 = force_reg (op_mode, op0);
23402
23403 if (CONSTANT_P (op1))
23404 {
23405 int tmp = standard_80387_constant_p (op1);
23406 if (tmp == 0)
23407 op1 = validize_mem (force_const_mem (op_mode, op1));
23408 else if (tmp == 1)
23409 {
23410 if (TARGET_CMOVE)
23411 op1 = force_reg (op_mode, op1);
23412 }
23413 else
23414 op1 = force_reg (op_mode, op1);
23415 }
23416 }
23417
23418 /* Try to rearrange the comparison to make it cheaper. */
23419 if (ix86_fp_comparison_cost (code)
23420 > ix86_fp_comparison_cost (swap_condition (code))
23421 && (REG_P (op1) || can_create_pseudo_p ()))
23422 {
23423 std::swap (op0, op1);
23424 code = swap_condition (code);
23425 if (!REG_P (op0))
23426 op0 = force_reg (op_mode, op0);
23427 }
23428
23429 *pop0 = op0;
23430 *pop1 = op1;
23431 return code;
23432 }
23433
23434 /* Convert comparison codes we use to represent FP comparison to integer
23435 code that will result in proper branch. Return UNKNOWN if no such code
23436 is available. */
23437
23438 enum rtx_code
23439 ix86_fp_compare_code_to_integer (enum rtx_code code)
23440 {
23441 switch (code)
23442 {
23443 case GT:
23444 return GTU;
23445 case GE:
23446 return GEU;
23447 case ORDERED:
23448 case UNORDERED:
23449 return code;
23450 case UNEQ:
23451 return EQ;
23452 case UNLT:
23453 return LTU;
23454 case UNLE:
23455 return LEU;
23456 case LTGT:
23457 return NE;
23458 default:
23459 return UNKNOWN;
23460 }
23461 }
23462
23463 /* Generate insn patterns to do a floating point compare of OPERANDS. */
23464
23465 static rtx
23466 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
23467 {
23468 machine_mode fpcmp_mode, intcmp_mode;
23469 rtx tmp, tmp2;
23470
23471 fpcmp_mode = ix86_fp_compare_mode (code);
23472 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
23473
23474 /* Do fcomi/sahf based test when profitable. */
23475 switch (ix86_fp_comparison_strategy (code))
23476 {
23477 case IX86_FPCMP_COMI:
23478 intcmp_mode = fpcmp_mode;
23479 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
23480 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
23481 emit_insn (tmp);
23482 break;
23483
23484 case IX86_FPCMP_SAHF:
23485 intcmp_mode = fpcmp_mode;
23486 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
23487 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
23488
23489 if (!scratch)
23490 scratch = gen_reg_rtx (HImode);
23491 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
23492 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
23493 break;
23494
23495 case IX86_FPCMP_ARITH:
23496 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
23497 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
23498 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
23499 if (!scratch)
23500 scratch = gen_reg_rtx (HImode);
23501 emit_insn (gen_rtx_SET (scratch, tmp2));
23502
23503 /* In the unordered case, we have to check C2 for NaN's, which
23504 doesn't happen to work out to anything nice combination-wise.
23505 So do some bit twiddling on the value we've got in AH to come
23506 up with an appropriate set of condition codes. */
23507
23508 intcmp_mode = CCNOmode;
23509 switch (code)
23510 {
23511 case GT:
23512 case UNGT:
23513 if (code == GT || !TARGET_IEEE_FP)
23514 {
23515 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
23516 code = EQ;
23517 }
23518 else
23519 {
23520 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23521 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
23522 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
23523 intcmp_mode = CCmode;
23524 code = GEU;
23525 }
23526 break;
23527 case LT:
23528 case UNLT:
23529 if (code == LT && TARGET_IEEE_FP)
23530 {
23531 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23532 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
23533 intcmp_mode = CCmode;
23534 code = EQ;
23535 }
23536 else
23537 {
23538 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
23539 code = NE;
23540 }
23541 break;
23542 case GE:
23543 case UNGE:
23544 if (code == GE || !TARGET_IEEE_FP)
23545 {
23546 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
23547 code = EQ;
23548 }
23549 else
23550 {
23551 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23552 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
23553 code = NE;
23554 }
23555 break;
23556 case LE:
23557 case UNLE:
23558 if (code == LE && TARGET_IEEE_FP)
23559 {
23560 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23561 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
23562 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
23563 intcmp_mode = CCmode;
23564 code = LTU;
23565 }
23566 else
23567 {
23568 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
23569 code = NE;
23570 }
23571 break;
23572 case EQ:
23573 case UNEQ:
23574 if (code == EQ && TARGET_IEEE_FP)
23575 {
23576 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23577 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
23578 intcmp_mode = CCmode;
23579 code = EQ;
23580 }
23581 else
23582 {
23583 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
23584 code = NE;
23585 }
23586 break;
23587 case NE:
23588 case LTGT:
23589 if (code == NE && TARGET_IEEE_FP)
23590 {
23591 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23592 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
23593 GEN_INT (0x40)));
23594 code = NE;
23595 }
23596 else
23597 {
23598 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
23599 code = EQ;
23600 }
23601 break;
23602
23603 case UNORDERED:
23604 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
23605 code = NE;
23606 break;
23607 case ORDERED:
23608 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
23609 code = EQ;
23610 break;
23611
23612 default:
23613 gcc_unreachable ();
23614 }
23615 break;
23616
23617 default:
23618 gcc_unreachable();
23619 }
23620
23621 /* Return the test that should be put into the flags user, i.e.
23622 the bcc, scc, or cmov instruction. */
23623 return gen_rtx_fmt_ee (code, VOIDmode,
23624 gen_rtx_REG (intcmp_mode, FLAGS_REG),
23625 const0_rtx);
23626 }
23627
23628 static rtx
23629 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
23630 {
23631 rtx ret;
23632
23633 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
23634 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
23635
23636 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
23637 {
23638 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
23639 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
23640 }
23641 else
23642 ret = ix86_expand_int_compare (code, op0, op1);
23643
23644 return ret;
23645 }
23646
23647 void
23648 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
23649 {
23650 machine_mode mode = GET_MODE (op0);
23651 rtx tmp;
23652
23653 /* Handle special case - vector comparsion with boolean result, transform
23654 it using ptest instruction. */
23655 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
23656 {
23657 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
23658 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
23659
23660 gcc_assert (code == EQ || code == NE);
23661 /* Generate XOR since we can't check that one operand is zero vector. */
23662 tmp = gen_reg_rtx (mode);
23663 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
23664 tmp = gen_lowpart (p_mode, tmp);
23665 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
23666 gen_rtx_UNSPEC (CCmode,
23667 gen_rtvec (2, tmp, tmp),
23668 UNSPEC_PTEST)));
23669 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
23670 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23671 gen_rtx_LABEL_REF (VOIDmode, label),
23672 pc_rtx);
23673 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
23674 return;
23675 }
23676
23677 switch (mode)
23678 {
23679 case SFmode:
23680 case DFmode:
23681 case XFmode:
23682 case QImode:
23683 case HImode:
23684 case SImode:
23685 simple:
23686 tmp = ix86_expand_compare (code, op0, op1);
23687 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23688 gen_rtx_LABEL_REF (VOIDmode, label),
23689 pc_rtx);
23690 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
23691 return;
23692
23693 case DImode:
23694 if (TARGET_64BIT)
23695 goto simple;
23696 /* For 32-bit target DI comparison may be performed on
23697 SSE registers. To allow this we should avoid split
23698 to SI mode which is achieved by doing xor in DI mode
23699 and then comparing with zero (which is recognized by
23700 STV pass). We don't compare using xor when optimizing
23701 for size. */
23702 if (!optimize_insn_for_size_p ()
23703 && TARGET_STV
23704 && (code == EQ || code == NE))
23705 {
23706 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
23707 op1 = const0_rtx;
23708 }
23709 /* FALLTHRU */
23710 case TImode:
23711 /* Expand DImode branch into multiple compare+branch. */
23712 {
23713 rtx lo[2], hi[2];
23714 rtx_code_label *label2;
23715 enum rtx_code code1, code2, code3;
23716 machine_mode submode;
23717
23718 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
23719 {
23720 std::swap (op0, op1);
23721 code = swap_condition (code);
23722 }
23723
23724 split_double_mode (mode, &op0, 1, lo+0, hi+0);
23725 split_double_mode (mode, &op1, 1, lo+1, hi+1);
23726
23727 submode = mode == DImode ? SImode : DImode;
23728
23729 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
23730 avoid two branches. This costs one extra insn, so disable when
23731 optimizing for size. */
23732
23733 if ((code == EQ || code == NE)
23734 && (!optimize_insn_for_size_p ()
23735 || hi[1] == const0_rtx || lo[1] == const0_rtx))
23736 {
23737 rtx xor0, xor1;
23738
23739 xor1 = hi[0];
23740 if (hi[1] != const0_rtx)
23741 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
23742 NULL_RTX, 0, OPTAB_WIDEN);
23743
23744 xor0 = lo[0];
23745 if (lo[1] != const0_rtx)
23746 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
23747 NULL_RTX, 0, OPTAB_WIDEN);
23748
23749 tmp = expand_binop (submode, ior_optab, xor1, xor0,
23750 NULL_RTX, 0, OPTAB_WIDEN);
23751
23752 ix86_expand_branch (code, tmp, const0_rtx, label);
23753 return;
23754 }
23755
23756 /* Otherwise, if we are doing less-than or greater-or-equal-than,
23757 op1 is a constant and the low word is zero, then we can just
23758 examine the high word. Similarly for low word -1 and
23759 less-or-equal-than or greater-than. */
23760
23761 if (CONST_INT_P (hi[1]))
23762 switch (code)
23763 {
23764 case LT: case LTU: case GE: case GEU:
23765 if (lo[1] == const0_rtx)
23766 {
23767 ix86_expand_branch (code, hi[0], hi[1], label);
23768 return;
23769 }
23770 break;
23771 case LE: case LEU: case GT: case GTU:
23772 if (lo[1] == constm1_rtx)
23773 {
23774 ix86_expand_branch (code, hi[0], hi[1], label);
23775 return;
23776 }
23777 break;
23778 default:
23779 break;
23780 }
23781
23782 /* Otherwise, we need two or three jumps. */
23783
23784 label2 = gen_label_rtx ();
23785
23786 code1 = code;
23787 code2 = swap_condition (code);
23788 code3 = unsigned_condition (code);
23789
23790 switch (code)
23791 {
23792 case LT: case GT: case LTU: case GTU:
23793 break;
23794
23795 case LE: code1 = LT; code2 = GT; break;
23796 case GE: code1 = GT; code2 = LT; break;
23797 case LEU: code1 = LTU; code2 = GTU; break;
23798 case GEU: code1 = GTU; code2 = LTU; break;
23799
23800 case EQ: code1 = UNKNOWN; code2 = NE; break;
23801 case NE: code2 = UNKNOWN; break;
23802
23803 default:
23804 gcc_unreachable ();
23805 }
23806
23807 /*
23808 * a < b =>
23809 * if (hi(a) < hi(b)) goto true;
23810 * if (hi(a) > hi(b)) goto false;
23811 * if (lo(a) < lo(b)) goto true;
23812 * false:
23813 */
23814
23815 if (code1 != UNKNOWN)
23816 ix86_expand_branch (code1, hi[0], hi[1], label);
23817 if (code2 != UNKNOWN)
23818 ix86_expand_branch (code2, hi[0], hi[1], label2);
23819
23820 ix86_expand_branch (code3, lo[0], lo[1], label);
23821
23822 if (code2 != UNKNOWN)
23823 emit_label (label2);
23824 return;
23825 }
23826
23827 default:
23828 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
23829 goto simple;
23830 }
23831 }
23832
23833 /* Split branch based on floating point condition. */
23834 void
23835 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
23836 rtx target1, rtx target2, rtx tmp)
23837 {
23838 rtx condition;
23839 rtx_insn *i;
23840
23841 if (target2 != pc_rtx)
23842 {
23843 std::swap (target1, target2);
23844 code = reverse_condition_maybe_unordered (code);
23845 }
23846
23847 condition = ix86_expand_fp_compare (code, op1, op2,
23848 tmp);
23849
23850 i = emit_jump_insn (gen_rtx_SET
23851 (pc_rtx,
23852 gen_rtx_IF_THEN_ELSE (VOIDmode,
23853 condition, target1, target2)));
23854 if (split_branch_probability >= 0)
23855 add_int_reg_note (i, REG_BR_PROB, split_branch_probability);
23856 }
23857
23858 void
23859 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
23860 {
23861 rtx ret;
23862
23863 gcc_assert (GET_MODE (dest) == QImode);
23864
23865 ret = ix86_expand_compare (code, op0, op1);
23866 PUT_MODE (ret, QImode);
23867 emit_insn (gen_rtx_SET (dest, ret));
23868 }
23869
23870 /* Expand comparison setting or clearing carry flag. Return true when
23871 successful and set pop for the operation. */
23872 static bool
23873 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
23874 {
23875 machine_mode mode =
23876 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
23877
23878 /* Do not handle double-mode compares that go through special path. */
23879 if (mode == (TARGET_64BIT ? TImode : DImode))
23880 return false;
23881
23882 if (SCALAR_FLOAT_MODE_P (mode))
23883 {
23884 rtx compare_op;
23885 rtx_insn *compare_seq;
23886
23887 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
23888
23889 /* Shortcut: following common codes never translate
23890 into carry flag compares. */
23891 if (code == EQ || code == NE || code == UNEQ || code == LTGT
23892 || code == ORDERED || code == UNORDERED)
23893 return false;
23894
23895 /* These comparisons require zero flag; swap operands so they won't. */
23896 if ((code == GT || code == UNLE || code == LE || code == UNGT)
23897 && !TARGET_IEEE_FP)
23898 {
23899 std::swap (op0, op1);
23900 code = swap_condition (code);
23901 }
23902
23903 /* Try to expand the comparison and verify that we end up with
23904 carry flag based comparison. This fails to be true only when
23905 we decide to expand comparison using arithmetic that is not
23906 too common scenario. */
23907 start_sequence ();
23908 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
23909 compare_seq = get_insns ();
23910 end_sequence ();
23911
23912 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
23913 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
23914 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
23915 else
23916 code = GET_CODE (compare_op);
23917
23918 if (code != LTU && code != GEU)
23919 return false;
23920
23921 emit_insn (compare_seq);
23922 *pop = compare_op;
23923 return true;
23924 }
23925
23926 if (!INTEGRAL_MODE_P (mode))
23927 return false;
23928
23929 switch (code)
23930 {
23931 case LTU:
23932 case GEU:
23933 break;
23934
23935 /* Convert a==0 into (unsigned)a<1. */
23936 case EQ:
23937 case NE:
23938 if (op1 != const0_rtx)
23939 return false;
23940 op1 = const1_rtx;
23941 code = (code == EQ ? LTU : GEU);
23942 break;
23943
23944 /* Convert a>b into b<a or a>=b-1. */
23945 case GTU:
23946 case LEU:
23947 if (CONST_INT_P (op1))
23948 {
23949 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
23950 /* Bail out on overflow. We still can swap operands but that
23951 would force loading of the constant into register. */
23952 if (op1 == const0_rtx
23953 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
23954 return false;
23955 code = (code == GTU ? GEU : LTU);
23956 }
23957 else
23958 {
23959 std::swap (op0, op1);
23960 code = (code == GTU ? LTU : GEU);
23961 }
23962 break;
23963
23964 /* Convert a>=0 into (unsigned)a<0x80000000. */
23965 case LT:
23966 case GE:
23967 if (mode == DImode || op1 != const0_rtx)
23968 return false;
23969 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
23970 code = (code == LT ? GEU : LTU);
23971 break;
23972 case LE:
23973 case GT:
23974 if (mode == DImode || op1 != constm1_rtx)
23975 return false;
23976 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
23977 code = (code == LE ? GEU : LTU);
23978 break;
23979
23980 default:
23981 return false;
23982 }
23983 /* Swapping operands may cause constant to appear as first operand. */
23984 if (!nonimmediate_operand (op0, VOIDmode))
23985 {
23986 if (!can_create_pseudo_p ())
23987 return false;
23988 op0 = force_reg (mode, op0);
23989 }
23990 *pop = ix86_expand_compare (code, op0, op1);
23991 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
23992 return true;
23993 }
23994
23995 bool
23996 ix86_expand_int_movcc (rtx operands[])
23997 {
23998 enum rtx_code code = GET_CODE (operands[1]), compare_code;
23999 rtx_insn *compare_seq;
24000 rtx compare_op;
24001 machine_mode mode = GET_MODE (operands[0]);
24002 bool sign_bit_compare_p = false;
24003 rtx op0 = XEXP (operands[1], 0);
24004 rtx op1 = XEXP (operands[1], 1);
24005
24006 if (GET_MODE (op0) == TImode
24007 || (GET_MODE (op0) == DImode
24008 && !TARGET_64BIT))
24009 return false;
24010
24011 start_sequence ();
24012 compare_op = ix86_expand_compare (code, op0, op1);
24013 compare_seq = get_insns ();
24014 end_sequence ();
24015
24016 compare_code = GET_CODE (compare_op);
24017
24018 if ((op1 == const0_rtx && (code == GE || code == LT))
24019 || (op1 == constm1_rtx && (code == GT || code == LE)))
24020 sign_bit_compare_p = true;
24021
24022 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
24023 HImode insns, we'd be swallowed in word prefix ops. */
24024
24025 if ((mode != HImode || TARGET_FAST_PREFIX)
24026 && (mode != (TARGET_64BIT ? TImode : DImode))
24027 && CONST_INT_P (operands[2])
24028 && CONST_INT_P (operands[3]))
24029 {
24030 rtx out = operands[0];
24031 HOST_WIDE_INT ct = INTVAL (operands[2]);
24032 HOST_WIDE_INT cf = INTVAL (operands[3]);
24033 HOST_WIDE_INT diff;
24034
24035 diff = ct - cf;
24036 /* Sign bit compares are better done using shifts than we do by using
24037 sbb. */
24038 if (sign_bit_compare_p
24039 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
24040 {
24041 /* Detect overlap between destination and compare sources. */
24042 rtx tmp = out;
24043
24044 if (!sign_bit_compare_p)
24045 {
24046 rtx flags;
24047 bool fpcmp = false;
24048
24049 compare_code = GET_CODE (compare_op);
24050
24051 flags = XEXP (compare_op, 0);
24052
24053 if (GET_MODE (flags) == CCFPmode
24054 || GET_MODE (flags) == CCFPUmode)
24055 {
24056 fpcmp = true;
24057 compare_code
24058 = ix86_fp_compare_code_to_integer (compare_code);
24059 }
24060
24061 /* To simplify rest of code, restrict to the GEU case. */
24062 if (compare_code == LTU)
24063 {
24064 std::swap (ct, cf);
24065 compare_code = reverse_condition (compare_code);
24066 code = reverse_condition (code);
24067 }
24068 else
24069 {
24070 if (fpcmp)
24071 PUT_CODE (compare_op,
24072 reverse_condition_maybe_unordered
24073 (GET_CODE (compare_op)));
24074 else
24075 PUT_CODE (compare_op,
24076 reverse_condition (GET_CODE (compare_op)));
24077 }
24078 diff = ct - cf;
24079
24080 if (reg_overlap_mentioned_p (out, op0)
24081 || reg_overlap_mentioned_p (out, op1))
24082 tmp = gen_reg_rtx (mode);
24083
24084 if (mode == DImode)
24085 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
24086 else
24087 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
24088 flags, compare_op));
24089 }
24090 else
24091 {
24092 if (code == GT || code == GE)
24093 code = reverse_condition (code);
24094 else
24095 {
24096 std::swap (ct, cf);
24097 diff = ct - cf;
24098 }
24099 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
24100 }
24101
24102 if (diff == 1)
24103 {
24104 /*
24105 * cmpl op0,op1
24106 * sbbl dest,dest
24107 * [addl dest, ct]
24108 *
24109 * Size 5 - 8.
24110 */
24111 if (ct)
24112 tmp = expand_simple_binop (mode, PLUS,
24113 tmp, GEN_INT (ct),
24114 copy_rtx (tmp), 1, OPTAB_DIRECT);
24115 }
24116 else if (cf == -1)
24117 {
24118 /*
24119 * cmpl op0,op1
24120 * sbbl dest,dest
24121 * orl $ct, dest
24122 *
24123 * Size 8.
24124 */
24125 tmp = expand_simple_binop (mode, IOR,
24126 tmp, GEN_INT (ct),
24127 copy_rtx (tmp), 1, OPTAB_DIRECT);
24128 }
24129 else if (diff == -1 && ct)
24130 {
24131 /*
24132 * cmpl op0,op1
24133 * sbbl dest,dest
24134 * notl dest
24135 * [addl dest, cf]
24136 *
24137 * Size 8 - 11.
24138 */
24139 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
24140 if (cf)
24141 tmp = expand_simple_binop (mode, PLUS,
24142 copy_rtx (tmp), GEN_INT (cf),
24143 copy_rtx (tmp), 1, OPTAB_DIRECT);
24144 }
24145 else
24146 {
24147 /*
24148 * cmpl op0,op1
24149 * sbbl dest,dest
24150 * [notl dest]
24151 * andl cf - ct, dest
24152 * [addl dest, ct]
24153 *
24154 * Size 8 - 11.
24155 */
24156
24157 if (cf == 0)
24158 {
24159 cf = ct;
24160 ct = 0;
24161 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
24162 }
24163
24164 tmp = expand_simple_binop (mode, AND,
24165 copy_rtx (tmp),
24166 gen_int_mode (cf - ct, mode),
24167 copy_rtx (tmp), 1, OPTAB_DIRECT);
24168 if (ct)
24169 tmp = expand_simple_binop (mode, PLUS,
24170 copy_rtx (tmp), GEN_INT (ct),
24171 copy_rtx (tmp), 1, OPTAB_DIRECT);
24172 }
24173
24174 if (!rtx_equal_p (tmp, out))
24175 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
24176
24177 return true;
24178 }
24179
24180 if (diff < 0)
24181 {
24182 machine_mode cmp_mode = GET_MODE (op0);
24183 enum rtx_code new_code;
24184
24185 if (SCALAR_FLOAT_MODE_P (cmp_mode))
24186 {
24187 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
24188
24189 /* We may be reversing unordered compare to normal compare, that
24190 is not valid in general (we may convert non-trapping condition
24191 to trapping one), however on i386 we currently emit all
24192 comparisons unordered. */
24193 new_code = reverse_condition_maybe_unordered (code);
24194 }
24195 else
24196 new_code = ix86_reverse_condition (code, cmp_mode);
24197 if (new_code != UNKNOWN)
24198 {
24199 std::swap (ct, cf);
24200 diff = -diff;
24201 code = new_code;
24202 }
24203 }
24204
24205 compare_code = UNKNOWN;
24206 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
24207 && CONST_INT_P (op1))
24208 {
24209 if (op1 == const0_rtx
24210 && (code == LT || code == GE))
24211 compare_code = code;
24212 else if (op1 == constm1_rtx)
24213 {
24214 if (code == LE)
24215 compare_code = LT;
24216 else if (code == GT)
24217 compare_code = GE;
24218 }
24219 }
24220
24221 /* Optimize dest = (op0 < 0) ? -1 : cf. */
24222 if (compare_code != UNKNOWN
24223 && GET_MODE (op0) == GET_MODE (out)
24224 && (cf == -1 || ct == -1))
24225 {
24226 /* If lea code below could be used, only optimize
24227 if it results in a 2 insn sequence. */
24228
24229 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
24230 || diff == 3 || diff == 5 || diff == 9)
24231 || (compare_code == LT && ct == -1)
24232 || (compare_code == GE && cf == -1))
24233 {
24234 /*
24235 * notl op1 (if necessary)
24236 * sarl $31, op1
24237 * orl cf, op1
24238 */
24239 if (ct != -1)
24240 {
24241 cf = ct;
24242 ct = -1;
24243 code = reverse_condition (code);
24244 }
24245
24246 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
24247
24248 out = expand_simple_binop (mode, IOR,
24249 out, GEN_INT (cf),
24250 out, 1, OPTAB_DIRECT);
24251 if (out != operands[0])
24252 emit_move_insn (operands[0], out);
24253
24254 return true;
24255 }
24256 }
24257
24258
24259 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
24260 || diff == 3 || diff == 5 || diff == 9)
24261 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
24262 && (mode != DImode
24263 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
24264 {
24265 /*
24266 * xorl dest,dest
24267 * cmpl op1,op2
24268 * setcc dest
24269 * lea cf(dest*(ct-cf)),dest
24270 *
24271 * Size 14.
24272 *
24273 * This also catches the degenerate setcc-only case.
24274 */
24275
24276 rtx tmp;
24277 int nops;
24278
24279 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
24280
24281 nops = 0;
24282 /* On x86_64 the lea instruction operates on Pmode, so we need
24283 to get arithmetics done in proper mode to match. */
24284 if (diff == 1)
24285 tmp = copy_rtx (out);
24286 else
24287 {
24288 rtx out1;
24289 out1 = copy_rtx (out);
24290 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
24291 nops++;
24292 if (diff & 1)
24293 {
24294 tmp = gen_rtx_PLUS (mode, tmp, out1);
24295 nops++;
24296 }
24297 }
24298 if (cf != 0)
24299 {
24300 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
24301 nops++;
24302 }
24303 if (!rtx_equal_p (tmp, out))
24304 {
24305 if (nops == 1)
24306 out = force_operand (tmp, copy_rtx (out));
24307 else
24308 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
24309 }
24310 if (!rtx_equal_p (out, operands[0]))
24311 emit_move_insn (operands[0], copy_rtx (out));
24312
24313 return true;
24314 }
24315
24316 /*
24317 * General case: Jumpful:
24318 * xorl dest,dest cmpl op1, op2
24319 * cmpl op1, op2 movl ct, dest
24320 * setcc dest jcc 1f
24321 * decl dest movl cf, dest
24322 * andl (cf-ct),dest 1:
24323 * addl ct,dest
24324 *
24325 * Size 20. Size 14.
24326 *
24327 * This is reasonably steep, but branch mispredict costs are
24328 * high on modern cpus, so consider failing only if optimizing
24329 * for space.
24330 */
24331
24332 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
24333 && BRANCH_COST (optimize_insn_for_speed_p (),
24334 false) >= 2)
24335 {
24336 if (cf == 0)
24337 {
24338 machine_mode cmp_mode = GET_MODE (op0);
24339 enum rtx_code new_code;
24340
24341 if (SCALAR_FLOAT_MODE_P (cmp_mode))
24342 {
24343 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
24344
24345 /* We may be reversing unordered compare to normal compare,
24346 that is not valid in general (we may convert non-trapping
24347 condition to trapping one), however on i386 we currently
24348 emit all comparisons unordered. */
24349 new_code = reverse_condition_maybe_unordered (code);
24350 }
24351 else
24352 {
24353 new_code = ix86_reverse_condition (code, cmp_mode);
24354 if (compare_code != UNKNOWN && new_code != UNKNOWN)
24355 compare_code = reverse_condition (compare_code);
24356 }
24357
24358 if (new_code != UNKNOWN)
24359 {
24360 cf = ct;
24361 ct = 0;
24362 code = new_code;
24363 }
24364 }
24365
24366 if (compare_code != UNKNOWN)
24367 {
24368 /* notl op1 (if needed)
24369 sarl $31, op1
24370 andl (cf-ct), op1
24371 addl ct, op1
24372
24373 For x < 0 (resp. x <= -1) there will be no notl,
24374 so if possible swap the constants to get rid of the
24375 complement.
24376 True/false will be -1/0 while code below (store flag
24377 followed by decrement) is 0/-1, so the constants need
24378 to be exchanged once more. */
24379
24380 if (compare_code == GE || !cf)
24381 {
24382 code = reverse_condition (code);
24383 compare_code = LT;
24384 }
24385 else
24386 std::swap (ct, cf);
24387
24388 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
24389 }
24390 else
24391 {
24392 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
24393
24394 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
24395 constm1_rtx,
24396 copy_rtx (out), 1, OPTAB_DIRECT);
24397 }
24398
24399 out = expand_simple_binop (mode, AND, copy_rtx (out),
24400 gen_int_mode (cf - ct, mode),
24401 copy_rtx (out), 1, OPTAB_DIRECT);
24402 if (ct)
24403 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
24404 copy_rtx (out), 1, OPTAB_DIRECT);
24405 if (!rtx_equal_p (out, operands[0]))
24406 emit_move_insn (operands[0], copy_rtx (out));
24407
24408 return true;
24409 }
24410 }
24411
24412 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
24413 {
24414 /* Try a few things more with specific constants and a variable. */
24415
24416 optab op;
24417 rtx var, orig_out, out, tmp;
24418
24419 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
24420 return false;
24421
24422 /* If one of the two operands is an interesting constant, load a
24423 constant with the above and mask it in with a logical operation. */
24424
24425 if (CONST_INT_P (operands[2]))
24426 {
24427 var = operands[3];
24428 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
24429 operands[3] = constm1_rtx, op = and_optab;
24430 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
24431 operands[3] = const0_rtx, op = ior_optab;
24432 else
24433 return false;
24434 }
24435 else if (CONST_INT_P (operands[3]))
24436 {
24437 var = operands[2];
24438 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
24439 operands[2] = constm1_rtx, op = and_optab;
24440 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
24441 operands[2] = const0_rtx, op = ior_optab;
24442 else
24443 return false;
24444 }
24445 else
24446 return false;
24447
24448 orig_out = operands[0];
24449 tmp = gen_reg_rtx (mode);
24450 operands[0] = tmp;
24451
24452 /* Recurse to get the constant loaded. */
24453 if (!ix86_expand_int_movcc (operands))
24454 return false;
24455
24456 /* Mask in the interesting variable. */
24457 out = expand_binop (mode, op, var, tmp, orig_out, 0,
24458 OPTAB_WIDEN);
24459 if (!rtx_equal_p (out, orig_out))
24460 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
24461
24462 return true;
24463 }
24464
24465 /*
24466 * For comparison with above,
24467 *
24468 * movl cf,dest
24469 * movl ct,tmp
24470 * cmpl op1,op2
24471 * cmovcc tmp,dest
24472 *
24473 * Size 15.
24474 */
24475
24476 if (! nonimmediate_operand (operands[2], mode))
24477 operands[2] = force_reg (mode, operands[2]);
24478 if (! nonimmediate_operand (operands[3], mode))
24479 operands[3] = force_reg (mode, operands[3]);
24480
24481 if (! register_operand (operands[2], VOIDmode)
24482 && (mode == QImode
24483 || ! register_operand (operands[3], VOIDmode)))
24484 operands[2] = force_reg (mode, operands[2]);
24485
24486 if (mode == QImode
24487 && ! register_operand (operands[3], VOIDmode))
24488 operands[3] = force_reg (mode, operands[3]);
24489
24490 emit_insn (compare_seq);
24491 emit_insn (gen_rtx_SET (operands[0],
24492 gen_rtx_IF_THEN_ELSE (mode,
24493 compare_op, operands[2],
24494 operands[3])));
24495 return true;
24496 }
24497
24498 /* Swap, force into registers, or otherwise massage the two operands
24499 to an sse comparison with a mask result. Thus we differ a bit from
24500 ix86_prepare_fp_compare_args which expects to produce a flags result.
24501
24502 The DEST operand exists to help determine whether to commute commutative
24503 operators. The POP0/POP1 operands are updated in place. The new
24504 comparison code is returned, or UNKNOWN if not implementable. */
24505
24506 static enum rtx_code
24507 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
24508 rtx *pop0, rtx *pop1)
24509 {
24510 switch (code)
24511 {
24512 case LTGT:
24513 case UNEQ:
24514 /* AVX supports all the needed comparisons. */
24515 if (TARGET_AVX)
24516 break;
24517 /* We have no LTGT as an operator. We could implement it with
24518 NE & ORDERED, but this requires an extra temporary. It's
24519 not clear that it's worth it. */
24520 return UNKNOWN;
24521
24522 case LT:
24523 case LE:
24524 case UNGT:
24525 case UNGE:
24526 /* These are supported directly. */
24527 break;
24528
24529 case EQ:
24530 case NE:
24531 case UNORDERED:
24532 case ORDERED:
24533 /* AVX has 3 operand comparisons, no need to swap anything. */
24534 if (TARGET_AVX)
24535 break;
24536 /* For commutative operators, try to canonicalize the destination
24537 operand to be first in the comparison - this helps reload to
24538 avoid extra moves. */
24539 if (!dest || !rtx_equal_p (dest, *pop1))
24540 break;
24541 /* FALLTHRU */
24542
24543 case GE:
24544 case GT:
24545 case UNLE:
24546 case UNLT:
24547 /* These are not supported directly before AVX, and furthermore
24548 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
24549 comparison operands to transform into something that is
24550 supported. */
24551 std::swap (*pop0, *pop1);
24552 code = swap_condition (code);
24553 break;
24554
24555 default:
24556 gcc_unreachable ();
24557 }
24558
24559 return code;
24560 }
24561
24562 /* Detect conditional moves that exactly match min/max operational
24563 semantics. Note that this is IEEE safe, as long as we don't
24564 interchange the operands.
24565
24566 Returns FALSE if this conditional move doesn't match a MIN/MAX,
24567 and TRUE if the operation is successful and instructions are emitted. */
24568
24569 static bool
24570 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
24571 rtx cmp_op1, rtx if_true, rtx if_false)
24572 {
24573 machine_mode mode;
24574 bool is_min;
24575 rtx tmp;
24576
24577 if (code == LT)
24578 ;
24579 else if (code == UNGE)
24580 std::swap (if_true, if_false);
24581 else
24582 return false;
24583
24584 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
24585 is_min = true;
24586 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
24587 is_min = false;
24588 else
24589 return false;
24590
24591 mode = GET_MODE (dest);
24592
24593 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
24594 but MODE may be a vector mode and thus not appropriate. */
24595 if (!flag_finite_math_only || flag_signed_zeros)
24596 {
24597 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
24598 rtvec v;
24599
24600 if_true = force_reg (mode, if_true);
24601 v = gen_rtvec (2, if_true, if_false);
24602 tmp = gen_rtx_UNSPEC (mode, v, u);
24603 }
24604 else
24605 {
24606 code = is_min ? SMIN : SMAX;
24607 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
24608 }
24609
24610 emit_insn (gen_rtx_SET (dest, tmp));
24611 return true;
24612 }
24613
24614 /* Expand an sse vector comparison. Return the register with the result. */
24615
24616 static rtx
24617 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
24618 rtx op_true, rtx op_false)
24619 {
24620 machine_mode mode = GET_MODE (dest);
24621 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
24622
24623 /* In general case result of comparison can differ from operands' type. */
24624 machine_mode cmp_mode;
24625
24626 /* In AVX512F the result of comparison is an integer mask. */
24627 bool maskcmp = false;
24628 rtx x;
24629
24630 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
24631 {
24632 cmp_mode = mode_for_size (GET_MODE_NUNITS (cmp_ops_mode), MODE_INT, 0);
24633 gcc_assert (cmp_mode != BLKmode);
24634
24635 maskcmp = true;
24636 }
24637 else
24638 cmp_mode = cmp_ops_mode;
24639
24640
24641 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
24642 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
24643 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
24644
24645 if (optimize
24646 || (maskcmp && cmp_mode != mode)
24647 || (op_true && reg_overlap_mentioned_p (dest, op_true))
24648 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
24649 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
24650
24651 /* Compare patterns for int modes are unspec in AVX512F only. */
24652 if (maskcmp && (code == GT || code == EQ))
24653 {
24654 rtx (*gen)(rtx, rtx, rtx);
24655
24656 switch (cmp_ops_mode)
24657 {
24658 case V64QImode:
24659 gcc_assert (TARGET_AVX512BW);
24660 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
24661 break;
24662 case V32HImode:
24663 gcc_assert (TARGET_AVX512BW);
24664 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
24665 break;
24666 case V16SImode:
24667 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
24668 break;
24669 case V8DImode:
24670 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
24671 break;
24672 default:
24673 gen = NULL;
24674 }
24675
24676 if (gen)
24677 {
24678 emit_insn (gen (dest, cmp_op0, cmp_op1));
24679 return dest;
24680 }
24681 }
24682 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
24683
24684 if (cmp_mode != mode && !maskcmp)
24685 {
24686 x = force_reg (cmp_ops_mode, x);
24687 convert_move (dest, x, false);
24688 }
24689 else
24690 emit_insn (gen_rtx_SET (dest, x));
24691
24692 return dest;
24693 }
24694
24695 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
24696 operations. This is used for both scalar and vector conditional moves. */
24697
24698 void
24699 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
24700 {
24701 machine_mode mode = GET_MODE (dest);
24702 machine_mode cmpmode = GET_MODE (cmp);
24703
24704 /* In AVX512F the result of comparison is an integer mask. */
24705 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
24706
24707 rtx t2, t3, x;
24708
24709 /* If we have an integer mask and FP value then we need
24710 to cast mask to FP mode. */
24711 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
24712 {
24713 cmp = force_reg (cmpmode, cmp);
24714 cmp = gen_rtx_SUBREG (mode, cmp, 0);
24715 }
24716
24717 if (vector_all_ones_operand (op_true, mode)
24718 && rtx_equal_p (op_false, CONST0_RTX (mode))
24719 && !maskcmp)
24720 {
24721 emit_insn (gen_rtx_SET (dest, cmp));
24722 }
24723 else if (op_false == CONST0_RTX (mode)
24724 && !maskcmp)
24725 {
24726 op_true = force_reg (mode, op_true);
24727 x = gen_rtx_AND (mode, cmp, op_true);
24728 emit_insn (gen_rtx_SET (dest, x));
24729 }
24730 else if (op_true == CONST0_RTX (mode)
24731 && !maskcmp)
24732 {
24733 op_false = force_reg (mode, op_false);
24734 x = gen_rtx_NOT (mode, cmp);
24735 x = gen_rtx_AND (mode, x, op_false);
24736 emit_insn (gen_rtx_SET (dest, x));
24737 }
24738 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
24739 && !maskcmp)
24740 {
24741 op_false = force_reg (mode, op_false);
24742 x = gen_rtx_IOR (mode, cmp, op_false);
24743 emit_insn (gen_rtx_SET (dest, x));
24744 }
24745 else if (TARGET_XOP
24746 && !maskcmp)
24747 {
24748 op_true = force_reg (mode, op_true);
24749
24750 if (!nonimmediate_operand (op_false, mode))
24751 op_false = force_reg (mode, op_false);
24752
24753 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
24754 op_true,
24755 op_false)));
24756 }
24757 else
24758 {
24759 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24760 rtx d = dest;
24761
24762 if (!nonimmediate_operand (op_true, mode))
24763 op_true = force_reg (mode, op_true);
24764
24765 op_false = force_reg (mode, op_false);
24766
24767 switch (mode)
24768 {
24769 case V4SFmode:
24770 if (TARGET_SSE4_1)
24771 gen = gen_sse4_1_blendvps;
24772 break;
24773 case V2DFmode:
24774 if (TARGET_SSE4_1)
24775 gen = gen_sse4_1_blendvpd;
24776 break;
24777 case V16QImode:
24778 case V8HImode:
24779 case V4SImode:
24780 case V2DImode:
24781 if (TARGET_SSE4_1)
24782 {
24783 gen = gen_sse4_1_pblendvb;
24784 if (mode != V16QImode)
24785 d = gen_reg_rtx (V16QImode);
24786 op_false = gen_lowpart (V16QImode, op_false);
24787 op_true = gen_lowpart (V16QImode, op_true);
24788 cmp = gen_lowpart (V16QImode, cmp);
24789 }
24790 break;
24791 case V8SFmode:
24792 if (TARGET_AVX)
24793 gen = gen_avx_blendvps256;
24794 break;
24795 case V4DFmode:
24796 if (TARGET_AVX)
24797 gen = gen_avx_blendvpd256;
24798 break;
24799 case V32QImode:
24800 case V16HImode:
24801 case V8SImode:
24802 case V4DImode:
24803 if (TARGET_AVX2)
24804 {
24805 gen = gen_avx2_pblendvb;
24806 if (mode != V32QImode)
24807 d = gen_reg_rtx (V32QImode);
24808 op_false = gen_lowpart (V32QImode, op_false);
24809 op_true = gen_lowpart (V32QImode, op_true);
24810 cmp = gen_lowpart (V32QImode, cmp);
24811 }
24812 break;
24813
24814 case V64QImode:
24815 gen = gen_avx512bw_blendmv64qi;
24816 break;
24817 case V32HImode:
24818 gen = gen_avx512bw_blendmv32hi;
24819 break;
24820 case V16SImode:
24821 gen = gen_avx512f_blendmv16si;
24822 break;
24823 case V8DImode:
24824 gen = gen_avx512f_blendmv8di;
24825 break;
24826 case V8DFmode:
24827 gen = gen_avx512f_blendmv8df;
24828 break;
24829 case V16SFmode:
24830 gen = gen_avx512f_blendmv16sf;
24831 break;
24832
24833 default:
24834 break;
24835 }
24836
24837 if (gen != NULL)
24838 {
24839 emit_insn (gen (d, op_false, op_true, cmp));
24840 if (d != dest)
24841 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
24842 }
24843 else
24844 {
24845 op_true = force_reg (mode, op_true);
24846
24847 t2 = gen_reg_rtx (mode);
24848 if (optimize)
24849 t3 = gen_reg_rtx (mode);
24850 else
24851 t3 = dest;
24852
24853 x = gen_rtx_AND (mode, op_true, cmp);
24854 emit_insn (gen_rtx_SET (t2, x));
24855
24856 x = gen_rtx_NOT (mode, cmp);
24857 x = gen_rtx_AND (mode, x, op_false);
24858 emit_insn (gen_rtx_SET (t3, x));
24859
24860 x = gen_rtx_IOR (mode, t3, t2);
24861 emit_insn (gen_rtx_SET (dest, x));
24862 }
24863 }
24864 }
24865
24866 /* Expand a floating-point conditional move. Return true if successful. */
24867
24868 bool
24869 ix86_expand_fp_movcc (rtx operands[])
24870 {
24871 machine_mode mode = GET_MODE (operands[0]);
24872 enum rtx_code code = GET_CODE (operands[1]);
24873 rtx tmp, compare_op;
24874 rtx op0 = XEXP (operands[1], 0);
24875 rtx op1 = XEXP (operands[1], 1);
24876
24877 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
24878 {
24879 machine_mode cmode;
24880
24881 /* Since we've no cmove for sse registers, don't force bad register
24882 allocation just to gain access to it. Deny movcc when the
24883 comparison mode doesn't match the move mode. */
24884 cmode = GET_MODE (op0);
24885 if (cmode == VOIDmode)
24886 cmode = GET_MODE (op1);
24887 if (cmode != mode)
24888 return false;
24889
24890 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
24891 if (code == UNKNOWN)
24892 return false;
24893
24894 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
24895 operands[2], operands[3]))
24896 return true;
24897
24898 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
24899 operands[2], operands[3]);
24900 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
24901 return true;
24902 }
24903
24904 if (GET_MODE (op0) == TImode
24905 || (GET_MODE (op0) == DImode
24906 && !TARGET_64BIT))
24907 return false;
24908
24909 /* The floating point conditional move instructions don't directly
24910 support conditions resulting from a signed integer comparison. */
24911
24912 compare_op = ix86_expand_compare (code, op0, op1);
24913 if (!fcmov_comparison_operator (compare_op, VOIDmode))
24914 {
24915 tmp = gen_reg_rtx (QImode);
24916 ix86_expand_setcc (tmp, code, op0, op1);
24917
24918 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
24919 }
24920
24921 emit_insn (gen_rtx_SET (operands[0],
24922 gen_rtx_IF_THEN_ELSE (mode, compare_op,
24923 operands[2], operands[3])));
24924
24925 return true;
24926 }
24927
24928 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
24929
24930 static int
24931 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
24932 {
24933 switch (code)
24934 {
24935 case EQ:
24936 return 0;
24937 case LT:
24938 case LTU:
24939 return 1;
24940 case LE:
24941 case LEU:
24942 return 2;
24943 case NE:
24944 return 4;
24945 case GE:
24946 case GEU:
24947 return 5;
24948 case GT:
24949 case GTU:
24950 return 6;
24951 default:
24952 gcc_unreachable ();
24953 }
24954 }
24955
24956 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
24957
24958 static int
24959 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
24960 {
24961 switch (code)
24962 {
24963 case EQ:
24964 return 0x00;
24965 case NE:
24966 return 0x04;
24967 case GT:
24968 return 0x0e;
24969 case LE:
24970 return 0x02;
24971 case GE:
24972 return 0x0d;
24973 case LT:
24974 return 0x01;
24975 case UNLE:
24976 return 0x0a;
24977 case UNLT:
24978 return 0x09;
24979 case UNGE:
24980 return 0x05;
24981 case UNGT:
24982 return 0x06;
24983 case UNEQ:
24984 return 0x18;
24985 case LTGT:
24986 return 0x0c;
24987 case ORDERED:
24988 return 0x07;
24989 case UNORDERED:
24990 return 0x03;
24991 default:
24992 gcc_unreachable ();
24993 }
24994 }
24995
24996 /* Return immediate value to be used in UNSPEC_PCMP
24997 for comparison CODE in MODE. */
24998
24999 static int
25000 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
25001 {
25002 if (FLOAT_MODE_P (mode))
25003 return ix86_fp_cmp_code_to_pcmp_immediate (code);
25004 return ix86_int_cmp_code_to_pcmp_immediate (code);
25005 }
25006
25007 /* Expand AVX-512 vector comparison. */
25008
25009 bool
25010 ix86_expand_mask_vec_cmp (rtx operands[])
25011 {
25012 machine_mode mask_mode = GET_MODE (operands[0]);
25013 machine_mode cmp_mode = GET_MODE (operands[2]);
25014 enum rtx_code code = GET_CODE (operands[1]);
25015 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
25016 int unspec_code;
25017 rtx unspec;
25018
25019 switch (code)
25020 {
25021 case LEU:
25022 case GTU:
25023 case GEU:
25024 case LTU:
25025 unspec_code = UNSPEC_UNSIGNED_PCMP;
25026 break;
25027
25028 default:
25029 unspec_code = UNSPEC_PCMP;
25030 }
25031
25032 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
25033 operands[3], imm),
25034 unspec_code);
25035 emit_insn (gen_rtx_SET (operands[0], unspec));
25036
25037 return true;
25038 }
25039
25040 /* Expand fp vector comparison. */
25041
25042 bool
25043 ix86_expand_fp_vec_cmp (rtx operands[])
25044 {
25045 enum rtx_code code = GET_CODE (operands[1]);
25046 rtx cmp;
25047
25048 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
25049 &operands[2], &operands[3]);
25050 if (code == UNKNOWN)
25051 {
25052 rtx temp;
25053 switch (GET_CODE (operands[1]))
25054 {
25055 case LTGT:
25056 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
25057 operands[3], NULL, NULL);
25058 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
25059 operands[3], NULL, NULL);
25060 code = AND;
25061 break;
25062 case UNEQ:
25063 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
25064 operands[3], NULL, NULL);
25065 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
25066 operands[3], NULL, NULL);
25067 code = IOR;
25068 break;
25069 default:
25070 gcc_unreachable ();
25071 }
25072 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
25073 OPTAB_DIRECT);
25074 }
25075 else
25076 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
25077 operands[1], operands[2]);
25078
25079 if (operands[0] != cmp)
25080 emit_move_insn (operands[0], cmp);
25081
25082 return true;
25083 }
25084
25085 static rtx
25086 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
25087 rtx op_true, rtx op_false, bool *negate)
25088 {
25089 machine_mode data_mode = GET_MODE (dest);
25090 machine_mode mode = GET_MODE (cop0);
25091 rtx x;
25092
25093 *negate = false;
25094
25095 /* XOP supports all of the comparisons on all 128-bit vector int types. */
25096 if (TARGET_XOP
25097 && (mode == V16QImode || mode == V8HImode
25098 || mode == V4SImode || mode == V2DImode))
25099 ;
25100 else
25101 {
25102 /* Canonicalize the comparison to EQ, GT, GTU. */
25103 switch (code)
25104 {
25105 case EQ:
25106 case GT:
25107 case GTU:
25108 break;
25109
25110 case NE:
25111 case LE:
25112 case LEU:
25113 code = reverse_condition (code);
25114 *negate = true;
25115 break;
25116
25117 case GE:
25118 case GEU:
25119 code = reverse_condition (code);
25120 *negate = true;
25121 /* FALLTHRU */
25122
25123 case LT:
25124 case LTU:
25125 std::swap (cop0, cop1);
25126 code = swap_condition (code);
25127 break;
25128
25129 default:
25130 gcc_unreachable ();
25131 }
25132
25133 /* Only SSE4.1/SSE4.2 supports V2DImode. */
25134 if (mode == V2DImode)
25135 {
25136 switch (code)
25137 {
25138 case EQ:
25139 /* SSE4.1 supports EQ. */
25140 if (!TARGET_SSE4_1)
25141 return NULL;
25142 break;
25143
25144 case GT:
25145 case GTU:
25146 /* SSE4.2 supports GT/GTU. */
25147 if (!TARGET_SSE4_2)
25148 return NULL;
25149 break;
25150
25151 default:
25152 gcc_unreachable ();
25153 }
25154 }
25155
25156 /* Unsigned parallel compare is not supported by the hardware.
25157 Play some tricks to turn this into a signed comparison
25158 against 0. */
25159 if (code == GTU)
25160 {
25161 cop0 = force_reg (mode, cop0);
25162
25163 switch (mode)
25164 {
25165 case V16SImode:
25166 case V8DImode:
25167 case V8SImode:
25168 case V4DImode:
25169 case V4SImode:
25170 case V2DImode:
25171 {
25172 rtx t1, t2, mask;
25173 rtx (*gen_sub3) (rtx, rtx, rtx);
25174
25175 switch (mode)
25176 {
25177 case V16SImode: gen_sub3 = gen_subv16si3; break;
25178 case V8DImode: gen_sub3 = gen_subv8di3; break;
25179 case V8SImode: gen_sub3 = gen_subv8si3; break;
25180 case V4DImode: gen_sub3 = gen_subv4di3; break;
25181 case V4SImode: gen_sub3 = gen_subv4si3; break;
25182 case V2DImode: gen_sub3 = gen_subv2di3; break;
25183 default:
25184 gcc_unreachable ();
25185 }
25186 /* Subtract (-(INT MAX) - 1) from both operands to make
25187 them signed. */
25188 mask = ix86_build_signbit_mask (mode, true, false);
25189 t1 = gen_reg_rtx (mode);
25190 emit_insn (gen_sub3 (t1, cop0, mask));
25191
25192 t2 = gen_reg_rtx (mode);
25193 emit_insn (gen_sub3 (t2, cop1, mask));
25194
25195 cop0 = t1;
25196 cop1 = t2;
25197 code = GT;
25198 }
25199 break;
25200
25201 case V64QImode:
25202 case V32HImode:
25203 case V32QImode:
25204 case V16HImode:
25205 case V16QImode:
25206 case V8HImode:
25207 /* Perform a parallel unsigned saturating subtraction. */
25208 x = gen_reg_rtx (mode);
25209 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
25210 cop1)));
25211
25212 cop0 = x;
25213 cop1 = CONST0_RTX (mode);
25214 code = EQ;
25215 *negate = !*negate;
25216 break;
25217
25218 default:
25219 gcc_unreachable ();
25220 }
25221 }
25222 }
25223
25224 if (*negate)
25225 std::swap (op_true, op_false);
25226
25227 /* Allow the comparison to be done in one mode, but the movcc to
25228 happen in another mode. */
25229 if (data_mode == mode)
25230 {
25231 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
25232 op_true, op_false);
25233 }
25234 else
25235 {
25236 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
25237 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
25238 op_true, op_false);
25239 if (GET_MODE (x) == mode)
25240 x = gen_lowpart (data_mode, x);
25241 }
25242
25243 return x;
25244 }
25245
25246 /* Expand integer vector comparison. */
25247
25248 bool
25249 ix86_expand_int_vec_cmp (rtx operands[])
25250 {
25251 rtx_code code = GET_CODE (operands[1]);
25252 bool negate = false;
25253 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
25254 operands[3], NULL, NULL, &negate);
25255
25256 if (!cmp)
25257 return false;
25258
25259 if (negate)
25260 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
25261 CONST0_RTX (GET_MODE (cmp)),
25262 NULL, NULL, &negate);
25263
25264 gcc_assert (!negate);
25265
25266 if (operands[0] != cmp)
25267 emit_move_insn (operands[0], cmp);
25268
25269 return true;
25270 }
25271
25272 /* Expand a floating-point vector conditional move; a vcond operation
25273 rather than a movcc operation. */
25274
25275 bool
25276 ix86_expand_fp_vcond (rtx operands[])
25277 {
25278 enum rtx_code code = GET_CODE (operands[3]);
25279 rtx cmp;
25280
25281 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
25282 &operands[4], &operands[5]);
25283 if (code == UNKNOWN)
25284 {
25285 rtx temp;
25286 switch (GET_CODE (operands[3]))
25287 {
25288 case LTGT:
25289 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
25290 operands[5], operands[0], operands[0]);
25291 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
25292 operands[5], operands[1], operands[2]);
25293 code = AND;
25294 break;
25295 case UNEQ:
25296 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
25297 operands[5], operands[0], operands[0]);
25298 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
25299 operands[5], operands[1], operands[2]);
25300 code = IOR;
25301 break;
25302 default:
25303 gcc_unreachable ();
25304 }
25305 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
25306 OPTAB_DIRECT);
25307 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
25308 return true;
25309 }
25310
25311 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
25312 operands[5], operands[1], operands[2]))
25313 return true;
25314
25315 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
25316 operands[1], operands[2]);
25317 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
25318 return true;
25319 }
25320
25321 /* Expand a signed/unsigned integral vector conditional move. */
25322
25323 bool
25324 ix86_expand_int_vcond (rtx operands[])
25325 {
25326 machine_mode data_mode = GET_MODE (operands[0]);
25327 machine_mode mode = GET_MODE (operands[4]);
25328 enum rtx_code code = GET_CODE (operands[3]);
25329 bool negate = false;
25330 rtx x, cop0, cop1;
25331
25332 cop0 = operands[4];
25333 cop1 = operands[5];
25334
25335 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
25336 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
25337 if ((code == LT || code == GE)
25338 && data_mode == mode
25339 && cop1 == CONST0_RTX (mode)
25340 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
25341 && GET_MODE_UNIT_SIZE (data_mode) > 1
25342 && GET_MODE_UNIT_SIZE (data_mode) <= 8
25343 && (GET_MODE_SIZE (data_mode) == 16
25344 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
25345 {
25346 rtx negop = operands[2 - (code == LT)];
25347 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
25348 if (negop == CONST1_RTX (data_mode))
25349 {
25350 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
25351 operands[0], 1, OPTAB_DIRECT);
25352 if (res != operands[0])
25353 emit_move_insn (operands[0], res);
25354 return true;
25355 }
25356 else if (GET_MODE_INNER (data_mode) != DImode
25357 && vector_all_ones_operand (negop, data_mode))
25358 {
25359 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
25360 operands[0], 0, OPTAB_DIRECT);
25361 if (res != operands[0])
25362 emit_move_insn (operands[0], res);
25363 return true;
25364 }
25365 }
25366
25367 if (!nonimmediate_operand (cop1, mode))
25368 cop1 = force_reg (mode, cop1);
25369 if (!general_operand (operands[1], data_mode))
25370 operands[1] = force_reg (data_mode, operands[1]);
25371 if (!general_operand (operands[2], data_mode))
25372 operands[2] = force_reg (data_mode, operands[2]);
25373
25374 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
25375 operands[1], operands[2], &negate);
25376
25377 if (!x)
25378 return false;
25379
25380 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
25381 operands[2-negate]);
25382 return true;
25383 }
25384
25385 /* AVX512F does support 64-byte integer vector operations,
25386 thus the longest vector we are faced with is V64QImode. */
25387 #define MAX_VECT_LEN 64
25388
25389 struct expand_vec_perm_d
25390 {
25391 rtx target, op0, op1;
25392 unsigned char perm[MAX_VECT_LEN];
25393 machine_mode vmode;
25394 unsigned char nelt;
25395 bool one_operand_p;
25396 bool testing_p;
25397 };
25398
25399 static bool
25400 ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1,
25401 struct expand_vec_perm_d *d)
25402 {
25403 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
25404 expander, so args are either in d, or in op0, op1 etc. */
25405 machine_mode mode = GET_MODE (d ? d->op0 : op0);
25406 machine_mode maskmode = mode;
25407 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
25408
25409 switch (mode)
25410 {
25411 case V8HImode:
25412 if (TARGET_AVX512VL && TARGET_AVX512BW)
25413 gen = gen_avx512vl_vpermi2varv8hi3;
25414 break;
25415 case V16HImode:
25416 if (TARGET_AVX512VL && TARGET_AVX512BW)
25417 gen = gen_avx512vl_vpermi2varv16hi3;
25418 break;
25419 case V64QImode:
25420 if (TARGET_AVX512VBMI)
25421 gen = gen_avx512bw_vpermi2varv64qi3;
25422 break;
25423 case V32HImode:
25424 if (TARGET_AVX512BW)
25425 gen = gen_avx512bw_vpermi2varv32hi3;
25426 break;
25427 case V4SImode:
25428 if (TARGET_AVX512VL)
25429 gen = gen_avx512vl_vpermi2varv4si3;
25430 break;
25431 case V8SImode:
25432 if (TARGET_AVX512VL)
25433 gen = gen_avx512vl_vpermi2varv8si3;
25434 break;
25435 case V16SImode:
25436 if (TARGET_AVX512F)
25437 gen = gen_avx512f_vpermi2varv16si3;
25438 break;
25439 case V4SFmode:
25440 if (TARGET_AVX512VL)
25441 {
25442 gen = gen_avx512vl_vpermi2varv4sf3;
25443 maskmode = V4SImode;
25444 }
25445 break;
25446 case V8SFmode:
25447 if (TARGET_AVX512VL)
25448 {
25449 gen = gen_avx512vl_vpermi2varv8sf3;
25450 maskmode = V8SImode;
25451 }
25452 break;
25453 case V16SFmode:
25454 if (TARGET_AVX512F)
25455 {
25456 gen = gen_avx512f_vpermi2varv16sf3;
25457 maskmode = V16SImode;
25458 }
25459 break;
25460 case V2DImode:
25461 if (TARGET_AVX512VL)
25462 gen = gen_avx512vl_vpermi2varv2di3;
25463 break;
25464 case V4DImode:
25465 if (TARGET_AVX512VL)
25466 gen = gen_avx512vl_vpermi2varv4di3;
25467 break;
25468 case V8DImode:
25469 if (TARGET_AVX512F)
25470 gen = gen_avx512f_vpermi2varv8di3;
25471 break;
25472 case V2DFmode:
25473 if (TARGET_AVX512VL)
25474 {
25475 gen = gen_avx512vl_vpermi2varv2df3;
25476 maskmode = V2DImode;
25477 }
25478 break;
25479 case V4DFmode:
25480 if (TARGET_AVX512VL)
25481 {
25482 gen = gen_avx512vl_vpermi2varv4df3;
25483 maskmode = V4DImode;
25484 }
25485 break;
25486 case V8DFmode:
25487 if (TARGET_AVX512F)
25488 {
25489 gen = gen_avx512f_vpermi2varv8df3;
25490 maskmode = V8DImode;
25491 }
25492 break;
25493 default:
25494 break;
25495 }
25496
25497 if (gen == NULL)
25498 return false;
25499
25500 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
25501 expander, so args are either in d, or in op0, op1 etc. */
25502 if (d)
25503 {
25504 rtx vec[64];
25505 target = d->target;
25506 op0 = d->op0;
25507 op1 = d->op1;
25508 for (int i = 0; i < d->nelt; ++i)
25509 vec[i] = GEN_INT (d->perm[i]);
25510 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
25511 }
25512
25513 emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
25514 return true;
25515 }
25516
25517 /* Expand a variable vector permutation. */
25518
25519 void
25520 ix86_expand_vec_perm (rtx operands[])
25521 {
25522 rtx target = operands[0];
25523 rtx op0 = operands[1];
25524 rtx op1 = operands[2];
25525 rtx mask = operands[3];
25526 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
25527 machine_mode mode = GET_MODE (op0);
25528 machine_mode maskmode = GET_MODE (mask);
25529 int w, e, i;
25530 bool one_operand_shuffle = rtx_equal_p (op0, op1);
25531
25532 /* Number of elements in the vector. */
25533 w = GET_MODE_NUNITS (mode);
25534 e = GET_MODE_UNIT_SIZE (mode);
25535 gcc_assert (w <= 64);
25536
25537 if (TARGET_AVX512F && one_operand_shuffle)
25538 {
25539 rtx (*gen) (rtx, rtx, rtx) = NULL;
25540 switch (mode)
25541 {
25542 case V16SImode:
25543 gen =gen_avx512f_permvarv16si;
25544 break;
25545 case V16SFmode:
25546 gen = gen_avx512f_permvarv16sf;
25547 break;
25548 case V8DImode:
25549 gen = gen_avx512f_permvarv8di;
25550 break;
25551 case V8DFmode:
25552 gen = gen_avx512f_permvarv8df;
25553 break;
25554 default:
25555 break;
25556 }
25557 if (gen != NULL)
25558 {
25559 emit_insn (gen (target, op0, mask));
25560 return;
25561 }
25562 }
25563
25564 if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1, NULL))
25565 return;
25566
25567 if (TARGET_AVX2)
25568 {
25569 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
25570 {
25571 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
25572 an constant shuffle operand. With a tiny bit of effort we can
25573 use VPERMD instead. A re-interpretation stall for V4DFmode is
25574 unfortunate but there's no avoiding it.
25575 Similarly for V16HImode we don't have instructions for variable
25576 shuffling, while for V32QImode we can use after preparing suitable
25577 masks vpshufb; vpshufb; vpermq; vpor. */
25578
25579 if (mode == V16HImode)
25580 {
25581 maskmode = mode = V32QImode;
25582 w = 32;
25583 e = 1;
25584 }
25585 else
25586 {
25587 maskmode = mode = V8SImode;
25588 w = 8;
25589 e = 4;
25590 }
25591 t1 = gen_reg_rtx (maskmode);
25592
25593 /* Replicate the low bits of the V4DImode mask into V8SImode:
25594 mask = { A B C D }
25595 t1 = { A A B B C C D D }. */
25596 for (i = 0; i < w / 2; ++i)
25597 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
25598 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25599 vt = force_reg (maskmode, vt);
25600 mask = gen_lowpart (maskmode, mask);
25601 if (maskmode == V8SImode)
25602 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
25603 else
25604 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
25605
25606 /* Multiply the shuffle indicies by two. */
25607 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
25608 OPTAB_DIRECT);
25609
25610 /* Add one to the odd shuffle indicies:
25611 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
25612 for (i = 0; i < w / 2; ++i)
25613 {
25614 vec[i * 2] = const0_rtx;
25615 vec[i * 2 + 1] = const1_rtx;
25616 }
25617 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25618 vt = validize_mem (force_const_mem (maskmode, vt));
25619 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
25620 OPTAB_DIRECT);
25621
25622 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
25623 operands[3] = mask = t1;
25624 target = gen_reg_rtx (mode);
25625 op0 = gen_lowpart (mode, op0);
25626 op1 = gen_lowpart (mode, op1);
25627 }
25628
25629 switch (mode)
25630 {
25631 case V8SImode:
25632 /* The VPERMD and VPERMPS instructions already properly ignore
25633 the high bits of the shuffle elements. No need for us to
25634 perform an AND ourselves. */
25635 if (one_operand_shuffle)
25636 {
25637 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
25638 if (target != operands[0])
25639 emit_move_insn (operands[0],
25640 gen_lowpart (GET_MODE (operands[0]), target));
25641 }
25642 else
25643 {
25644 t1 = gen_reg_rtx (V8SImode);
25645 t2 = gen_reg_rtx (V8SImode);
25646 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
25647 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
25648 goto merge_two;
25649 }
25650 return;
25651
25652 case V8SFmode:
25653 mask = gen_lowpart (V8SImode, mask);
25654 if (one_operand_shuffle)
25655 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
25656 else
25657 {
25658 t1 = gen_reg_rtx (V8SFmode);
25659 t2 = gen_reg_rtx (V8SFmode);
25660 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
25661 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
25662 goto merge_two;
25663 }
25664 return;
25665
25666 case V4SImode:
25667 /* By combining the two 128-bit input vectors into one 256-bit
25668 input vector, we can use VPERMD and VPERMPS for the full
25669 two-operand shuffle. */
25670 t1 = gen_reg_rtx (V8SImode);
25671 t2 = gen_reg_rtx (V8SImode);
25672 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
25673 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
25674 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
25675 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
25676 return;
25677
25678 case V4SFmode:
25679 t1 = gen_reg_rtx (V8SFmode);
25680 t2 = gen_reg_rtx (V8SImode);
25681 mask = gen_lowpart (V4SImode, mask);
25682 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
25683 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
25684 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
25685 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
25686 return;
25687
25688 case V32QImode:
25689 t1 = gen_reg_rtx (V32QImode);
25690 t2 = gen_reg_rtx (V32QImode);
25691 t3 = gen_reg_rtx (V32QImode);
25692 vt2 = GEN_INT (-128);
25693 for (i = 0; i < 32; i++)
25694 vec[i] = vt2;
25695 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
25696 vt = force_reg (V32QImode, vt);
25697 for (i = 0; i < 32; i++)
25698 vec[i] = i < 16 ? vt2 : const0_rtx;
25699 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
25700 vt2 = force_reg (V32QImode, vt2);
25701 /* From mask create two adjusted masks, which contain the same
25702 bits as mask in the low 7 bits of each vector element.
25703 The first mask will have the most significant bit clear
25704 if it requests element from the same 128-bit lane
25705 and MSB set if it requests element from the other 128-bit lane.
25706 The second mask will have the opposite values of the MSB,
25707 and additionally will have its 128-bit lanes swapped.
25708 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
25709 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
25710 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
25711 stands for other 12 bytes. */
25712 /* The bit whether element is from the same lane or the other
25713 lane is bit 4, so shift it up by 3 to the MSB position. */
25714 t5 = gen_reg_rtx (V4DImode);
25715 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
25716 GEN_INT (3)));
25717 /* Clear MSB bits from the mask just in case it had them set. */
25718 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
25719 /* After this t1 will have MSB set for elements from other lane. */
25720 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
25721 /* Clear bits other than MSB. */
25722 emit_insn (gen_andv32qi3 (t1, t1, vt));
25723 /* Or in the lower bits from mask into t3. */
25724 emit_insn (gen_iorv32qi3 (t3, t1, t2));
25725 /* And invert MSB bits in t1, so MSB is set for elements from the same
25726 lane. */
25727 emit_insn (gen_xorv32qi3 (t1, t1, vt));
25728 /* Swap 128-bit lanes in t3. */
25729 t6 = gen_reg_rtx (V4DImode);
25730 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
25731 const2_rtx, GEN_INT (3),
25732 const0_rtx, const1_rtx));
25733 /* And or in the lower bits from mask into t1. */
25734 emit_insn (gen_iorv32qi3 (t1, t1, t2));
25735 if (one_operand_shuffle)
25736 {
25737 /* Each of these shuffles will put 0s in places where
25738 element from the other 128-bit lane is needed, otherwise
25739 will shuffle in the requested value. */
25740 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
25741 gen_lowpart (V32QImode, t6)));
25742 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
25743 /* For t3 the 128-bit lanes are swapped again. */
25744 t7 = gen_reg_rtx (V4DImode);
25745 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
25746 const2_rtx, GEN_INT (3),
25747 const0_rtx, const1_rtx));
25748 /* And oring both together leads to the result. */
25749 emit_insn (gen_iorv32qi3 (target, t1,
25750 gen_lowpart (V32QImode, t7)));
25751 if (target != operands[0])
25752 emit_move_insn (operands[0],
25753 gen_lowpart (GET_MODE (operands[0]), target));
25754 return;
25755 }
25756
25757 t4 = gen_reg_rtx (V32QImode);
25758 /* Similarly to the above one_operand_shuffle code,
25759 just for repeated twice for each operand. merge_two:
25760 code will merge the two results together. */
25761 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
25762 gen_lowpart (V32QImode, t6)));
25763 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
25764 gen_lowpart (V32QImode, t6)));
25765 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
25766 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
25767 t7 = gen_reg_rtx (V4DImode);
25768 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
25769 const2_rtx, GEN_INT (3),
25770 const0_rtx, const1_rtx));
25771 t8 = gen_reg_rtx (V4DImode);
25772 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
25773 const2_rtx, GEN_INT (3),
25774 const0_rtx, const1_rtx));
25775 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
25776 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
25777 t1 = t4;
25778 t2 = t3;
25779 goto merge_two;
25780
25781 default:
25782 gcc_assert (GET_MODE_SIZE (mode) <= 16);
25783 break;
25784 }
25785 }
25786
25787 if (TARGET_XOP)
25788 {
25789 /* The XOP VPPERM insn supports three inputs. By ignoring the
25790 one_operand_shuffle special case, we avoid creating another
25791 set of constant vectors in memory. */
25792 one_operand_shuffle = false;
25793
25794 /* mask = mask & {2*w-1, ...} */
25795 vt = GEN_INT (2*w - 1);
25796 }
25797 else
25798 {
25799 /* mask = mask & {w-1, ...} */
25800 vt = GEN_INT (w - 1);
25801 }
25802
25803 for (i = 0; i < w; i++)
25804 vec[i] = vt;
25805 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25806 mask = expand_simple_binop (maskmode, AND, mask, vt,
25807 NULL_RTX, 0, OPTAB_DIRECT);
25808
25809 /* For non-QImode operations, convert the word permutation control
25810 into a byte permutation control. */
25811 if (mode != V16QImode)
25812 {
25813 mask = expand_simple_binop (maskmode, ASHIFT, mask,
25814 GEN_INT (exact_log2 (e)),
25815 NULL_RTX, 0, OPTAB_DIRECT);
25816
25817 /* Convert mask to vector of chars. */
25818 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
25819
25820 /* Replicate each of the input bytes into byte positions:
25821 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
25822 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
25823 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
25824 for (i = 0; i < 16; ++i)
25825 vec[i] = GEN_INT (i/e * e);
25826 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25827 vt = validize_mem (force_const_mem (V16QImode, vt));
25828 if (TARGET_XOP)
25829 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
25830 else
25831 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
25832
25833 /* Convert it into the byte positions by doing
25834 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
25835 for (i = 0; i < 16; ++i)
25836 vec[i] = GEN_INT (i % e);
25837 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25838 vt = validize_mem (force_const_mem (V16QImode, vt));
25839 emit_insn (gen_addv16qi3 (mask, mask, vt));
25840 }
25841
25842 /* The actual shuffle operations all operate on V16QImode. */
25843 op0 = gen_lowpart (V16QImode, op0);
25844 op1 = gen_lowpart (V16QImode, op1);
25845
25846 if (TARGET_XOP)
25847 {
25848 if (GET_MODE (target) != V16QImode)
25849 target = gen_reg_rtx (V16QImode);
25850 emit_insn (gen_xop_pperm (target, op0, op1, mask));
25851 if (target != operands[0])
25852 emit_move_insn (operands[0],
25853 gen_lowpart (GET_MODE (operands[0]), target));
25854 }
25855 else if (one_operand_shuffle)
25856 {
25857 if (GET_MODE (target) != V16QImode)
25858 target = gen_reg_rtx (V16QImode);
25859 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
25860 if (target != operands[0])
25861 emit_move_insn (operands[0],
25862 gen_lowpart (GET_MODE (operands[0]), target));
25863 }
25864 else
25865 {
25866 rtx xops[6];
25867 bool ok;
25868
25869 /* Shuffle the two input vectors independently. */
25870 t1 = gen_reg_rtx (V16QImode);
25871 t2 = gen_reg_rtx (V16QImode);
25872 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
25873 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
25874
25875 merge_two:
25876 /* Then merge them together. The key is whether any given control
25877 element contained a bit set that indicates the second word. */
25878 mask = operands[3];
25879 vt = GEN_INT (w);
25880 if (maskmode == V2DImode && !TARGET_SSE4_1)
25881 {
25882 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
25883 more shuffle to convert the V2DI input mask into a V4SI
25884 input mask. At which point the masking that expand_int_vcond
25885 will work as desired. */
25886 rtx t3 = gen_reg_rtx (V4SImode);
25887 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
25888 const0_rtx, const0_rtx,
25889 const2_rtx, const2_rtx));
25890 mask = t3;
25891 maskmode = V4SImode;
25892 e = w = 4;
25893 }
25894
25895 for (i = 0; i < w; i++)
25896 vec[i] = vt;
25897 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25898 vt = force_reg (maskmode, vt);
25899 mask = expand_simple_binop (maskmode, AND, mask, vt,
25900 NULL_RTX, 0, OPTAB_DIRECT);
25901
25902 if (GET_MODE (target) != mode)
25903 target = gen_reg_rtx (mode);
25904 xops[0] = target;
25905 xops[1] = gen_lowpart (mode, t2);
25906 xops[2] = gen_lowpart (mode, t1);
25907 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
25908 xops[4] = mask;
25909 xops[5] = vt;
25910 ok = ix86_expand_int_vcond (xops);
25911 gcc_assert (ok);
25912 if (target != operands[0])
25913 emit_move_insn (operands[0],
25914 gen_lowpart (GET_MODE (operands[0]), target));
25915 }
25916 }
25917
25918 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
25919 true if we should do zero extension, else sign extension. HIGH_P is
25920 true if we want the N/2 high elements, else the low elements. */
25921
25922 void
25923 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
25924 {
25925 machine_mode imode = GET_MODE (src);
25926 rtx tmp;
25927
25928 if (TARGET_SSE4_1)
25929 {
25930 rtx (*unpack)(rtx, rtx);
25931 rtx (*extract)(rtx, rtx) = NULL;
25932 machine_mode halfmode = BLKmode;
25933
25934 switch (imode)
25935 {
25936 case V64QImode:
25937 if (unsigned_p)
25938 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
25939 else
25940 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
25941 halfmode = V32QImode;
25942 extract
25943 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
25944 break;
25945 case V32QImode:
25946 if (unsigned_p)
25947 unpack = gen_avx2_zero_extendv16qiv16hi2;
25948 else
25949 unpack = gen_avx2_sign_extendv16qiv16hi2;
25950 halfmode = V16QImode;
25951 extract
25952 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
25953 break;
25954 case V32HImode:
25955 if (unsigned_p)
25956 unpack = gen_avx512f_zero_extendv16hiv16si2;
25957 else
25958 unpack = gen_avx512f_sign_extendv16hiv16si2;
25959 halfmode = V16HImode;
25960 extract
25961 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
25962 break;
25963 case V16HImode:
25964 if (unsigned_p)
25965 unpack = gen_avx2_zero_extendv8hiv8si2;
25966 else
25967 unpack = gen_avx2_sign_extendv8hiv8si2;
25968 halfmode = V8HImode;
25969 extract
25970 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
25971 break;
25972 case V16SImode:
25973 if (unsigned_p)
25974 unpack = gen_avx512f_zero_extendv8siv8di2;
25975 else
25976 unpack = gen_avx512f_sign_extendv8siv8di2;
25977 halfmode = V8SImode;
25978 extract
25979 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
25980 break;
25981 case V8SImode:
25982 if (unsigned_p)
25983 unpack = gen_avx2_zero_extendv4siv4di2;
25984 else
25985 unpack = gen_avx2_sign_extendv4siv4di2;
25986 halfmode = V4SImode;
25987 extract
25988 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
25989 break;
25990 case V16QImode:
25991 if (unsigned_p)
25992 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
25993 else
25994 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
25995 break;
25996 case V8HImode:
25997 if (unsigned_p)
25998 unpack = gen_sse4_1_zero_extendv4hiv4si2;
25999 else
26000 unpack = gen_sse4_1_sign_extendv4hiv4si2;
26001 break;
26002 case V4SImode:
26003 if (unsigned_p)
26004 unpack = gen_sse4_1_zero_extendv2siv2di2;
26005 else
26006 unpack = gen_sse4_1_sign_extendv2siv2di2;
26007 break;
26008 default:
26009 gcc_unreachable ();
26010 }
26011
26012 if (GET_MODE_SIZE (imode) >= 32)
26013 {
26014 tmp = gen_reg_rtx (halfmode);
26015 emit_insn (extract (tmp, src));
26016 }
26017 else if (high_p)
26018 {
26019 /* Shift higher 8 bytes to lower 8 bytes. */
26020 tmp = gen_reg_rtx (V1TImode);
26021 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
26022 GEN_INT (64)));
26023 tmp = gen_lowpart (imode, tmp);
26024 }
26025 else
26026 tmp = src;
26027
26028 emit_insn (unpack (dest, tmp));
26029 }
26030 else
26031 {
26032 rtx (*unpack)(rtx, rtx, rtx);
26033
26034 switch (imode)
26035 {
26036 case V16QImode:
26037 if (high_p)
26038 unpack = gen_vec_interleave_highv16qi;
26039 else
26040 unpack = gen_vec_interleave_lowv16qi;
26041 break;
26042 case V8HImode:
26043 if (high_p)
26044 unpack = gen_vec_interleave_highv8hi;
26045 else
26046 unpack = gen_vec_interleave_lowv8hi;
26047 break;
26048 case V4SImode:
26049 if (high_p)
26050 unpack = gen_vec_interleave_highv4si;
26051 else
26052 unpack = gen_vec_interleave_lowv4si;
26053 break;
26054 default:
26055 gcc_unreachable ();
26056 }
26057
26058 if (unsigned_p)
26059 tmp = force_reg (imode, CONST0_RTX (imode));
26060 else
26061 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
26062 src, pc_rtx, pc_rtx);
26063
26064 rtx tmp2 = gen_reg_rtx (imode);
26065 emit_insn (unpack (tmp2, src, tmp));
26066 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
26067 }
26068 }
26069
26070 /* Expand conditional increment or decrement using adb/sbb instructions.
26071 The default case using setcc followed by the conditional move can be
26072 done by generic code. */
26073 bool
26074 ix86_expand_int_addcc (rtx operands[])
26075 {
26076 enum rtx_code code = GET_CODE (operands[1]);
26077 rtx flags;
26078 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
26079 rtx compare_op;
26080 rtx val = const0_rtx;
26081 bool fpcmp = false;
26082 machine_mode mode;
26083 rtx op0 = XEXP (operands[1], 0);
26084 rtx op1 = XEXP (operands[1], 1);
26085
26086 if (operands[3] != const1_rtx
26087 && operands[3] != constm1_rtx)
26088 return false;
26089 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
26090 return false;
26091 code = GET_CODE (compare_op);
26092
26093 flags = XEXP (compare_op, 0);
26094
26095 if (GET_MODE (flags) == CCFPmode
26096 || GET_MODE (flags) == CCFPUmode)
26097 {
26098 fpcmp = true;
26099 code = ix86_fp_compare_code_to_integer (code);
26100 }
26101
26102 if (code != LTU)
26103 {
26104 val = constm1_rtx;
26105 if (fpcmp)
26106 PUT_CODE (compare_op,
26107 reverse_condition_maybe_unordered
26108 (GET_CODE (compare_op)));
26109 else
26110 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
26111 }
26112
26113 mode = GET_MODE (operands[0]);
26114
26115 /* Construct either adc or sbb insn. */
26116 if ((code == LTU) == (operands[3] == constm1_rtx))
26117 {
26118 switch (mode)
26119 {
26120 case QImode:
26121 insn = gen_subqi3_carry;
26122 break;
26123 case HImode:
26124 insn = gen_subhi3_carry;
26125 break;
26126 case SImode:
26127 insn = gen_subsi3_carry;
26128 break;
26129 case DImode:
26130 insn = gen_subdi3_carry;
26131 break;
26132 default:
26133 gcc_unreachable ();
26134 }
26135 }
26136 else
26137 {
26138 switch (mode)
26139 {
26140 case QImode:
26141 insn = gen_addqi3_carry;
26142 break;
26143 case HImode:
26144 insn = gen_addhi3_carry;
26145 break;
26146 case SImode:
26147 insn = gen_addsi3_carry;
26148 break;
26149 case DImode:
26150 insn = gen_adddi3_carry;
26151 break;
26152 default:
26153 gcc_unreachable ();
26154 }
26155 }
26156 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
26157
26158 return true;
26159 }
26160
26161
26162 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
26163 but works for floating pointer parameters and nonoffsetable memories.
26164 For pushes, it returns just stack offsets; the values will be saved
26165 in the right order. Maximally three parts are generated. */
26166
26167 static int
26168 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
26169 {
26170 int size;
26171
26172 if (!TARGET_64BIT)
26173 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
26174 else
26175 size = (GET_MODE_SIZE (mode) + 4) / 8;
26176
26177 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
26178 gcc_assert (size >= 2 && size <= 4);
26179
26180 /* Optimize constant pool reference to immediates. This is used by fp
26181 moves, that force all constants to memory to allow combining. */
26182 if (MEM_P (operand) && MEM_READONLY_P (operand))
26183 {
26184 rtx tmp = maybe_get_pool_constant (operand);
26185 if (tmp)
26186 operand = tmp;
26187 }
26188
26189 if (MEM_P (operand) && !offsettable_memref_p (operand))
26190 {
26191 /* The only non-offsetable memories we handle are pushes. */
26192 int ok = push_operand (operand, VOIDmode);
26193
26194 gcc_assert (ok);
26195
26196 operand = copy_rtx (operand);
26197 PUT_MODE (operand, word_mode);
26198 parts[0] = parts[1] = parts[2] = parts[3] = operand;
26199 return size;
26200 }
26201
26202 if (GET_CODE (operand) == CONST_VECTOR)
26203 {
26204 machine_mode imode = int_mode_for_mode (mode);
26205 /* Caution: if we looked through a constant pool memory above,
26206 the operand may actually have a different mode now. That's
26207 ok, since we want to pun this all the way back to an integer. */
26208 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
26209 gcc_assert (operand != NULL);
26210 mode = imode;
26211 }
26212
26213 if (!TARGET_64BIT)
26214 {
26215 if (mode == DImode)
26216 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
26217 else
26218 {
26219 int i;
26220
26221 if (REG_P (operand))
26222 {
26223 gcc_assert (reload_completed);
26224 for (i = 0; i < size; i++)
26225 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
26226 }
26227 else if (offsettable_memref_p (operand))
26228 {
26229 operand = adjust_address (operand, SImode, 0);
26230 parts[0] = operand;
26231 for (i = 1; i < size; i++)
26232 parts[i] = adjust_address (operand, SImode, 4 * i);
26233 }
26234 else if (CONST_DOUBLE_P (operand))
26235 {
26236 const REAL_VALUE_TYPE *r;
26237 long l[4];
26238
26239 r = CONST_DOUBLE_REAL_VALUE (operand);
26240 switch (mode)
26241 {
26242 case TFmode:
26243 real_to_target (l, r, mode);
26244 parts[3] = gen_int_mode (l[3], SImode);
26245 parts[2] = gen_int_mode (l[2], SImode);
26246 break;
26247 case XFmode:
26248 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
26249 long double may not be 80-bit. */
26250 real_to_target (l, r, mode);
26251 parts[2] = gen_int_mode (l[2], SImode);
26252 break;
26253 case DFmode:
26254 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
26255 break;
26256 default:
26257 gcc_unreachable ();
26258 }
26259 parts[1] = gen_int_mode (l[1], SImode);
26260 parts[0] = gen_int_mode (l[0], SImode);
26261 }
26262 else
26263 gcc_unreachable ();
26264 }
26265 }
26266 else
26267 {
26268 if (mode == TImode)
26269 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
26270 if (mode == XFmode || mode == TFmode)
26271 {
26272 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
26273 if (REG_P (operand))
26274 {
26275 gcc_assert (reload_completed);
26276 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
26277 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
26278 }
26279 else if (offsettable_memref_p (operand))
26280 {
26281 operand = adjust_address (operand, DImode, 0);
26282 parts[0] = operand;
26283 parts[1] = adjust_address (operand, upper_mode, 8);
26284 }
26285 else if (CONST_DOUBLE_P (operand))
26286 {
26287 long l[4];
26288
26289 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
26290
26291 /* real_to_target puts 32-bit pieces in each long. */
26292 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
26293 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
26294 << 32), DImode);
26295
26296 if (upper_mode == SImode)
26297 parts[1] = gen_int_mode (l[2], SImode);
26298 else
26299 parts[1]
26300 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
26301 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
26302 << 32), DImode);
26303 }
26304 else
26305 gcc_unreachable ();
26306 }
26307 }
26308
26309 return size;
26310 }
26311
26312 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
26313 Return false when normal moves are needed; true when all required
26314 insns have been emitted. Operands 2-4 contain the input values
26315 int the correct order; operands 5-7 contain the output values. */
26316
26317 void
26318 ix86_split_long_move (rtx operands[])
26319 {
26320 rtx part[2][4];
26321 int nparts, i, j;
26322 int push = 0;
26323 int collisions = 0;
26324 machine_mode mode = GET_MODE (operands[0]);
26325 bool collisionparts[4];
26326
26327 /* The DFmode expanders may ask us to move double.
26328 For 64bit target this is single move. By hiding the fact
26329 here we simplify i386.md splitters. */
26330 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
26331 {
26332 /* Optimize constant pool reference to immediates. This is used by
26333 fp moves, that force all constants to memory to allow combining. */
26334
26335 if (MEM_P (operands[1])
26336 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
26337 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
26338 operands[1] = get_pool_constant (XEXP (operands[1], 0));
26339 if (push_operand (operands[0], VOIDmode))
26340 {
26341 operands[0] = copy_rtx (operands[0]);
26342 PUT_MODE (operands[0], word_mode);
26343 }
26344 else
26345 operands[0] = gen_lowpart (DImode, operands[0]);
26346 operands[1] = gen_lowpart (DImode, operands[1]);
26347 emit_move_insn (operands[0], operands[1]);
26348 return;
26349 }
26350
26351 /* The only non-offsettable memory we handle is push. */
26352 if (push_operand (operands[0], VOIDmode))
26353 push = 1;
26354 else
26355 gcc_assert (!MEM_P (operands[0])
26356 || offsettable_memref_p (operands[0]));
26357
26358 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
26359 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
26360
26361 /* When emitting push, take care for source operands on the stack. */
26362 if (push && MEM_P (operands[1])
26363 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
26364 {
26365 rtx src_base = XEXP (part[1][nparts - 1], 0);
26366
26367 /* Compensate for the stack decrement by 4. */
26368 if (!TARGET_64BIT && nparts == 3
26369 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
26370 src_base = plus_constant (Pmode, src_base, 4);
26371
26372 /* src_base refers to the stack pointer and is
26373 automatically decreased by emitted push. */
26374 for (i = 0; i < nparts; i++)
26375 part[1][i] = change_address (part[1][i],
26376 GET_MODE (part[1][i]), src_base);
26377 }
26378
26379 /* We need to do copy in the right order in case an address register
26380 of the source overlaps the destination. */
26381 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
26382 {
26383 rtx tmp;
26384
26385 for (i = 0; i < nparts; i++)
26386 {
26387 collisionparts[i]
26388 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
26389 if (collisionparts[i])
26390 collisions++;
26391 }
26392
26393 /* Collision in the middle part can be handled by reordering. */
26394 if (collisions == 1 && nparts == 3 && collisionparts [1])
26395 {
26396 std::swap (part[0][1], part[0][2]);
26397 std::swap (part[1][1], part[1][2]);
26398 }
26399 else if (collisions == 1
26400 && nparts == 4
26401 && (collisionparts [1] || collisionparts [2]))
26402 {
26403 if (collisionparts [1])
26404 {
26405 std::swap (part[0][1], part[0][2]);
26406 std::swap (part[1][1], part[1][2]);
26407 }
26408 else
26409 {
26410 std::swap (part[0][2], part[0][3]);
26411 std::swap (part[1][2], part[1][3]);
26412 }
26413 }
26414
26415 /* If there are more collisions, we can't handle it by reordering.
26416 Do an lea to the last part and use only one colliding move. */
26417 else if (collisions > 1)
26418 {
26419 rtx base, addr, tls_base = NULL_RTX;
26420
26421 collisions = 1;
26422
26423 base = part[0][nparts - 1];
26424
26425 /* Handle the case when the last part isn't valid for lea.
26426 Happens in 64-bit mode storing the 12-byte XFmode. */
26427 if (GET_MODE (base) != Pmode)
26428 base = gen_rtx_REG (Pmode, REGNO (base));
26429
26430 addr = XEXP (part[1][0], 0);
26431 if (TARGET_TLS_DIRECT_SEG_REFS)
26432 {
26433 struct ix86_address parts;
26434 int ok = ix86_decompose_address (addr, &parts);
26435 gcc_assert (ok);
26436 if (parts.seg == DEFAULT_TLS_SEG_REG)
26437 {
26438 /* It is not valid to use %gs: or %fs: in
26439 lea though, so we need to remove it from the
26440 address used for lea and add it to each individual
26441 memory loads instead. */
26442 addr = copy_rtx (addr);
26443 rtx *x = &addr;
26444 while (GET_CODE (*x) == PLUS)
26445 {
26446 for (i = 0; i < 2; i++)
26447 {
26448 rtx u = XEXP (*x, i);
26449 if (GET_CODE (u) == ZERO_EXTEND)
26450 u = XEXP (u, 0);
26451 if (GET_CODE (u) == UNSPEC
26452 && XINT (u, 1) == UNSPEC_TP)
26453 {
26454 tls_base = XEXP (*x, i);
26455 *x = XEXP (*x, 1 - i);
26456 break;
26457 }
26458 }
26459 if (tls_base)
26460 break;
26461 x = &XEXP (*x, 0);
26462 }
26463 gcc_assert (tls_base);
26464 }
26465 }
26466 emit_insn (gen_rtx_SET (base, addr));
26467 if (tls_base)
26468 base = gen_rtx_PLUS (GET_MODE (base), base, tls_base);
26469 part[1][0] = replace_equiv_address (part[1][0], base);
26470 for (i = 1; i < nparts; i++)
26471 {
26472 if (tls_base)
26473 base = copy_rtx (base);
26474 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
26475 part[1][i] = replace_equiv_address (part[1][i], tmp);
26476 }
26477 }
26478 }
26479
26480 if (push)
26481 {
26482 if (!TARGET_64BIT)
26483 {
26484 if (nparts == 3)
26485 {
26486 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
26487 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
26488 stack_pointer_rtx, GEN_INT (-4)));
26489 emit_move_insn (part[0][2], part[1][2]);
26490 }
26491 else if (nparts == 4)
26492 {
26493 emit_move_insn (part[0][3], part[1][3]);
26494 emit_move_insn (part[0][2], part[1][2]);
26495 }
26496 }
26497 else
26498 {
26499 /* In 64bit mode we don't have 32bit push available. In case this is
26500 register, it is OK - we will just use larger counterpart. We also
26501 retype memory - these comes from attempt to avoid REX prefix on
26502 moving of second half of TFmode value. */
26503 if (GET_MODE (part[1][1]) == SImode)
26504 {
26505 switch (GET_CODE (part[1][1]))
26506 {
26507 case MEM:
26508 part[1][1] = adjust_address (part[1][1], DImode, 0);
26509 break;
26510
26511 case REG:
26512 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
26513 break;
26514
26515 default:
26516 gcc_unreachable ();
26517 }
26518
26519 if (GET_MODE (part[1][0]) == SImode)
26520 part[1][0] = part[1][1];
26521 }
26522 }
26523 emit_move_insn (part[0][1], part[1][1]);
26524 emit_move_insn (part[0][0], part[1][0]);
26525 return;
26526 }
26527
26528 /* Choose correct order to not overwrite the source before it is copied. */
26529 if ((REG_P (part[0][0])
26530 && REG_P (part[1][1])
26531 && (REGNO (part[0][0]) == REGNO (part[1][1])
26532 || (nparts == 3
26533 && REGNO (part[0][0]) == REGNO (part[1][2]))
26534 || (nparts == 4
26535 && REGNO (part[0][0]) == REGNO (part[1][3]))))
26536 || (collisions > 0
26537 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
26538 {
26539 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
26540 {
26541 operands[2 + i] = part[0][j];
26542 operands[6 + i] = part[1][j];
26543 }
26544 }
26545 else
26546 {
26547 for (i = 0; i < nparts; i++)
26548 {
26549 operands[2 + i] = part[0][i];
26550 operands[6 + i] = part[1][i];
26551 }
26552 }
26553
26554 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
26555 if (optimize_insn_for_size_p ())
26556 {
26557 for (j = 0; j < nparts - 1; j++)
26558 if (CONST_INT_P (operands[6 + j])
26559 && operands[6 + j] != const0_rtx
26560 && REG_P (operands[2 + j]))
26561 for (i = j; i < nparts - 1; i++)
26562 if (CONST_INT_P (operands[7 + i])
26563 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
26564 operands[7 + i] = operands[2 + j];
26565 }
26566
26567 for (i = 0; i < nparts; i++)
26568 emit_move_insn (operands[2 + i], operands[6 + i]);
26569
26570 return;
26571 }
26572
26573 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
26574 left shift by a constant, either using a single shift or
26575 a sequence of add instructions. */
26576
26577 static void
26578 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
26579 {
26580 rtx (*insn)(rtx, rtx, rtx);
26581
26582 if (count == 1
26583 || (count * ix86_cost->add <= ix86_cost->shift_const
26584 && !optimize_insn_for_size_p ()))
26585 {
26586 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
26587 while (count-- > 0)
26588 emit_insn (insn (operand, operand, operand));
26589 }
26590 else
26591 {
26592 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
26593 emit_insn (insn (operand, operand, GEN_INT (count)));
26594 }
26595 }
26596
26597 void
26598 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
26599 {
26600 rtx (*gen_ashl3)(rtx, rtx, rtx);
26601 rtx (*gen_shld)(rtx, rtx, rtx);
26602 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26603
26604 rtx low[2], high[2];
26605 int count;
26606
26607 if (CONST_INT_P (operands[2]))
26608 {
26609 split_double_mode (mode, operands, 2, low, high);
26610 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26611
26612 if (count >= half_width)
26613 {
26614 emit_move_insn (high[0], low[1]);
26615 emit_move_insn (low[0], const0_rtx);
26616
26617 if (count > half_width)
26618 ix86_expand_ashl_const (high[0], count - half_width, mode);
26619 }
26620 else
26621 {
26622 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
26623
26624 if (!rtx_equal_p (operands[0], operands[1]))
26625 emit_move_insn (operands[0], operands[1]);
26626
26627 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
26628 ix86_expand_ashl_const (low[0], count, mode);
26629 }
26630 return;
26631 }
26632
26633 split_double_mode (mode, operands, 1, low, high);
26634
26635 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
26636
26637 if (operands[1] == const1_rtx)
26638 {
26639 /* Assuming we've chosen a QImode capable registers, then 1 << N
26640 can be done with two 32/64-bit shifts, no branches, no cmoves. */
26641 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
26642 {
26643 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
26644
26645 ix86_expand_clear (low[0]);
26646 ix86_expand_clear (high[0]);
26647 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
26648
26649 d = gen_lowpart (QImode, low[0]);
26650 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
26651 s = gen_rtx_EQ (QImode, flags, const0_rtx);
26652 emit_insn (gen_rtx_SET (d, s));
26653
26654 d = gen_lowpart (QImode, high[0]);
26655 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
26656 s = gen_rtx_NE (QImode, flags, const0_rtx);
26657 emit_insn (gen_rtx_SET (d, s));
26658 }
26659
26660 /* Otherwise, we can get the same results by manually performing
26661 a bit extract operation on bit 5/6, and then performing the two
26662 shifts. The two methods of getting 0/1 into low/high are exactly
26663 the same size. Avoiding the shift in the bit extract case helps
26664 pentium4 a bit; no one else seems to care much either way. */
26665 else
26666 {
26667 machine_mode half_mode;
26668 rtx (*gen_lshr3)(rtx, rtx, rtx);
26669 rtx (*gen_and3)(rtx, rtx, rtx);
26670 rtx (*gen_xor3)(rtx, rtx, rtx);
26671 HOST_WIDE_INT bits;
26672 rtx x;
26673
26674 if (mode == DImode)
26675 {
26676 half_mode = SImode;
26677 gen_lshr3 = gen_lshrsi3;
26678 gen_and3 = gen_andsi3;
26679 gen_xor3 = gen_xorsi3;
26680 bits = 5;
26681 }
26682 else
26683 {
26684 half_mode = DImode;
26685 gen_lshr3 = gen_lshrdi3;
26686 gen_and3 = gen_anddi3;
26687 gen_xor3 = gen_xordi3;
26688 bits = 6;
26689 }
26690
26691 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
26692 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
26693 else
26694 x = gen_lowpart (half_mode, operands[2]);
26695 emit_insn (gen_rtx_SET (high[0], x));
26696
26697 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
26698 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
26699 emit_move_insn (low[0], high[0]);
26700 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
26701 }
26702
26703 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
26704 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
26705 return;
26706 }
26707
26708 if (operands[1] == constm1_rtx)
26709 {
26710 /* For -1 << N, we can avoid the shld instruction, because we
26711 know that we're shifting 0...31/63 ones into a -1. */
26712 emit_move_insn (low[0], constm1_rtx);
26713 if (optimize_insn_for_size_p ())
26714 emit_move_insn (high[0], low[0]);
26715 else
26716 emit_move_insn (high[0], constm1_rtx);
26717 }
26718 else
26719 {
26720 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
26721
26722 if (!rtx_equal_p (operands[0], operands[1]))
26723 emit_move_insn (operands[0], operands[1]);
26724
26725 split_double_mode (mode, operands, 1, low, high);
26726 emit_insn (gen_shld (high[0], low[0], operands[2]));
26727 }
26728
26729 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
26730
26731 if (TARGET_CMOVE && scratch)
26732 {
26733 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26734 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26735
26736 ix86_expand_clear (scratch);
26737 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
26738 }
26739 else
26740 {
26741 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
26742 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
26743
26744 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
26745 }
26746 }
26747
26748 void
26749 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
26750 {
26751 rtx (*gen_ashr3)(rtx, rtx, rtx)
26752 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
26753 rtx (*gen_shrd)(rtx, rtx, rtx);
26754 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26755
26756 rtx low[2], high[2];
26757 int count;
26758
26759 if (CONST_INT_P (operands[2]))
26760 {
26761 split_double_mode (mode, operands, 2, low, high);
26762 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26763
26764 if (count == GET_MODE_BITSIZE (mode) - 1)
26765 {
26766 emit_move_insn (high[0], high[1]);
26767 emit_insn (gen_ashr3 (high[0], high[0],
26768 GEN_INT (half_width - 1)));
26769 emit_move_insn (low[0], high[0]);
26770
26771 }
26772 else if (count >= half_width)
26773 {
26774 emit_move_insn (low[0], high[1]);
26775 emit_move_insn (high[0], low[0]);
26776 emit_insn (gen_ashr3 (high[0], high[0],
26777 GEN_INT (half_width - 1)));
26778
26779 if (count > half_width)
26780 emit_insn (gen_ashr3 (low[0], low[0],
26781 GEN_INT (count - half_width)));
26782 }
26783 else
26784 {
26785 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26786
26787 if (!rtx_equal_p (operands[0], operands[1]))
26788 emit_move_insn (operands[0], operands[1]);
26789
26790 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
26791 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
26792 }
26793 }
26794 else
26795 {
26796 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26797
26798 if (!rtx_equal_p (operands[0], operands[1]))
26799 emit_move_insn (operands[0], operands[1]);
26800
26801 split_double_mode (mode, operands, 1, low, high);
26802
26803 emit_insn (gen_shrd (low[0], high[0], operands[2]));
26804 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
26805
26806 if (TARGET_CMOVE && scratch)
26807 {
26808 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26809 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26810
26811 emit_move_insn (scratch, high[0]);
26812 emit_insn (gen_ashr3 (scratch, scratch,
26813 GEN_INT (half_width - 1)));
26814 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
26815 scratch));
26816 }
26817 else
26818 {
26819 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
26820 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
26821
26822 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
26823 }
26824 }
26825 }
26826
26827 void
26828 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
26829 {
26830 rtx (*gen_lshr3)(rtx, rtx, rtx)
26831 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
26832 rtx (*gen_shrd)(rtx, rtx, rtx);
26833 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26834
26835 rtx low[2], high[2];
26836 int count;
26837
26838 if (CONST_INT_P (operands[2]))
26839 {
26840 split_double_mode (mode, operands, 2, low, high);
26841 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26842
26843 if (count >= half_width)
26844 {
26845 emit_move_insn (low[0], high[1]);
26846 ix86_expand_clear (high[0]);
26847
26848 if (count > half_width)
26849 emit_insn (gen_lshr3 (low[0], low[0],
26850 GEN_INT (count - half_width)));
26851 }
26852 else
26853 {
26854 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26855
26856 if (!rtx_equal_p (operands[0], operands[1]))
26857 emit_move_insn (operands[0], operands[1]);
26858
26859 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
26860 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
26861 }
26862 }
26863 else
26864 {
26865 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26866
26867 if (!rtx_equal_p (operands[0], operands[1]))
26868 emit_move_insn (operands[0], operands[1]);
26869
26870 split_double_mode (mode, operands, 1, low, high);
26871
26872 emit_insn (gen_shrd (low[0], high[0], operands[2]));
26873 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
26874
26875 if (TARGET_CMOVE && scratch)
26876 {
26877 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26878 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26879
26880 ix86_expand_clear (scratch);
26881 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
26882 scratch));
26883 }
26884 else
26885 {
26886 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
26887 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
26888
26889 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
26890 }
26891 }
26892 }
26893
26894 /* Predict just emitted jump instruction to be taken with probability PROB. */
26895 static void
26896 predict_jump (int prob)
26897 {
26898 rtx_insn *insn = get_last_insn ();
26899 gcc_assert (JUMP_P (insn));
26900 add_int_reg_note (insn, REG_BR_PROB, prob);
26901 }
26902
26903 /* Helper function for the string operations below. Dest VARIABLE whether
26904 it is aligned to VALUE bytes. If true, jump to the label. */
26905 static rtx_code_label *
26906 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
26907 {
26908 rtx_code_label *label = gen_label_rtx ();
26909 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
26910 if (GET_MODE (variable) == DImode)
26911 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
26912 else
26913 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
26914 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
26915 1, label);
26916 if (epilogue)
26917 predict_jump (REG_BR_PROB_BASE * 50 / 100);
26918 else
26919 predict_jump (REG_BR_PROB_BASE * 90 / 100);
26920 return label;
26921 }
26922
26923 /* Adjust COUNTER by the VALUE. */
26924 static void
26925 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
26926 {
26927 rtx (*gen_add)(rtx, rtx, rtx)
26928 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
26929
26930 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
26931 }
26932
26933 /* Zero extend possibly SImode EXP to Pmode register. */
26934 rtx
26935 ix86_zero_extend_to_Pmode (rtx exp)
26936 {
26937 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
26938 }
26939
26940 /* Divide COUNTREG by SCALE. */
26941 static rtx
26942 scale_counter (rtx countreg, int scale)
26943 {
26944 rtx sc;
26945
26946 if (scale == 1)
26947 return countreg;
26948 if (CONST_INT_P (countreg))
26949 return GEN_INT (INTVAL (countreg) / scale);
26950 gcc_assert (REG_P (countreg));
26951
26952 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
26953 GEN_INT (exact_log2 (scale)),
26954 NULL, 1, OPTAB_DIRECT);
26955 return sc;
26956 }
26957
26958 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
26959 DImode for constant loop counts. */
26960
26961 static machine_mode
26962 counter_mode (rtx count_exp)
26963 {
26964 if (GET_MODE (count_exp) != VOIDmode)
26965 return GET_MODE (count_exp);
26966 if (!CONST_INT_P (count_exp))
26967 return Pmode;
26968 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
26969 return DImode;
26970 return SImode;
26971 }
26972
26973 /* Copy the address to a Pmode register. This is used for x32 to
26974 truncate DImode TLS address to a SImode register. */
26975
26976 static rtx
26977 ix86_copy_addr_to_reg (rtx addr)
26978 {
26979 rtx reg;
26980 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
26981 {
26982 reg = copy_addr_to_reg (addr);
26983 REG_POINTER (reg) = 1;
26984 return reg;
26985 }
26986 else
26987 {
26988 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
26989 reg = copy_to_mode_reg (DImode, addr);
26990 REG_POINTER (reg) = 1;
26991 return gen_rtx_SUBREG (SImode, reg, 0);
26992 }
26993 }
26994
26995 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
26996 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
26997 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
26998 memory by VALUE (supposed to be in MODE).
26999
27000 The size is rounded down to whole number of chunk size moved at once.
27001 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
27002
27003
27004 static void
27005 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
27006 rtx destptr, rtx srcptr, rtx value,
27007 rtx count, machine_mode mode, int unroll,
27008 int expected_size, bool issetmem)
27009 {
27010 rtx_code_label *out_label, *top_label;
27011 rtx iter, tmp;
27012 machine_mode iter_mode = counter_mode (count);
27013 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
27014 rtx piece_size = GEN_INT (piece_size_n);
27015 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
27016 rtx size;
27017 int i;
27018
27019 top_label = gen_label_rtx ();
27020 out_label = gen_label_rtx ();
27021 iter = gen_reg_rtx (iter_mode);
27022
27023 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
27024 NULL, 1, OPTAB_DIRECT);
27025 /* Those two should combine. */
27026 if (piece_size == const1_rtx)
27027 {
27028 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
27029 true, out_label);
27030 predict_jump (REG_BR_PROB_BASE * 10 / 100);
27031 }
27032 emit_move_insn (iter, const0_rtx);
27033
27034 emit_label (top_label);
27035
27036 tmp = convert_modes (Pmode, iter_mode, iter, true);
27037
27038 /* This assert could be relaxed - in this case we'll need to compute
27039 smallest power of two, containing in PIECE_SIZE_N and pass it to
27040 offset_address. */
27041 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
27042 destmem = offset_address (destmem, tmp, piece_size_n);
27043 destmem = adjust_address (destmem, mode, 0);
27044
27045 if (!issetmem)
27046 {
27047 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
27048 srcmem = adjust_address (srcmem, mode, 0);
27049
27050 /* When unrolling for chips that reorder memory reads and writes,
27051 we can save registers by using single temporary.
27052 Also using 4 temporaries is overkill in 32bit mode. */
27053 if (!TARGET_64BIT && 0)
27054 {
27055 for (i = 0; i < unroll; i++)
27056 {
27057 if (i)
27058 {
27059 destmem =
27060 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
27061 srcmem =
27062 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
27063 }
27064 emit_move_insn (destmem, srcmem);
27065 }
27066 }
27067 else
27068 {
27069 rtx tmpreg[4];
27070 gcc_assert (unroll <= 4);
27071 for (i = 0; i < unroll; i++)
27072 {
27073 tmpreg[i] = gen_reg_rtx (mode);
27074 if (i)
27075 {
27076 srcmem =
27077 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
27078 }
27079 emit_move_insn (tmpreg[i], srcmem);
27080 }
27081 for (i = 0; i < unroll; i++)
27082 {
27083 if (i)
27084 {
27085 destmem =
27086 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
27087 }
27088 emit_move_insn (destmem, tmpreg[i]);
27089 }
27090 }
27091 }
27092 else
27093 for (i = 0; i < unroll; i++)
27094 {
27095 if (i)
27096 destmem =
27097 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
27098 emit_move_insn (destmem, value);
27099 }
27100
27101 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
27102 true, OPTAB_LIB_WIDEN);
27103 if (tmp != iter)
27104 emit_move_insn (iter, tmp);
27105
27106 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
27107 true, top_label);
27108 if (expected_size != -1)
27109 {
27110 expected_size /= GET_MODE_SIZE (mode) * unroll;
27111 if (expected_size == 0)
27112 predict_jump (0);
27113 else if (expected_size > REG_BR_PROB_BASE)
27114 predict_jump (REG_BR_PROB_BASE - 1);
27115 else
27116 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
27117 }
27118 else
27119 predict_jump (REG_BR_PROB_BASE * 80 / 100);
27120 iter = ix86_zero_extend_to_Pmode (iter);
27121 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
27122 true, OPTAB_LIB_WIDEN);
27123 if (tmp != destptr)
27124 emit_move_insn (destptr, tmp);
27125 if (!issetmem)
27126 {
27127 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
27128 true, OPTAB_LIB_WIDEN);
27129 if (tmp != srcptr)
27130 emit_move_insn (srcptr, tmp);
27131 }
27132 emit_label (out_label);
27133 }
27134
27135 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
27136 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
27137 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
27138 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
27139 ORIG_VALUE is the original value passed to memset to fill the memory with.
27140 Other arguments have same meaning as for previous function. */
27141
27142 static void
27143 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
27144 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
27145 rtx count,
27146 machine_mode mode, bool issetmem)
27147 {
27148 rtx destexp;
27149 rtx srcexp;
27150 rtx countreg;
27151 HOST_WIDE_INT rounded_count;
27152
27153 /* If possible, it is shorter to use rep movs.
27154 TODO: Maybe it is better to move this logic to decide_alg. */
27155 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
27156 && (!issetmem || orig_value == const0_rtx))
27157 mode = SImode;
27158
27159 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
27160 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
27161
27162 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
27163 GET_MODE_SIZE (mode)));
27164 if (mode != QImode)
27165 {
27166 destexp = gen_rtx_ASHIFT (Pmode, countreg,
27167 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
27168 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
27169 }
27170 else
27171 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
27172 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
27173 {
27174 rounded_count
27175 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
27176 destmem = shallow_copy_rtx (destmem);
27177 set_mem_size (destmem, rounded_count);
27178 }
27179 else if (MEM_SIZE_KNOWN_P (destmem))
27180 clear_mem_size (destmem);
27181
27182 if (issetmem)
27183 {
27184 value = force_reg (mode, gen_lowpart (mode, value));
27185 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
27186 }
27187 else
27188 {
27189 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
27190 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
27191 if (mode != QImode)
27192 {
27193 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
27194 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
27195 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
27196 }
27197 else
27198 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
27199 if (CONST_INT_P (count))
27200 {
27201 rounded_count
27202 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
27203 srcmem = shallow_copy_rtx (srcmem);
27204 set_mem_size (srcmem, rounded_count);
27205 }
27206 else
27207 {
27208 if (MEM_SIZE_KNOWN_P (srcmem))
27209 clear_mem_size (srcmem);
27210 }
27211 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
27212 destexp, srcexp));
27213 }
27214 }
27215
27216 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
27217 DESTMEM.
27218 SRC is passed by pointer to be updated on return.
27219 Return value is updated DST. */
27220 static rtx
27221 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
27222 HOST_WIDE_INT size_to_move)
27223 {
27224 rtx dst = destmem, src = *srcmem, adjust, tempreg;
27225 enum insn_code code;
27226 machine_mode move_mode;
27227 int piece_size, i;
27228
27229 /* Find the widest mode in which we could perform moves.
27230 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
27231 it until move of such size is supported. */
27232 piece_size = 1 << floor_log2 (size_to_move);
27233 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
27234 code = optab_handler (mov_optab, move_mode);
27235 while (code == CODE_FOR_nothing && piece_size > 1)
27236 {
27237 piece_size >>= 1;
27238 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
27239 code = optab_handler (mov_optab, move_mode);
27240 }
27241
27242 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27243 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27244 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27245 {
27246 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27247 move_mode = mode_for_vector (word_mode, nunits);
27248 code = optab_handler (mov_optab, move_mode);
27249 if (code == CODE_FOR_nothing)
27250 {
27251 move_mode = word_mode;
27252 piece_size = GET_MODE_SIZE (move_mode);
27253 code = optab_handler (mov_optab, move_mode);
27254 }
27255 }
27256 gcc_assert (code != CODE_FOR_nothing);
27257
27258 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
27259 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
27260
27261 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
27262 gcc_assert (size_to_move % piece_size == 0);
27263 adjust = GEN_INT (piece_size);
27264 for (i = 0; i < size_to_move; i += piece_size)
27265 {
27266 /* We move from memory to memory, so we'll need to do it via
27267 a temporary register. */
27268 tempreg = gen_reg_rtx (move_mode);
27269 emit_insn (GEN_FCN (code) (tempreg, src));
27270 emit_insn (GEN_FCN (code) (dst, tempreg));
27271
27272 emit_move_insn (destptr,
27273 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
27274 emit_move_insn (srcptr,
27275 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
27276
27277 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
27278 piece_size);
27279 src = adjust_automodify_address_nv (src, move_mode, srcptr,
27280 piece_size);
27281 }
27282
27283 /* Update DST and SRC rtx. */
27284 *srcmem = src;
27285 return dst;
27286 }
27287
27288 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
27289 static void
27290 expand_movmem_epilogue (rtx destmem, rtx srcmem,
27291 rtx destptr, rtx srcptr, rtx count, int max_size)
27292 {
27293 rtx src, dest;
27294 if (CONST_INT_P (count))
27295 {
27296 HOST_WIDE_INT countval = INTVAL (count);
27297 HOST_WIDE_INT epilogue_size = countval % max_size;
27298 int i;
27299
27300 /* For now MAX_SIZE should be a power of 2. This assert could be
27301 relaxed, but it'll require a bit more complicated epilogue
27302 expanding. */
27303 gcc_assert ((max_size & (max_size - 1)) == 0);
27304 for (i = max_size; i >= 1; i >>= 1)
27305 {
27306 if (epilogue_size & i)
27307 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
27308 }
27309 return;
27310 }
27311 if (max_size > 8)
27312 {
27313 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
27314 count, 1, OPTAB_DIRECT);
27315 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
27316 count, QImode, 1, 4, false);
27317 return;
27318 }
27319
27320 /* When there are stringops, we can cheaply increase dest and src pointers.
27321 Otherwise we save code size by maintaining offset (zero is readily
27322 available from preceding rep operation) and using x86 addressing modes.
27323 */
27324 if (TARGET_SINGLE_STRINGOP)
27325 {
27326 if (max_size > 4)
27327 {
27328 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
27329 src = change_address (srcmem, SImode, srcptr);
27330 dest = change_address (destmem, SImode, destptr);
27331 emit_insn (gen_strmov (destptr, dest, srcptr, src));
27332 emit_label (label);
27333 LABEL_NUSES (label) = 1;
27334 }
27335 if (max_size > 2)
27336 {
27337 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
27338 src = change_address (srcmem, HImode, srcptr);
27339 dest = change_address (destmem, HImode, destptr);
27340 emit_insn (gen_strmov (destptr, dest, srcptr, src));
27341 emit_label (label);
27342 LABEL_NUSES (label) = 1;
27343 }
27344 if (max_size > 1)
27345 {
27346 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
27347 src = change_address (srcmem, QImode, srcptr);
27348 dest = change_address (destmem, QImode, destptr);
27349 emit_insn (gen_strmov (destptr, dest, srcptr, src));
27350 emit_label (label);
27351 LABEL_NUSES (label) = 1;
27352 }
27353 }
27354 else
27355 {
27356 rtx offset = force_reg (Pmode, const0_rtx);
27357 rtx tmp;
27358
27359 if (max_size > 4)
27360 {
27361 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
27362 src = change_address (srcmem, SImode, srcptr);
27363 dest = change_address (destmem, SImode, destptr);
27364 emit_move_insn (dest, src);
27365 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
27366 true, OPTAB_LIB_WIDEN);
27367 if (tmp != offset)
27368 emit_move_insn (offset, tmp);
27369 emit_label (label);
27370 LABEL_NUSES (label) = 1;
27371 }
27372 if (max_size > 2)
27373 {
27374 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
27375 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
27376 src = change_address (srcmem, HImode, tmp);
27377 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
27378 dest = change_address (destmem, HImode, tmp);
27379 emit_move_insn (dest, src);
27380 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
27381 true, OPTAB_LIB_WIDEN);
27382 if (tmp != offset)
27383 emit_move_insn (offset, tmp);
27384 emit_label (label);
27385 LABEL_NUSES (label) = 1;
27386 }
27387 if (max_size > 1)
27388 {
27389 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
27390 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
27391 src = change_address (srcmem, QImode, tmp);
27392 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
27393 dest = change_address (destmem, QImode, tmp);
27394 emit_move_insn (dest, src);
27395 emit_label (label);
27396 LABEL_NUSES (label) = 1;
27397 }
27398 }
27399 }
27400
27401 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
27402 with value PROMOTED_VAL.
27403 SRC is passed by pointer to be updated on return.
27404 Return value is updated DST. */
27405 static rtx
27406 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
27407 HOST_WIDE_INT size_to_move)
27408 {
27409 rtx dst = destmem, adjust;
27410 enum insn_code code;
27411 machine_mode move_mode;
27412 int piece_size, i;
27413
27414 /* Find the widest mode in which we could perform moves.
27415 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
27416 it until move of such size is supported. */
27417 move_mode = GET_MODE (promoted_val);
27418 if (move_mode == VOIDmode)
27419 move_mode = QImode;
27420 if (size_to_move < GET_MODE_SIZE (move_mode))
27421 {
27422 move_mode = mode_for_size (size_to_move * BITS_PER_UNIT, MODE_INT, 0);
27423 promoted_val = gen_lowpart (move_mode, promoted_val);
27424 }
27425 piece_size = GET_MODE_SIZE (move_mode);
27426 code = optab_handler (mov_optab, move_mode);
27427 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
27428
27429 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
27430
27431 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
27432 gcc_assert (size_to_move % piece_size == 0);
27433 adjust = GEN_INT (piece_size);
27434 for (i = 0; i < size_to_move; i += piece_size)
27435 {
27436 if (piece_size <= GET_MODE_SIZE (word_mode))
27437 {
27438 emit_insn (gen_strset (destptr, dst, promoted_val));
27439 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
27440 piece_size);
27441 continue;
27442 }
27443
27444 emit_insn (GEN_FCN (code) (dst, promoted_val));
27445
27446 emit_move_insn (destptr,
27447 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
27448
27449 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
27450 piece_size);
27451 }
27452
27453 /* Update DST rtx. */
27454 return dst;
27455 }
27456 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
27457 static void
27458 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
27459 rtx count, int max_size)
27460 {
27461 count =
27462 expand_simple_binop (counter_mode (count), AND, count,
27463 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
27464 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
27465 gen_lowpart (QImode, value), count, QImode,
27466 1, max_size / 2, true);
27467 }
27468
27469 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
27470 static void
27471 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
27472 rtx count, int max_size)
27473 {
27474 rtx dest;
27475
27476 if (CONST_INT_P (count))
27477 {
27478 HOST_WIDE_INT countval = INTVAL (count);
27479 HOST_WIDE_INT epilogue_size = countval % max_size;
27480 int i;
27481
27482 /* For now MAX_SIZE should be a power of 2. This assert could be
27483 relaxed, but it'll require a bit more complicated epilogue
27484 expanding. */
27485 gcc_assert ((max_size & (max_size - 1)) == 0);
27486 for (i = max_size; i >= 1; i >>= 1)
27487 {
27488 if (epilogue_size & i)
27489 {
27490 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
27491 destmem = emit_memset (destmem, destptr, vec_value, i);
27492 else
27493 destmem = emit_memset (destmem, destptr, value, i);
27494 }
27495 }
27496 return;
27497 }
27498 if (max_size > 32)
27499 {
27500 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
27501 return;
27502 }
27503 if (max_size > 16)
27504 {
27505 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
27506 if (TARGET_64BIT)
27507 {
27508 dest = change_address (destmem, DImode, destptr);
27509 emit_insn (gen_strset (destptr, dest, value));
27510 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
27511 emit_insn (gen_strset (destptr, dest, value));
27512 }
27513 else
27514 {
27515 dest = change_address (destmem, SImode, destptr);
27516 emit_insn (gen_strset (destptr, dest, value));
27517 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
27518 emit_insn (gen_strset (destptr, dest, value));
27519 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
27520 emit_insn (gen_strset (destptr, dest, value));
27521 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
27522 emit_insn (gen_strset (destptr, dest, value));
27523 }
27524 emit_label (label);
27525 LABEL_NUSES (label) = 1;
27526 }
27527 if (max_size > 8)
27528 {
27529 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
27530 if (TARGET_64BIT)
27531 {
27532 dest = change_address (destmem, DImode, destptr);
27533 emit_insn (gen_strset (destptr, dest, value));
27534 }
27535 else
27536 {
27537 dest = change_address (destmem, SImode, destptr);
27538 emit_insn (gen_strset (destptr, dest, value));
27539 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
27540 emit_insn (gen_strset (destptr, dest, value));
27541 }
27542 emit_label (label);
27543 LABEL_NUSES (label) = 1;
27544 }
27545 if (max_size > 4)
27546 {
27547 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
27548 dest = change_address (destmem, SImode, destptr);
27549 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
27550 emit_label (label);
27551 LABEL_NUSES (label) = 1;
27552 }
27553 if (max_size > 2)
27554 {
27555 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
27556 dest = change_address (destmem, HImode, destptr);
27557 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
27558 emit_label (label);
27559 LABEL_NUSES (label) = 1;
27560 }
27561 if (max_size > 1)
27562 {
27563 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
27564 dest = change_address (destmem, QImode, destptr);
27565 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
27566 emit_label (label);
27567 LABEL_NUSES (label) = 1;
27568 }
27569 }
27570
27571 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
27572 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
27573 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
27574 ignored.
27575 Return value is updated DESTMEM. */
27576 static rtx
27577 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
27578 rtx destptr, rtx srcptr, rtx value,
27579 rtx vec_value, rtx count, int align,
27580 int desired_alignment, bool issetmem)
27581 {
27582 int i;
27583 for (i = 1; i < desired_alignment; i <<= 1)
27584 {
27585 if (align <= i)
27586 {
27587 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
27588 if (issetmem)
27589 {
27590 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
27591 destmem = emit_memset (destmem, destptr, vec_value, i);
27592 else
27593 destmem = emit_memset (destmem, destptr, value, i);
27594 }
27595 else
27596 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
27597 ix86_adjust_counter (count, i);
27598 emit_label (label);
27599 LABEL_NUSES (label) = 1;
27600 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
27601 }
27602 }
27603 return destmem;
27604 }
27605
27606 /* Test if COUNT&SIZE is nonzero and if so, expand movme
27607 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
27608 and jump to DONE_LABEL. */
27609 static void
27610 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
27611 rtx destptr, rtx srcptr,
27612 rtx value, rtx vec_value,
27613 rtx count, int size,
27614 rtx done_label, bool issetmem)
27615 {
27616 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
27617 machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 1);
27618 rtx modesize;
27619 int n;
27620
27621 /* If we do not have vector value to copy, we must reduce size. */
27622 if (issetmem)
27623 {
27624 if (!vec_value)
27625 {
27626 if (GET_MODE (value) == VOIDmode && size > 8)
27627 mode = Pmode;
27628 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
27629 mode = GET_MODE (value);
27630 }
27631 else
27632 mode = GET_MODE (vec_value), value = vec_value;
27633 }
27634 else
27635 {
27636 /* Choose appropriate vector mode. */
27637 if (size >= 32)
27638 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
27639 else if (size >= 16)
27640 mode = TARGET_SSE ? V16QImode : DImode;
27641 srcmem = change_address (srcmem, mode, srcptr);
27642 }
27643 destmem = change_address (destmem, mode, destptr);
27644 modesize = GEN_INT (GET_MODE_SIZE (mode));
27645 gcc_assert (GET_MODE_SIZE (mode) <= size);
27646 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
27647 {
27648 if (issetmem)
27649 emit_move_insn (destmem, gen_lowpart (mode, value));
27650 else
27651 {
27652 emit_move_insn (destmem, srcmem);
27653 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27654 }
27655 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27656 }
27657
27658 destmem = offset_address (destmem, count, 1);
27659 destmem = offset_address (destmem, GEN_INT (-2 * size),
27660 GET_MODE_SIZE (mode));
27661 if (!issetmem)
27662 {
27663 srcmem = offset_address (srcmem, count, 1);
27664 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
27665 GET_MODE_SIZE (mode));
27666 }
27667 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
27668 {
27669 if (issetmem)
27670 emit_move_insn (destmem, gen_lowpart (mode, value));
27671 else
27672 {
27673 emit_move_insn (destmem, srcmem);
27674 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27675 }
27676 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27677 }
27678 emit_jump_insn (gen_jump (done_label));
27679 emit_barrier ();
27680
27681 emit_label (label);
27682 LABEL_NUSES (label) = 1;
27683 }
27684
27685 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
27686 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
27687 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
27688 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
27689 DONE_LABEL is a label after the whole copying sequence. The label is created
27690 on demand if *DONE_LABEL is NULL.
27691 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
27692 bounds after the initial copies.
27693
27694 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
27695 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
27696 we will dispatch to a library call for large blocks.
27697
27698 In pseudocode we do:
27699
27700 if (COUNT < SIZE)
27701 {
27702 Assume that SIZE is 4. Bigger sizes are handled analogously
27703 if (COUNT & 4)
27704 {
27705 copy 4 bytes from SRCPTR to DESTPTR
27706 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
27707 goto done_label
27708 }
27709 if (!COUNT)
27710 goto done_label;
27711 copy 1 byte from SRCPTR to DESTPTR
27712 if (COUNT & 2)
27713 {
27714 copy 2 bytes from SRCPTR to DESTPTR
27715 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
27716 }
27717 }
27718 else
27719 {
27720 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
27721 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
27722
27723 OLD_DESPTR = DESTPTR;
27724 Align DESTPTR up to DESIRED_ALIGN
27725 SRCPTR += DESTPTR - OLD_DESTPTR
27726 COUNT -= DEST_PTR - OLD_DESTPTR
27727 if (DYNAMIC_CHECK)
27728 Round COUNT down to multiple of SIZE
27729 << optional caller supplied zero size guard is here >>
27730 << optional caller supplied dynamic check is here >>
27731 << caller supplied main copy loop is here >>
27732 }
27733 done_label:
27734 */
27735 static void
27736 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
27737 rtx *destptr, rtx *srcptr,
27738 machine_mode mode,
27739 rtx value, rtx vec_value,
27740 rtx *count,
27741 rtx_code_label **done_label,
27742 int size,
27743 int desired_align,
27744 int align,
27745 unsigned HOST_WIDE_INT *min_size,
27746 bool dynamic_check,
27747 bool issetmem)
27748 {
27749 rtx_code_label *loop_label = NULL, *label;
27750 int n;
27751 rtx modesize;
27752 int prolog_size = 0;
27753 rtx mode_value;
27754
27755 /* Chose proper value to copy. */
27756 if (issetmem && VECTOR_MODE_P (mode))
27757 mode_value = vec_value;
27758 else
27759 mode_value = value;
27760 gcc_assert (GET_MODE_SIZE (mode) <= size);
27761
27762 /* See if block is big or small, handle small blocks. */
27763 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
27764 {
27765 int size2 = size;
27766 loop_label = gen_label_rtx ();
27767
27768 if (!*done_label)
27769 *done_label = gen_label_rtx ();
27770
27771 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
27772 1, loop_label);
27773 size2 >>= 1;
27774
27775 /* Handle sizes > 3. */
27776 for (;size2 > 2; size2 >>= 1)
27777 expand_small_movmem_or_setmem (destmem, srcmem,
27778 *destptr, *srcptr,
27779 value, vec_value,
27780 *count,
27781 size2, *done_label, issetmem);
27782 /* Nothing to copy? Jump to DONE_LABEL if so */
27783 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
27784 1, *done_label);
27785
27786 /* Do a byte copy. */
27787 destmem = change_address (destmem, QImode, *destptr);
27788 if (issetmem)
27789 emit_move_insn (destmem, gen_lowpart (QImode, value));
27790 else
27791 {
27792 srcmem = change_address (srcmem, QImode, *srcptr);
27793 emit_move_insn (destmem, srcmem);
27794 }
27795
27796 /* Handle sizes 2 and 3. */
27797 label = ix86_expand_aligntest (*count, 2, false);
27798 destmem = change_address (destmem, HImode, *destptr);
27799 destmem = offset_address (destmem, *count, 1);
27800 destmem = offset_address (destmem, GEN_INT (-2), 2);
27801 if (issetmem)
27802 emit_move_insn (destmem, gen_lowpart (HImode, value));
27803 else
27804 {
27805 srcmem = change_address (srcmem, HImode, *srcptr);
27806 srcmem = offset_address (srcmem, *count, 1);
27807 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
27808 emit_move_insn (destmem, srcmem);
27809 }
27810
27811 emit_label (label);
27812 LABEL_NUSES (label) = 1;
27813 emit_jump_insn (gen_jump (*done_label));
27814 emit_barrier ();
27815 }
27816 else
27817 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
27818 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
27819
27820 /* Start memcpy for COUNT >= SIZE. */
27821 if (loop_label)
27822 {
27823 emit_label (loop_label);
27824 LABEL_NUSES (loop_label) = 1;
27825 }
27826
27827 /* Copy first desired_align bytes. */
27828 if (!issetmem)
27829 srcmem = change_address (srcmem, mode, *srcptr);
27830 destmem = change_address (destmem, mode, *destptr);
27831 modesize = GEN_INT (GET_MODE_SIZE (mode));
27832 for (n = 0; prolog_size < desired_align - align; n++)
27833 {
27834 if (issetmem)
27835 emit_move_insn (destmem, mode_value);
27836 else
27837 {
27838 emit_move_insn (destmem, srcmem);
27839 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27840 }
27841 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27842 prolog_size += GET_MODE_SIZE (mode);
27843 }
27844
27845
27846 /* Copy last SIZE bytes. */
27847 destmem = offset_address (destmem, *count, 1);
27848 destmem = offset_address (destmem,
27849 GEN_INT (-size - prolog_size),
27850 1);
27851 if (issetmem)
27852 emit_move_insn (destmem, mode_value);
27853 else
27854 {
27855 srcmem = offset_address (srcmem, *count, 1);
27856 srcmem = offset_address (srcmem,
27857 GEN_INT (-size - prolog_size),
27858 1);
27859 emit_move_insn (destmem, srcmem);
27860 }
27861 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
27862 {
27863 destmem = offset_address (destmem, modesize, 1);
27864 if (issetmem)
27865 emit_move_insn (destmem, mode_value);
27866 else
27867 {
27868 srcmem = offset_address (srcmem, modesize, 1);
27869 emit_move_insn (destmem, srcmem);
27870 }
27871 }
27872
27873 /* Align destination. */
27874 if (desired_align > 1 && desired_align > align)
27875 {
27876 rtx saveddest = *destptr;
27877
27878 gcc_assert (desired_align <= size);
27879 /* Align destptr up, place it to new register. */
27880 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
27881 GEN_INT (prolog_size),
27882 NULL_RTX, 1, OPTAB_DIRECT);
27883 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
27884 REG_POINTER (*destptr) = 1;
27885 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
27886 GEN_INT (-desired_align),
27887 *destptr, 1, OPTAB_DIRECT);
27888 /* See how many bytes we skipped. */
27889 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
27890 *destptr,
27891 saveddest, 1, OPTAB_DIRECT);
27892 /* Adjust srcptr and count. */
27893 if (!issetmem)
27894 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
27895 saveddest, *srcptr, 1, OPTAB_DIRECT);
27896 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
27897 saveddest, *count, 1, OPTAB_DIRECT);
27898 /* We copied at most size + prolog_size. */
27899 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
27900 *min_size
27901 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
27902 else
27903 *min_size = 0;
27904
27905 /* Our loops always round down the block size, but for dispatch to
27906 library we need precise value. */
27907 if (dynamic_check)
27908 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
27909 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
27910 }
27911 else
27912 {
27913 gcc_assert (prolog_size == 0);
27914 /* Decrease count, so we won't end up copying last word twice. */
27915 if (!CONST_INT_P (*count))
27916 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
27917 constm1_rtx, *count, 1, OPTAB_DIRECT);
27918 else
27919 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
27920 (unsigned HOST_WIDE_INT)size));
27921 if (*min_size)
27922 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
27923 }
27924 }
27925
27926
27927 /* This function is like the previous one, except here we know how many bytes
27928 need to be copied. That allows us to update alignment not only of DST, which
27929 is returned, but also of SRC, which is passed as a pointer for that
27930 reason. */
27931 static rtx
27932 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
27933 rtx srcreg, rtx value, rtx vec_value,
27934 int desired_align, int align_bytes,
27935 bool issetmem)
27936 {
27937 rtx src = NULL;
27938 rtx orig_dst = dst;
27939 rtx orig_src = NULL;
27940 int piece_size = 1;
27941 int copied_bytes = 0;
27942
27943 if (!issetmem)
27944 {
27945 gcc_assert (srcp != NULL);
27946 src = *srcp;
27947 orig_src = src;
27948 }
27949
27950 for (piece_size = 1;
27951 piece_size <= desired_align && copied_bytes < align_bytes;
27952 piece_size <<= 1)
27953 {
27954 if (align_bytes & piece_size)
27955 {
27956 if (issetmem)
27957 {
27958 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
27959 dst = emit_memset (dst, destreg, vec_value, piece_size);
27960 else
27961 dst = emit_memset (dst, destreg, value, piece_size);
27962 }
27963 else
27964 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
27965 copied_bytes += piece_size;
27966 }
27967 }
27968 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
27969 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27970 if (MEM_SIZE_KNOWN_P (orig_dst))
27971 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
27972
27973 if (!issetmem)
27974 {
27975 int src_align_bytes = get_mem_align_offset (src, desired_align
27976 * BITS_PER_UNIT);
27977 if (src_align_bytes >= 0)
27978 src_align_bytes = desired_align - src_align_bytes;
27979 if (src_align_bytes >= 0)
27980 {
27981 unsigned int src_align;
27982 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
27983 {
27984 if ((src_align_bytes & (src_align - 1))
27985 == (align_bytes & (src_align - 1)))
27986 break;
27987 }
27988 if (src_align > (unsigned int) desired_align)
27989 src_align = desired_align;
27990 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
27991 set_mem_align (src, src_align * BITS_PER_UNIT);
27992 }
27993 if (MEM_SIZE_KNOWN_P (orig_src))
27994 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
27995 *srcp = src;
27996 }
27997
27998 return dst;
27999 }
28000
28001 /* Return true if ALG can be used in current context.
28002 Assume we expand memset if MEMSET is true. */
28003 static bool
28004 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
28005 {
28006 if (alg == no_stringop)
28007 return false;
28008 if (alg == vector_loop)
28009 return TARGET_SSE || TARGET_AVX;
28010 /* Algorithms using the rep prefix want at least edi and ecx;
28011 additionally, memset wants eax and memcpy wants esi. Don't
28012 consider such algorithms if the user has appropriated those
28013 registers for their own purposes, or if we have a non-default
28014 address space, since some string insns cannot override the segment. */
28015 if (alg == rep_prefix_1_byte
28016 || alg == rep_prefix_4_byte
28017 || alg == rep_prefix_8_byte)
28018 {
28019 if (have_as)
28020 return false;
28021 if (fixed_regs[CX_REG]
28022 || fixed_regs[DI_REG]
28023 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
28024 return false;
28025 }
28026 return true;
28027 }
28028
28029 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
28030 static enum stringop_alg
28031 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
28032 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
28033 bool memset, bool zero_memset, bool have_as,
28034 int *dynamic_check, bool *noalign, bool recur)
28035 {
28036 const struct stringop_algs *algs;
28037 bool optimize_for_speed;
28038 int max = 0;
28039 const struct processor_costs *cost;
28040 int i;
28041 bool any_alg_usable_p = false;
28042
28043 *noalign = false;
28044 *dynamic_check = -1;
28045
28046 /* Even if the string operation call is cold, we still might spend a lot
28047 of time processing large blocks. */
28048 if (optimize_function_for_size_p (cfun)
28049 || (optimize_insn_for_size_p ()
28050 && (max_size < 256
28051 || (expected_size != -1 && expected_size < 256))))
28052 optimize_for_speed = false;
28053 else
28054 optimize_for_speed = true;
28055
28056 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
28057 if (memset)
28058 algs = &cost->memset[TARGET_64BIT != 0];
28059 else
28060 algs = &cost->memcpy[TARGET_64BIT != 0];
28061
28062 /* See maximal size for user defined algorithm. */
28063 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
28064 {
28065 enum stringop_alg candidate = algs->size[i].alg;
28066 bool usable = alg_usable_p (candidate, memset, have_as);
28067 any_alg_usable_p |= usable;
28068
28069 if (candidate != libcall && candidate && usable)
28070 max = algs->size[i].max;
28071 }
28072
28073 /* If expected size is not known but max size is small enough
28074 so inline version is a win, set expected size into
28075 the range. */
28076 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
28077 && expected_size == -1)
28078 expected_size = min_size / 2 + max_size / 2;
28079
28080 /* If user specified the algorithm, honor it if possible. */
28081 if (ix86_stringop_alg != no_stringop
28082 && alg_usable_p (ix86_stringop_alg, memset, have_as))
28083 return ix86_stringop_alg;
28084 /* rep; movq or rep; movl is the smallest variant. */
28085 else if (!optimize_for_speed)
28086 {
28087 *noalign = true;
28088 if (!count || (count & 3) || (memset && !zero_memset))
28089 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
28090 ? rep_prefix_1_byte : loop_1_byte;
28091 else
28092 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
28093 ? rep_prefix_4_byte : loop;
28094 }
28095 /* Very tiny blocks are best handled via the loop, REP is expensive to
28096 setup. */
28097 else if (expected_size != -1 && expected_size < 4)
28098 return loop_1_byte;
28099 else if (expected_size != -1)
28100 {
28101 enum stringop_alg alg = libcall;
28102 bool alg_noalign = false;
28103 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
28104 {
28105 /* We get here if the algorithms that were not libcall-based
28106 were rep-prefix based and we are unable to use rep prefixes
28107 based on global register usage. Break out of the loop and
28108 use the heuristic below. */
28109 if (algs->size[i].max == 0)
28110 break;
28111 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
28112 {
28113 enum stringop_alg candidate = algs->size[i].alg;
28114
28115 if (candidate != libcall
28116 && alg_usable_p (candidate, memset, have_as))
28117 {
28118 alg = candidate;
28119 alg_noalign = algs->size[i].noalign;
28120 }
28121 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
28122 last non-libcall inline algorithm. */
28123 if (TARGET_INLINE_ALL_STRINGOPS)
28124 {
28125 /* When the current size is best to be copied by a libcall,
28126 but we are still forced to inline, run the heuristic below
28127 that will pick code for medium sized blocks. */
28128 if (alg != libcall)
28129 {
28130 *noalign = alg_noalign;
28131 return alg;
28132 }
28133 else if (!any_alg_usable_p)
28134 break;
28135 }
28136 else if (alg_usable_p (candidate, memset, have_as))
28137 {
28138 *noalign = algs->size[i].noalign;
28139 return candidate;
28140 }
28141 }
28142 }
28143 }
28144 /* When asked to inline the call anyway, try to pick meaningful choice.
28145 We look for maximal size of block that is faster to copy by hand and
28146 take blocks of at most of that size guessing that average size will
28147 be roughly half of the block.
28148
28149 If this turns out to be bad, we might simply specify the preferred
28150 choice in ix86_costs. */
28151 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
28152 && (algs->unknown_size == libcall
28153 || !alg_usable_p (algs->unknown_size, memset, have_as)))
28154 {
28155 enum stringop_alg alg;
28156 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
28157
28158 /* If there aren't any usable algorithms or if recursing already,
28159 then recursing on smaller sizes or same size isn't going to
28160 find anything. Just return the simple byte-at-a-time copy loop. */
28161 if (!any_alg_usable_p || recur)
28162 {
28163 /* Pick something reasonable. */
28164 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
28165 *dynamic_check = 128;
28166 return loop_1_byte;
28167 }
28168 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
28169 zero_memset, have_as, dynamic_check, noalign, true);
28170 gcc_assert (*dynamic_check == -1);
28171 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
28172 *dynamic_check = max;
28173 else
28174 gcc_assert (alg != libcall);
28175 return alg;
28176 }
28177 return (alg_usable_p (algs->unknown_size, memset, have_as)
28178 ? algs->unknown_size : libcall);
28179 }
28180
28181 /* Decide on alignment. We know that the operand is already aligned to ALIGN
28182 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
28183 static int
28184 decide_alignment (int align,
28185 enum stringop_alg alg,
28186 int expected_size,
28187 machine_mode move_mode)
28188 {
28189 int desired_align = 0;
28190
28191 gcc_assert (alg != no_stringop);
28192
28193 if (alg == libcall)
28194 return 0;
28195 if (move_mode == VOIDmode)
28196 return 0;
28197
28198 desired_align = GET_MODE_SIZE (move_mode);
28199 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
28200 copying whole cacheline at once. */
28201 if (TARGET_PENTIUMPRO
28202 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
28203 desired_align = 8;
28204
28205 if (optimize_size)
28206 desired_align = 1;
28207 if (desired_align < align)
28208 desired_align = align;
28209 if (expected_size != -1 && expected_size < 4)
28210 desired_align = align;
28211
28212 return desired_align;
28213 }
28214
28215
28216 /* Helper function for memcpy. For QImode value 0xXY produce
28217 0xXYXYXYXY of wide specified by MODE. This is essentially
28218 a * 0x10101010, but we can do slightly better than
28219 synth_mult by unwinding the sequence by hand on CPUs with
28220 slow multiply. */
28221 static rtx
28222 promote_duplicated_reg (machine_mode mode, rtx val)
28223 {
28224 machine_mode valmode = GET_MODE (val);
28225 rtx tmp;
28226 int nops = mode == DImode ? 3 : 2;
28227
28228 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
28229 if (val == const0_rtx)
28230 return copy_to_mode_reg (mode, CONST0_RTX (mode));
28231 if (CONST_INT_P (val))
28232 {
28233 HOST_WIDE_INT v = INTVAL (val) & 255;
28234
28235 v |= v << 8;
28236 v |= v << 16;
28237 if (mode == DImode)
28238 v |= (v << 16) << 16;
28239 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
28240 }
28241
28242 if (valmode == VOIDmode)
28243 valmode = QImode;
28244 if (valmode != QImode)
28245 val = gen_lowpart (QImode, val);
28246 if (mode == QImode)
28247 return val;
28248 if (!TARGET_PARTIAL_REG_STALL)
28249 nops--;
28250 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
28251 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
28252 <= (ix86_cost->shift_const + ix86_cost->add) * nops
28253 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
28254 {
28255 rtx reg = convert_modes (mode, QImode, val, true);
28256 tmp = promote_duplicated_reg (mode, const1_rtx);
28257 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
28258 OPTAB_DIRECT);
28259 }
28260 else
28261 {
28262 rtx reg = convert_modes (mode, QImode, val, true);
28263
28264 if (!TARGET_PARTIAL_REG_STALL)
28265 if (mode == SImode)
28266 emit_insn (gen_insvsi_1 (reg, reg));
28267 else
28268 emit_insn (gen_insvdi_1 (reg, reg));
28269 else
28270 {
28271 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
28272 NULL, 1, OPTAB_DIRECT);
28273 reg =
28274 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
28275 }
28276 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
28277 NULL, 1, OPTAB_DIRECT);
28278 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
28279 if (mode == SImode)
28280 return reg;
28281 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
28282 NULL, 1, OPTAB_DIRECT);
28283 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
28284 return reg;
28285 }
28286 }
28287
28288 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
28289 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
28290 alignment from ALIGN to DESIRED_ALIGN. */
28291 static rtx
28292 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
28293 int align)
28294 {
28295 rtx promoted_val;
28296
28297 if (TARGET_64BIT
28298 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
28299 promoted_val = promote_duplicated_reg (DImode, val);
28300 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
28301 promoted_val = promote_duplicated_reg (SImode, val);
28302 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
28303 promoted_val = promote_duplicated_reg (HImode, val);
28304 else
28305 promoted_val = val;
28306
28307 return promoted_val;
28308 }
28309
28310 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
28311 operations when profitable. The code depends upon architecture, block size
28312 and alignment, but always has one of the following overall structures:
28313
28314 Aligned move sequence:
28315
28316 1) Prologue guard: Conditional that jumps up to epilogues for small
28317 blocks that can be handled by epilogue alone. This is faster
28318 but also needed for correctness, since prologue assume the block
28319 is larger than the desired alignment.
28320
28321 Optional dynamic check for size and libcall for large
28322 blocks is emitted here too, with -minline-stringops-dynamically.
28323
28324 2) Prologue: copy first few bytes in order to get destination
28325 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
28326 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
28327 copied. We emit either a jump tree on power of two sized
28328 blocks, or a byte loop.
28329
28330 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
28331 with specified algorithm.
28332
28333 4) Epilogue: code copying tail of the block that is too small to be
28334 handled by main body (or up to size guarded by prologue guard).
28335
28336 Misaligned move sequence
28337
28338 1) missaligned move prologue/epilogue containing:
28339 a) Prologue handling small memory blocks and jumping to done_label
28340 (skipped if blocks are known to be large enough)
28341 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
28342 needed by single possibly misaligned move
28343 (skipped if alignment is not needed)
28344 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
28345
28346 2) Zero size guard dispatching to done_label, if needed
28347
28348 3) dispatch to library call, if needed,
28349
28350 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
28351 with specified algorithm. */
28352 bool
28353 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
28354 rtx align_exp, rtx expected_align_exp,
28355 rtx expected_size_exp, rtx min_size_exp,
28356 rtx max_size_exp, rtx probable_max_size_exp,
28357 bool issetmem)
28358 {
28359 rtx destreg;
28360 rtx srcreg = NULL;
28361 rtx_code_label *label = NULL;
28362 rtx tmp;
28363 rtx_code_label *jump_around_label = NULL;
28364 HOST_WIDE_INT align = 1;
28365 unsigned HOST_WIDE_INT count = 0;
28366 HOST_WIDE_INT expected_size = -1;
28367 int size_needed = 0, epilogue_size_needed;
28368 int desired_align = 0, align_bytes = 0;
28369 enum stringop_alg alg;
28370 rtx promoted_val = NULL;
28371 rtx vec_promoted_val = NULL;
28372 bool force_loopy_epilogue = false;
28373 int dynamic_check;
28374 bool need_zero_guard = false;
28375 bool noalign;
28376 machine_mode move_mode = VOIDmode;
28377 int unroll_factor = 1;
28378 /* TODO: Once value ranges are available, fill in proper data. */
28379 unsigned HOST_WIDE_INT min_size = 0;
28380 unsigned HOST_WIDE_INT max_size = -1;
28381 unsigned HOST_WIDE_INT probable_max_size = -1;
28382 bool misaligned_prologue_used = false;
28383 bool have_as;
28384
28385 if (CONST_INT_P (align_exp))
28386 align = INTVAL (align_exp);
28387 /* i386 can do misaligned access on reasonably increased cost. */
28388 if (CONST_INT_P (expected_align_exp)
28389 && INTVAL (expected_align_exp) > align)
28390 align = INTVAL (expected_align_exp);
28391 /* ALIGN is the minimum of destination and source alignment, but we care here
28392 just about destination alignment. */
28393 else if (!issetmem
28394 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
28395 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
28396
28397 if (CONST_INT_P (count_exp))
28398 {
28399 min_size = max_size = probable_max_size = count = expected_size
28400 = INTVAL (count_exp);
28401 /* When COUNT is 0, there is nothing to do. */
28402 if (!count)
28403 return true;
28404 }
28405 else
28406 {
28407 if (min_size_exp)
28408 min_size = INTVAL (min_size_exp);
28409 if (max_size_exp)
28410 max_size = INTVAL (max_size_exp);
28411 if (probable_max_size_exp)
28412 probable_max_size = INTVAL (probable_max_size_exp);
28413 if (CONST_INT_P (expected_size_exp))
28414 expected_size = INTVAL (expected_size_exp);
28415 }
28416
28417 /* Make sure we don't need to care about overflow later on. */
28418 if (count > (HOST_WIDE_INT_1U << 30))
28419 return false;
28420
28421 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
28422 if (!issetmem)
28423 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
28424
28425 /* Step 0: Decide on preferred algorithm, desired alignment and
28426 size of chunks to be copied by main loop. */
28427 alg = decide_alg (count, expected_size, min_size, probable_max_size,
28428 issetmem,
28429 issetmem && val_exp == const0_rtx, have_as,
28430 &dynamic_check, &noalign, false);
28431 if (alg == libcall)
28432 return false;
28433 gcc_assert (alg != no_stringop);
28434
28435 /* For now vector-version of memset is generated only for memory zeroing, as
28436 creating of promoted vector value is very cheap in this case. */
28437 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
28438 alg = unrolled_loop;
28439
28440 if (!count)
28441 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
28442 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
28443 if (!issetmem)
28444 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
28445
28446 unroll_factor = 1;
28447 move_mode = word_mode;
28448 switch (alg)
28449 {
28450 case libcall:
28451 case no_stringop:
28452 case last_alg:
28453 gcc_unreachable ();
28454 case loop_1_byte:
28455 need_zero_guard = true;
28456 move_mode = QImode;
28457 break;
28458 case loop:
28459 need_zero_guard = true;
28460 break;
28461 case unrolled_loop:
28462 need_zero_guard = true;
28463 unroll_factor = (TARGET_64BIT ? 4 : 2);
28464 break;
28465 case vector_loop:
28466 need_zero_guard = true;
28467 unroll_factor = 4;
28468 /* Find the widest supported mode. */
28469 move_mode = word_mode;
28470 while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
28471 != CODE_FOR_nothing)
28472 move_mode = GET_MODE_WIDER_MODE (move_mode);
28473
28474 /* Find the corresponding vector mode with the same size as MOVE_MODE.
28475 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
28476 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
28477 {
28478 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
28479 move_mode = mode_for_vector (word_mode, nunits);
28480 if (optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
28481 move_mode = word_mode;
28482 }
28483 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
28484 break;
28485 case rep_prefix_8_byte:
28486 move_mode = DImode;
28487 break;
28488 case rep_prefix_4_byte:
28489 move_mode = SImode;
28490 break;
28491 case rep_prefix_1_byte:
28492 move_mode = QImode;
28493 break;
28494 }
28495 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
28496 epilogue_size_needed = size_needed;
28497
28498 /* If we are going to call any library calls conditionally, make sure any
28499 pending stack adjustment happen before the first conditional branch,
28500 otherwise they will be emitted before the library call only and won't
28501 happen from the other branches. */
28502 if (dynamic_check != -1)
28503 do_pending_stack_adjust ();
28504
28505 desired_align = decide_alignment (align, alg, expected_size, move_mode);
28506 if (!TARGET_ALIGN_STRINGOPS || noalign)
28507 align = desired_align;
28508
28509 /* Step 1: Prologue guard. */
28510
28511 /* Alignment code needs count to be in register. */
28512 if (CONST_INT_P (count_exp) && desired_align > align)
28513 {
28514 if (INTVAL (count_exp) > desired_align
28515 && INTVAL (count_exp) > size_needed)
28516 {
28517 align_bytes
28518 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
28519 if (align_bytes <= 0)
28520 align_bytes = 0;
28521 else
28522 align_bytes = desired_align - align_bytes;
28523 }
28524 if (align_bytes == 0)
28525 count_exp = force_reg (counter_mode (count_exp), count_exp);
28526 }
28527 gcc_assert (desired_align >= 1 && align >= 1);
28528
28529 /* Misaligned move sequences handle both prologue and epilogue at once.
28530 Default code generation results in a smaller code for large alignments
28531 and also avoids redundant job when sizes are known precisely. */
28532 misaligned_prologue_used
28533 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
28534 && MAX (desired_align, epilogue_size_needed) <= 32
28535 && desired_align <= epilogue_size_needed
28536 && ((desired_align > align && !align_bytes)
28537 || (!count && epilogue_size_needed > 1)));
28538
28539 /* Do the cheap promotion to allow better CSE across the
28540 main loop and epilogue (ie one load of the big constant in the
28541 front of all code.
28542 For now the misaligned move sequences do not have fast path
28543 without broadcasting. */
28544 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
28545 {
28546 if (alg == vector_loop)
28547 {
28548 gcc_assert (val_exp == const0_rtx);
28549 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
28550 promoted_val = promote_duplicated_reg_to_size (val_exp,
28551 GET_MODE_SIZE (word_mode),
28552 desired_align, align);
28553 }
28554 else
28555 {
28556 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
28557 desired_align, align);
28558 }
28559 }
28560 /* Misaligned move sequences handles both prologues and epilogues at once.
28561 Default code generation results in smaller code for large alignments and
28562 also avoids redundant job when sizes are known precisely. */
28563 if (misaligned_prologue_used)
28564 {
28565 /* Misaligned move prologue handled small blocks by itself. */
28566 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
28567 (dst, src, &destreg, &srcreg,
28568 move_mode, promoted_val, vec_promoted_val,
28569 &count_exp,
28570 &jump_around_label,
28571 desired_align < align
28572 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
28573 desired_align, align, &min_size, dynamic_check, issetmem);
28574 if (!issetmem)
28575 src = change_address (src, BLKmode, srcreg);
28576 dst = change_address (dst, BLKmode, destreg);
28577 set_mem_align (dst, desired_align * BITS_PER_UNIT);
28578 epilogue_size_needed = 0;
28579 if (need_zero_guard
28580 && min_size < (unsigned HOST_WIDE_INT) size_needed)
28581 {
28582 /* It is possible that we copied enough so the main loop will not
28583 execute. */
28584 gcc_assert (size_needed > 1);
28585 if (jump_around_label == NULL_RTX)
28586 jump_around_label = gen_label_rtx ();
28587 emit_cmp_and_jump_insns (count_exp,
28588 GEN_INT (size_needed),
28589 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
28590 if (expected_size == -1
28591 || expected_size < (desired_align - align) / 2 + size_needed)
28592 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28593 else
28594 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28595 }
28596 }
28597 /* Ensure that alignment prologue won't copy past end of block. */
28598 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
28599 {
28600 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
28601 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
28602 Make sure it is power of 2. */
28603 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
28604
28605 /* To improve performance of small blocks, we jump around the VAL
28606 promoting mode. This mean that if the promoted VAL is not constant,
28607 we might not use it in the epilogue and have to use byte
28608 loop variant. */
28609 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
28610 force_loopy_epilogue = true;
28611 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28612 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28613 {
28614 /* If main algorithm works on QImode, no epilogue is needed.
28615 For small sizes just don't align anything. */
28616 if (size_needed == 1)
28617 desired_align = align;
28618 else
28619 goto epilogue;
28620 }
28621 else if (!count
28622 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28623 {
28624 label = gen_label_rtx ();
28625 emit_cmp_and_jump_insns (count_exp,
28626 GEN_INT (epilogue_size_needed),
28627 LTU, 0, counter_mode (count_exp), 1, label);
28628 if (expected_size == -1 || expected_size < epilogue_size_needed)
28629 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28630 else
28631 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28632 }
28633 }
28634
28635 /* Emit code to decide on runtime whether library call or inline should be
28636 used. */
28637 if (dynamic_check != -1)
28638 {
28639 if (!issetmem && CONST_INT_P (count_exp))
28640 {
28641 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
28642 {
28643 emit_block_copy_via_libcall (dst, src, count_exp);
28644 count_exp = const0_rtx;
28645 goto epilogue;
28646 }
28647 }
28648 else
28649 {
28650 rtx_code_label *hot_label = gen_label_rtx ();
28651 if (jump_around_label == NULL_RTX)
28652 jump_around_label = gen_label_rtx ();
28653 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
28654 LEU, 0, counter_mode (count_exp),
28655 1, hot_label);
28656 predict_jump (REG_BR_PROB_BASE * 90 / 100);
28657 if (issetmem)
28658 set_storage_via_libcall (dst, count_exp, val_exp);
28659 else
28660 emit_block_copy_via_libcall (dst, src, count_exp);
28661 emit_jump (jump_around_label);
28662 emit_label (hot_label);
28663 }
28664 }
28665
28666 /* Step 2: Alignment prologue. */
28667 /* Do the expensive promotion once we branched off the small blocks. */
28668 if (issetmem && !promoted_val)
28669 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
28670 desired_align, align);
28671
28672 if (desired_align > align && !misaligned_prologue_used)
28673 {
28674 if (align_bytes == 0)
28675 {
28676 /* Except for the first move in prologue, we no longer know
28677 constant offset in aliasing info. It don't seems to worth
28678 the pain to maintain it for the first move, so throw away
28679 the info early. */
28680 dst = change_address (dst, BLKmode, destreg);
28681 if (!issetmem)
28682 src = change_address (src, BLKmode, srcreg);
28683 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
28684 promoted_val, vec_promoted_val,
28685 count_exp, align, desired_align,
28686 issetmem);
28687 /* At most desired_align - align bytes are copied. */
28688 if (min_size < (unsigned)(desired_align - align))
28689 min_size = 0;
28690 else
28691 min_size -= desired_align - align;
28692 }
28693 else
28694 {
28695 /* If we know how many bytes need to be stored before dst is
28696 sufficiently aligned, maintain aliasing info accurately. */
28697 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
28698 srcreg,
28699 promoted_val,
28700 vec_promoted_val,
28701 desired_align,
28702 align_bytes,
28703 issetmem);
28704
28705 count_exp = plus_constant (counter_mode (count_exp),
28706 count_exp, -align_bytes);
28707 count -= align_bytes;
28708 min_size -= align_bytes;
28709 max_size -= align_bytes;
28710 }
28711 if (need_zero_guard
28712 && min_size < (unsigned HOST_WIDE_INT) size_needed
28713 && (count < (unsigned HOST_WIDE_INT) size_needed
28714 || (align_bytes == 0
28715 && count < ((unsigned HOST_WIDE_INT) size_needed
28716 + desired_align - align))))
28717 {
28718 /* It is possible that we copied enough so the main loop will not
28719 execute. */
28720 gcc_assert (size_needed > 1);
28721 if (label == NULL_RTX)
28722 label = gen_label_rtx ();
28723 emit_cmp_and_jump_insns (count_exp,
28724 GEN_INT (size_needed),
28725 LTU, 0, counter_mode (count_exp), 1, label);
28726 if (expected_size == -1
28727 || expected_size < (desired_align - align) / 2 + size_needed)
28728 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28729 else
28730 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28731 }
28732 }
28733 if (label && size_needed == 1)
28734 {
28735 emit_label (label);
28736 LABEL_NUSES (label) = 1;
28737 label = NULL;
28738 epilogue_size_needed = 1;
28739 if (issetmem)
28740 promoted_val = val_exp;
28741 }
28742 else if (label == NULL_RTX && !misaligned_prologue_used)
28743 epilogue_size_needed = size_needed;
28744
28745 /* Step 3: Main loop. */
28746
28747 switch (alg)
28748 {
28749 case libcall:
28750 case no_stringop:
28751 case last_alg:
28752 gcc_unreachable ();
28753 case loop_1_byte:
28754 case loop:
28755 case unrolled_loop:
28756 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
28757 count_exp, move_mode, unroll_factor,
28758 expected_size, issetmem);
28759 break;
28760 case vector_loop:
28761 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
28762 vec_promoted_val, count_exp, move_mode,
28763 unroll_factor, expected_size, issetmem);
28764 break;
28765 case rep_prefix_8_byte:
28766 case rep_prefix_4_byte:
28767 case rep_prefix_1_byte:
28768 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
28769 val_exp, count_exp, move_mode, issetmem);
28770 break;
28771 }
28772 /* Adjust properly the offset of src and dest memory for aliasing. */
28773 if (CONST_INT_P (count_exp))
28774 {
28775 if (!issetmem)
28776 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
28777 (count / size_needed) * size_needed);
28778 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
28779 (count / size_needed) * size_needed);
28780 }
28781 else
28782 {
28783 if (!issetmem)
28784 src = change_address (src, BLKmode, srcreg);
28785 dst = change_address (dst, BLKmode, destreg);
28786 }
28787
28788 /* Step 4: Epilogue to copy the remaining bytes. */
28789 epilogue:
28790 if (label)
28791 {
28792 /* When the main loop is done, COUNT_EXP might hold original count,
28793 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
28794 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
28795 bytes. Compensate if needed. */
28796
28797 if (size_needed < epilogue_size_needed)
28798 {
28799 tmp =
28800 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
28801 GEN_INT (size_needed - 1), count_exp, 1,
28802 OPTAB_DIRECT);
28803 if (tmp != count_exp)
28804 emit_move_insn (count_exp, tmp);
28805 }
28806 emit_label (label);
28807 LABEL_NUSES (label) = 1;
28808 }
28809
28810 if (count_exp != const0_rtx && epilogue_size_needed > 1)
28811 {
28812 if (force_loopy_epilogue)
28813 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
28814 epilogue_size_needed);
28815 else
28816 {
28817 if (issetmem)
28818 expand_setmem_epilogue (dst, destreg, promoted_val,
28819 vec_promoted_val, count_exp,
28820 epilogue_size_needed);
28821 else
28822 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
28823 epilogue_size_needed);
28824 }
28825 }
28826 if (jump_around_label)
28827 emit_label (jump_around_label);
28828 return true;
28829 }
28830
28831
28832 /* Expand the appropriate insns for doing strlen if not just doing
28833 repnz; scasb
28834
28835 out = result, initialized with the start address
28836 align_rtx = alignment of the address.
28837 scratch = scratch register, initialized with the startaddress when
28838 not aligned, otherwise undefined
28839
28840 This is just the body. It needs the initializations mentioned above and
28841 some address computing at the end. These things are done in i386.md. */
28842
28843 static void
28844 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
28845 {
28846 int align;
28847 rtx tmp;
28848 rtx_code_label *align_2_label = NULL;
28849 rtx_code_label *align_3_label = NULL;
28850 rtx_code_label *align_4_label = gen_label_rtx ();
28851 rtx_code_label *end_0_label = gen_label_rtx ();
28852 rtx mem;
28853 rtx tmpreg = gen_reg_rtx (SImode);
28854 rtx scratch = gen_reg_rtx (SImode);
28855 rtx cmp;
28856
28857 align = 0;
28858 if (CONST_INT_P (align_rtx))
28859 align = INTVAL (align_rtx);
28860
28861 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
28862
28863 /* Is there a known alignment and is it less than 4? */
28864 if (align < 4)
28865 {
28866 rtx scratch1 = gen_reg_rtx (Pmode);
28867 emit_move_insn (scratch1, out);
28868 /* Is there a known alignment and is it not 2? */
28869 if (align != 2)
28870 {
28871 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
28872 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
28873
28874 /* Leave just the 3 lower bits. */
28875 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
28876 NULL_RTX, 0, OPTAB_WIDEN);
28877
28878 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
28879 Pmode, 1, align_4_label);
28880 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
28881 Pmode, 1, align_2_label);
28882 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
28883 Pmode, 1, align_3_label);
28884 }
28885 else
28886 {
28887 /* Since the alignment is 2, we have to check 2 or 0 bytes;
28888 check if is aligned to 4 - byte. */
28889
28890 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
28891 NULL_RTX, 0, OPTAB_WIDEN);
28892
28893 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
28894 Pmode, 1, align_4_label);
28895 }
28896
28897 mem = change_address (src, QImode, out);
28898
28899 /* Now compare the bytes. */
28900
28901 /* Compare the first n unaligned byte on a byte per byte basis. */
28902 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
28903 QImode, 1, end_0_label);
28904
28905 /* Increment the address. */
28906 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28907
28908 /* Not needed with an alignment of 2 */
28909 if (align != 2)
28910 {
28911 emit_label (align_2_label);
28912
28913 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
28914 end_0_label);
28915
28916 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28917
28918 emit_label (align_3_label);
28919 }
28920
28921 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
28922 end_0_label);
28923
28924 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28925 }
28926
28927 /* Generate loop to check 4 bytes at a time. It is not a good idea to
28928 align this loop. It gives only huge programs, but does not help to
28929 speed up. */
28930 emit_label (align_4_label);
28931
28932 mem = change_address (src, SImode, out);
28933 emit_move_insn (scratch, mem);
28934 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
28935
28936 /* This formula yields a nonzero result iff one of the bytes is zero.
28937 This saves three branches inside loop and many cycles. */
28938
28939 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
28940 emit_insn (gen_one_cmplsi2 (scratch, scratch));
28941 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
28942 emit_insn (gen_andsi3 (tmpreg, tmpreg,
28943 gen_int_mode (0x80808080, SImode)));
28944 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
28945 align_4_label);
28946
28947 if (TARGET_CMOVE)
28948 {
28949 rtx reg = gen_reg_rtx (SImode);
28950 rtx reg2 = gen_reg_rtx (Pmode);
28951 emit_move_insn (reg, tmpreg);
28952 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
28953
28954 /* If zero is not in the first two bytes, move two bytes forward. */
28955 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
28956 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28957 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
28958 emit_insn (gen_rtx_SET (tmpreg,
28959 gen_rtx_IF_THEN_ELSE (SImode, tmp,
28960 reg,
28961 tmpreg)));
28962 /* Emit lea manually to avoid clobbering of flags. */
28963 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
28964
28965 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28966 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
28967 emit_insn (gen_rtx_SET (out,
28968 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
28969 reg2,
28970 out)));
28971 }
28972 else
28973 {
28974 rtx_code_label *end_2_label = gen_label_rtx ();
28975 /* Is zero in the first two bytes? */
28976
28977 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
28978 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28979 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
28980 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
28981 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
28982 pc_rtx);
28983 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
28984 JUMP_LABEL (tmp) = end_2_label;
28985
28986 /* Not in the first two. Move two bytes forward. */
28987 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
28988 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
28989
28990 emit_label (end_2_label);
28991
28992 }
28993
28994 /* Avoid branch in fixing the byte. */
28995 tmpreg = gen_lowpart (QImode, tmpreg);
28996 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
28997 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
28998 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
28999 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
29000
29001 emit_label (end_0_label);
29002 }
29003
29004 /* Expand strlen. */
29005
29006 bool
29007 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
29008 {
29009 rtx addr, scratch1, scratch2, scratch3, scratch4;
29010
29011 /* The generic case of strlen expander is long. Avoid it's
29012 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
29013
29014 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
29015 && !TARGET_INLINE_ALL_STRINGOPS
29016 && !optimize_insn_for_size_p ()
29017 && (!CONST_INT_P (align) || INTVAL (align) < 4))
29018 return false;
29019
29020 addr = force_reg (Pmode, XEXP (src, 0));
29021 scratch1 = gen_reg_rtx (Pmode);
29022
29023 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
29024 && !optimize_insn_for_size_p ())
29025 {
29026 /* Well it seems that some optimizer does not combine a call like
29027 foo(strlen(bar), strlen(bar));
29028 when the move and the subtraction is done here. It does calculate
29029 the length just once when these instructions are done inside of
29030 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
29031 often used and I use one fewer register for the lifetime of
29032 output_strlen_unroll() this is better. */
29033
29034 emit_move_insn (out, addr);
29035
29036 ix86_expand_strlensi_unroll_1 (out, src, align);
29037
29038 /* strlensi_unroll_1 returns the address of the zero at the end of
29039 the string, like memchr(), so compute the length by subtracting
29040 the start address. */
29041 emit_insn (ix86_gen_sub3 (out, out, addr));
29042 }
29043 else
29044 {
29045 rtx unspec;
29046
29047 /* Can't use this if the user has appropriated eax, ecx, or edi. */
29048 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
29049 return false;
29050 /* Can't use this for non-default address spaces. */
29051 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
29052 return false;
29053
29054 scratch2 = gen_reg_rtx (Pmode);
29055 scratch3 = gen_reg_rtx (Pmode);
29056 scratch4 = force_reg (Pmode, constm1_rtx);
29057
29058 emit_move_insn (scratch3, addr);
29059 eoschar = force_reg (QImode, eoschar);
29060
29061 src = replace_equiv_address_nv (src, scratch3);
29062
29063 /* If .md starts supporting :P, this can be done in .md. */
29064 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
29065 scratch4), UNSPEC_SCAS);
29066 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
29067 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
29068 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
29069 }
29070 return true;
29071 }
29072
29073 /* For given symbol (function) construct code to compute address of it's PLT
29074 entry in large x86-64 PIC model. */
29075 static rtx
29076 construct_plt_address (rtx symbol)
29077 {
29078 rtx tmp, unspec;
29079
29080 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
29081 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
29082 gcc_assert (Pmode == DImode);
29083
29084 tmp = gen_reg_rtx (Pmode);
29085 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
29086
29087 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
29088 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
29089 return tmp;
29090 }
29091
29092 rtx
29093 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
29094 rtx callarg2,
29095 rtx pop, bool sibcall)
29096 {
29097 rtx vec[3];
29098 rtx use = NULL, call;
29099 unsigned int vec_len = 0;
29100 tree fndecl;
29101
29102 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
29103 {
29104 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
29105 if (fndecl
29106 && (lookup_attribute ("interrupt",
29107 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
29108 error ("interrupt service routine can't be called directly");
29109 }
29110 else
29111 fndecl = NULL_TREE;
29112
29113 if (pop == const0_rtx)
29114 pop = NULL;
29115 gcc_assert (!TARGET_64BIT || !pop);
29116
29117 if (TARGET_MACHO && !TARGET_64BIT)
29118 {
29119 #if TARGET_MACHO
29120 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
29121 fnaddr = machopic_indirect_call_target (fnaddr);
29122 #endif
29123 }
29124 else
29125 {
29126 /* Static functions and indirect calls don't need the pic register. Also,
29127 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
29128 it an indirect call. */
29129 rtx addr = XEXP (fnaddr, 0);
29130 if (flag_pic
29131 && GET_CODE (addr) == SYMBOL_REF
29132 && !SYMBOL_REF_LOCAL_P (addr))
29133 {
29134 if (flag_plt
29135 && (SYMBOL_REF_DECL (addr) == NULL_TREE
29136 || !lookup_attribute ("noplt",
29137 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
29138 {
29139 if (!TARGET_64BIT
29140 || (ix86_cmodel == CM_LARGE_PIC
29141 && DEFAULT_ABI != MS_ABI))
29142 {
29143 use_reg (&use, gen_rtx_REG (Pmode,
29144 REAL_PIC_OFFSET_TABLE_REGNUM));
29145 if (ix86_use_pseudo_pic_reg ())
29146 emit_move_insn (gen_rtx_REG (Pmode,
29147 REAL_PIC_OFFSET_TABLE_REGNUM),
29148 pic_offset_table_rtx);
29149 }
29150 }
29151 else if (!TARGET_PECOFF && !TARGET_MACHO)
29152 {
29153 if (TARGET_64BIT)
29154 {
29155 fnaddr = gen_rtx_UNSPEC (Pmode,
29156 gen_rtvec (1, addr),
29157 UNSPEC_GOTPCREL);
29158 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
29159 }
29160 else
29161 {
29162 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
29163 UNSPEC_GOT);
29164 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
29165 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
29166 fnaddr);
29167 }
29168 fnaddr = gen_const_mem (Pmode, fnaddr);
29169 /* Pmode may not be the same as word_mode for x32, which
29170 doesn't support indirect branch via 32-bit memory slot.
29171 Since x32 GOT slot is 64 bit with zero upper 32 bits,
29172 indirect branch via x32 GOT slot is OK. */
29173 if (GET_MODE (fnaddr) != word_mode)
29174 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
29175 fnaddr = gen_rtx_MEM (QImode, fnaddr);
29176 }
29177 }
29178 }
29179
29180 /* Skip setting up RAX register for -mskip-rax-setup when there are no
29181 parameters passed in vector registers. */
29182 if (TARGET_64BIT
29183 && (INTVAL (callarg2) > 0
29184 || (INTVAL (callarg2) == 0
29185 && (TARGET_SSE || !flag_skip_rax_setup))))
29186 {
29187 rtx al = gen_rtx_REG (QImode, AX_REG);
29188 emit_move_insn (al, callarg2);
29189 use_reg (&use, al);
29190 }
29191
29192 if (ix86_cmodel == CM_LARGE_PIC
29193 && !TARGET_PECOFF
29194 && MEM_P (fnaddr)
29195 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
29196 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
29197 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
29198 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
29199 branch via x32 GOT slot is OK. */
29200 else if (!(TARGET_X32
29201 && MEM_P (fnaddr)
29202 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
29203 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
29204 && (sibcall
29205 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
29206 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
29207 {
29208 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
29209 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
29210 }
29211
29212 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
29213
29214 if (retval)
29215 {
29216 /* We should add bounds as destination register in case
29217 pointer with bounds may be returned. */
29218 if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
29219 {
29220 rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
29221 rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
29222 if (GET_CODE (retval) == PARALLEL)
29223 {
29224 b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
29225 b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
29226 rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
29227 retval = chkp_join_splitted_slot (retval, par);
29228 }
29229 else
29230 {
29231 retval = gen_rtx_PARALLEL (VOIDmode,
29232 gen_rtvec (3, retval, b0, b1));
29233 chkp_put_regs_to_expr_list (retval);
29234 }
29235 }
29236
29237 call = gen_rtx_SET (retval, call);
29238 }
29239 vec[vec_len++] = call;
29240
29241 if (pop)
29242 {
29243 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
29244 pop = gen_rtx_SET (stack_pointer_rtx, pop);
29245 vec[vec_len++] = pop;
29246 }
29247
29248 if (cfun->machine->no_caller_saved_registers
29249 && (!fndecl
29250 || (!TREE_THIS_VOLATILE (fndecl)
29251 && !lookup_attribute ("no_caller_saved_registers",
29252 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
29253 {
29254 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
29255 bool is_64bit_ms_abi = (TARGET_64BIT
29256 && ix86_function_abi (fndecl) == MS_ABI);
29257 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
29258
29259 /* If there are no caller-saved registers, add all registers
29260 that are clobbered by the call which returns. */
29261 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
29262 if (!fixed_regs[i]
29263 && (ix86_call_used_regs[i] == 1
29264 || (ix86_call_used_regs[i] & c_mask))
29265 && !STACK_REGNO_P (i)
29266 && !MMX_REGNO_P (i))
29267 clobber_reg (&use,
29268 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
29269 }
29270 else if (TARGET_64BIT_MS_ABI
29271 && (!callarg2 || INTVAL (callarg2) != -2))
29272 {
29273 unsigned i;
29274
29275 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
29276 {
29277 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
29278 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
29279
29280 clobber_reg (&use, gen_rtx_REG (mode, regno));
29281 }
29282
29283 /* Set here, but it may get cleared later. */
29284 if (TARGET_CALL_MS2SYSV_XLOGUES)
29285 {
29286 if (!TARGET_SSE)
29287 ;
29288
29289 /* Don't break hot-patched functions. */
29290 else if (ix86_function_ms_hook_prologue (current_function_decl))
29291 ;
29292
29293 /* TODO: Cases not yet examined. */
29294 else if (flag_split_stack)
29295 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
29296
29297 else
29298 {
29299 gcc_assert (!reload_completed);
29300 cfun->machine->call_ms2sysv = true;
29301 }
29302 }
29303 }
29304
29305 if (vec_len > 1)
29306 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
29307 call = emit_call_insn (call);
29308 if (use)
29309 CALL_INSN_FUNCTION_USAGE (call) = use;
29310
29311 return call;
29312 }
29313
29314 /* Return true if the function being called was marked with attribute
29315 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
29316 to handle the non-PIC case in the backend because there is no easy
29317 interface for the front-end to force non-PLT calls to use the GOT.
29318 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
29319 to call the function marked "noplt" indirectly. */
29320
29321 static bool
29322 ix86_nopic_noplt_attribute_p (rtx call_op)
29323 {
29324 if (flag_pic || ix86_cmodel == CM_LARGE
29325 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
29326 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
29327 || SYMBOL_REF_LOCAL_P (call_op))
29328 return false;
29329
29330 tree symbol_decl = SYMBOL_REF_DECL (call_op);
29331
29332 if (!flag_plt
29333 || (symbol_decl != NULL_TREE
29334 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
29335 return true;
29336
29337 return false;
29338 }
29339
29340 /* Output the assembly for a call instruction. */
29341
29342 const char *
29343 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
29344 {
29345 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
29346 bool seh_nop_p = false;
29347 const char *xasm;
29348
29349 if (SIBLING_CALL_P (insn))
29350 {
29351 if (direct_p)
29352 {
29353 if (ix86_nopic_noplt_attribute_p (call_op))
29354 {
29355 if (TARGET_64BIT)
29356 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29357 else
29358 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29359 }
29360 else
29361 xasm = "%!jmp\t%P0";
29362 }
29363 /* SEH epilogue detection requires the indirect branch case
29364 to include REX.W. */
29365 else if (TARGET_SEH)
29366 xasm = "%!rex.W jmp\t%A0";
29367 else
29368 xasm = "%!jmp\t%A0";
29369
29370 output_asm_insn (xasm, &call_op);
29371 return "";
29372 }
29373
29374 /* SEH unwinding can require an extra nop to be emitted in several
29375 circumstances. Determine if we have one of those. */
29376 if (TARGET_SEH)
29377 {
29378 rtx_insn *i;
29379
29380 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
29381 {
29382 /* If we get to another real insn, we don't need the nop. */
29383 if (INSN_P (i))
29384 break;
29385
29386 /* If we get to the epilogue note, prevent a catch region from
29387 being adjacent to the standard epilogue sequence. If non-
29388 call-exceptions, we'll have done this during epilogue emission. */
29389 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
29390 && !flag_non_call_exceptions
29391 && !can_throw_internal (insn))
29392 {
29393 seh_nop_p = true;
29394 break;
29395 }
29396 }
29397
29398 /* If we didn't find a real insn following the call, prevent the
29399 unwinder from looking into the next function. */
29400 if (i == NULL)
29401 seh_nop_p = true;
29402 }
29403
29404 if (direct_p)
29405 {
29406 if (ix86_nopic_noplt_attribute_p (call_op))
29407 {
29408 if (TARGET_64BIT)
29409 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29410 else
29411 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29412 }
29413 else
29414 xasm = "%!call\t%P0";
29415 }
29416 else
29417 xasm = "%!call\t%A0";
29418
29419 output_asm_insn (xasm, &call_op);
29420
29421 if (seh_nop_p)
29422 return "nop";
29423
29424 return "";
29425 }
29426 \f
29427 /* Clear stack slot assignments remembered from previous functions.
29428 This is called from INIT_EXPANDERS once before RTL is emitted for each
29429 function. */
29430
29431 static struct machine_function *
29432 ix86_init_machine_status (void)
29433 {
29434 struct machine_function *f;
29435
29436 f = ggc_cleared_alloc<machine_function> ();
29437 f->call_abi = ix86_abi;
29438
29439 return f;
29440 }
29441
29442 /* Return a MEM corresponding to a stack slot with mode MODE.
29443 Allocate a new slot if necessary.
29444
29445 The RTL for a function can have several slots available: N is
29446 which slot to use. */
29447
29448 rtx
29449 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
29450 {
29451 struct stack_local_entry *s;
29452
29453 gcc_assert (n < MAX_386_STACK_LOCALS);
29454
29455 for (s = ix86_stack_locals; s; s = s->next)
29456 if (s->mode == mode && s->n == n)
29457 return validize_mem (copy_rtx (s->rtl));
29458
29459 s = ggc_alloc<stack_local_entry> ();
29460 s->n = n;
29461 s->mode = mode;
29462 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
29463
29464 s->next = ix86_stack_locals;
29465 ix86_stack_locals = s;
29466 return validize_mem (copy_rtx (s->rtl));
29467 }
29468
29469 static void
29470 ix86_instantiate_decls (void)
29471 {
29472 struct stack_local_entry *s;
29473
29474 for (s = ix86_stack_locals; s; s = s->next)
29475 if (s->rtl != NULL_RTX)
29476 instantiate_decl_rtl (s->rtl);
29477 }
29478 \f
29479 /* Return the number used for encoding REG, in the range 0..7. */
29480
29481 static int
29482 reg_encoded_number (rtx reg)
29483 {
29484 unsigned regno = REGNO (reg);
29485 switch (regno)
29486 {
29487 case AX_REG:
29488 return 0;
29489 case CX_REG:
29490 return 1;
29491 case DX_REG:
29492 return 2;
29493 case BX_REG:
29494 return 3;
29495 case SP_REG:
29496 return 4;
29497 case BP_REG:
29498 return 5;
29499 case SI_REG:
29500 return 6;
29501 case DI_REG:
29502 return 7;
29503 default:
29504 break;
29505 }
29506 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
29507 return regno - FIRST_STACK_REG;
29508 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
29509 return regno - FIRST_SSE_REG;
29510 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
29511 return regno - FIRST_MMX_REG;
29512 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
29513 return regno - FIRST_REX_SSE_REG;
29514 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
29515 return regno - FIRST_REX_INT_REG;
29516 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
29517 return regno - FIRST_MASK_REG;
29518 if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
29519 return regno - FIRST_BND_REG;
29520 return -1;
29521 }
29522
29523 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
29524 in its encoding if it could be relevant for ROP mitigation, otherwise
29525 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
29526 used for calculating it into them. */
29527
29528 static int
29529 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
29530 int *popno0 = 0, int *popno1 = 0)
29531 {
29532 if (asm_noperands (PATTERN (insn)) >= 0)
29533 return -1;
29534 int has_modrm = get_attr_modrm (insn);
29535 if (!has_modrm)
29536 return -1;
29537 enum attr_modrm_class cls = get_attr_modrm_class (insn);
29538 rtx op0, op1;
29539 switch (cls)
29540 {
29541 case MODRM_CLASS_OP02:
29542 gcc_assert (noperands >= 3);
29543 if (popno0)
29544 {
29545 *popno0 = 0;
29546 *popno1 = 2;
29547 }
29548 op0 = operands[0];
29549 op1 = operands[2];
29550 break;
29551 case MODRM_CLASS_OP01:
29552 gcc_assert (noperands >= 2);
29553 if (popno0)
29554 {
29555 *popno0 = 0;
29556 *popno1 = 1;
29557 }
29558 op0 = operands[0];
29559 op1 = operands[1];
29560 break;
29561 default:
29562 return -1;
29563 }
29564 if (REG_P (op0) && REG_P (op1))
29565 {
29566 int enc0 = reg_encoded_number (op0);
29567 int enc1 = reg_encoded_number (op1);
29568 return 0xc0 + (enc1 << 3) + enc0;
29569 }
29570 return -1;
29571 }
29572
29573 /* Check whether x86 address PARTS is a pc-relative address. */
29574
29575 static bool
29576 rip_relative_addr_p (struct ix86_address *parts)
29577 {
29578 rtx base, index, disp;
29579
29580 base = parts->base;
29581 index = parts->index;
29582 disp = parts->disp;
29583
29584 if (disp && !base && !index)
29585 {
29586 if (TARGET_64BIT)
29587 {
29588 rtx symbol = disp;
29589
29590 if (GET_CODE (disp) == CONST)
29591 symbol = XEXP (disp, 0);
29592 if (GET_CODE (symbol) == PLUS
29593 && CONST_INT_P (XEXP (symbol, 1)))
29594 symbol = XEXP (symbol, 0);
29595
29596 if (GET_CODE (symbol) == LABEL_REF
29597 || (GET_CODE (symbol) == SYMBOL_REF
29598 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
29599 || (GET_CODE (symbol) == UNSPEC
29600 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
29601 || XINT (symbol, 1) == UNSPEC_PCREL
29602 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
29603 return true;
29604 }
29605 }
29606 return false;
29607 }
29608
29609 /* Calculate the length of the memory address in the instruction encoding.
29610 Includes addr32 prefix, does not include the one-byte modrm, opcode,
29611 or other prefixes. We never generate addr32 prefix for LEA insn. */
29612
29613 int
29614 memory_address_length (rtx addr, bool lea)
29615 {
29616 struct ix86_address parts;
29617 rtx base, index, disp;
29618 int len;
29619 int ok;
29620
29621 if (GET_CODE (addr) == PRE_DEC
29622 || GET_CODE (addr) == POST_INC
29623 || GET_CODE (addr) == PRE_MODIFY
29624 || GET_CODE (addr) == POST_MODIFY)
29625 return 0;
29626
29627 ok = ix86_decompose_address (addr, &parts);
29628 gcc_assert (ok);
29629
29630 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
29631
29632 /* If this is not LEA instruction, add the length of addr32 prefix. */
29633 if (TARGET_64BIT && !lea
29634 && (SImode_address_operand (addr, VOIDmode)
29635 || (parts.base && GET_MODE (parts.base) == SImode)
29636 || (parts.index && GET_MODE (parts.index) == SImode)))
29637 len++;
29638
29639 base = parts.base;
29640 index = parts.index;
29641 disp = parts.disp;
29642
29643 if (base && SUBREG_P (base))
29644 base = SUBREG_REG (base);
29645 if (index && SUBREG_P (index))
29646 index = SUBREG_REG (index);
29647
29648 gcc_assert (base == NULL_RTX || REG_P (base));
29649 gcc_assert (index == NULL_RTX || REG_P (index));
29650
29651 /* Rule of thumb:
29652 - esp as the base always wants an index,
29653 - ebp as the base always wants a displacement,
29654 - r12 as the base always wants an index,
29655 - r13 as the base always wants a displacement. */
29656
29657 /* Register Indirect. */
29658 if (base && !index && !disp)
29659 {
29660 /* esp (for its index) and ebp (for its displacement) need
29661 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
29662 code. */
29663 if (base == arg_pointer_rtx
29664 || base == frame_pointer_rtx
29665 || REGNO (base) == SP_REG
29666 || REGNO (base) == BP_REG
29667 || REGNO (base) == R12_REG
29668 || REGNO (base) == R13_REG)
29669 len++;
29670 }
29671
29672 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
29673 is not disp32, but disp32(%rip), so for disp32
29674 SIB byte is needed, unless print_operand_address
29675 optimizes it into disp32(%rip) or (%rip) is implied
29676 by UNSPEC. */
29677 else if (disp && !base && !index)
29678 {
29679 len += 4;
29680 if (!rip_relative_addr_p (&parts))
29681 len++;
29682 }
29683 else
29684 {
29685 /* Find the length of the displacement constant. */
29686 if (disp)
29687 {
29688 if (base && satisfies_constraint_K (disp))
29689 len += 1;
29690 else
29691 len += 4;
29692 }
29693 /* ebp always wants a displacement. Similarly r13. */
29694 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
29695 len++;
29696
29697 /* An index requires the two-byte modrm form.... */
29698 if (index
29699 /* ...like esp (or r12), which always wants an index. */
29700 || base == arg_pointer_rtx
29701 || base == frame_pointer_rtx
29702 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
29703 len++;
29704 }
29705
29706 return len;
29707 }
29708
29709 /* Compute default value for "length_immediate" attribute. When SHORTFORM
29710 is set, expect that insn have 8bit immediate alternative. */
29711 int
29712 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
29713 {
29714 int len = 0;
29715 int i;
29716 extract_insn_cached (insn);
29717 for (i = recog_data.n_operands - 1; i >= 0; --i)
29718 if (CONSTANT_P (recog_data.operand[i]))
29719 {
29720 enum attr_mode mode = get_attr_mode (insn);
29721
29722 gcc_assert (!len);
29723 if (shortform && CONST_INT_P (recog_data.operand[i]))
29724 {
29725 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
29726 switch (mode)
29727 {
29728 case MODE_QI:
29729 len = 1;
29730 continue;
29731 case MODE_HI:
29732 ival = trunc_int_for_mode (ival, HImode);
29733 break;
29734 case MODE_SI:
29735 ival = trunc_int_for_mode (ival, SImode);
29736 break;
29737 default:
29738 break;
29739 }
29740 if (IN_RANGE (ival, -128, 127))
29741 {
29742 len = 1;
29743 continue;
29744 }
29745 }
29746 switch (mode)
29747 {
29748 case MODE_QI:
29749 len = 1;
29750 break;
29751 case MODE_HI:
29752 len = 2;
29753 break;
29754 case MODE_SI:
29755 len = 4;
29756 break;
29757 /* Immediates for DImode instructions are encoded
29758 as 32bit sign extended values. */
29759 case MODE_DI:
29760 len = 4;
29761 break;
29762 default:
29763 fatal_insn ("unknown insn mode", insn);
29764 }
29765 }
29766 return len;
29767 }
29768
29769 /* Compute default value for "length_address" attribute. */
29770 int
29771 ix86_attr_length_address_default (rtx_insn *insn)
29772 {
29773 int i;
29774
29775 if (get_attr_type (insn) == TYPE_LEA)
29776 {
29777 rtx set = PATTERN (insn), addr;
29778
29779 if (GET_CODE (set) == PARALLEL)
29780 set = XVECEXP (set, 0, 0);
29781
29782 gcc_assert (GET_CODE (set) == SET);
29783
29784 addr = SET_SRC (set);
29785
29786 return memory_address_length (addr, true);
29787 }
29788
29789 extract_insn_cached (insn);
29790 for (i = recog_data.n_operands - 1; i >= 0; --i)
29791 {
29792 rtx op = recog_data.operand[i];
29793 if (MEM_P (op))
29794 {
29795 constrain_operands_cached (insn, reload_completed);
29796 if (which_alternative != -1)
29797 {
29798 const char *constraints = recog_data.constraints[i];
29799 int alt = which_alternative;
29800
29801 while (*constraints == '=' || *constraints == '+')
29802 constraints++;
29803 while (alt-- > 0)
29804 while (*constraints++ != ',')
29805 ;
29806 /* Skip ignored operands. */
29807 if (*constraints == 'X')
29808 continue;
29809 }
29810
29811 int len = memory_address_length (XEXP (op, 0), false);
29812
29813 /* Account for segment prefix for non-default addr spaces. */
29814 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
29815 len++;
29816
29817 return len;
29818 }
29819 }
29820 return 0;
29821 }
29822
29823 /* Compute default value for "length_vex" attribute. It includes
29824 2 or 3 byte VEX prefix and 1 opcode byte. */
29825
29826 int
29827 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
29828 bool has_vex_w)
29829 {
29830 int i;
29831
29832 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
29833 byte VEX prefix. */
29834 if (!has_0f_opcode || has_vex_w)
29835 return 3 + 1;
29836
29837 /* We can always use 2 byte VEX prefix in 32bit. */
29838 if (!TARGET_64BIT)
29839 return 2 + 1;
29840
29841 extract_insn_cached (insn);
29842
29843 for (i = recog_data.n_operands - 1; i >= 0; --i)
29844 if (REG_P (recog_data.operand[i]))
29845 {
29846 /* REX.W bit uses 3 byte VEX prefix. */
29847 if (GET_MODE (recog_data.operand[i]) == DImode
29848 && GENERAL_REG_P (recog_data.operand[i]))
29849 return 3 + 1;
29850 }
29851 else
29852 {
29853 /* REX.X or REX.B bits use 3 byte VEX prefix. */
29854 if (MEM_P (recog_data.operand[i])
29855 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
29856 return 3 + 1;
29857 }
29858
29859 return 2 + 1;
29860 }
29861 \f
29862 /* Return the maximum number of instructions a cpu can issue. */
29863
29864 static int
29865 ix86_issue_rate (void)
29866 {
29867 switch (ix86_tune)
29868 {
29869 case PROCESSOR_PENTIUM:
29870 case PROCESSOR_LAKEMONT:
29871 case PROCESSOR_BONNELL:
29872 case PROCESSOR_SILVERMONT:
29873 case PROCESSOR_KNL:
29874 case PROCESSOR_INTEL:
29875 case PROCESSOR_K6:
29876 case PROCESSOR_BTVER2:
29877 case PROCESSOR_PENTIUM4:
29878 case PROCESSOR_NOCONA:
29879 return 2;
29880
29881 case PROCESSOR_PENTIUMPRO:
29882 case PROCESSOR_ATHLON:
29883 case PROCESSOR_K8:
29884 case PROCESSOR_AMDFAM10:
29885 case PROCESSOR_GENERIC:
29886 case PROCESSOR_BTVER1:
29887 return 3;
29888
29889 case PROCESSOR_BDVER1:
29890 case PROCESSOR_BDVER2:
29891 case PROCESSOR_BDVER3:
29892 case PROCESSOR_BDVER4:
29893 case PROCESSOR_ZNVER1:
29894 case PROCESSOR_CORE2:
29895 case PROCESSOR_NEHALEM:
29896 case PROCESSOR_SANDYBRIDGE:
29897 case PROCESSOR_HASWELL:
29898 return 4;
29899
29900 default:
29901 return 1;
29902 }
29903 }
29904
29905 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
29906 by DEP_INSN and nothing set by DEP_INSN. */
29907
29908 static bool
29909 ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
29910 {
29911 rtx set, set2;
29912
29913 /* Simplify the test for uninteresting insns. */
29914 if (insn_type != TYPE_SETCC
29915 && insn_type != TYPE_ICMOV
29916 && insn_type != TYPE_FCMOV
29917 && insn_type != TYPE_IBR)
29918 return false;
29919
29920 if ((set = single_set (dep_insn)) != 0)
29921 {
29922 set = SET_DEST (set);
29923 set2 = NULL_RTX;
29924 }
29925 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
29926 && XVECLEN (PATTERN (dep_insn), 0) == 2
29927 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
29928 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
29929 {
29930 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
29931 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
29932 }
29933 else
29934 return false;
29935
29936 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
29937 return false;
29938
29939 /* This test is true if the dependent insn reads the flags but
29940 not any other potentially set register. */
29941 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
29942 return false;
29943
29944 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
29945 return false;
29946
29947 return true;
29948 }
29949
29950 /* Return true iff USE_INSN has a memory address with operands set by
29951 SET_INSN. */
29952
29953 bool
29954 ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
29955 {
29956 int i;
29957 extract_insn_cached (use_insn);
29958 for (i = recog_data.n_operands - 1; i >= 0; --i)
29959 if (MEM_P (recog_data.operand[i]))
29960 {
29961 rtx addr = XEXP (recog_data.operand[i], 0);
29962 if (modified_in_p (addr, set_insn) != 0)
29963 {
29964 /* No AGI stall if SET_INSN is a push or pop and USE_INSN
29965 has SP based memory (unless index reg is modified in a pop). */
29966 rtx set = single_set (set_insn);
29967 if (set
29968 && (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
29969 || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
29970 {
29971 struct ix86_address parts;
29972 if (ix86_decompose_address (addr, &parts)
29973 && parts.base == stack_pointer_rtx
29974 && (parts.index == NULL_RTX
29975 || MEM_P (SET_DEST (set))
29976 || !modified_in_p (parts.index, set_insn)))
29977 return false;
29978 }
29979 return true;
29980 }
29981 return false;
29982 }
29983 return false;
29984 }
29985
29986 /* Helper function for exact_store_load_dependency.
29987 Return true if addr is found in insn. */
29988 static bool
29989 exact_dependency_1 (rtx addr, rtx insn)
29990 {
29991 enum rtx_code code;
29992 const char *format_ptr;
29993 int i, j;
29994
29995 code = GET_CODE (insn);
29996 switch (code)
29997 {
29998 case MEM:
29999 if (rtx_equal_p (addr, insn))
30000 return true;
30001 break;
30002 case REG:
30003 CASE_CONST_ANY:
30004 case SYMBOL_REF:
30005 case CODE_LABEL:
30006 case PC:
30007 case CC0:
30008 case EXPR_LIST:
30009 return false;
30010 default:
30011 break;
30012 }
30013
30014 format_ptr = GET_RTX_FORMAT (code);
30015 for (i = 0; i < GET_RTX_LENGTH (code); i++)
30016 {
30017 switch (*format_ptr++)
30018 {
30019 case 'e':
30020 if (exact_dependency_1 (addr, XEXP (insn, i)))
30021 return true;
30022 break;
30023 case 'E':
30024 for (j = 0; j < XVECLEN (insn, i); j++)
30025 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
30026 return true;
30027 break;
30028 }
30029 }
30030 return false;
30031 }
30032
30033 /* Return true if there exists exact dependency for store & load, i.e.
30034 the same memory address is used in them. */
30035 static bool
30036 exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
30037 {
30038 rtx set1, set2;
30039
30040 set1 = single_set (store);
30041 if (!set1)
30042 return false;
30043 if (!MEM_P (SET_DEST (set1)))
30044 return false;
30045 set2 = single_set (load);
30046 if (!set2)
30047 return false;
30048 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
30049 return true;
30050 return false;
30051 }
30052
30053 static int
30054 ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
30055 unsigned int)
30056 {
30057 enum attr_type insn_type, dep_insn_type;
30058 enum attr_memory memory;
30059 rtx set, set2;
30060 int dep_insn_code_number;
30061
30062 /* Anti and output dependencies have zero cost on all CPUs. */
30063 if (dep_type != 0)
30064 return 0;
30065
30066 dep_insn_code_number = recog_memoized (dep_insn);
30067
30068 /* If we can't recognize the insns, we can't really do anything. */
30069 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
30070 return cost;
30071
30072 insn_type = get_attr_type (insn);
30073 dep_insn_type = get_attr_type (dep_insn);
30074
30075 switch (ix86_tune)
30076 {
30077 case PROCESSOR_PENTIUM:
30078 case PROCESSOR_LAKEMONT:
30079 /* Address Generation Interlock adds a cycle of latency. */
30080 if (insn_type == TYPE_LEA)
30081 {
30082 rtx addr = PATTERN (insn);
30083
30084 if (GET_CODE (addr) == PARALLEL)
30085 addr = XVECEXP (addr, 0, 0);
30086
30087 gcc_assert (GET_CODE (addr) == SET);
30088
30089 addr = SET_SRC (addr);
30090 if (modified_in_p (addr, dep_insn))
30091 cost += 1;
30092 }
30093 else if (ix86_agi_dependent (dep_insn, insn))
30094 cost += 1;
30095
30096 /* ??? Compares pair with jump/setcc. */
30097 if (ix86_flags_dependent (insn, dep_insn, insn_type))
30098 cost = 0;
30099
30100 /* Floating point stores require value to be ready one cycle earlier. */
30101 if (insn_type == TYPE_FMOV
30102 && get_attr_memory (insn) == MEMORY_STORE
30103 && !ix86_agi_dependent (dep_insn, insn))
30104 cost += 1;
30105 break;
30106
30107 case PROCESSOR_PENTIUMPRO:
30108 /* INT->FP conversion is expensive. */
30109 if (get_attr_fp_int_src (dep_insn))
30110 cost += 5;
30111
30112 /* There is one cycle extra latency between an FP op and a store. */
30113 if (insn_type == TYPE_FMOV
30114 && (set = single_set (dep_insn)) != NULL_RTX
30115 && (set2 = single_set (insn)) != NULL_RTX
30116 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
30117 && MEM_P (SET_DEST (set2)))
30118 cost += 1;
30119
30120 memory = get_attr_memory (insn);
30121
30122 /* Show ability of reorder buffer to hide latency of load by executing
30123 in parallel with previous instruction in case
30124 previous instruction is not needed to compute the address. */
30125 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30126 && !ix86_agi_dependent (dep_insn, insn))
30127 {
30128 /* Claim moves to take one cycle, as core can issue one load
30129 at time and the next load can start cycle later. */
30130 if (dep_insn_type == TYPE_IMOV
30131 || dep_insn_type == TYPE_FMOV)
30132 cost = 1;
30133 else if (cost > 1)
30134 cost--;
30135 }
30136 break;
30137
30138 case PROCESSOR_K6:
30139 /* The esp dependency is resolved before
30140 the instruction is really finished. */
30141 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
30142 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
30143 return 1;
30144
30145 /* INT->FP conversion is expensive. */
30146 if (get_attr_fp_int_src (dep_insn))
30147 cost += 5;
30148
30149 memory = get_attr_memory (insn);
30150
30151 /* Show ability of reorder buffer to hide latency of load by executing
30152 in parallel with previous instruction in case
30153 previous instruction is not needed to compute the address. */
30154 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30155 && !ix86_agi_dependent (dep_insn, insn))
30156 {
30157 /* Claim moves to take one cycle, as core can issue one load
30158 at time and the next load can start cycle later. */
30159 if (dep_insn_type == TYPE_IMOV
30160 || dep_insn_type == TYPE_FMOV)
30161 cost = 1;
30162 else if (cost > 2)
30163 cost -= 2;
30164 else
30165 cost = 1;
30166 }
30167 break;
30168
30169 case PROCESSOR_AMDFAM10:
30170 case PROCESSOR_BDVER1:
30171 case PROCESSOR_BDVER2:
30172 case PROCESSOR_BDVER3:
30173 case PROCESSOR_BDVER4:
30174 case PROCESSOR_ZNVER1:
30175 case PROCESSOR_BTVER1:
30176 case PROCESSOR_BTVER2:
30177 case PROCESSOR_GENERIC:
30178 /* Stack engine allows to execute push&pop instructions in parall. */
30179 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
30180 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
30181 return 0;
30182 /* FALLTHRU */
30183
30184 case PROCESSOR_ATHLON:
30185 case PROCESSOR_K8:
30186 memory = get_attr_memory (insn);
30187
30188 /* Show ability of reorder buffer to hide latency of load by executing
30189 in parallel with previous instruction in case
30190 previous instruction is not needed to compute the address. */
30191 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30192 && !ix86_agi_dependent (dep_insn, insn))
30193 {
30194 enum attr_unit unit = get_attr_unit (insn);
30195 int loadcost = 3;
30196
30197 /* Because of the difference between the length of integer and
30198 floating unit pipeline preparation stages, the memory operands
30199 for floating point are cheaper.
30200
30201 ??? For Athlon it the difference is most probably 2. */
30202 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
30203 loadcost = 3;
30204 else
30205 loadcost = TARGET_ATHLON ? 2 : 0;
30206
30207 if (cost >= loadcost)
30208 cost -= loadcost;
30209 else
30210 cost = 0;
30211 }
30212 break;
30213
30214 case PROCESSOR_CORE2:
30215 case PROCESSOR_NEHALEM:
30216 case PROCESSOR_SANDYBRIDGE:
30217 case PROCESSOR_HASWELL:
30218 /* Stack engine allows to execute push&pop instructions in parall. */
30219 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
30220 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
30221 return 0;
30222
30223 memory = get_attr_memory (insn);
30224
30225 /* Show ability of reorder buffer to hide latency of load by executing
30226 in parallel with previous instruction in case
30227 previous instruction is not needed to compute the address. */
30228 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30229 && !ix86_agi_dependent (dep_insn, insn))
30230 {
30231 if (cost >= 4)
30232 cost -= 4;
30233 else
30234 cost = 0;
30235 }
30236 break;
30237
30238 case PROCESSOR_SILVERMONT:
30239 case PROCESSOR_KNL:
30240 case PROCESSOR_INTEL:
30241 if (!reload_completed)
30242 return cost;
30243
30244 /* Increase cost of integer loads. */
30245 memory = get_attr_memory (dep_insn);
30246 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30247 {
30248 enum attr_unit unit = get_attr_unit (dep_insn);
30249 if (unit == UNIT_INTEGER && cost == 1)
30250 {
30251 if (memory == MEMORY_LOAD)
30252 cost = 3;
30253 else
30254 {
30255 /* Increase cost of ld/st for short int types only
30256 because of store forwarding issue. */
30257 rtx set = single_set (dep_insn);
30258 if (set && (GET_MODE (SET_DEST (set)) == QImode
30259 || GET_MODE (SET_DEST (set)) == HImode))
30260 {
30261 /* Increase cost of store/load insn if exact
30262 dependence exists and it is load insn. */
30263 enum attr_memory insn_memory = get_attr_memory (insn);
30264 if (insn_memory == MEMORY_LOAD
30265 && exact_store_load_dependency (dep_insn, insn))
30266 cost = 3;
30267 }
30268 }
30269 }
30270 }
30271
30272 default:
30273 break;
30274 }
30275
30276 return cost;
30277 }
30278
30279 /* How many alternative schedules to try. This should be as wide as the
30280 scheduling freedom in the DFA, but no wider. Making this value too
30281 large results extra work for the scheduler. */
30282
30283 static int
30284 ia32_multipass_dfa_lookahead (void)
30285 {
30286 switch (ix86_tune)
30287 {
30288 case PROCESSOR_PENTIUM:
30289 case PROCESSOR_LAKEMONT:
30290 return 2;
30291
30292 case PROCESSOR_PENTIUMPRO:
30293 case PROCESSOR_K6:
30294 return 1;
30295
30296 case PROCESSOR_BDVER1:
30297 case PROCESSOR_BDVER2:
30298 case PROCESSOR_BDVER3:
30299 case PROCESSOR_BDVER4:
30300 /* We use lookahead value 4 for BD both before and after reload
30301 schedules. Plan is to have value 8 included for O3. */
30302 return 4;
30303
30304 case PROCESSOR_CORE2:
30305 case PROCESSOR_NEHALEM:
30306 case PROCESSOR_SANDYBRIDGE:
30307 case PROCESSOR_HASWELL:
30308 case PROCESSOR_BONNELL:
30309 case PROCESSOR_SILVERMONT:
30310 case PROCESSOR_KNL:
30311 case PROCESSOR_INTEL:
30312 /* Generally, we want haifa-sched:max_issue() to look ahead as far
30313 as many instructions can be executed on a cycle, i.e.,
30314 issue_rate. I wonder why tuning for many CPUs does not do this. */
30315 if (reload_completed)
30316 return ix86_issue_rate ();
30317 /* Don't use lookahead for pre-reload schedule to save compile time. */
30318 return 0;
30319
30320 default:
30321 return 0;
30322 }
30323 }
30324
30325 /* Return true if target platform supports macro-fusion. */
30326
30327 static bool
30328 ix86_macro_fusion_p ()
30329 {
30330 return TARGET_FUSE_CMP_AND_BRANCH;
30331 }
30332
30333 /* Check whether current microarchitecture support macro fusion
30334 for insn pair "CONDGEN + CONDJMP". Refer to
30335 "Intel Architectures Optimization Reference Manual". */
30336
30337 static bool
30338 ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
30339 {
30340 rtx src, dest;
30341 enum rtx_code ccode;
30342 rtx compare_set = NULL_RTX, test_if, cond;
30343 rtx alu_set = NULL_RTX, addr = NULL_RTX;
30344
30345 if (!any_condjump_p (condjmp))
30346 return false;
30347
30348 if (get_attr_type (condgen) != TYPE_TEST
30349 && get_attr_type (condgen) != TYPE_ICMP
30350 && get_attr_type (condgen) != TYPE_INCDEC
30351 && get_attr_type (condgen) != TYPE_ALU)
30352 return false;
30353
30354 compare_set = single_set (condgen);
30355 if (compare_set == NULL_RTX
30356 && !TARGET_FUSE_ALU_AND_BRANCH)
30357 return false;
30358
30359 if (compare_set == NULL_RTX)
30360 {
30361 int i;
30362 rtx pat = PATTERN (condgen);
30363 for (i = 0; i < XVECLEN (pat, 0); i++)
30364 if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
30365 {
30366 rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
30367 if (GET_CODE (set_src) == COMPARE)
30368 compare_set = XVECEXP (pat, 0, i);
30369 else
30370 alu_set = XVECEXP (pat, 0, i);
30371 }
30372 }
30373 if (compare_set == NULL_RTX)
30374 return false;
30375 src = SET_SRC (compare_set);
30376 if (GET_CODE (src) != COMPARE)
30377 return false;
30378
30379 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
30380 supported. */
30381 if ((MEM_P (XEXP (src, 0))
30382 && CONST_INT_P (XEXP (src, 1)))
30383 || (MEM_P (XEXP (src, 1))
30384 && CONST_INT_P (XEXP (src, 0))))
30385 return false;
30386
30387 /* No fusion for RIP-relative address. */
30388 if (MEM_P (XEXP (src, 0)))
30389 addr = XEXP (XEXP (src, 0), 0);
30390 else if (MEM_P (XEXP (src, 1)))
30391 addr = XEXP (XEXP (src, 1), 0);
30392
30393 if (addr) {
30394 ix86_address parts;
30395 int ok = ix86_decompose_address (addr, &parts);
30396 gcc_assert (ok);
30397
30398 if (rip_relative_addr_p (&parts))
30399 return false;
30400 }
30401
30402 test_if = SET_SRC (pc_set (condjmp));
30403 cond = XEXP (test_if, 0);
30404 ccode = GET_CODE (cond);
30405 /* Check whether conditional jump use Sign or Overflow Flags. */
30406 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
30407 && (ccode == GE
30408 || ccode == GT
30409 || ccode == LE
30410 || ccode == LT))
30411 return false;
30412
30413 /* Return true for TYPE_TEST and TYPE_ICMP. */
30414 if (get_attr_type (condgen) == TYPE_TEST
30415 || get_attr_type (condgen) == TYPE_ICMP)
30416 return true;
30417
30418 /* The following is the case that macro-fusion for alu + jmp. */
30419 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
30420 return false;
30421
30422 /* No fusion for alu op with memory destination operand. */
30423 dest = SET_DEST (alu_set);
30424 if (MEM_P (dest))
30425 return false;
30426
30427 /* Macro-fusion for inc/dec + unsigned conditional jump is not
30428 supported. */
30429 if (get_attr_type (condgen) == TYPE_INCDEC
30430 && (ccode == GEU
30431 || ccode == GTU
30432 || ccode == LEU
30433 || ccode == LTU))
30434 return false;
30435
30436 return true;
30437 }
30438
30439 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
30440 execution. It is applied if
30441 (1) IMUL instruction is on the top of list;
30442 (2) There exists the only producer of independent IMUL instruction in
30443 ready list.
30444 Return index of IMUL producer if it was found and -1 otherwise. */
30445 static int
30446 do_reorder_for_imul (rtx_insn **ready, int n_ready)
30447 {
30448 rtx_insn *insn;
30449 rtx set, insn1, insn2;
30450 sd_iterator_def sd_it;
30451 dep_t dep;
30452 int index = -1;
30453 int i;
30454
30455 if (!TARGET_BONNELL)
30456 return index;
30457
30458 /* Check that IMUL instruction is on the top of ready list. */
30459 insn = ready[n_ready - 1];
30460 set = single_set (insn);
30461 if (!set)
30462 return index;
30463 if (!(GET_CODE (SET_SRC (set)) == MULT
30464 && GET_MODE (SET_SRC (set)) == SImode))
30465 return index;
30466
30467 /* Search for producer of independent IMUL instruction. */
30468 for (i = n_ready - 2; i >= 0; i--)
30469 {
30470 insn = ready[i];
30471 if (!NONDEBUG_INSN_P (insn))
30472 continue;
30473 /* Skip IMUL instruction. */
30474 insn2 = PATTERN (insn);
30475 if (GET_CODE (insn2) == PARALLEL)
30476 insn2 = XVECEXP (insn2, 0, 0);
30477 if (GET_CODE (insn2) == SET
30478 && GET_CODE (SET_SRC (insn2)) == MULT
30479 && GET_MODE (SET_SRC (insn2)) == SImode)
30480 continue;
30481
30482 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
30483 {
30484 rtx con;
30485 con = DEP_CON (dep);
30486 if (!NONDEBUG_INSN_P (con))
30487 continue;
30488 insn1 = PATTERN (con);
30489 if (GET_CODE (insn1) == PARALLEL)
30490 insn1 = XVECEXP (insn1, 0, 0);
30491
30492 if (GET_CODE (insn1) == SET
30493 && GET_CODE (SET_SRC (insn1)) == MULT
30494 && GET_MODE (SET_SRC (insn1)) == SImode)
30495 {
30496 sd_iterator_def sd_it1;
30497 dep_t dep1;
30498 /* Check if there is no other dependee for IMUL. */
30499 index = i;
30500 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
30501 {
30502 rtx pro;
30503 pro = DEP_PRO (dep1);
30504 if (!NONDEBUG_INSN_P (pro))
30505 continue;
30506 if (pro != insn)
30507 index = -1;
30508 }
30509 if (index >= 0)
30510 break;
30511 }
30512 }
30513 if (index >= 0)
30514 break;
30515 }
30516 return index;
30517 }
30518
30519 /* Try to find the best candidate on the top of ready list if two insns
30520 have the same priority - candidate is best if its dependees were
30521 scheduled earlier. Applied for Silvermont only.
30522 Return true if top 2 insns must be interchanged. */
30523 static bool
30524 swap_top_of_ready_list (rtx_insn **ready, int n_ready)
30525 {
30526 rtx_insn *top = ready[n_ready - 1];
30527 rtx_insn *next = ready[n_ready - 2];
30528 rtx set;
30529 sd_iterator_def sd_it;
30530 dep_t dep;
30531 int clock1 = -1;
30532 int clock2 = -1;
30533 #define INSN_TICK(INSN) (HID (INSN)->tick)
30534
30535 if (!TARGET_SILVERMONT && !TARGET_INTEL)
30536 return false;
30537
30538 if (!NONDEBUG_INSN_P (top))
30539 return false;
30540 if (!NONJUMP_INSN_P (top))
30541 return false;
30542 if (!NONDEBUG_INSN_P (next))
30543 return false;
30544 if (!NONJUMP_INSN_P (next))
30545 return false;
30546 set = single_set (top);
30547 if (!set)
30548 return false;
30549 set = single_set (next);
30550 if (!set)
30551 return false;
30552
30553 if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
30554 {
30555 if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
30556 return false;
30557 /* Determine winner more precise. */
30558 FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
30559 {
30560 rtx pro;
30561 pro = DEP_PRO (dep);
30562 if (!NONDEBUG_INSN_P (pro))
30563 continue;
30564 if (INSN_TICK (pro) > clock1)
30565 clock1 = INSN_TICK (pro);
30566 }
30567 FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
30568 {
30569 rtx pro;
30570 pro = DEP_PRO (dep);
30571 if (!NONDEBUG_INSN_P (pro))
30572 continue;
30573 if (INSN_TICK (pro) > clock2)
30574 clock2 = INSN_TICK (pro);
30575 }
30576
30577 if (clock1 == clock2)
30578 {
30579 /* Determine winner - load must win. */
30580 enum attr_memory memory1, memory2;
30581 memory1 = get_attr_memory (top);
30582 memory2 = get_attr_memory (next);
30583 if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
30584 return true;
30585 }
30586 return (bool) (clock2 < clock1);
30587 }
30588 return false;
30589 #undef INSN_TICK
30590 }
30591
30592 /* Perform possible reodering of ready list for Atom/Silvermont only.
30593 Return issue rate. */
30594 static int
30595 ix86_sched_reorder (FILE *dump, int sched_verbose, rtx_insn **ready,
30596 int *pn_ready, int clock_var)
30597 {
30598 int issue_rate = -1;
30599 int n_ready = *pn_ready;
30600 int i;
30601 rtx_insn *insn;
30602 int index = -1;
30603
30604 /* Set up issue rate. */
30605 issue_rate = ix86_issue_rate ();
30606
30607 /* Do reodering for BONNELL/SILVERMONT only. */
30608 if (!TARGET_BONNELL && !TARGET_SILVERMONT && !TARGET_INTEL)
30609 return issue_rate;
30610
30611 /* Nothing to do if ready list contains only 1 instruction. */
30612 if (n_ready <= 1)
30613 return issue_rate;
30614
30615 /* Do reodering for post-reload scheduler only. */
30616 if (!reload_completed)
30617 return issue_rate;
30618
30619 if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
30620 {
30621 if (sched_verbose > 1)
30622 fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
30623 INSN_UID (ready[index]));
30624
30625 /* Put IMUL producer (ready[index]) at the top of ready list. */
30626 insn = ready[index];
30627 for (i = index; i < n_ready - 1; i++)
30628 ready[i] = ready[i + 1];
30629 ready[n_ready - 1] = insn;
30630 return issue_rate;
30631 }
30632
30633 /* Skip selective scheduling since HID is not populated in it. */
30634 if (clock_var != 0
30635 && !sel_sched_p ()
30636 && swap_top_of_ready_list (ready, n_ready))
30637 {
30638 if (sched_verbose > 1)
30639 fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
30640 INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
30641 /* Swap 2 top elements of ready list. */
30642 insn = ready[n_ready - 1];
30643 ready[n_ready - 1] = ready[n_ready - 2];
30644 ready[n_ready - 2] = insn;
30645 }
30646 return issue_rate;
30647 }
30648
30649 static bool
30650 ix86_class_likely_spilled_p (reg_class_t);
30651
30652 /* Returns true if lhs of insn is HW function argument register and set up
30653 is_spilled to true if it is likely spilled HW register. */
30654 static bool
30655 insn_is_function_arg (rtx insn, bool* is_spilled)
30656 {
30657 rtx dst;
30658
30659 if (!NONDEBUG_INSN_P (insn))
30660 return false;
30661 /* Call instructions are not movable, ignore it. */
30662 if (CALL_P (insn))
30663 return false;
30664 insn = PATTERN (insn);
30665 if (GET_CODE (insn) == PARALLEL)
30666 insn = XVECEXP (insn, 0, 0);
30667 if (GET_CODE (insn) != SET)
30668 return false;
30669 dst = SET_DEST (insn);
30670 if (REG_P (dst) && HARD_REGISTER_P (dst)
30671 && ix86_function_arg_regno_p (REGNO (dst)))
30672 {
30673 /* Is it likely spilled HW register? */
30674 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
30675 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
30676 *is_spilled = true;
30677 return true;
30678 }
30679 return false;
30680 }
30681
30682 /* Add output dependencies for chain of function adjacent arguments if only
30683 there is a move to likely spilled HW register. Return first argument
30684 if at least one dependence was added or NULL otherwise. */
30685 static rtx_insn *
30686 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
30687 {
30688 rtx_insn *insn;
30689 rtx_insn *last = call;
30690 rtx_insn *first_arg = NULL;
30691 bool is_spilled = false;
30692
30693 head = PREV_INSN (head);
30694
30695 /* Find nearest to call argument passing instruction. */
30696 while (true)
30697 {
30698 last = PREV_INSN (last);
30699 if (last == head)
30700 return NULL;
30701 if (!NONDEBUG_INSN_P (last))
30702 continue;
30703 if (insn_is_function_arg (last, &is_spilled))
30704 break;
30705 return NULL;
30706 }
30707
30708 first_arg = last;
30709 while (true)
30710 {
30711 insn = PREV_INSN (last);
30712 if (!INSN_P (insn))
30713 break;
30714 if (insn == head)
30715 break;
30716 if (!NONDEBUG_INSN_P (insn))
30717 {
30718 last = insn;
30719 continue;
30720 }
30721 if (insn_is_function_arg (insn, &is_spilled))
30722 {
30723 /* Add output depdendence between two function arguments if chain
30724 of output arguments contains likely spilled HW registers. */
30725 if (is_spilled)
30726 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
30727 first_arg = last = insn;
30728 }
30729 else
30730 break;
30731 }
30732 if (!is_spilled)
30733 return NULL;
30734 return first_arg;
30735 }
30736
30737 /* Add output or anti dependency from insn to first_arg to restrict its code
30738 motion. */
30739 static void
30740 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
30741 {
30742 rtx set;
30743 rtx tmp;
30744
30745 /* Add anti dependencies for bounds stores. */
30746 if (INSN_P (insn)
30747 && GET_CODE (PATTERN (insn)) == PARALLEL
30748 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
30749 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
30750 {
30751 add_dependence (first_arg, insn, REG_DEP_ANTI);
30752 return;
30753 }
30754
30755 set = single_set (insn);
30756 if (!set)
30757 return;
30758 tmp = SET_DEST (set);
30759 if (REG_P (tmp))
30760 {
30761 /* Add output dependency to the first function argument. */
30762 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
30763 return;
30764 }
30765 /* Add anti dependency. */
30766 add_dependence (first_arg, insn, REG_DEP_ANTI);
30767 }
30768
30769 /* Avoid cross block motion of function argument through adding dependency
30770 from the first non-jump instruction in bb. */
30771 static void
30772 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
30773 {
30774 rtx_insn *insn = BB_END (bb);
30775
30776 while (insn)
30777 {
30778 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
30779 {
30780 rtx set = single_set (insn);
30781 if (set)
30782 {
30783 avoid_func_arg_motion (arg, insn);
30784 return;
30785 }
30786 }
30787 if (insn == BB_HEAD (bb))
30788 return;
30789 insn = PREV_INSN (insn);
30790 }
30791 }
30792
30793 /* Hook for pre-reload schedule - avoid motion of function arguments
30794 passed in likely spilled HW registers. */
30795 static void
30796 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
30797 {
30798 rtx_insn *insn;
30799 rtx_insn *first_arg = NULL;
30800 if (reload_completed)
30801 return;
30802 while (head != tail && DEBUG_INSN_P (head))
30803 head = NEXT_INSN (head);
30804 for (insn = tail; insn != head; insn = PREV_INSN (insn))
30805 if (INSN_P (insn) && CALL_P (insn))
30806 {
30807 first_arg = add_parameter_dependencies (insn, head);
30808 if (first_arg)
30809 {
30810 /* Add dependee for first argument to predecessors if only
30811 region contains more than one block. */
30812 basic_block bb = BLOCK_FOR_INSN (insn);
30813 int rgn = CONTAINING_RGN (bb->index);
30814 int nr_blks = RGN_NR_BLOCKS (rgn);
30815 /* Skip trivial regions and region head blocks that can have
30816 predecessors outside of region. */
30817 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
30818 {
30819 edge e;
30820 edge_iterator ei;
30821
30822 /* Regions are SCCs with the exception of selective
30823 scheduling with pipelining of outer blocks enabled.
30824 So also check that immediate predecessors of a non-head
30825 block are in the same region. */
30826 FOR_EACH_EDGE (e, ei, bb->preds)
30827 {
30828 /* Avoid creating of loop-carried dependencies through
30829 using topological ordering in the region. */
30830 if (rgn == CONTAINING_RGN (e->src->index)
30831 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
30832 add_dependee_for_func_arg (first_arg, e->src);
30833 }
30834 }
30835 insn = first_arg;
30836 if (insn == head)
30837 break;
30838 }
30839 }
30840 else if (first_arg)
30841 avoid_func_arg_motion (first_arg, insn);
30842 }
30843
30844 /* Hook for pre-reload schedule - set priority of moves from likely spilled
30845 HW registers to maximum, to schedule them at soon as possible. These are
30846 moves from function argument registers at the top of the function entry
30847 and moves from function return value registers after call. */
30848 static int
30849 ix86_adjust_priority (rtx_insn *insn, int priority)
30850 {
30851 rtx set;
30852
30853 if (reload_completed)
30854 return priority;
30855
30856 if (!NONDEBUG_INSN_P (insn))
30857 return priority;
30858
30859 set = single_set (insn);
30860 if (set)
30861 {
30862 rtx tmp = SET_SRC (set);
30863 if (REG_P (tmp)
30864 && HARD_REGISTER_P (tmp)
30865 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
30866 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
30867 return current_sched_info->sched_max_insns_priority;
30868 }
30869
30870 return priority;
30871 }
30872
30873 /* Model decoder of Core 2/i7.
30874 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
30875 track the instruction fetch block boundaries and make sure that long
30876 (9+ bytes) instructions are assigned to D0. */
30877
30878 /* Maximum length of an insn that can be handled by
30879 a secondary decoder unit. '8' for Core 2/i7. */
30880 static int core2i7_secondary_decoder_max_insn_size;
30881
30882 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
30883 '16' for Core 2/i7. */
30884 static int core2i7_ifetch_block_size;
30885
30886 /* Maximum number of instructions decoder can handle per cycle.
30887 '6' for Core 2/i7. */
30888 static int core2i7_ifetch_block_max_insns;
30889
30890 typedef struct ix86_first_cycle_multipass_data_ *
30891 ix86_first_cycle_multipass_data_t;
30892 typedef const struct ix86_first_cycle_multipass_data_ *
30893 const_ix86_first_cycle_multipass_data_t;
30894
30895 /* A variable to store target state across calls to max_issue within
30896 one cycle. */
30897 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
30898 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
30899
30900 /* Initialize DATA. */
30901 static void
30902 core2i7_first_cycle_multipass_init (void *_data)
30903 {
30904 ix86_first_cycle_multipass_data_t data
30905 = (ix86_first_cycle_multipass_data_t) _data;
30906
30907 data->ifetch_block_len = 0;
30908 data->ifetch_block_n_insns = 0;
30909 data->ready_try_change = NULL;
30910 data->ready_try_change_size = 0;
30911 }
30912
30913 /* Advancing the cycle; reset ifetch block counts. */
30914 static void
30915 core2i7_dfa_post_advance_cycle (void)
30916 {
30917 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
30918
30919 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
30920
30921 data->ifetch_block_len = 0;
30922 data->ifetch_block_n_insns = 0;
30923 }
30924
30925 static int min_insn_size (rtx_insn *);
30926
30927 /* Filter out insns from ready_try that the core will not be able to issue
30928 on current cycle due to decoder. */
30929 static void
30930 core2i7_first_cycle_multipass_filter_ready_try
30931 (const_ix86_first_cycle_multipass_data_t data,
30932 signed char *ready_try, int n_ready, bool first_cycle_insn_p)
30933 {
30934 while (n_ready--)
30935 {
30936 rtx_insn *insn;
30937 int insn_size;
30938
30939 if (ready_try[n_ready])
30940 continue;
30941
30942 insn = get_ready_element (n_ready);
30943 insn_size = min_insn_size (insn);
30944
30945 if (/* If this is a too long an insn for a secondary decoder ... */
30946 (!first_cycle_insn_p
30947 && insn_size > core2i7_secondary_decoder_max_insn_size)
30948 /* ... or it would not fit into the ifetch block ... */
30949 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
30950 /* ... or the decoder is full already ... */
30951 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
30952 /* ... mask the insn out. */
30953 {
30954 ready_try[n_ready] = 1;
30955
30956 if (data->ready_try_change)
30957 bitmap_set_bit (data->ready_try_change, n_ready);
30958 }
30959 }
30960 }
30961
30962 /* Prepare for a new round of multipass lookahead scheduling. */
30963 static void
30964 core2i7_first_cycle_multipass_begin (void *_data,
30965 signed char *ready_try, int n_ready,
30966 bool first_cycle_insn_p)
30967 {
30968 ix86_first_cycle_multipass_data_t data
30969 = (ix86_first_cycle_multipass_data_t) _data;
30970 const_ix86_first_cycle_multipass_data_t prev_data
30971 = ix86_first_cycle_multipass_data;
30972
30973 /* Restore the state from the end of the previous round. */
30974 data->ifetch_block_len = prev_data->ifetch_block_len;
30975 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
30976
30977 /* Filter instructions that cannot be issued on current cycle due to
30978 decoder restrictions. */
30979 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
30980 first_cycle_insn_p);
30981 }
30982
30983 /* INSN is being issued in current solution. Account for its impact on
30984 the decoder model. */
30985 static void
30986 core2i7_first_cycle_multipass_issue (void *_data,
30987 signed char *ready_try, int n_ready,
30988 rtx_insn *insn, const void *_prev_data)
30989 {
30990 ix86_first_cycle_multipass_data_t data
30991 = (ix86_first_cycle_multipass_data_t) _data;
30992 const_ix86_first_cycle_multipass_data_t prev_data
30993 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
30994
30995 int insn_size = min_insn_size (insn);
30996
30997 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
30998 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
30999 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
31000 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
31001
31002 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
31003 if (!data->ready_try_change)
31004 {
31005 data->ready_try_change = sbitmap_alloc (n_ready);
31006 data->ready_try_change_size = n_ready;
31007 }
31008 else if (data->ready_try_change_size < n_ready)
31009 {
31010 data->ready_try_change = sbitmap_resize (data->ready_try_change,
31011 n_ready, 0);
31012 data->ready_try_change_size = n_ready;
31013 }
31014 bitmap_clear (data->ready_try_change);
31015
31016 /* Filter out insns from ready_try that the core will not be able to issue
31017 on current cycle due to decoder. */
31018 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
31019 false);
31020 }
31021
31022 /* Revert the effect on ready_try. */
31023 static void
31024 core2i7_first_cycle_multipass_backtrack (const void *_data,
31025 signed char *ready_try,
31026 int n_ready ATTRIBUTE_UNUSED)
31027 {
31028 const_ix86_first_cycle_multipass_data_t data
31029 = (const_ix86_first_cycle_multipass_data_t) _data;
31030 unsigned int i = 0;
31031 sbitmap_iterator sbi;
31032
31033 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
31034 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
31035 {
31036 ready_try[i] = 0;
31037 }
31038 }
31039
31040 /* Save the result of multipass lookahead scheduling for the next round. */
31041 static void
31042 core2i7_first_cycle_multipass_end (const void *_data)
31043 {
31044 const_ix86_first_cycle_multipass_data_t data
31045 = (const_ix86_first_cycle_multipass_data_t) _data;
31046 ix86_first_cycle_multipass_data_t next_data
31047 = ix86_first_cycle_multipass_data;
31048
31049 if (data != NULL)
31050 {
31051 next_data->ifetch_block_len = data->ifetch_block_len;
31052 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
31053 }
31054 }
31055
31056 /* Deallocate target data. */
31057 static void
31058 core2i7_first_cycle_multipass_fini (void *_data)
31059 {
31060 ix86_first_cycle_multipass_data_t data
31061 = (ix86_first_cycle_multipass_data_t) _data;
31062
31063 if (data->ready_try_change)
31064 {
31065 sbitmap_free (data->ready_try_change);
31066 data->ready_try_change = NULL;
31067 data->ready_try_change_size = 0;
31068 }
31069 }
31070
31071 /* Prepare for scheduling pass. */
31072 static void
31073 ix86_sched_init_global (FILE *, int, int)
31074 {
31075 /* Install scheduling hooks for current CPU. Some of these hooks are used
31076 in time-critical parts of the scheduler, so we only set them up when
31077 they are actually used. */
31078 switch (ix86_tune)
31079 {
31080 case PROCESSOR_CORE2:
31081 case PROCESSOR_NEHALEM:
31082 case PROCESSOR_SANDYBRIDGE:
31083 case PROCESSOR_HASWELL:
31084 /* Do not perform multipass scheduling for pre-reload schedule
31085 to save compile time. */
31086 if (reload_completed)
31087 {
31088 targetm.sched.dfa_post_advance_cycle
31089 = core2i7_dfa_post_advance_cycle;
31090 targetm.sched.first_cycle_multipass_init
31091 = core2i7_first_cycle_multipass_init;
31092 targetm.sched.first_cycle_multipass_begin
31093 = core2i7_first_cycle_multipass_begin;
31094 targetm.sched.first_cycle_multipass_issue
31095 = core2i7_first_cycle_multipass_issue;
31096 targetm.sched.first_cycle_multipass_backtrack
31097 = core2i7_first_cycle_multipass_backtrack;
31098 targetm.sched.first_cycle_multipass_end
31099 = core2i7_first_cycle_multipass_end;
31100 targetm.sched.first_cycle_multipass_fini
31101 = core2i7_first_cycle_multipass_fini;
31102
31103 /* Set decoder parameters. */
31104 core2i7_secondary_decoder_max_insn_size = 8;
31105 core2i7_ifetch_block_size = 16;
31106 core2i7_ifetch_block_max_insns = 6;
31107 break;
31108 }
31109 /* Fall through. */
31110 default:
31111 targetm.sched.dfa_post_advance_cycle = NULL;
31112 targetm.sched.first_cycle_multipass_init = NULL;
31113 targetm.sched.first_cycle_multipass_begin = NULL;
31114 targetm.sched.first_cycle_multipass_issue = NULL;
31115 targetm.sched.first_cycle_multipass_backtrack = NULL;
31116 targetm.sched.first_cycle_multipass_end = NULL;
31117 targetm.sched.first_cycle_multipass_fini = NULL;
31118 break;
31119 }
31120 }
31121
31122 \f
31123 /* Compute the alignment given to a constant that is being placed in memory.
31124 EXP is the constant and ALIGN is the alignment that the object would
31125 ordinarily have.
31126 The value of this function is used instead of that alignment to align
31127 the object. */
31128
31129 int
31130 ix86_constant_alignment (tree exp, int align)
31131 {
31132 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
31133 || TREE_CODE (exp) == INTEGER_CST)
31134 {
31135 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
31136 return 64;
31137 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
31138 return 128;
31139 }
31140 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
31141 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
31142 return BITS_PER_WORD;
31143
31144 return align;
31145 }
31146
31147 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
31148 the data type, and ALIGN is the alignment that the object would
31149 ordinarily have. */
31150
31151 static int
31152 iamcu_alignment (tree type, int align)
31153 {
31154 enum machine_mode mode;
31155
31156 if (align < 32 || TYPE_USER_ALIGN (type))
31157 return align;
31158
31159 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
31160 bytes. */
31161 mode = TYPE_MODE (strip_array_types (type));
31162 switch (GET_MODE_CLASS (mode))
31163 {
31164 case MODE_INT:
31165 case MODE_COMPLEX_INT:
31166 case MODE_COMPLEX_FLOAT:
31167 case MODE_FLOAT:
31168 case MODE_DECIMAL_FLOAT:
31169 return 32;
31170 default:
31171 return align;
31172 }
31173 }
31174
31175 /* Compute the alignment for a static variable.
31176 TYPE is the data type, and ALIGN is the alignment that
31177 the object would ordinarily have. The value of this function is used
31178 instead of that alignment to align the object. */
31179
31180 int
31181 ix86_data_alignment (tree type, int align, bool opt)
31182 {
31183 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
31184 for symbols from other compilation units or symbols that don't need
31185 to bind locally. In order to preserve some ABI compatibility with
31186 those compilers, ensure we don't decrease alignment from what we
31187 used to assume. */
31188
31189 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
31190
31191 /* A data structure, equal or greater than the size of a cache line
31192 (64 bytes in the Pentium 4 and other recent Intel processors, including
31193 processors based on Intel Core microarchitecture) should be aligned
31194 so that its base address is a multiple of a cache line size. */
31195
31196 int max_align
31197 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
31198
31199 if (max_align < BITS_PER_WORD)
31200 max_align = BITS_PER_WORD;
31201
31202 switch (ix86_align_data_type)
31203 {
31204 case ix86_align_data_type_abi: opt = false; break;
31205 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
31206 case ix86_align_data_type_cacheline: break;
31207 }
31208
31209 if (TARGET_IAMCU)
31210 align = iamcu_alignment (type, align);
31211
31212 if (opt
31213 && AGGREGATE_TYPE_P (type)
31214 && TYPE_SIZE (type)
31215 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
31216 {
31217 if (wi::geu_p (TYPE_SIZE (type), max_align_compat)
31218 && align < max_align_compat)
31219 align = max_align_compat;
31220 if (wi::geu_p (TYPE_SIZE (type), max_align)
31221 && align < max_align)
31222 align = max_align;
31223 }
31224
31225 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
31226 to 16byte boundary. */
31227 if (TARGET_64BIT)
31228 {
31229 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
31230 && TYPE_SIZE (type)
31231 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
31232 && wi::geu_p (TYPE_SIZE (type), 128)
31233 && align < 128)
31234 return 128;
31235 }
31236
31237 if (!opt)
31238 return align;
31239
31240 if (TREE_CODE (type) == ARRAY_TYPE)
31241 {
31242 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
31243 return 64;
31244 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
31245 return 128;
31246 }
31247 else if (TREE_CODE (type) == COMPLEX_TYPE)
31248 {
31249
31250 if (TYPE_MODE (type) == DCmode && align < 64)
31251 return 64;
31252 if ((TYPE_MODE (type) == XCmode
31253 || TYPE_MODE (type) == TCmode) && align < 128)
31254 return 128;
31255 }
31256 else if ((TREE_CODE (type) == RECORD_TYPE
31257 || TREE_CODE (type) == UNION_TYPE
31258 || TREE_CODE (type) == QUAL_UNION_TYPE)
31259 && TYPE_FIELDS (type))
31260 {
31261 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
31262 return 64;
31263 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
31264 return 128;
31265 }
31266 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
31267 || TREE_CODE (type) == INTEGER_TYPE)
31268 {
31269 if (TYPE_MODE (type) == DFmode && align < 64)
31270 return 64;
31271 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
31272 return 128;
31273 }
31274
31275 return align;
31276 }
31277
31278 /* Compute the alignment for a local variable or a stack slot. EXP is
31279 the data type or decl itself, MODE is the widest mode available and
31280 ALIGN is the alignment that the object would ordinarily have. The
31281 value of this macro is used instead of that alignment to align the
31282 object. */
31283
31284 unsigned int
31285 ix86_local_alignment (tree exp, machine_mode mode,
31286 unsigned int align)
31287 {
31288 tree type, decl;
31289
31290 if (exp && DECL_P (exp))
31291 {
31292 type = TREE_TYPE (exp);
31293 decl = exp;
31294 }
31295 else
31296 {
31297 type = exp;
31298 decl = NULL;
31299 }
31300
31301 /* Don't do dynamic stack realignment for long long objects with
31302 -mpreferred-stack-boundary=2. */
31303 if (!TARGET_64BIT
31304 && align == 64
31305 && ix86_preferred_stack_boundary < 64
31306 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
31307 && (!type || !TYPE_USER_ALIGN (type))
31308 && (!decl || !DECL_USER_ALIGN (decl)))
31309 align = 32;
31310
31311 /* If TYPE is NULL, we are allocating a stack slot for caller-save
31312 register in MODE. We will return the largest alignment of XF
31313 and DF. */
31314 if (!type)
31315 {
31316 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
31317 align = GET_MODE_ALIGNMENT (DFmode);
31318 return align;
31319 }
31320
31321 /* Don't increase alignment for Intel MCU psABI. */
31322 if (TARGET_IAMCU)
31323 return align;
31324
31325 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
31326 to 16byte boundary. Exact wording is:
31327
31328 An array uses the same alignment as its elements, except that a local or
31329 global array variable of length at least 16 bytes or
31330 a C99 variable-length array variable always has alignment of at least 16 bytes.
31331
31332 This was added to allow use of aligned SSE instructions at arrays. This
31333 rule is meant for static storage (where compiler can not do the analysis
31334 by itself). We follow it for automatic variables only when convenient.
31335 We fully control everything in the function compiled and functions from
31336 other unit can not rely on the alignment.
31337
31338 Exclude va_list type. It is the common case of local array where
31339 we can not benefit from the alignment.
31340
31341 TODO: Probably one should optimize for size only when var is not escaping. */
31342 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
31343 && TARGET_SSE)
31344 {
31345 if (AGGREGATE_TYPE_P (type)
31346 && (va_list_type_node == NULL_TREE
31347 || (TYPE_MAIN_VARIANT (type)
31348 != TYPE_MAIN_VARIANT (va_list_type_node)))
31349 && TYPE_SIZE (type)
31350 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
31351 && wi::geu_p (TYPE_SIZE (type), 128)
31352 && align < 128)
31353 return 128;
31354 }
31355 if (TREE_CODE (type) == ARRAY_TYPE)
31356 {
31357 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
31358 return 64;
31359 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
31360 return 128;
31361 }
31362 else if (TREE_CODE (type) == COMPLEX_TYPE)
31363 {
31364 if (TYPE_MODE (type) == DCmode && align < 64)
31365 return 64;
31366 if ((TYPE_MODE (type) == XCmode
31367 || TYPE_MODE (type) == TCmode) && align < 128)
31368 return 128;
31369 }
31370 else if ((TREE_CODE (type) == RECORD_TYPE
31371 || TREE_CODE (type) == UNION_TYPE
31372 || TREE_CODE (type) == QUAL_UNION_TYPE)
31373 && TYPE_FIELDS (type))
31374 {
31375 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
31376 return 64;
31377 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
31378 return 128;
31379 }
31380 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
31381 || TREE_CODE (type) == INTEGER_TYPE)
31382 {
31383
31384 if (TYPE_MODE (type) == DFmode && align < 64)
31385 return 64;
31386 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
31387 return 128;
31388 }
31389 return align;
31390 }
31391
31392 /* Compute the minimum required alignment for dynamic stack realignment
31393 purposes for a local variable, parameter or a stack slot. EXP is
31394 the data type or decl itself, MODE is its mode and ALIGN is the
31395 alignment that the object would ordinarily have. */
31396
31397 unsigned int
31398 ix86_minimum_alignment (tree exp, machine_mode mode,
31399 unsigned int align)
31400 {
31401 tree type, decl;
31402
31403 if (exp && DECL_P (exp))
31404 {
31405 type = TREE_TYPE (exp);
31406 decl = exp;
31407 }
31408 else
31409 {
31410 type = exp;
31411 decl = NULL;
31412 }
31413
31414 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
31415 return align;
31416
31417 /* Don't do dynamic stack realignment for long long objects with
31418 -mpreferred-stack-boundary=2. */
31419 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
31420 && (!type || !TYPE_USER_ALIGN (type))
31421 && (!decl || !DECL_USER_ALIGN (decl)))
31422 {
31423 gcc_checking_assert (!TARGET_STV);
31424 return 32;
31425 }
31426
31427 return align;
31428 }
31429 \f
31430 /* Find a location for the static chain incoming to a nested function.
31431 This is a register, unless all free registers are used by arguments. */
31432
31433 static rtx
31434 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
31435 {
31436 unsigned regno;
31437
31438 /* While this function won't be called by the middle-end when a static
31439 chain isn't needed, it's also used throughout the backend so it's
31440 easiest to keep this check centralized. */
31441 if (DECL_P (fndecl_or_type) && !DECL_STATIC_CHAIN (fndecl_or_type))
31442 return NULL;
31443
31444 if (TARGET_64BIT)
31445 {
31446 /* We always use R10 in 64-bit mode. */
31447 regno = R10_REG;
31448 }
31449 else
31450 {
31451 const_tree fntype, fndecl;
31452 unsigned int ccvt;
31453
31454 /* By default in 32-bit mode we use ECX to pass the static chain. */
31455 regno = CX_REG;
31456
31457 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
31458 {
31459 fntype = TREE_TYPE (fndecl_or_type);
31460 fndecl = fndecl_or_type;
31461 }
31462 else
31463 {
31464 fntype = fndecl_or_type;
31465 fndecl = NULL;
31466 }
31467
31468 ccvt = ix86_get_callcvt (fntype);
31469 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
31470 {
31471 /* Fastcall functions use ecx/edx for arguments, which leaves
31472 us with EAX for the static chain.
31473 Thiscall functions use ecx for arguments, which also
31474 leaves us with EAX for the static chain. */
31475 regno = AX_REG;
31476 }
31477 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
31478 {
31479 /* Thiscall functions use ecx for arguments, which leaves
31480 us with EAX and EDX for the static chain.
31481 We are using for abi-compatibility EAX. */
31482 regno = AX_REG;
31483 }
31484 else if (ix86_function_regparm (fntype, fndecl) == 3)
31485 {
31486 /* For regparm 3, we have no free call-clobbered registers in
31487 which to store the static chain. In order to implement this,
31488 we have the trampoline push the static chain to the stack.
31489 However, we can't push a value below the return address when
31490 we call the nested function directly, so we have to use an
31491 alternate entry point. For this we use ESI, and have the
31492 alternate entry point push ESI, so that things appear the
31493 same once we're executing the nested function. */
31494 if (incoming_p)
31495 {
31496 if (fndecl == current_function_decl
31497 && !ix86_static_chain_on_stack)
31498 {
31499 gcc_assert (!reload_completed);
31500 ix86_static_chain_on_stack = true;
31501 }
31502 return gen_frame_mem (SImode,
31503 plus_constant (Pmode,
31504 arg_pointer_rtx, -8));
31505 }
31506 regno = SI_REG;
31507 }
31508 }
31509
31510 return gen_rtx_REG (Pmode, regno);
31511 }
31512
31513 /* Emit RTL insns to initialize the variable parts of a trampoline.
31514 FNDECL is the decl of the target address; M_TRAMP is a MEM for
31515 the trampoline, and CHAIN_VALUE is an RTX for the static chain
31516 to be passed to the target function. */
31517
31518 static void
31519 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
31520 {
31521 rtx mem, fnaddr;
31522 int opcode;
31523 int offset = 0;
31524
31525 fnaddr = XEXP (DECL_RTL (fndecl), 0);
31526
31527 if (TARGET_64BIT)
31528 {
31529 int size;
31530
31531 /* Load the function address to r11. Try to load address using
31532 the shorter movl instead of movabs. We may want to support
31533 movq for kernel mode, but kernel does not use trampolines at
31534 the moment. FNADDR is a 32bit address and may not be in
31535 DImode when ptr_mode == SImode. Always use movl in this
31536 case. */
31537 if (ptr_mode == SImode
31538 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
31539 {
31540 fnaddr = copy_addr_to_reg (fnaddr);
31541
31542 mem = adjust_address (m_tramp, HImode, offset);
31543 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
31544
31545 mem = adjust_address (m_tramp, SImode, offset + 2);
31546 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
31547 offset += 6;
31548 }
31549 else
31550 {
31551 mem = adjust_address (m_tramp, HImode, offset);
31552 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
31553
31554 mem = adjust_address (m_tramp, DImode, offset + 2);
31555 emit_move_insn (mem, fnaddr);
31556 offset += 10;
31557 }
31558
31559 /* Load static chain using movabs to r10. Use the shorter movl
31560 instead of movabs when ptr_mode == SImode. */
31561 if (ptr_mode == SImode)
31562 {
31563 opcode = 0xba41;
31564 size = 6;
31565 }
31566 else
31567 {
31568 opcode = 0xba49;
31569 size = 10;
31570 }
31571
31572 mem = adjust_address (m_tramp, HImode, offset);
31573 emit_move_insn (mem, gen_int_mode (opcode, HImode));
31574
31575 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
31576 emit_move_insn (mem, chain_value);
31577 offset += size;
31578
31579 /* Jump to r11; the last (unused) byte is a nop, only there to
31580 pad the write out to a single 32-bit store. */
31581 mem = adjust_address (m_tramp, SImode, offset);
31582 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
31583 offset += 4;
31584 }
31585 else
31586 {
31587 rtx disp, chain;
31588
31589 /* Depending on the static chain location, either load a register
31590 with a constant, or push the constant to the stack. All of the
31591 instructions are the same size. */
31592 chain = ix86_static_chain (fndecl, true);
31593 if (REG_P (chain))
31594 {
31595 switch (REGNO (chain))
31596 {
31597 case AX_REG:
31598 opcode = 0xb8; break;
31599 case CX_REG:
31600 opcode = 0xb9; break;
31601 default:
31602 gcc_unreachable ();
31603 }
31604 }
31605 else
31606 opcode = 0x68;
31607
31608 mem = adjust_address (m_tramp, QImode, offset);
31609 emit_move_insn (mem, gen_int_mode (opcode, QImode));
31610
31611 mem = adjust_address (m_tramp, SImode, offset + 1);
31612 emit_move_insn (mem, chain_value);
31613 offset += 5;
31614
31615 mem = adjust_address (m_tramp, QImode, offset);
31616 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
31617
31618 mem = adjust_address (m_tramp, SImode, offset + 1);
31619
31620 /* Compute offset from the end of the jmp to the target function.
31621 In the case in which the trampoline stores the static chain on
31622 the stack, we need to skip the first insn which pushes the
31623 (call-saved) register static chain; this push is 1 byte. */
31624 offset += 5;
31625 disp = expand_binop (SImode, sub_optab, fnaddr,
31626 plus_constant (Pmode, XEXP (m_tramp, 0),
31627 offset - (MEM_P (chain) ? 1 : 0)),
31628 NULL_RTX, 1, OPTAB_DIRECT);
31629 emit_move_insn (mem, disp);
31630 }
31631
31632 gcc_assert (offset <= TRAMPOLINE_SIZE);
31633
31634 #ifdef HAVE_ENABLE_EXECUTE_STACK
31635 #ifdef CHECK_EXECUTE_STACK_ENABLED
31636 if (CHECK_EXECUTE_STACK_ENABLED)
31637 #endif
31638 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
31639 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
31640 #endif
31641 }
31642 \f
31643 /* The following file contains several enumerations and data structures
31644 built from the definitions in i386-builtin-types.def. */
31645
31646 #include "i386-builtin-types.inc"
31647
31648 /* Table for the ix86 builtin non-function types. */
31649 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
31650
31651 /* Retrieve an element from the above table, building some of
31652 the types lazily. */
31653
31654 static tree
31655 ix86_get_builtin_type (enum ix86_builtin_type tcode)
31656 {
31657 unsigned int index;
31658 tree type, itype;
31659
31660 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
31661
31662 type = ix86_builtin_type_tab[(int) tcode];
31663 if (type != NULL)
31664 return type;
31665
31666 gcc_assert (tcode > IX86_BT_LAST_PRIM);
31667 if (tcode <= IX86_BT_LAST_VECT)
31668 {
31669 machine_mode mode;
31670
31671 index = tcode - IX86_BT_LAST_PRIM - 1;
31672 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
31673 mode = ix86_builtin_type_vect_mode[index];
31674
31675 type = build_vector_type_for_mode (itype, mode);
31676 }
31677 else
31678 {
31679 int quals;
31680
31681 index = tcode - IX86_BT_LAST_VECT - 1;
31682 if (tcode <= IX86_BT_LAST_PTR)
31683 quals = TYPE_UNQUALIFIED;
31684 else
31685 quals = TYPE_QUAL_CONST;
31686
31687 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
31688 if (quals != TYPE_UNQUALIFIED)
31689 itype = build_qualified_type (itype, quals);
31690
31691 type = build_pointer_type (itype);
31692 }
31693
31694 ix86_builtin_type_tab[(int) tcode] = type;
31695 return type;
31696 }
31697
31698 /* Table for the ix86 builtin function types. */
31699 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
31700
31701 /* Retrieve an element from the above table, building some of
31702 the types lazily. */
31703
31704 static tree
31705 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
31706 {
31707 tree type;
31708
31709 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
31710
31711 type = ix86_builtin_func_type_tab[(int) tcode];
31712 if (type != NULL)
31713 return type;
31714
31715 if (tcode <= IX86_BT_LAST_FUNC)
31716 {
31717 unsigned start = ix86_builtin_func_start[(int) tcode];
31718 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
31719 tree rtype, atype, args = void_list_node;
31720 unsigned i;
31721
31722 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
31723 for (i = after - 1; i > start; --i)
31724 {
31725 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
31726 args = tree_cons (NULL, atype, args);
31727 }
31728
31729 type = build_function_type (rtype, args);
31730 }
31731 else
31732 {
31733 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
31734 enum ix86_builtin_func_type icode;
31735
31736 icode = ix86_builtin_func_alias_base[index];
31737 type = ix86_get_builtin_func_type (icode);
31738 }
31739
31740 ix86_builtin_func_type_tab[(int) tcode] = type;
31741 return type;
31742 }
31743
31744
31745 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
31746 bdesc_* arrays below should come first, then builtins for each bdesc_*
31747 array in ascending order, so that we can use direct array accesses. */
31748 enum ix86_builtins
31749 {
31750 IX86_BUILTIN_MASKMOVQ,
31751 IX86_BUILTIN_LDMXCSR,
31752 IX86_BUILTIN_STMXCSR,
31753 IX86_BUILTIN_MASKMOVDQU,
31754 IX86_BUILTIN_PSLLDQ128,
31755 IX86_BUILTIN_CLFLUSH,
31756 IX86_BUILTIN_MONITOR,
31757 IX86_BUILTIN_MWAIT,
31758 IX86_BUILTIN_CLZERO,
31759 IX86_BUILTIN_VEC_INIT_V2SI,
31760 IX86_BUILTIN_VEC_INIT_V4HI,
31761 IX86_BUILTIN_VEC_INIT_V8QI,
31762 IX86_BUILTIN_VEC_EXT_V2DF,
31763 IX86_BUILTIN_VEC_EXT_V2DI,
31764 IX86_BUILTIN_VEC_EXT_V4SF,
31765 IX86_BUILTIN_VEC_EXT_V4SI,
31766 IX86_BUILTIN_VEC_EXT_V8HI,
31767 IX86_BUILTIN_VEC_EXT_V2SI,
31768 IX86_BUILTIN_VEC_EXT_V4HI,
31769 IX86_BUILTIN_VEC_EXT_V16QI,
31770 IX86_BUILTIN_VEC_SET_V2DI,
31771 IX86_BUILTIN_VEC_SET_V4SF,
31772 IX86_BUILTIN_VEC_SET_V4SI,
31773 IX86_BUILTIN_VEC_SET_V8HI,
31774 IX86_BUILTIN_VEC_SET_V4HI,
31775 IX86_BUILTIN_VEC_SET_V16QI,
31776 IX86_BUILTIN_GATHERSIV2DF,
31777 IX86_BUILTIN_GATHERSIV4DF,
31778 IX86_BUILTIN_GATHERDIV2DF,
31779 IX86_BUILTIN_GATHERDIV4DF,
31780 IX86_BUILTIN_GATHERSIV4SF,
31781 IX86_BUILTIN_GATHERSIV8SF,
31782 IX86_BUILTIN_GATHERDIV4SF,
31783 IX86_BUILTIN_GATHERDIV8SF,
31784 IX86_BUILTIN_GATHERSIV2DI,
31785 IX86_BUILTIN_GATHERSIV4DI,
31786 IX86_BUILTIN_GATHERDIV2DI,
31787 IX86_BUILTIN_GATHERDIV4DI,
31788 IX86_BUILTIN_GATHERSIV4SI,
31789 IX86_BUILTIN_GATHERSIV8SI,
31790 IX86_BUILTIN_GATHERDIV4SI,
31791 IX86_BUILTIN_GATHERDIV8SI,
31792 IX86_BUILTIN_VFMSUBSD3_MASK3,
31793 IX86_BUILTIN_VFMSUBSS3_MASK3,
31794 IX86_BUILTIN_GATHER3SIV8SF,
31795 IX86_BUILTIN_GATHER3SIV4SF,
31796 IX86_BUILTIN_GATHER3SIV4DF,
31797 IX86_BUILTIN_GATHER3SIV2DF,
31798 IX86_BUILTIN_GATHER3DIV8SF,
31799 IX86_BUILTIN_GATHER3DIV4SF,
31800 IX86_BUILTIN_GATHER3DIV4DF,
31801 IX86_BUILTIN_GATHER3DIV2DF,
31802 IX86_BUILTIN_GATHER3SIV8SI,
31803 IX86_BUILTIN_GATHER3SIV4SI,
31804 IX86_BUILTIN_GATHER3SIV4DI,
31805 IX86_BUILTIN_GATHER3SIV2DI,
31806 IX86_BUILTIN_GATHER3DIV8SI,
31807 IX86_BUILTIN_GATHER3DIV4SI,
31808 IX86_BUILTIN_GATHER3DIV4DI,
31809 IX86_BUILTIN_GATHER3DIV2DI,
31810 IX86_BUILTIN_SCATTERSIV8SF,
31811 IX86_BUILTIN_SCATTERSIV4SF,
31812 IX86_BUILTIN_SCATTERSIV4DF,
31813 IX86_BUILTIN_SCATTERSIV2DF,
31814 IX86_BUILTIN_SCATTERDIV8SF,
31815 IX86_BUILTIN_SCATTERDIV4SF,
31816 IX86_BUILTIN_SCATTERDIV4DF,
31817 IX86_BUILTIN_SCATTERDIV2DF,
31818 IX86_BUILTIN_SCATTERSIV8SI,
31819 IX86_BUILTIN_SCATTERSIV4SI,
31820 IX86_BUILTIN_SCATTERSIV4DI,
31821 IX86_BUILTIN_SCATTERSIV2DI,
31822 IX86_BUILTIN_SCATTERDIV8SI,
31823 IX86_BUILTIN_SCATTERDIV4SI,
31824 IX86_BUILTIN_SCATTERDIV4DI,
31825 IX86_BUILTIN_SCATTERDIV2DI,
31826 /* Alternate 4 and 8 element gather/scatter for the vectorizer
31827 where all operands are 32-byte or 64-byte wide respectively. */
31828 IX86_BUILTIN_GATHERALTSIV4DF,
31829 IX86_BUILTIN_GATHERALTDIV8SF,
31830 IX86_BUILTIN_GATHERALTSIV4DI,
31831 IX86_BUILTIN_GATHERALTDIV8SI,
31832 IX86_BUILTIN_GATHER3ALTDIV16SF,
31833 IX86_BUILTIN_GATHER3ALTDIV16SI,
31834 IX86_BUILTIN_GATHER3ALTSIV4DF,
31835 IX86_BUILTIN_GATHER3ALTDIV8SF,
31836 IX86_BUILTIN_GATHER3ALTSIV4DI,
31837 IX86_BUILTIN_GATHER3ALTDIV8SI,
31838 IX86_BUILTIN_GATHER3ALTSIV8DF,
31839 IX86_BUILTIN_GATHER3ALTSIV8DI,
31840 IX86_BUILTIN_GATHER3DIV16SF,
31841 IX86_BUILTIN_GATHER3DIV16SI,
31842 IX86_BUILTIN_GATHER3DIV8DF,
31843 IX86_BUILTIN_GATHER3DIV8DI,
31844 IX86_BUILTIN_GATHER3SIV16SF,
31845 IX86_BUILTIN_GATHER3SIV16SI,
31846 IX86_BUILTIN_GATHER3SIV8DF,
31847 IX86_BUILTIN_GATHER3SIV8DI,
31848 IX86_BUILTIN_SCATTERALTSIV8DF,
31849 IX86_BUILTIN_SCATTERALTDIV16SF,
31850 IX86_BUILTIN_SCATTERALTSIV8DI,
31851 IX86_BUILTIN_SCATTERALTDIV16SI,
31852 IX86_BUILTIN_SCATTERDIV16SF,
31853 IX86_BUILTIN_SCATTERDIV16SI,
31854 IX86_BUILTIN_SCATTERDIV8DF,
31855 IX86_BUILTIN_SCATTERDIV8DI,
31856 IX86_BUILTIN_SCATTERSIV16SF,
31857 IX86_BUILTIN_SCATTERSIV16SI,
31858 IX86_BUILTIN_SCATTERSIV8DF,
31859 IX86_BUILTIN_SCATTERSIV8DI,
31860 IX86_BUILTIN_GATHERPFQPD,
31861 IX86_BUILTIN_GATHERPFDPS,
31862 IX86_BUILTIN_GATHERPFDPD,
31863 IX86_BUILTIN_GATHERPFQPS,
31864 IX86_BUILTIN_SCATTERPFDPD,
31865 IX86_BUILTIN_SCATTERPFDPS,
31866 IX86_BUILTIN_SCATTERPFQPD,
31867 IX86_BUILTIN_SCATTERPFQPS,
31868 IX86_BUILTIN_CLWB,
31869 IX86_BUILTIN_CLFLUSHOPT,
31870 IX86_BUILTIN_INFQ,
31871 IX86_BUILTIN_HUGE_VALQ,
31872 IX86_BUILTIN_NANQ,
31873 IX86_BUILTIN_NANSQ,
31874 IX86_BUILTIN_XABORT,
31875 IX86_BUILTIN_ADDCARRYX32,
31876 IX86_BUILTIN_ADDCARRYX64,
31877 IX86_BUILTIN_SBB32,
31878 IX86_BUILTIN_SBB64,
31879 IX86_BUILTIN_RDRAND16_STEP,
31880 IX86_BUILTIN_RDRAND32_STEP,
31881 IX86_BUILTIN_RDRAND64_STEP,
31882 IX86_BUILTIN_RDSEED16_STEP,
31883 IX86_BUILTIN_RDSEED32_STEP,
31884 IX86_BUILTIN_RDSEED64_STEP,
31885 IX86_BUILTIN_MONITORX,
31886 IX86_BUILTIN_MWAITX,
31887 IX86_BUILTIN_CFSTRING,
31888 IX86_BUILTIN_CPU_INIT,
31889 IX86_BUILTIN_CPU_IS,
31890 IX86_BUILTIN_CPU_SUPPORTS,
31891 IX86_BUILTIN_READ_FLAGS,
31892 IX86_BUILTIN_WRITE_FLAGS,
31893
31894 /* All the remaining builtins are tracked in bdesc_* arrays in
31895 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
31896 this point. */
31897 #define BDESC(mask, icode, name, code, comparison, flag) \
31898 code,
31899 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
31900 code, \
31901 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
31902 #define BDESC_END(kind, next_kind)
31903
31904 #include "i386-builtin.def"
31905
31906 #undef BDESC
31907 #undef BDESC_FIRST
31908 #undef BDESC_END
31909
31910 IX86_BUILTIN_MAX,
31911
31912 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
31913
31914 /* Now just the aliases for bdesc_* start/end. */
31915 #define BDESC(mask, icode, name, code, comparison, flag)
31916 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
31917 #define BDESC_END(kind, next_kind) \
31918 IX86_BUILTIN__BDESC_##kind##_LAST \
31919 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
31920
31921 #include "i386-builtin.def"
31922
31923 #undef BDESC
31924 #undef BDESC_FIRST
31925 #undef BDESC_END
31926
31927 /* Just to make sure there is no comma after the last enumerator. */
31928 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
31929 };
31930
31931 /* Table for the ix86 builtin decls. */
31932 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
31933
31934 /* Table of all of the builtin functions that are possible with different ISA's
31935 but are waiting to be built until a function is declared to use that
31936 ISA. */
31937 struct builtin_isa {
31938 const char *name; /* function name */
31939 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
31940 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
31941 HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */
31942 bool const_p; /* true if the declaration is constant */
31943 bool leaf_p; /* true if the declaration has leaf attribute */
31944 bool nothrow_p; /* true if the declaration has nothrow attribute */
31945 bool set_and_not_built_p;
31946 };
31947
31948 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
31949
31950 /* Bits that can still enable any inclusion of a builtin. */
31951 static HOST_WIDE_INT deferred_isa_values = 0;
31952 static HOST_WIDE_INT deferred_isa_values2 = 0;
31953
31954 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
31955 of which isa_flags to use in the ix86_builtins_isa array. Stores the
31956 function decl in the ix86_builtins array. Returns the function decl or
31957 NULL_TREE, if the builtin was not added.
31958
31959 If the front end has a special hook for builtin functions, delay adding
31960 builtin functions that aren't in the current ISA until the ISA is changed
31961 with function specific optimization. Doing so, can save about 300K for the
31962 default compiler. When the builtin is expanded, check at that time whether
31963 it is valid.
31964
31965 If the front end doesn't have a special hook, record all builtins, even if
31966 it isn't an instruction set in the current ISA in case the user uses
31967 function specific options for a different ISA, so that we don't get scope
31968 errors if a builtin is added in the middle of a function scope. */
31969
31970 static inline tree
31971 def_builtin (HOST_WIDE_INT mask, const char *name,
31972 enum ix86_builtin_func_type tcode,
31973 enum ix86_builtins code)
31974 {
31975 tree decl = NULL_TREE;
31976
31977 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
31978 {
31979 ix86_builtins_isa[(int) code].isa = mask;
31980
31981 /* OPTION_MASK_ISA_AVX512VL has special meaning. Despite of generic case,
31982 where any bit set means that built-in is enable, this bit must be *and-ed*
31983 with another one. E.g.: OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL
31984 means that *both* cpuid bits must be set for the built-in to be available.
31985 Handle this here. */
31986 if (mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
31987 mask &= ~OPTION_MASK_ISA_AVX512VL;
31988
31989 mask &= ~OPTION_MASK_ISA_64BIT;
31990 if (mask == 0
31991 || (mask & ix86_isa_flags) != 0
31992 || (lang_hooks.builtin_function
31993 == lang_hooks.builtin_function_ext_scope))
31994
31995 {
31996 tree type = ix86_get_builtin_func_type (tcode);
31997 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
31998 NULL, NULL_TREE);
31999 ix86_builtins[(int) code] = decl;
32000 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
32001 }
32002 else
32003 {
32004 /* Just a MASK where set_and_not_built_p == true can potentially
32005 include a builtin. */
32006 deferred_isa_values |= mask;
32007 ix86_builtins[(int) code] = NULL_TREE;
32008 ix86_builtins_isa[(int) code].tcode = tcode;
32009 ix86_builtins_isa[(int) code].name = name;
32010 ix86_builtins_isa[(int) code].leaf_p = false;
32011 ix86_builtins_isa[(int) code].nothrow_p = false;
32012 ix86_builtins_isa[(int) code].const_p = false;
32013 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
32014 }
32015 }
32016
32017 return decl;
32018 }
32019
32020 /* Like def_builtin, but also marks the function decl "const". */
32021
32022 static inline tree
32023 def_builtin_const (HOST_WIDE_INT mask, const char *name,
32024 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32025 {
32026 tree decl = def_builtin (mask, name, tcode, code);
32027 if (decl)
32028 TREE_READONLY (decl) = 1;
32029 else
32030 ix86_builtins_isa[(int) code].const_p = true;
32031
32032 return decl;
32033 }
32034
32035 /* Like def_builtin, but for additional isa2 flags. */
32036
32037 static inline tree
32038 def_builtin2 (HOST_WIDE_INT mask, const char *name,
32039 enum ix86_builtin_func_type tcode,
32040 enum ix86_builtins code)
32041 {
32042 tree decl = NULL_TREE;
32043
32044 ix86_builtins_isa[(int) code].isa2 = mask;
32045
32046 if (mask == 0
32047 || (mask & ix86_isa_flags2) != 0
32048 || (lang_hooks.builtin_function
32049 == lang_hooks.builtin_function_ext_scope))
32050
32051 {
32052 tree type = ix86_get_builtin_func_type (tcode);
32053 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
32054 NULL, NULL_TREE);
32055 ix86_builtins[(int) code] = decl;
32056 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
32057 }
32058 else
32059 {
32060 /* Just a MASK where set_and_not_built_p == true can potentially
32061 include a builtin. */
32062 deferred_isa_values2 |= mask;
32063 ix86_builtins[(int) code] = NULL_TREE;
32064 ix86_builtins_isa[(int) code].tcode = tcode;
32065 ix86_builtins_isa[(int) code].name = name;
32066 ix86_builtins_isa[(int) code].leaf_p = false;
32067 ix86_builtins_isa[(int) code].nothrow_p = false;
32068 ix86_builtins_isa[(int) code].const_p = false;
32069 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
32070 }
32071
32072 return decl;
32073 }
32074
32075 /* Like def_builtin, but also marks the function decl "const". */
32076
32077 static inline tree
32078 def_builtin_const2 (HOST_WIDE_INT mask, const char *name,
32079 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32080 {
32081 tree decl = def_builtin2 (mask, name, tcode, code);
32082 if (decl)
32083 TREE_READONLY (decl) = 1;
32084 else
32085 ix86_builtins_isa[(int) code].const_p = true;
32086
32087 return decl;
32088 }
32089
32090 /* Add any new builtin functions for a given ISA that may not have been
32091 declared. This saves a bit of space compared to adding all of the
32092 declarations to the tree, even if we didn't use them. */
32093
32094 static void
32095 ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
32096 {
32097 if ((isa & deferred_isa_values) == 0
32098 && (isa2 & deferred_isa_values2) == 0)
32099 return;
32100
32101 /* Bits in ISA value can be removed from potential isa values. */
32102 deferred_isa_values &= ~isa;
32103 deferred_isa_values2 &= ~isa2;
32104
32105 int i;
32106 tree saved_current_target_pragma = current_target_pragma;
32107 current_target_pragma = NULL_TREE;
32108
32109 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
32110 {
32111 if (((ix86_builtins_isa[i].isa & isa) != 0
32112 || (ix86_builtins_isa[i].isa2 & isa2) != 0)
32113 && ix86_builtins_isa[i].set_and_not_built_p)
32114 {
32115 tree decl, type;
32116
32117 /* Don't define the builtin again. */
32118 ix86_builtins_isa[i].set_and_not_built_p = false;
32119
32120 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
32121 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
32122 type, i, BUILT_IN_MD, NULL,
32123 NULL_TREE);
32124
32125 ix86_builtins[i] = decl;
32126 if (ix86_builtins_isa[i].const_p)
32127 TREE_READONLY (decl) = 1;
32128 if (ix86_builtins_isa[i].leaf_p)
32129 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
32130 NULL_TREE);
32131 if (ix86_builtins_isa[i].nothrow_p)
32132 TREE_NOTHROW (decl) = 1;
32133 }
32134 }
32135
32136 current_target_pragma = saved_current_target_pragma;
32137 }
32138
32139 /* Bits for builtin_description.flag. */
32140
32141 /* Set when we don't support the comparison natively, and should
32142 swap_comparison in order to support it. */
32143 #define BUILTIN_DESC_SWAP_OPERANDS 1
32144
32145 struct builtin_description
32146 {
32147 const HOST_WIDE_INT mask;
32148 const enum insn_code icode;
32149 const char *const name;
32150 const enum ix86_builtins code;
32151 const enum rtx_code comparison;
32152 const int flag;
32153 };
32154
32155 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
32156 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
32157 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
32158 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
32159 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
32160 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
32161 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
32162 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
32163 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
32164 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
32165 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
32166 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
32167 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
32168 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
32169 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
32170 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
32171 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
32172 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
32173 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
32174 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
32175 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
32176 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
32177 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
32178 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
32179 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
32180 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
32181 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
32182 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
32183 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
32184 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
32185 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
32186 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
32187 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
32188 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
32189 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
32190 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
32191 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
32192 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
32193 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
32194 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
32195 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
32196 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
32197 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
32198 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
32199 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
32200 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
32201 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
32202 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
32203 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
32204 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
32205 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
32206 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
32207
32208 #define BDESC(mask, icode, name, code, comparison, flag) \
32209 { mask, icode, name, code, comparison, flag },
32210 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
32211 static const struct builtin_description bdesc_##kind[] = \
32212 { \
32213 BDESC (mask, icode, name, code, comparison, flag)
32214 #define BDESC_END(kind, next_kind) \
32215 };
32216
32217 #include "i386-builtin.def"
32218
32219 #undef BDESC
32220 #undef BDESC_FIRST
32221 #undef BDESC_END
32222 \f
32223 /* TM vector builtins. */
32224
32225 /* Reuse the existing x86-specific `struct builtin_description' cause
32226 we're lazy. Add casts to make them fit. */
32227 static const struct builtin_description bdesc_tm[] =
32228 {
32229 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
32230 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
32231 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
32232 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32233 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32234 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32235 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32236
32237 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
32238 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
32239 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
32240 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32241 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32242 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32243 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32244
32245 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
32246 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
32247 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
32248 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32249 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32250 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32251 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32252
32253 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
32254 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
32255 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
32256 };
32257
32258 /* Initialize the transactional memory vector load/store builtins. */
32259
32260 static void
32261 ix86_init_tm_builtins (void)
32262 {
32263 enum ix86_builtin_func_type ftype;
32264 const struct builtin_description *d;
32265 size_t i;
32266 tree decl;
32267 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
32268 tree attrs_log, attrs_type_log;
32269
32270 if (!flag_tm)
32271 return;
32272
32273 /* If there are no builtins defined, we must be compiling in a
32274 language without trans-mem support. */
32275 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
32276 return;
32277
32278 /* Use whatever attributes a normal TM load has. */
32279 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
32280 attrs_load = DECL_ATTRIBUTES (decl);
32281 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
32282 /* Use whatever attributes a normal TM store has. */
32283 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
32284 attrs_store = DECL_ATTRIBUTES (decl);
32285 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
32286 /* Use whatever attributes a normal TM log has. */
32287 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
32288 attrs_log = DECL_ATTRIBUTES (decl);
32289 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
32290
32291 for (i = 0, d = bdesc_tm;
32292 i < ARRAY_SIZE (bdesc_tm);
32293 i++, d++)
32294 {
32295 if ((d->mask & ix86_isa_flags) != 0
32296 || (lang_hooks.builtin_function
32297 == lang_hooks.builtin_function_ext_scope))
32298 {
32299 tree type, attrs, attrs_type;
32300 enum built_in_function code = (enum built_in_function) d->code;
32301
32302 ftype = (enum ix86_builtin_func_type) d->flag;
32303 type = ix86_get_builtin_func_type (ftype);
32304
32305 if (BUILTIN_TM_LOAD_P (code))
32306 {
32307 attrs = attrs_load;
32308 attrs_type = attrs_type_load;
32309 }
32310 else if (BUILTIN_TM_STORE_P (code))
32311 {
32312 attrs = attrs_store;
32313 attrs_type = attrs_type_store;
32314 }
32315 else
32316 {
32317 attrs = attrs_log;
32318 attrs_type = attrs_type_log;
32319 }
32320 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
32321 /* The builtin without the prefix for
32322 calling it directly. */
32323 d->name + strlen ("__builtin_"),
32324 attrs);
32325 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
32326 set the TYPE_ATTRIBUTES. */
32327 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
32328
32329 set_builtin_decl (code, decl, false);
32330 }
32331 }
32332 }
32333
32334 /* Macros for verification of enum ix86_builtins order. */
32335 #define BDESC_VERIFY(x, y, z) \
32336 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
32337 #define BDESC_VERIFYS(x, y, z) \
32338 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
32339
32340 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
32341 IX86_BUILTIN__BDESC_COMI_LAST, 1);
32342 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
32343 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
32344 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
32345 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
32346 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
32347 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
32348 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
32349 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
32350 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_FIRST,
32351 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
32352 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
32353 IX86_BUILTIN__BDESC_ARGS2_LAST, 1);
32354 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
32355 IX86_BUILTIN__BDESC_MPX_LAST, 1);
32356 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
32357 IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
32358 BDESC_VERIFYS (IX86_BUILTIN_MAX,
32359 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
32360
32361 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
32362 in the current target ISA to allow the user to compile particular modules
32363 with different target specific options that differ from the command line
32364 options. */
32365 static void
32366 ix86_init_mmx_sse_builtins (void)
32367 {
32368 const struct builtin_description * d;
32369 enum ix86_builtin_func_type ftype;
32370 size_t i;
32371
32372 /* Add all special builtins with variable number of operands. */
32373 for (i = 0, d = bdesc_special_args;
32374 i < ARRAY_SIZE (bdesc_special_args);
32375 i++, d++)
32376 {
32377 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
32378 if (d->name == 0)
32379 continue;
32380
32381 ftype = (enum ix86_builtin_func_type) d->flag;
32382 def_builtin (d->mask, d->name, ftype, d->code);
32383 }
32384 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
32385 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
32386 ARRAY_SIZE (bdesc_special_args) - 1);
32387
32388 /* Add all builtins with variable number of operands. */
32389 for (i = 0, d = bdesc_args;
32390 i < ARRAY_SIZE (bdesc_args);
32391 i++, d++)
32392 {
32393 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
32394 if (d->name == 0)
32395 continue;
32396
32397 ftype = (enum ix86_builtin_func_type) d->flag;
32398 def_builtin_const (d->mask, d->name, ftype, d->code);
32399 }
32400 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
32401 IX86_BUILTIN__BDESC_ARGS_FIRST,
32402 ARRAY_SIZE (bdesc_args) - 1);
32403
32404 /* Add all builtins with variable number of operands. */
32405 for (i = 0, d = bdesc_args2;
32406 i < ARRAY_SIZE (bdesc_args2);
32407 i++, d++)
32408 {
32409 if (d->name == 0)
32410 continue;
32411
32412 ftype = (enum ix86_builtin_func_type) d->flag;
32413 def_builtin_const2 (d->mask, d->name, ftype, d->code);
32414 }
32415
32416 /* Add all builtins with rounding. */
32417 for (i = 0, d = bdesc_round_args;
32418 i < ARRAY_SIZE (bdesc_round_args);
32419 i++, d++)
32420 {
32421 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
32422 if (d->name == 0)
32423 continue;
32424
32425 ftype = (enum ix86_builtin_func_type) d->flag;
32426 def_builtin_const (d->mask, d->name, ftype, d->code);
32427 }
32428 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
32429 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
32430 ARRAY_SIZE (bdesc_round_args) - 1);
32431
32432 /* pcmpestr[im] insns. */
32433 for (i = 0, d = bdesc_pcmpestr;
32434 i < ARRAY_SIZE (bdesc_pcmpestr);
32435 i++, d++)
32436 {
32437 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
32438 if (d->code == IX86_BUILTIN_PCMPESTRM128)
32439 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
32440 else
32441 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
32442 def_builtin_const (d->mask, d->name, ftype, d->code);
32443 }
32444 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
32445 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
32446 ARRAY_SIZE (bdesc_pcmpestr) - 1);
32447
32448 /* pcmpistr[im] insns. */
32449 for (i = 0, d = bdesc_pcmpistr;
32450 i < ARRAY_SIZE (bdesc_pcmpistr);
32451 i++, d++)
32452 {
32453 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
32454 if (d->code == IX86_BUILTIN_PCMPISTRM128)
32455 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
32456 else
32457 ftype = INT_FTYPE_V16QI_V16QI_INT;
32458 def_builtin_const (d->mask, d->name, ftype, d->code);
32459 }
32460 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
32461 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
32462 ARRAY_SIZE (bdesc_pcmpistr) - 1);
32463
32464 /* comi/ucomi insns. */
32465 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
32466 {
32467 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
32468 if (d->mask == OPTION_MASK_ISA_SSE2)
32469 ftype = INT_FTYPE_V2DF_V2DF;
32470 else
32471 ftype = INT_FTYPE_V4SF_V4SF;
32472 def_builtin_const (d->mask, d->name, ftype, d->code);
32473 }
32474 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
32475 IX86_BUILTIN__BDESC_COMI_FIRST,
32476 ARRAY_SIZE (bdesc_comi) - 1);
32477
32478 /* SSE */
32479 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
32480 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
32481 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
32482 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
32483
32484 /* SSE or 3DNow!A */
32485 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
32486 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
32487 IX86_BUILTIN_MASKMOVQ);
32488
32489 /* SSE2 */
32490 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
32491 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
32492
32493 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
32494 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
32495 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
32496 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
32497
32498 /* SSE3. */
32499 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
32500 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
32501 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
32502 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
32503
32504 /* AES */
32505 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
32506 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
32507 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
32508 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
32509 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
32510 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
32511 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
32512 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
32513 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
32514 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
32515 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
32516 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
32517
32518 /* PCLMUL */
32519 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
32520 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
32521
32522 /* RDRND */
32523 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
32524 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
32525 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
32526 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
32527 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
32528 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
32529 IX86_BUILTIN_RDRAND64_STEP);
32530
32531 /* AVX2 */
32532 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
32533 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
32534 IX86_BUILTIN_GATHERSIV2DF);
32535
32536 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
32537 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
32538 IX86_BUILTIN_GATHERSIV4DF);
32539
32540 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
32541 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
32542 IX86_BUILTIN_GATHERDIV2DF);
32543
32544 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
32545 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
32546 IX86_BUILTIN_GATHERDIV4DF);
32547
32548 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
32549 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
32550 IX86_BUILTIN_GATHERSIV4SF);
32551
32552 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
32553 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
32554 IX86_BUILTIN_GATHERSIV8SF);
32555
32556 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
32557 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
32558 IX86_BUILTIN_GATHERDIV4SF);
32559
32560 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
32561 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
32562 IX86_BUILTIN_GATHERDIV8SF);
32563
32564 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
32565 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
32566 IX86_BUILTIN_GATHERSIV2DI);
32567
32568 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
32569 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
32570 IX86_BUILTIN_GATHERSIV4DI);
32571
32572 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
32573 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
32574 IX86_BUILTIN_GATHERDIV2DI);
32575
32576 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
32577 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
32578 IX86_BUILTIN_GATHERDIV4DI);
32579
32580 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
32581 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
32582 IX86_BUILTIN_GATHERSIV4SI);
32583
32584 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
32585 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
32586 IX86_BUILTIN_GATHERSIV8SI);
32587
32588 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
32589 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
32590 IX86_BUILTIN_GATHERDIV4SI);
32591
32592 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
32593 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
32594 IX86_BUILTIN_GATHERDIV8SI);
32595
32596 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
32597 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
32598 IX86_BUILTIN_GATHERALTSIV4DF);
32599
32600 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
32601 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
32602 IX86_BUILTIN_GATHERALTDIV8SF);
32603
32604 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
32605 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
32606 IX86_BUILTIN_GATHERALTSIV4DI);
32607
32608 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
32609 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
32610 IX86_BUILTIN_GATHERALTDIV8SI);
32611
32612 /* AVX512F */
32613 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
32614 V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
32615 IX86_BUILTIN_GATHER3SIV16SF);
32616
32617 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
32618 V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
32619 IX86_BUILTIN_GATHER3SIV8DF);
32620
32621 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
32622 V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
32623 IX86_BUILTIN_GATHER3DIV16SF);
32624
32625 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
32626 V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
32627 IX86_BUILTIN_GATHER3DIV8DF);
32628
32629 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
32630 V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
32631 IX86_BUILTIN_GATHER3SIV16SI);
32632
32633 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
32634 V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
32635 IX86_BUILTIN_GATHER3SIV8DI);
32636
32637 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
32638 V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
32639 IX86_BUILTIN_GATHER3DIV16SI);
32640
32641 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
32642 V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
32643 IX86_BUILTIN_GATHER3DIV8DI);
32644
32645 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
32646 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
32647 IX86_BUILTIN_GATHER3ALTSIV8DF);
32648
32649 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
32650 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
32651 IX86_BUILTIN_GATHER3ALTDIV16SF);
32652
32653 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
32654 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
32655 IX86_BUILTIN_GATHER3ALTSIV8DI);
32656
32657 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
32658 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
32659 IX86_BUILTIN_GATHER3ALTDIV16SI);
32660
32661 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
32662 VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
32663 IX86_BUILTIN_SCATTERSIV16SF);
32664
32665 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
32666 VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
32667 IX86_BUILTIN_SCATTERSIV8DF);
32668
32669 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
32670 VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
32671 IX86_BUILTIN_SCATTERDIV16SF);
32672
32673 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
32674 VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
32675 IX86_BUILTIN_SCATTERDIV8DF);
32676
32677 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
32678 VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
32679 IX86_BUILTIN_SCATTERSIV16SI);
32680
32681 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
32682 VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
32683 IX86_BUILTIN_SCATTERSIV8DI);
32684
32685 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
32686 VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
32687 IX86_BUILTIN_SCATTERDIV16SI);
32688
32689 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
32690 VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
32691 IX86_BUILTIN_SCATTERDIV8DI);
32692
32693 /* AVX512VL */
32694 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
32695 V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
32696 IX86_BUILTIN_GATHER3SIV2DF);
32697
32698 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
32699 V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
32700 IX86_BUILTIN_GATHER3SIV4DF);
32701
32702 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
32703 V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
32704 IX86_BUILTIN_GATHER3DIV2DF);
32705
32706 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
32707 V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
32708 IX86_BUILTIN_GATHER3DIV4DF);
32709
32710 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
32711 V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
32712 IX86_BUILTIN_GATHER3SIV4SF);
32713
32714 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
32715 V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
32716 IX86_BUILTIN_GATHER3SIV8SF);
32717
32718 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
32719 V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
32720 IX86_BUILTIN_GATHER3DIV4SF);
32721
32722 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
32723 V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
32724 IX86_BUILTIN_GATHER3DIV8SF);
32725
32726 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
32727 V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
32728 IX86_BUILTIN_GATHER3SIV2DI);
32729
32730 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
32731 V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
32732 IX86_BUILTIN_GATHER3SIV4DI);
32733
32734 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
32735 V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
32736 IX86_BUILTIN_GATHER3DIV2DI);
32737
32738 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
32739 V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
32740 IX86_BUILTIN_GATHER3DIV4DI);
32741
32742 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
32743 V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
32744 IX86_BUILTIN_GATHER3SIV4SI);
32745
32746 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
32747 V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
32748 IX86_BUILTIN_GATHER3SIV8SI);
32749
32750 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
32751 V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
32752 IX86_BUILTIN_GATHER3DIV4SI);
32753
32754 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
32755 V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
32756 IX86_BUILTIN_GATHER3DIV8SI);
32757
32758 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
32759 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
32760 IX86_BUILTIN_GATHER3ALTSIV4DF);
32761
32762 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
32763 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
32764 IX86_BUILTIN_GATHER3ALTDIV8SF);
32765
32766 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
32767 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
32768 IX86_BUILTIN_GATHER3ALTSIV4DI);
32769
32770 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
32771 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
32772 IX86_BUILTIN_GATHER3ALTDIV8SI);
32773
32774 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
32775 VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
32776 IX86_BUILTIN_SCATTERSIV8SF);
32777
32778 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
32779 VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
32780 IX86_BUILTIN_SCATTERSIV4SF);
32781
32782 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
32783 VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
32784 IX86_BUILTIN_SCATTERSIV4DF);
32785
32786 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
32787 VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
32788 IX86_BUILTIN_SCATTERSIV2DF);
32789
32790 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
32791 VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
32792 IX86_BUILTIN_SCATTERDIV8SF);
32793
32794 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
32795 VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
32796 IX86_BUILTIN_SCATTERDIV4SF);
32797
32798 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
32799 VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
32800 IX86_BUILTIN_SCATTERDIV4DF);
32801
32802 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
32803 VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
32804 IX86_BUILTIN_SCATTERDIV2DF);
32805
32806 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
32807 VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
32808 IX86_BUILTIN_SCATTERSIV8SI);
32809
32810 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
32811 VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
32812 IX86_BUILTIN_SCATTERSIV4SI);
32813
32814 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
32815 VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
32816 IX86_BUILTIN_SCATTERSIV4DI);
32817
32818 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
32819 VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
32820 IX86_BUILTIN_SCATTERSIV2DI);
32821
32822 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
32823 VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
32824 IX86_BUILTIN_SCATTERDIV8SI);
32825
32826 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
32827 VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
32828 IX86_BUILTIN_SCATTERDIV4SI);
32829
32830 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
32831 VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
32832 IX86_BUILTIN_SCATTERDIV4DI);
32833
32834 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
32835 VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
32836 IX86_BUILTIN_SCATTERDIV2DI);
32837 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
32838 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
32839 IX86_BUILTIN_SCATTERALTSIV8DF);
32840
32841 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
32842 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
32843 IX86_BUILTIN_SCATTERALTDIV16SF);
32844
32845 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
32846 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
32847 IX86_BUILTIN_SCATTERALTSIV8DI);
32848
32849 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
32850 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
32851 IX86_BUILTIN_SCATTERALTDIV16SI);
32852
32853 /* AVX512PF */
32854 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
32855 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
32856 IX86_BUILTIN_GATHERPFDPD);
32857 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
32858 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
32859 IX86_BUILTIN_GATHERPFDPS);
32860 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
32861 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
32862 IX86_BUILTIN_GATHERPFQPD);
32863 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
32864 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
32865 IX86_BUILTIN_GATHERPFQPS);
32866 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
32867 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
32868 IX86_BUILTIN_SCATTERPFDPD);
32869 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
32870 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
32871 IX86_BUILTIN_SCATTERPFDPS);
32872 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
32873 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
32874 IX86_BUILTIN_SCATTERPFQPD);
32875 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
32876 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
32877 IX86_BUILTIN_SCATTERPFQPS);
32878
32879 /* SHA */
32880 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
32881 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
32882 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
32883 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
32884 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
32885 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
32886 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
32887 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
32888 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
32889 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
32890 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
32891 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
32892 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
32893 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
32894
32895 /* RTM. */
32896 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
32897 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
32898
32899 /* MMX access to the vec_init patterns. */
32900 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
32901 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
32902
32903 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
32904 V4HI_FTYPE_HI_HI_HI_HI,
32905 IX86_BUILTIN_VEC_INIT_V4HI);
32906
32907 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
32908 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
32909 IX86_BUILTIN_VEC_INIT_V8QI);
32910
32911 /* Access to the vec_extract patterns. */
32912 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
32913 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
32914 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
32915 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
32916 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
32917 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
32918 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
32919 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
32920 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
32921 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
32922
32923 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
32924 "__builtin_ia32_vec_ext_v4hi",
32925 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
32926
32927 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
32928 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
32929
32930 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
32931 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
32932
32933 /* Access to the vec_set patterns. */
32934 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
32935 "__builtin_ia32_vec_set_v2di",
32936 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
32937
32938 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
32939 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
32940
32941 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
32942 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
32943
32944 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
32945 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
32946
32947 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
32948 "__builtin_ia32_vec_set_v4hi",
32949 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
32950
32951 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
32952 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
32953
32954 /* RDSEED */
32955 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
32956 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
32957 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
32958 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
32959 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
32960 "__builtin_ia32_rdseed_di_step",
32961 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
32962
32963 /* ADCX */
32964 def_builtin (0, "__builtin_ia32_addcarryx_u32",
32965 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
32966 def_builtin (OPTION_MASK_ISA_64BIT,
32967 "__builtin_ia32_addcarryx_u64",
32968 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
32969 IX86_BUILTIN_ADDCARRYX64);
32970
32971 /* SBB */
32972 def_builtin (0, "__builtin_ia32_sbb_u32",
32973 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
32974 def_builtin (OPTION_MASK_ISA_64BIT,
32975 "__builtin_ia32_sbb_u64",
32976 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
32977 IX86_BUILTIN_SBB64);
32978
32979 /* Read/write FLAGS. */
32980 def_builtin (0, "__builtin_ia32_readeflags_u32",
32981 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
32982 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
32983 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
32984 def_builtin (0, "__builtin_ia32_writeeflags_u32",
32985 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
32986 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
32987 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
32988
32989 /* CLFLUSHOPT. */
32990 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
32991 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
32992
32993 /* CLWB. */
32994 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
32995 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
32996
32997 /* MONITORX and MWAITX. */
32998 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
32999 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
33000 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
33001 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
33002
33003 /* CLZERO. */
33004 def_builtin (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
33005 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
33006
33007 /* Add FMA4 multi-arg argument instructions */
33008 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
33009 {
33010 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
33011 if (d->name == 0)
33012 continue;
33013
33014 ftype = (enum ix86_builtin_func_type) d->flag;
33015 def_builtin_const (d->mask, d->name, ftype, d->code);
33016 }
33017 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
33018 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
33019 ARRAY_SIZE (bdesc_multi_arg) - 1);
33020 }
33021
33022 static void
33023 ix86_init_mpx_builtins ()
33024 {
33025 const struct builtin_description * d;
33026 enum ix86_builtin_func_type ftype;
33027 tree decl;
33028 size_t i;
33029
33030 for (i = 0, d = bdesc_mpx;
33031 i < ARRAY_SIZE (bdesc_mpx);
33032 i++, d++)
33033 {
33034 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
33035 if (d->name == 0)
33036 continue;
33037
33038 ftype = (enum ix86_builtin_func_type) d->flag;
33039 decl = def_builtin (d->mask, d->name, ftype, d->code);
33040
33041 /* With no leaf and nothrow flags for MPX builtins
33042 abnormal edges may follow its call when setjmp
33043 presents in the function. Since we may have a lot
33044 of MPX builtins calls it causes lots of useless
33045 edges and enormous PHI nodes. To avoid this we mark
33046 MPX builtins as leaf and nothrow. */
33047 if (decl)
33048 {
33049 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
33050 NULL_TREE);
33051 TREE_NOTHROW (decl) = 1;
33052 }
33053 else
33054 {
33055 ix86_builtins_isa[(int)d->code].leaf_p = true;
33056 ix86_builtins_isa[(int)d->code].nothrow_p = true;
33057 }
33058 }
33059 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
33060 IX86_BUILTIN__BDESC_MPX_FIRST,
33061 ARRAY_SIZE (bdesc_mpx) - 1);
33062
33063 for (i = 0, d = bdesc_mpx_const;
33064 i < ARRAY_SIZE (bdesc_mpx_const);
33065 i++, d++)
33066 {
33067 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
33068 if (d->name == 0)
33069 continue;
33070
33071 ftype = (enum ix86_builtin_func_type) d->flag;
33072 decl = def_builtin_const (d->mask, d->name, ftype, d->code);
33073
33074 if (decl)
33075 {
33076 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
33077 NULL_TREE);
33078 TREE_NOTHROW (decl) = 1;
33079 }
33080 else
33081 {
33082 ix86_builtins_isa[(int)d->code].leaf_p = true;
33083 ix86_builtins_isa[(int)d->code].nothrow_p = true;
33084 }
33085 }
33086 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
33087 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
33088 ARRAY_SIZE (bdesc_mpx_const) - 1);
33089 }
33090 #undef BDESC_VERIFY
33091 #undef BDESC_VERIFYS
33092
33093 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
33094 to return a pointer to VERSION_DECL if the outcome of the expression
33095 formed by PREDICATE_CHAIN is true. This function will be called during
33096 version dispatch to decide which function version to execute. It returns
33097 the basic block at the end, to which more conditions can be added. */
33098
33099 static basic_block
33100 add_condition_to_bb (tree function_decl, tree version_decl,
33101 tree predicate_chain, basic_block new_bb)
33102 {
33103 gimple *return_stmt;
33104 tree convert_expr, result_var;
33105 gimple *convert_stmt;
33106 gimple *call_cond_stmt;
33107 gimple *if_else_stmt;
33108
33109 basic_block bb1, bb2, bb3;
33110 edge e12, e23;
33111
33112 tree cond_var, and_expr_var = NULL_TREE;
33113 gimple_seq gseq;
33114
33115 tree predicate_decl, predicate_arg;
33116
33117 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
33118
33119 gcc_assert (new_bb != NULL);
33120 gseq = bb_seq (new_bb);
33121
33122
33123 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
33124 build_fold_addr_expr (version_decl));
33125 result_var = create_tmp_var (ptr_type_node);
33126 convert_stmt = gimple_build_assign (result_var, convert_expr);
33127 return_stmt = gimple_build_return (result_var);
33128
33129 if (predicate_chain == NULL_TREE)
33130 {
33131 gimple_seq_add_stmt (&gseq, convert_stmt);
33132 gimple_seq_add_stmt (&gseq, return_stmt);
33133 set_bb_seq (new_bb, gseq);
33134 gimple_set_bb (convert_stmt, new_bb);
33135 gimple_set_bb (return_stmt, new_bb);
33136 pop_cfun ();
33137 return new_bb;
33138 }
33139
33140 while (predicate_chain != NULL)
33141 {
33142 cond_var = create_tmp_var (integer_type_node);
33143 predicate_decl = TREE_PURPOSE (predicate_chain);
33144 predicate_arg = TREE_VALUE (predicate_chain);
33145 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
33146 gimple_call_set_lhs (call_cond_stmt, cond_var);
33147
33148 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
33149 gimple_set_bb (call_cond_stmt, new_bb);
33150 gimple_seq_add_stmt (&gseq, call_cond_stmt);
33151
33152 predicate_chain = TREE_CHAIN (predicate_chain);
33153
33154 if (and_expr_var == NULL)
33155 and_expr_var = cond_var;
33156 else
33157 {
33158 gimple *assign_stmt;
33159 /* Use MIN_EXPR to check if any integer is zero?.
33160 and_expr_var = min_expr <cond_var, and_expr_var> */
33161 assign_stmt = gimple_build_assign (and_expr_var,
33162 build2 (MIN_EXPR, integer_type_node,
33163 cond_var, and_expr_var));
33164
33165 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
33166 gimple_set_bb (assign_stmt, new_bb);
33167 gimple_seq_add_stmt (&gseq, assign_stmt);
33168 }
33169 }
33170
33171 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
33172 integer_zero_node,
33173 NULL_TREE, NULL_TREE);
33174 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
33175 gimple_set_bb (if_else_stmt, new_bb);
33176 gimple_seq_add_stmt (&gseq, if_else_stmt);
33177
33178 gimple_seq_add_stmt (&gseq, convert_stmt);
33179 gimple_seq_add_stmt (&gseq, return_stmt);
33180 set_bb_seq (new_bb, gseq);
33181
33182 bb1 = new_bb;
33183 e12 = split_block (bb1, if_else_stmt);
33184 bb2 = e12->dest;
33185 e12->flags &= ~EDGE_FALLTHRU;
33186 e12->flags |= EDGE_TRUE_VALUE;
33187
33188 e23 = split_block (bb2, return_stmt);
33189
33190 gimple_set_bb (convert_stmt, bb2);
33191 gimple_set_bb (return_stmt, bb2);
33192
33193 bb3 = e23->dest;
33194 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
33195
33196 remove_edge (e23);
33197 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
33198
33199 pop_cfun ();
33200
33201 return bb3;
33202 }
33203
33204 /* This parses the attribute arguments to target in DECL and determines
33205 the right builtin to use to match the platform specification.
33206 It returns the priority value for this version decl. If PREDICATE_LIST
33207 is not NULL, it stores the list of cpu features that need to be checked
33208 before dispatching this function. */
33209
33210 static unsigned int
33211 get_builtin_code_for_version (tree decl, tree *predicate_list)
33212 {
33213 tree attrs;
33214 struct cl_target_option cur_target;
33215 tree target_node;
33216 struct cl_target_option *new_target;
33217 const char *arg_str = NULL;
33218 const char *attrs_str = NULL;
33219 char *tok_str = NULL;
33220 char *token;
33221
33222 /* Priority of i386 features, greater value is higher priority. This is
33223 used to decide the order in which function dispatch must happen. For
33224 instance, a version specialized for SSE4.2 should be checked for dispatch
33225 before a version for SSE3, as SSE4.2 implies SSE3. */
33226 enum feature_priority
33227 {
33228 P_ZERO = 0,
33229 P_MMX,
33230 P_SSE,
33231 P_SSE2,
33232 P_SSE3,
33233 P_SSSE3,
33234 P_PROC_SSSE3,
33235 P_SSE4_A,
33236 P_PROC_SSE4_A,
33237 P_SSE4_1,
33238 P_SSE4_2,
33239 P_PROC_SSE4_2,
33240 P_POPCNT,
33241 P_AES,
33242 P_PCLMUL,
33243 P_AVX,
33244 P_PROC_AVX,
33245 P_BMI,
33246 P_PROC_BMI,
33247 P_FMA4,
33248 P_XOP,
33249 P_PROC_XOP,
33250 P_FMA,
33251 P_PROC_FMA,
33252 P_BMI2,
33253 P_AVX2,
33254 P_PROC_AVX2,
33255 P_AVX512F,
33256 P_PROC_AVX512F
33257 };
33258
33259 enum feature_priority priority = P_ZERO;
33260
33261 /* These are the target attribute strings for which a dispatcher is
33262 available, from fold_builtin_cpu. */
33263
33264 static struct _feature_list
33265 {
33266 const char *const name;
33267 const enum feature_priority priority;
33268 }
33269 const feature_list[] =
33270 {
33271 {"mmx", P_MMX},
33272 {"sse", P_SSE},
33273 {"sse2", P_SSE2},
33274 {"sse3", P_SSE3},
33275 {"sse4a", P_SSE4_A},
33276 {"ssse3", P_SSSE3},
33277 {"sse4.1", P_SSE4_1},
33278 {"sse4.2", P_SSE4_2},
33279 {"popcnt", P_POPCNT},
33280 {"aes", P_AES},
33281 {"pclmul", P_PCLMUL},
33282 {"avx", P_AVX},
33283 {"bmi", P_BMI},
33284 {"fma4", P_FMA4},
33285 {"xop", P_XOP},
33286 {"fma", P_FMA},
33287 {"bmi2", P_BMI2},
33288 {"avx2", P_AVX2},
33289 {"avx512f", P_AVX512F}
33290 };
33291
33292
33293 static unsigned int NUM_FEATURES
33294 = sizeof (feature_list) / sizeof (struct _feature_list);
33295
33296 unsigned int i;
33297
33298 tree predicate_chain = NULL_TREE;
33299 tree predicate_decl, predicate_arg;
33300
33301 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
33302 gcc_assert (attrs != NULL);
33303
33304 attrs = TREE_VALUE (TREE_VALUE (attrs));
33305
33306 gcc_assert (TREE_CODE (attrs) == STRING_CST);
33307 attrs_str = TREE_STRING_POINTER (attrs);
33308
33309 /* Return priority zero for default function. */
33310 if (strcmp (attrs_str, "default") == 0)
33311 return 0;
33312
33313 /* Handle arch= if specified. For priority, set it to be 1 more than
33314 the best instruction set the processor can handle. For instance, if
33315 there is a version for atom and a version for ssse3 (the highest ISA
33316 priority for atom), the atom version must be checked for dispatch
33317 before the ssse3 version. */
33318 if (strstr (attrs_str, "arch=") != NULL)
33319 {
33320 cl_target_option_save (&cur_target, &global_options);
33321 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
33322 &global_options_set);
33323
33324 gcc_assert (target_node);
33325 new_target = TREE_TARGET_OPTION (target_node);
33326 gcc_assert (new_target);
33327
33328 if (new_target->arch_specified && new_target->arch > 0)
33329 {
33330 switch (new_target->arch)
33331 {
33332 case PROCESSOR_CORE2:
33333 arg_str = "core2";
33334 priority = P_PROC_SSSE3;
33335 break;
33336 case PROCESSOR_NEHALEM:
33337 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
33338 arg_str = "westmere";
33339 else
33340 /* We translate "arch=corei7" and "arch=nehalem" to
33341 "corei7" so that it will be mapped to M_INTEL_COREI7
33342 as cpu type to cover all M_INTEL_COREI7_XXXs. */
33343 arg_str = "corei7";
33344 priority = P_PROC_SSE4_2;
33345 break;
33346 case PROCESSOR_SANDYBRIDGE:
33347 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
33348 arg_str = "ivybridge";
33349 else
33350 arg_str = "sandybridge";
33351 priority = P_PROC_AVX;
33352 break;
33353 case PROCESSOR_HASWELL:
33354 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
33355 arg_str = "skylake-avx512";
33356 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_XSAVES)
33357 arg_str = "skylake";
33358 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
33359 arg_str = "broadwell";
33360 else
33361 arg_str = "haswell";
33362 priority = P_PROC_AVX2;
33363 break;
33364 case PROCESSOR_BONNELL:
33365 arg_str = "bonnell";
33366 priority = P_PROC_SSSE3;
33367 break;
33368 case PROCESSOR_KNL:
33369 arg_str = "knl";
33370 priority = P_PROC_AVX512F;
33371 break;
33372 case PROCESSOR_SILVERMONT:
33373 arg_str = "silvermont";
33374 priority = P_PROC_SSE4_2;
33375 break;
33376 case PROCESSOR_AMDFAM10:
33377 arg_str = "amdfam10h";
33378 priority = P_PROC_SSE4_A;
33379 break;
33380 case PROCESSOR_BTVER1:
33381 arg_str = "btver1";
33382 priority = P_PROC_SSE4_A;
33383 break;
33384 case PROCESSOR_BTVER2:
33385 arg_str = "btver2";
33386 priority = P_PROC_BMI;
33387 break;
33388 case PROCESSOR_BDVER1:
33389 arg_str = "bdver1";
33390 priority = P_PROC_XOP;
33391 break;
33392 case PROCESSOR_BDVER2:
33393 arg_str = "bdver2";
33394 priority = P_PROC_FMA;
33395 break;
33396 case PROCESSOR_BDVER3:
33397 arg_str = "bdver3";
33398 priority = P_PROC_FMA;
33399 break;
33400 case PROCESSOR_BDVER4:
33401 arg_str = "bdver4";
33402 priority = P_PROC_AVX2;
33403 break;
33404 case PROCESSOR_ZNVER1:
33405 arg_str = "znver1";
33406 priority = P_PROC_AVX2;
33407 break;
33408 }
33409 }
33410
33411 cl_target_option_restore (&global_options, &cur_target);
33412
33413 if (predicate_list && arg_str == NULL)
33414 {
33415 error_at (DECL_SOURCE_LOCATION (decl),
33416 "No dispatcher found for the versioning attributes");
33417 return 0;
33418 }
33419
33420 if (predicate_list)
33421 {
33422 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
33423 /* For a C string literal the length includes the trailing NULL. */
33424 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
33425 predicate_chain = tree_cons (predicate_decl, predicate_arg,
33426 predicate_chain);
33427 }
33428 }
33429
33430 /* Process feature name. */
33431 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
33432 strcpy (tok_str, attrs_str);
33433 token = strtok (tok_str, ",");
33434 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
33435
33436 while (token != NULL)
33437 {
33438 /* Do not process "arch=" */
33439 if (strncmp (token, "arch=", 5) == 0)
33440 {
33441 token = strtok (NULL, ",");
33442 continue;
33443 }
33444 for (i = 0; i < NUM_FEATURES; ++i)
33445 {
33446 if (strcmp (token, feature_list[i].name) == 0)
33447 {
33448 if (predicate_list)
33449 {
33450 predicate_arg = build_string_literal (
33451 strlen (feature_list[i].name) + 1,
33452 feature_list[i].name);
33453 predicate_chain = tree_cons (predicate_decl, predicate_arg,
33454 predicate_chain);
33455 }
33456 /* Find the maximum priority feature. */
33457 if (feature_list[i].priority > priority)
33458 priority = feature_list[i].priority;
33459
33460 break;
33461 }
33462 }
33463 if (predicate_list && i == NUM_FEATURES)
33464 {
33465 error_at (DECL_SOURCE_LOCATION (decl),
33466 "No dispatcher found for %s", token);
33467 return 0;
33468 }
33469 token = strtok (NULL, ",");
33470 }
33471 free (tok_str);
33472
33473 if (predicate_list && predicate_chain == NULL_TREE)
33474 {
33475 error_at (DECL_SOURCE_LOCATION (decl),
33476 "No dispatcher found for the versioning attributes : %s",
33477 attrs_str);
33478 return 0;
33479 }
33480 else if (predicate_list)
33481 {
33482 predicate_chain = nreverse (predicate_chain);
33483 *predicate_list = predicate_chain;
33484 }
33485
33486 return priority;
33487 }
33488
33489 /* This compares the priority of target features in function DECL1
33490 and DECL2. It returns positive value if DECL1 is higher priority,
33491 negative value if DECL2 is higher priority and 0 if they are the
33492 same. */
33493
33494 static int
33495 ix86_compare_version_priority (tree decl1, tree decl2)
33496 {
33497 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
33498 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
33499
33500 return (int)priority1 - (int)priority2;
33501 }
33502
33503 /* V1 and V2 point to function versions with different priorities
33504 based on the target ISA. This function compares their priorities. */
33505
33506 static int
33507 feature_compare (const void *v1, const void *v2)
33508 {
33509 typedef struct _function_version_info
33510 {
33511 tree version_decl;
33512 tree predicate_chain;
33513 unsigned int dispatch_priority;
33514 } function_version_info;
33515
33516 const function_version_info c1 = *(const function_version_info *)v1;
33517 const function_version_info c2 = *(const function_version_info *)v2;
33518 return (c2.dispatch_priority - c1.dispatch_priority);
33519 }
33520
33521 /* This function generates the dispatch function for
33522 multi-versioned functions. DISPATCH_DECL is the function which will
33523 contain the dispatch logic. FNDECLS are the function choices for
33524 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
33525 in DISPATCH_DECL in which the dispatch code is generated. */
33526
33527 static int
33528 dispatch_function_versions (tree dispatch_decl,
33529 void *fndecls_p,
33530 basic_block *empty_bb)
33531 {
33532 tree default_decl;
33533 gimple *ifunc_cpu_init_stmt;
33534 gimple_seq gseq;
33535 int ix;
33536 tree ele;
33537 vec<tree> *fndecls;
33538 unsigned int num_versions = 0;
33539 unsigned int actual_versions = 0;
33540 unsigned int i;
33541
33542 struct _function_version_info
33543 {
33544 tree version_decl;
33545 tree predicate_chain;
33546 unsigned int dispatch_priority;
33547 }*function_version_info;
33548
33549 gcc_assert (dispatch_decl != NULL
33550 && fndecls_p != NULL
33551 && empty_bb != NULL);
33552
33553 /*fndecls_p is actually a vector. */
33554 fndecls = static_cast<vec<tree> *> (fndecls_p);
33555
33556 /* At least one more version other than the default. */
33557 num_versions = fndecls->length ();
33558 gcc_assert (num_versions >= 2);
33559
33560 function_version_info = (struct _function_version_info *)
33561 XNEWVEC (struct _function_version_info, (num_versions - 1));
33562
33563 /* The first version in the vector is the default decl. */
33564 default_decl = (*fndecls)[0];
33565
33566 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
33567
33568 gseq = bb_seq (*empty_bb);
33569 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
33570 constructors, so explicity call __builtin_cpu_init here. */
33571 ifunc_cpu_init_stmt = gimple_build_call_vec (
33572 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
33573 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
33574 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
33575 set_bb_seq (*empty_bb, gseq);
33576
33577 pop_cfun ();
33578
33579
33580 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
33581 {
33582 tree version_decl = ele;
33583 tree predicate_chain = NULL_TREE;
33584 unsigned int priority;
33585 /* Get attribute string, parse it and find the right predicate decl.
33586 The predicate function could be a lengthy combination of many
33587 features, like arch-type and various isa-variants. */
33588 priority = get_builtin_code_for_version (version_decl,
33589 &predicate_chain);
33590
33591 if (predicate_chain == NULL_TREE)
33592 continue;
33593
33594 function_version_info [actual_versions].version_decl = version_decl;
33595 function_version_info [actual_versions].predicate_chain
33596 = predicate_chain;
33597 function_version_info [actual_versions].dispatch_priority = priority;
33598 actual_versions++;
33599 }
33600
33601 /* Sort the versions according to descending order of dispatch priority. The
33602 priority is based on the ISA. This is not a perfect solution. There
33603 could still be ambiguity. If more than one function version is suitable
33604 to execute, which one should be dispatched? In future, allow the user
33605 to specify a dispatch priority next to the version. */
33606 qsort (function_version_info, actual_versions,
33607 sizeof (struct _function_version_info), feature_compare);
33608
33609 for (i = 0; i < actual_versions; ++i)
33610 *empty_bb = add_condition_to_bb (dispatch_decl,
33611 function_version_info[i].version_decl,
33612 function_version_info[i].predicate_chain,
33613 *empty_bb);
33614
33615 /* dispatch default version at the end. */
33616 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
33617 NULL, *empty_bb);
33618
33619 free (function_version_info);
33620 return 0;
33621 }
33622
33623 /* This function changes the assembler name for functions that are
33624 versions. If DECL is a function version and has a "target"
33625 attribute, it appends the attribute string to its assembler name. */
33626
33627 static tree
33628 ix86_mangle_function_version_assembler_name (tree decl, tree id)
33629 {
33630 tree version_attr;
33631 const char *orig_name, *version_string;
33632 char *attr_str, *assembler_name;
33633
33634 if (DECL_DECLARED_INLINE_P (decl)
33635 && lookup_attribute ("gnu_inline",
33636 DECL_ATTRIBUTES (decl)))
33637 error_at (DECL_SOURCE_LOCATION (decl),
33638 "Function versions cannot be marked as gnu_inline,"
33639 " bodies have to be generated");
33640
33641 if (DECL_VIRTUAL_P (decl)
33642 || DECL_VINDEX (decl))
33643 sorry ("Virtual function multiversioning not supported");
33644
33645 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
33646
33647 /* target attribute string cannot be NULL. */
33648 gcc_assert (version_attr != NULL_TREE);
33649
33650 orig_name = IDENTIFIER_POINTER (id);
33651 version_string
33652 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
33653
33654 if (strcmp (version_string, "default") == 0)
33655 return id;
33656
33657 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
33658 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
33659
33660 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
33661
33662 /* Allow assembler name to be modified if already set. */
33663 if (DECL_ASSEMBLER_NAME_SET_P (decl))
33664 SET_DECL_RTL (decl, NULL);
33665
33666 tree ret = get_identifier (assembler_name);
33667 XDELETEVEC (attr_str);
33668 XDELETEVEC (assembler_name);
33669 return ret;
33670 }
33671
33672
33673 static tree
33674 ix86_mangle_decl_assembler_name (tree decl, tree id)
33675 {
33676 /* For function version, add the target suffix to the assembler name. */
33677 if (TREE_CODE (decl) == FUNCTION_DECL
33678 && DECL_FUNCTION_VERSIONED (decl))
33679 id = ix86_mangle_function_version_assembler_name (decl, id);
33680 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
33681 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
33682 #endif
33683
33684 return id;
33685 }
33686
33687 /* Make a dispatcher declaration for the multi-versioned function DECL.
33688 Calls to DECL function will be replaced with calls to the dispatcher
33689 by the front-end. Returns the decl of the dispatcher function. */
33690
33691 static tree
33692 ix86_get_function_versions_dispatcher (void *decl)
33693 {
33694 tree fn = (tree) decl;
33695 struct cgraph_node *node = NULL;
33696 struct cgraph_node *default_node = NULL;
33697 struct cgraph_function_version_info *node_v = NULL;
33698 struct cgraph_function_version_info *first_v = NULL;
33699
33700 tree dispatch_decl = NULL;
33701
33702 struct cgraph_function_version_info *default_version_info = NULL;
33703
33704 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
33705
33706 node = cgraph_node::get (fn);
33707 gcc_assert (node != NULL);
33708
33709 node_v = node->function_version ();
33710 gcc_assert (node_v != NULL);
33711
33712 if (node_v->dispatcher_resolver != NULL)
33713 return node_v->dispatcher_resolver;
33714
33715 /* Find the default version and make it the first node. */
33716 first_v = node_v;
33717 /* Go to the beginning of the chain. */
33718 while (first_v->prev != NULL)
33719 first_v = first_v->prev;
33720 default_version_info = first_v;
33721 while (default_version_info != NULL)
33722 {
33723 if (is_function_default_version
33724 (default_version_info->this_node->decl))
33725 break;
33726 default_version_info = default_version_info->next;
33727 }
33728
33729 /* If there is no default node, just return NULL. */
33730 if (default_version_info == NULL)
33731 return NULL;
33732
33733 /* Make default info the first node. */
33734 if (first_v != default_version_info)
33735 {
33736 default_version_info->prev->next = default_version_info->next;
33737 if (default_version_info->next)
33738 default_version_info->next->prev = default_version_info->prev;
33739 first_v->prev = default_version_info;
33740 default_version_info->next = first_v;
33741 default_version_info->prev = NULL;
33742 }
33743
33744 default_node = default_version_info->this_node;
33745
33746 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
33747 if (targetm.has_ifunc_p ())
33748 {
33749 struct cgraph_function_version_info *it_v = NULL;
33750 struct cgraph_node *dispatcher_node = NULL;
33751 struct cgraph_function_version_info *dispatcher_version_info = NULL;
33752
33753 /* Right now, the dispatching is done via ifunc. */
33754 dispatch_decl = make_dispatcher_decl (default_node->decl);
33755
33756 dispatcher_node = cgraph_node::get_create (dispatch_decl);
33757 gcc_assert (dispatcher_node != NULL);
33758 dispatcher_node->dispatcher_function = 1;
33759 dispatcher_version_info
33760 = dispatcher_node->insert_new_function_version ();
33761 dispatcher_version_info->next = default_version_info;
33762 dispatcher_node->definition = 1;
33763
33764 /* Set the dispatcher for all the versions. */
33765 it_v = default_version_info;
33766 while (it_v != NULL)
33767 {
33768 it_v->dispatcher_resolver = dispatch_decl;
33769 it_v = it_v->next;
33770 }
33771 }
33772 else
33773 #endif
33774 {
33775 error_at (DECL_SOURCE_LOCATION (default_node->decl),
33776 "multiversioning needs ifunc which is not supported "
33777 "on this target");
33778 }
33779
33780 return dispatch_decl;
33781 }
33782
33783 /* Make the resolver function decl to dispatch the versions of
33784 a multi-versioned function, DEFAULT_DECL. Create an
33785 empty basic block in the resolver and store the pointer in
33786 EMPTY_BB. Return the decl of the resolver function. */
33787
33788 static tree
33789 make_resolver_func (const tree default_decl,
33790 const tree dispatch_decl,
33791 basic_block *empty_bb)
33792 {
33793 char *resolver_name;
33794 tree decl, type, decl_name, t;
33795 bool is_uniq = false;
33796
33797 /* IFUNC's have to be globally visible. So, if the default_decl is
33798 not, then the name of the IFUNC should be made unique. */
33799 if (TREE_PUBLIC (default_decl) == 0)
33800 is_uniq = true;
33801
33802 /* Append the filename to the resolver function if the versions are
33803 not externally visible. This is because the resolver function has
33804 to be externally visible for the loader to find it. So, appending
33805 the filename will prevent conflicts with a resolver function from
33806 another module which is based on the same version name. */
33807 resolver_name = make_unique_name (default_decl, "resolver", is_uniq);
33808
33809 /* The resolver function should return a (void *). */
33810 type = build_function_type_list (ptr_type_node, NULL_TREE);
33811
33812 decl = build_fn_decl (resolver_name, type);
33813 decl_name = get_identifier (resolver_name);
33814 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
33815
33816 DECL_NAME (decl) = decl_name;
33817 TREE_USED (decl) = 1;
33818 DECL_ARTIFICIAL (decl) = 1;
33819 DECL_IGNORED_P (decl) = 0;
33820 /* IFUNC resolvers have to be externally visible. */
33821 TREE_PUBLIC (decl) = 1;
33822 DECL_UNINLINABLE (decl) = 1;
33823
33824 /* Resolver is not external, body is generated. */
33825 DECL_EXTERNAL (decl) = 0;
33826 DECL_EXTERNAL (dispatch_decl) = 0;
33827
33828 DECL_CONTEXT (decl) = NULL_TREE;
33829 DECL_INITIAL (decl) = make_node (BLOCK);
33830 DECL_STATIC_CONSTRUCTOR (decl) = 0;
33831
33832 if (DECL_COMDAT_GROUP (default_decl)
33833 || TREE_PUBLIC (default_decl))
33834 {
33835 /* In this case, each translation unit with a call to this
33836 versioned function will put out a resolver. Ensure it
33837 is comdat to keep just one copy. */
33838 DECL_COMDAT (decl) = 1;
33839 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
33840 }
33841 /* Build result decl and add to function_decl. */
33842 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
33843 DECL_ARTIFICIAL (t) = 1;
33844 DECL_IGNORED_P (t) = 1;
33845 DECL_RESULT (decl) = t;
33846
33847 gimplify_function_tree (decl);
33848 push_cfun (DECL_STRUCT_FUNCTION (decl));
33849 *empty_bb = init_lowered_empty_function (decl, false, 0);
33850
33851 cgraph_node::add_new_function (decl, true);
33852 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
33853
33854 pop_cfun ();
33855
33856 gcc_assert (dispatch_decl != NULL);
33857 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
33858 DECL_ATTRIBUTES (dispatch_decl)
33859 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
33860
33861 /* Create the alias for dispatch to resolver here. */
33862 /*cgraph_create_function_alias (dispatch_decl, decl);*/
33863 cgraph_node::create_same_body_alias (dispatch_decl, decl);
33864 XDELETEVEC (resolver_name);
33865 return decl;
33866 }
33867
33868 /* Generate the dispatching code body to dispatch multi-versioned function
33869 DECL. The target hook is called to process the "target" attributes and
33870 provide the code to dispatch the right function at run-time. NODE points
33871 to the dispatcher decl whose body will be created. */
33872
33873 static tree
33874 ix86_generate_version_dispatcher_body (void *node_p)
33875 {
33876 tree resolver_decl;
33877 basic_block empty_bb;
33878 tree default_ver_decl;
33879 struct cgraph_node *versn;
33880 struct cgraph_node *node;
33881
33882 struct cgraph_function_version_info *node_version_info = NULL;
33883 struct cgraph_function_version_info *versn_info = NULL;
33884
33885 node = (cgraph_node *)node_p;
33886
33887 node_version_info = node->function_version ();
33888 gcc_assert (node->dispatcher_function
33889 && node_version_info != NULL);
33890
33891 if (node_version_info->dispatcher_resolver)
33892 return node_version_info->dispatcher_resolver;
33893
33894 /* The first version in the chain corresponds to the default version. */
33895 default_ver_decl = node_version_info->next->this_node->decl;
33896
33897 /* node is going to be an alias, so remove the finalized bit. */
33898 node->definition = false;
33899
33900 resolver_decl = make_resolver_func (default_ver_decl,
33901 node->decl, &empty_bb);
33902
33903 node_version_info->dispatcher_resolver = resolver_decl;
33904
33905 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
33906
33907 auto_vec<tree, 2> fn_ver_vec;
33908
33909 for (versn_info = node_version_info->next; versn_info;
33910 versn_info = versn_info->next)
33911 {
33912 versn = versn_info->this_node;
33913 /* Check for virtual functions here again, as by this time it should
33914 have been determined if this function needs a vtable index or
33915 not. This happens for methods in derived classes that override
33916 virtual methods in base classes but are not explicitly marked as
33917 virtual. */
33918 if (DECL_VINDEX (versn->decl))
33919 sorry ("Virtual function multiversioning not supported");
33920
33921 fn_ver_vec.safe_push (versn->decl);
33922 }
33923
33924 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
33925 cgraph_edge::rebuild_edges ();
33926 pop_cfun ();
33927 return resolver_decl;
33928 }
33929 /* This builds the processor_model struct type defined in
33930 libgcc/config/i386/cpuinfo.c */
33931
33932 static tree
33933 build_processor_model_struct (void)
33934 {
33935 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
33936 "__cpu_features"};
33937 tree field = NULL_TREE, field_chain = NULL_TREE;
33938 int i;
33939 tree type = make_node (RECORD_TYPE);
33940
33941 /* The first 3 fields are unsigned int. */
33942 for (i = 0; i < 3; ++i)
33943 {
33944 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
33945 get_identifier (field_name[i]), unsigned_type_node);
33946 if (field_chain != NULL_TREE)
33947 DECL_CHAIN (field) = field_chain;
33948 field_chain = field;
33949 }
33950
33951 /* The last field is an array of unsigned integers of size one. */
33952 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
33953 get_identifier (field_name[3]),
33954 build_array_type (unsigned_type_node,
33955 build_index_type (size_one_node)));
33956 if (field_chain != NULL_TREE)
33957 DECL_CHAIN (field) = field_chain;
33958 field_chain = field;
33959
33960 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
33961 return type;
33962 }
33963
33964 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
33965
33966 static tree
33967 make_var_decl (tree type, const char *name)
33968 {
33969 tree new_decl;
33970
33971 new_decl = build_decl (UNKNOWN_LOCATION,
33972 VAR_DECL,
33973 get_identifier(name),
33974 type);
33975
33976 DECL_EXTERNAL (new_decl) = 1;
33977 TREE_STATIC (new_decl) = 1;
33978 TREE_PUBLIC (new_decl) = 1;
33979 DECL_INITIAL (new_decl) = 0;
33980 DECL_ARTIFICIAL (new_decl) = 0;
33981 DECL_PRESERVE_P (new_decl) = 1;
33982
33983 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
33984 assemble_variable (new_decl, 0, 0, 0);
33985
33986 return new_decl;
33987 }
33988
33989 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
33990 into an integer defined in libgcc/config/i386/cpuinfo.c */
33991
33992 static tree
33993 fold_builtin_cpu (tree fndecl, tree *args)
33994 {
33995 unsigned int i;
33996 enum ix86_builtins fn_code = (enum ix86_builtins)
33997 DECL_FUNCTION_CODE (fndecl);
33998 tree param_string_cst = NULL;
33999
34000 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
34001 enum processor_features
34002 {
34003 F_CMOV = 0,
34004 F_MMX,
34005 F_POPCNT,
34006 F_SSE,
34007 F_SSE2,
34008 F_SSE3,
34009 F_SSSE3,
34010 F_SSE4_1,
34011 F_SSE4_2,
34012 F_AVX,
34013 F_AVX2,
34014 F_SSE4_A,
34015 F_FMA4,
34016 F_XOP,
34017 F_FMA,
34018 F_AVX512F,
34019 F_BMI,
34020 F_BMI2,
34021 F_AES,
34022 F_PCLMUL,
34023 F_AVX512VL,
34024 F_AVX512BW,
34025 F_AVX512DQ,
34026 F_AVX512CD,
34027 F_AVX512ER,
34028 F_AVX512PF,
34029 F_AVX512VBMI,
34030 F_AVX512IFMA,
34031 F_AVX5124VNNIW,
34032 F_AVX5124FMAPS,
34033 F_AVX512VPOPCNTDQ,
34034 F_MAX
34035 };
34036
34037 /* These are the values for vendor types and cpu types and subtypes
34038 in cpuinfo.c. Cpu types and subtypes should be subtracted by
34039 the corresponding start value. */
34040 enum processor_model
34041 {
34042 M_INTEL = 1,
34043 M_AMD,
34044 M_CPU_TYPE_START,
34045 M_INTEL_BONNELL,
34046 M_INTEL_CORE2,
34047 M_INTEL_COREI7,
34048 M_AMDFAM10H,
34049 M_AMDFAM15H,
34050 M_INTEL_SILVERMONT,
34051 M_INTEL_KNL,
34052 M_AMD_BTVER1,
34053 M_AMD_BTVER2,
34054 M_CPU_SUBTYPE_START,
34055 M_INTEL_COREI7_NEHALEM,
34056 M_INTEL_COREI7_WESTMERE,
34057 M_INTEL_COREI7_SANDYBRIDGE,
34058 M_AMDFAM10H_BARCELONA,
34059 M_AMDFAM10H_SHANGHAI,
34060 M_AMDFAM10H_ISTANBUL,
34061 M_AMDFAM15H_BDVER1,
34062 M_AMDFAM15H_BDVER2,
34063 M_AMDFAM15H_BDVER3,
34064 M_AMDFAM15H_BDVER4,
34065 M_AMDFAM17H_ZNVER1,
34066 M_INTEL_COREI7_IVYBRIDGE,
34067 M_INTEL_COREI7_HASWELL,
34068 M_INTEL_COREI7_BROADWELL,
34069 M_INTEL_COREI7_SKYLAKE,
34070 M_INTEL_COREI7_SKYLAKE_AVX512
34071 };
34072
34073 static struct _arch_names_table
34074 {
34075 const char *const name;
34076 const enum processor_model model;
34077 }
34078 const arch_names_table[] =
34079 {
34080 {"amd", M_AMD},
34081 {"intel", M_INTEL},
34082 {"atom", M_INTEL_BONNELL},
34083 {"slm", M_INTEL_SILVERMONT},
34084 {"core2", M_INTEL_CORE2},
34085 {"corei7", M_INTEL_COREI7},
34086 {"nehalem", M_INTEL_COREI7_NEHALEM},
34087 {"westmere", M_INTEL_COREI7_WESTMERE},
34088 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
34089 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
34090 {"haswell", M_INTEL_COREI7_HASWELL},
34091 {"broadwell", M_INTEL_COREI7_BROADWELL},
34092 {"skylake", M_INTEL_COREI7_SKYLAKE},
34093 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
34094 {"bonnell", M_INTEL_BONNELL},
34095 {"silvermont", M_INTEL_SILVERMONT},
34096 {"knl", M_INTEL_KNL},
34097 {"amdfam10h", M_AMDFAM10H},
34098 {"barcelona", M_AMDFAM10H_BARCELONA},
34099 {"shanghai", M_AMDFAM10H_SHANGHAI},
34100 {"istanbul", M_AMDFAM10H_ISTANBUL},
34101 {"btver1", M_AMD_BTVER1},
34102 {"amdfam15h", M_AMDFAM15H},
34103 {"bdver1", M_AMDFAM15H_BDVER1},
34104 {"bdver2", M_AMDFAM15H_BDVER2},
34105 {"bdver3", M_AMDFAM15H_BDVER3},
34106 {"bdver4", M_AMDFAM15H_BDVER4},
34107 {"btver2", M_AMD_BTVER2},
34108 {"znver1", M_AMDFAM17H_ZNVER1},
34109 };
34110
34111 static struct _isa_names_table
34112 {
34113 const char *const name;
34114 const enum processor_features feature;
34115 }
34116 const isa_names_table[] =
34117 {
34118 {"cmov", F_CMOV},
34119 {"mmx", F_MMX},
34120 {"popcnt", F_POPCNT},
34121 {"sse", F_SSE},
34122 {"sse2", F_SSE2},
34123 {"sse3", F_SSE3},
34124 {"ssse3", F_SSSE3},
34125 {"sse4a", F_SSE4_A},
34126 {"sse4.1", F_SSE4_1},
34127 {"sse4.2", F_SSE4_2},
34128 {"avx", F_AVX},
34129 {"fma4", F_FMA4},
34130 {"xop", F_XOP},
34131 {"fma", F_FMA},
34132 {"avx2", F_AVX2},
34133 {"avx512f", F_AVX512F},
34134 {"bmi", F_BMI},
34135 {"bmi2", F_BMI2},
34136 {"aes", F_AES},
34137 {"pclmul", F_PCLMUL},
34138 {"avx512vl",F_AVX512VL},
34139 {"avx512bw",F_AVX512BW},
34140 {"avx512dq",F_AVX512DQ},
34141 {"avx512cd",F_AVX512CD},
34142 {"avx512er",F_AVX512ER},
34143 {"avx512pf",F_AVX512PF},
34144 {"avx512vbmi",F_AVX512VBMI},
34145 {"avx512ifma",F_AVX512IFMA},
34146 {"avx5124vnniw",F_AVX5124VNNIW},
34147 {"avx5124fmaps",F_AVX5124FMAPS},
34148 {"avx512vpopcntdq",F_AVX512VPOPCNTDQ}
34149 };
34150
34151 tree __processor_model_type = build_processor_model_struct ();
34152 tree __cpu_model_var = make_var_decl (__processor_model_type,
34153 "__cpu_model");
34154
34155
34156 varpool_node::add (__cpu_model_var);
34157
34158 gcc_assert ((args != NULL) && (*args != NULL));
34159
34160 param_string_cst = *args;
34161 while (param_string_cst
34162 && TREE_CODE (param_string_cst) != STRING_CST)
34163 {
34164 /* *args must be a expr that can contain other EXPRS leading to a
34165 STRING_CST. */
34166 if (!EXPR_P (param_string_cst))
34167 {
34168 error ("Parameter to builtin must be a string constant or literal");
34169 return integer_zero_node;
34170 }
34171 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
34172 }
34173
34174 gcc_assert (param_string_cst);
34175
34176 if (fn_code == IX86_BUILTIN_CPU_IS)
34177 {
34178 tree ref;
34179 tree field;
34180 tree final;
34181
34182 unsigned int field_val = 0;
34183 unsigned int NUM_ARCH_NAMES
34184 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
34185
34186 for (i = 0; i < NUM_ARCH_NAMES; i++)
34187 if (strcmp (arch_names_table[i].name,
34188 TREE_STRING_POINTER (param_string_cst)) == 0)
34189 break;
34190
34191 if (i == NUM_ARCH_NAMES)
34192 {
34193 error ("Parameter to builtin not valid: %s",
34194 TREE_STRING_POINTER (param_string_cst));
34195 return integer_zero_node;
34196 }
34197
34198 field = TYPE_FIELDS (__processor_model_type);
34199 field_val = arch_names_table[i].model;
34200
34201 /* CPU types are stored in the next field. */
34202 if (field_val > M_CPU_TYPE_START
34203 && field_val < M_CPU_SUBTYPE_START)
34204 {
34205 field = DECL_CHAIN (field);
34206 field_val -= M_CPU_TYPE_START;
34207 }
34208
34209 /* CPU subtypes are stored in the next field. */
34210 if (field_val > M_CPU_SUBTYPE_START)
34211 {
34212 field = DECL_CHAIN ( DECL_CHAIN (field));
34213 field_val -= M_CPU_SUBTYPE_START;
34214 }
34215
34216 /* Get the appropriate field in __cpu_model. */
34217 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
34218 field, NULL_TREE);
34219
34220 /* Check the value. */
34221 final = build2 (EQ_EXPR, unsigned_type_node, ref,
34222 build_int_cstu (unsigned_type_node, field_val));
34223 return build1 (CONVERT_EXPR, integer_type_node, final);
34224 }
34225 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
34226 {
34227 tree ref;
34228 tree array_elt;
34229 tree field;
34230 tree final;
34231
34232 unsigned int field_val = 0;
34233 unsigned int NUM_ISA_NAMES
34234 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
34235
34236 for (i = 0; i < NUM_ISA_NAMES; i++)
34237 if (strcmp (isa_names_table[i].name,
34238 TREE_STRING_POINTER (param_string_cst)) == 0)
34239 break;
34240
34241 if (i == NUM_ISA_NAMES)
34242 {
34243 error ("Parameter to builtin not valid: %s",
34244 TREE_STRING_POINTER (param_string_cst));
34245 return integer_zero_node;
34246 }
34247
34248 field = TYPE_FIELDS (__processor_model_type);
34249 /* Get the last field, which is __cpu_features. */
34250 while (DECL_CHAIN (field))
34251 field = DECL_CHAIN (field);
34252
34253 /* Get the appropriate field: __cpu_model.__cpu_features */
34254 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
34255 field, NULL_TREE);
34256
34257 /* Access the 0th element of __cpu_features array. */
34258 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
34259 integer_zero_node, NULL_TREE, NULL_TREE);
34260
34261 field_val = (1 << isa_names_table[i].feature);
34262 /* Return __cpu_model.__cpu_features[0] & field_val */
34263 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
34264 build_int_cstu (unsigned_type_node, field_val));
34265 return build1 (CONVERT_EXPR, integer_type_node, final);
34266 }
34267 gcc_unreachable ();
34268 }
34269
34270 static tree
34271 ix86_fold_builtin (tree fndecl, int n_args,
34272 tree *args, bool ignore ATTRIBUTE_UNUSED)
34273 {
34274 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
34275 {
34276 enum ix86_builtins fn_code = (enum ix86_builtins)
34277 DECL_FUNCTION_CODE (fndecl);
34278 switch (fn_code)
34279 {
34280 case IX86_BUILTIN_CPU_IS:
34281 case IX86_BUILTIN_CPU_SUPPORTS:
34282 gcc_assert (n_args == 1);
34283 return fold_builtin_cpu (fndecl, args);
34284
34285 case IX86_BUILTIN_NANQ:
34286 case IX86_BUILTIN_NANSQ:
34287 {
34288 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34289 const char *str = c_getstr (*args);
34290 int quiet = fn_code == IX86_BUILTIN_NANQ;
34291 REAL_VALUE_TYPE real;
34292
34293 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
34294 return build_real (type, real);
34295 return NULL_TREE;
34296 }
34297
34298 case IX86_BUILTIN_INFQ:
34299 case IX86_BUILTIN_HUGE_VALQ:
34300 {
34301 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34302 REAL_VALUE_TYPE inf;
34303 real_inf (&inf);
34304 return build_real (type, inf);
34305 }
34306
34307 case IX86_BUILTIN_TZCNT16:
34308 case IX86_BUILTIN_CTZS:
34309 case IX86_BUILTIN_TZCNT32:
34310 case IX86_BUILTIN_TZCNT64:
34311 gcc_assert (n_args == 1);
34312 if (TREE_CODE (args[0]) == INTEGER_CST)
34313 {
34314 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34315 tree arg = args[0];
34316 if (fn_code == IX86_BUILTIN_TZCNT16
34317 || fn_code == IX86_BUILTIN_CTZS)
34318 arg = fold_convert (short_unsigned_type_node, arg);
34319 if (integer_zerop (arg))
34320 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
34321 else
34322 return fold_const_call (CFN_CTZ, type, arg);
34323 }
34324 break;
34325
34326 case IX86_BUILTIN_LZCNT16:
34327 case IX86_BUILTIN_CLZS:
34328 case IX86_BUILTIN_LZCNT32:
34329 case IX86_BUILTIN_LZCNT64:
34330 gcc_assert (n_args == 1);
34331 if (TREE_CODE (args[0]) == INTEGER_CST)
34332 {
34333 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34334 tree arg = args[0];
34335 if (fn_code == IX86_BUILTIN_LZCNT16
34336 || fn_code == IX86_BUILTIN_CLZS)
34337 arg = fold_convert (short_unsigned_type_node, arg);
34338 if (integer_zerop (arg))
34339 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
34340 else
34341 return fold_const_call (CFN_CLZ, type, arg);
34342 }
34343 break;
34344
34345 case IX86_BUILTIN_BEXTR32:
34346 case IX86_BUILTIN_BEXTR64:
34347 case IX86_BUILTIN_BEXTRI32:
34348 case IX86_BUILTIN_BEXTRI64:
34349 gcc_assert (n_args == 2);
34350 if (tree_fits_uhwi_p (args[1]))
34351 {
34352 unsigned HOST_WIDE_INT res = 0;
34353 unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
34354 unsigned int start = tree_to_uhwi (args[1]);
34355 unsigned int len = (start & 0xff00) >> 8;
34356 start &= 0xff;
34357 if (start >= prec || len == 0)
34358 res = 0;
34359 else if (!tree_fits_uhwi_p (args[0]))
34360 break;
34361 else
34362 res = tree_to_uhwi (args[0]) >> start;
34363 if (len > prec)
34364 len = prec;
34365 if (len < HOST_BITS_PER_WIDE_INT)
34366 res &= (HOST_WIDE_INT_1U << len) - 1;
34367 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34368 }
34369 break;
34370
34371 case IX86_BUILTIN_BZHI32:
34372 case IX86_BUILTIN_BZHI64:
34373 gcc_assert (n_args == 2);
34374 if (tree_fits_uhwi_p (args[1]))
34375 {
34376 unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
34377 if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
34378 return args[0];
34379 if (!tree_fits_uhwi_p (args[0]))
34380 break;
34381 unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
34382 res &= ~(HOST_WIDE_INT_M1U << idx);
34383 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34384 }
34385 break;
34386
34387 case IX86_BUILTIN_PDEP32:
34388 case IX86_BUILTIN_PDEP64:
34389 gcc_assert (n_args == 2);
34390 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
34391 {
34392 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
34393 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
34394 unsigned HOST_WIDE_INT res = 0;
34395 unsigned HOST_WIDE_INT m, k = 1;
34396 for (m = 1; m; m <<= 1)
34397 if ((mask & m) != 0)
34398 {
34399 if ((src & k) != 0)
34400 res |= m;
34401 k <<= 1;
34402 }
34403 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34404 }
34405 break;
34406
34407 case IX86_BUILTIN_PEXT32:
34408 case IX86_BUILTIN_PEXT64:
34409 gcc_assert (n_args == 2);
34410 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
34411 {
34412 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
34413 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
34414 unsigned HOST_WIDE_INT res = 0;
34415 unsigned HOST_WIDE_INT m, k = 1;
34416 for (m = 1; m; m <<= 1)
34417 if ((mask & m) != 0)
34418 {
34419 if ((src & m) != 0)
34420 res |= k;
34421 k <<= 1;
34422 }
34423 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34424 }
34425 break;
34426
34427 default:
34428 break;
34429 }
34430 }
34431
34432 #ifdef SUBTARGET_FOLD_BUILTIN
34433 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
34434 #endif
34435
34436 return NULL_TREE;
34437 }
34438
34439 /* Fold a MD builtin (use ix86_fold_builtin for folding into
34440 constant) in GIMPLE. */
34441
34442 bool
34443 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
34444 {
34445 gimple *stmt = gsi_stmt (*gsi);
34446 tree fndecl = gimple_call_fndecl (stmt);
34447 gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
34448 int n_args = gimple_call_num_args (stmt);
34449 enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
34450 tree decl = NULL_TREE;
34451 tree arg0, arg1;
34452
34453 switch (fn_code)
34454 {
34455 case IX86_BUILTIN_TZCNT32:
34456 decl = builtin_decl_implicit (BUILT_IN_CTZ);
34457 goto fold_tzcnt_lzcnt;
34458
34459 case IX86_BUILTIN_TZCNT64:
34460 decl = builtin_decl_implicit (BUILT_IN_CTZLL);
34461 goto fold_tzcnt_lzcnt;
34462
34463 case IX86_BUILTIN_LZCNT32:
34464 decl = builtin_decl_implicit (BUILT_IN_CLZ);
34465 goto fold_tzcnt_lzcnt;
34466
34467 case IX86_BUILTIN_LZCNT64:
34468 decl = builtin_decl_implicit (BUILT_IN_CLZLL);
34469 goto fold_tzcnt_lzcnt;
34470
34471 fold_tzcnt_lzcnt:
34472 gcc_assert (n_args == 1);
34473 arg0 = gimple_call_arg (stmt, 0);
34474 if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
34475 {
34476 int prec = TYPE_PRECISION (TREE_TYPE (arg0));
34477 /* If arg0 is provably non-zero, optimize into generic
34478 __builtin_c[tl]z{,ll} function the middle-end handles
34479 better. */
34480 if (!expr_not_equal_to (arg0, wi::zero (prec)))
34481 return false;
34482
34483 location_t loc = gimple_location (stmt);
34484 gimple *g = gimple_build_call (decl, 1, arg0);
34485 gimple_set_location (g, loc);
34486 tree lhs = make_ssa_name (integer_type_node);
34487 gimple_call_set_lhs (g, lhs);
34488 gsi_insert_before (gsi, g, GSI_SAME_STMT);
34489 g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
34490 gimple_set_location (g, loc);
34491 gsi_replace (gsi, g, false);
34492 return true;
34493 }
34494 break;
34495
34496 case IX86_BUILTIN_BZHI32:
34497 case IX86_BUILTIN_BZHI64:
34498 gcc_assert (n_args == 2);
34499 arg1 = gimple_call_arg (stmt, 1);
34500 if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
34501 {
34502 unsigned int idx = tree_to_uhwi (arg1) & 0xff;
34503 arg0 = gimple_call_arg (stmt, 0);
34504 if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
34505 break;
34506 location_t loc = gimple_location (stmt);
34507 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
34508 gimple_set_location (g, loc);
34509 gsi_replace (gsi, g, false);
34510 return true;
34511 }
34512 break;
34513
34514 case IX86_BUILTIN_PDEP32:
34515 case IX86_BUILTIN_PDEP64:
34516 case IX86_BUILTIN_PEXT32:
34517 case IX86_BUILTIN_PEXT64:
34518 gcc_assert (n_args == 2);
34519 arg1 = gimple_call_arg (stmt, 1);
34520 if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
34521 {
34522 location_t loc = gimple_location (stmt);
34523 arg0 = gimple_call_arg (stmt, 0);
34524 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
34525 gimple_set_location (g, loc);
34526 gsi_replace (gsi, g, false);
34527 return true;
34528 }
34529 break;
34530
34531 default:
34532 break;
34533 }
34534
34535 return false;
34536 }
34537
34538 /* Make builtins to detect cpu type and features supported. NAME is
34539 the builtin name, CODE is the builtin code, and FTYPE is the function
34540 type of the builtin. */
34541
34542 static void
34543 make_cpu_type_builtin (const char* name, int code,
34544 enum ix86_builtin_func_type ftype, bool is_const)
34545 {
34546 tree decl;
34547 tree type;
34548
34549 type = ix86_get_builtin_func_type (ftype);
34550 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
34551 NULL, NULL_TREE);
34552 gcc_assert (decl != NULL_TREE);
34553 ix86_builtins[(int) code] = decl;
34554 TREE_READONLY (decl) = is_const;
34555 }
34556
34557 /* Make builtins to get CPU type and features supported. The created
34558 builtins are :
34559
34560 __builtin_cpu_init (), to detect cpu type and features,
34561 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
34562 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
34563 */
34564
34565 static void
34566 ix86_init_platform_type_builtins (void)
34567 {
34568 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
34569 INT_FTYPE_VOID, false);
34570 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
34571 INT_FTYPE_PCCHAR, true);
34572 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
34573 INT_FTYPE_PCCHAR, true);
34574 }
34575
34576 /* Internal method for ix86_init_builtins. */
34577
34578 static void
34579 ix86_init_builtins_va_builtins_abi (void)
34580 {
34581 tree ms_va_ref, sysv_va_ref;
34582 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
34583 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
34584 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
34585 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
34586
34587 if (!TARGET_64BIT)
34588 return;
34589 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
34590 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
34591 ms_va_ref = build_reference_type (ms_va_list_type_node);
34592 sysv_va_ref =
34593 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
34594
34595 fnvoid_va_end_ms =
34596 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
34597 fnvoid_va_start_ms =
34598 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
34599 fnvoid_va_end_sysv =
34600 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
34601 fnvoid_va_start_sysv =
34602 build_varargs_function_type_list (void_type_node, sysv_va_ref,
34603 NULL_TREE);
34604 fnvoid_va_copy_ms =
34605 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
34606 NULL_TREE);
34607 fnvoid_va_copy_sysv =
34608 build_function_type_list (void_type_node, sysv_va_ref,
34609 sysv_va_ref, NULL_TREE);
34610
34611 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
34612 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
34613 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
34614 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
34615 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
34616 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
34617 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
34618 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
34619 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
34620 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
34621 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
34622 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
34623 }
34624
34625 static void
34626 ix86_init_builtin_types (void)
34627 {
34628 tree float80_type_node, const_string_type_node;
34629
34630 /* The __float80 type. */
34631 float80_type_node = long_double_type_node;
34632 if (TYPE_MODE (float80_type_node) != XFmode)
34633 {
34634 if (float64x_type_node != NULL_TREE
34635 && TYPE_MODE (float64x_type_node) == XFmode)
34636 float80_type_node = float64x_type_node;
34637 else
34638 {
34639 /* The __float80 type. */
34640 float80_type_node = make_node (REAL_TYPE);
34641
34642 TYPE_PRECISION (float80_type_node) = 80;
34643 layout_type (float80_type_node);
34644 }
34645 }
34646 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
34647
34648 /* The __float128 type. The node has already been created as
34649 _Float128, so we only need to register the __float128 name for
34650 it. */
34651 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
34652
34653 const_string_type_node
34654 = build_pointer_type (build_qualified_type
34655 (char_type_node, TYPE_QUAL_CONST));
34656
34657 /* This macro is built by i386-builtin-types.awk. */
34658 DEFINE_BUILTIN_PRIMITIVE_TYPES;
34659 }
34660
34661 static void
34662 ix86_init_builtins (void)
34663 {
34664 tree ftype, decl;
34665
34666 ix86_init_builtin_types ();
34667
34668 /* Builtins to get CPU type and features. */
34669 ix86_init_platform_type_builtins ();
34670
34671 /* TFmode support builtins. */
34672 def_builtin_const (0, "__builtin_infq",
34673 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
34674 def_builtin_const (0, "__builtin_huge_valq",
34675 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
34676
34677 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
34678 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
34679 BUILT_IN_MD, "nanq", NULL_TREE);
34680 TREE_READONLY (decl) = 1;
34681 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
34682
34683 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
34684 BUILT_IN_MD, "nansq", NULL_TREE);
34685 TREE_READONLY (decl) = 1;
34686 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
34687
34688 /* We will expand them to normal call if SSE isn't available since
34689 they are used by libgcc. */
34690 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
34691 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
34692 BUILT_IN_MD, "__fabstf2", NULL_TREE);
34693 TREE_READONLY (decl) = 1;
34694 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
34695
34696 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
34697 decl = add_builtin_function ("__builtin_copysignq", ftype,
34698 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
34699 "__copysigntf3", NULL_TREE);
34700 TREE_READONLY (decl) = 1;
34701 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
34702
34703 ix86_init_tm_builtins ();
34704 ix86_init_mmx_sse_builtins ();
34705 ix86_init_mpx_builtins ();
34706
34707 if (TARGET_LP64)
34708 ix86_init_builtins_va_builtins_abi ();
34709
34710 #ifdef SUBTARGET_INIT_BUILTINS
34711 SUBTARGET_INIT_BUILTINS;
34712 #endif
34713 }
34714
34715 /* Return the ix86 builtin for CODE. */
34716
34717 static tree
34718 ix86_builtin_decl (unsigned code, bool)
34719 {
34720 if (code >= IX86_BUILTIN_MAX)
34721 return error_mark_node;
34722
34723 return ix86_builtins[code];
34724 }
34725
34726 /* Errors in the source file can cause expand_expr to return const0_rtx
34727 where we expect a vector. To avoid crashing, use one of the vector
34728 clear instructions. */
34729 static rtx
34730 safe_vector_operand (rtx x, machine_mode mode)
34731 {
34732 if (x == const0_rtx)
34733 x = CONST0_RTX (mode);
34734 return x;
34735 }
34736
34737 /* Fixup modeless constants to fit required mode. */
34738 static rtx
34739 fixup_modeless_constant (rtx x, machine_mode mode)
34740 {
34741 if (GET_MODE (x) == VOIDmode)
34742 x = convert_to_mode (mode, x, 1);
34743 return x;
34744 }
34745
34746 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
34747
34748 static rtx
34749 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
34750 {
34751 rtx pat;
34752 tree arg0 = CALL_EXPR_ARG (exp, 0);
34753 tree arg1 = CALL_EXPR_ARG (exp, 1);
34754 rtx op0 = expand_normal (arg0);
34755 rtx op1 = expand_normal (arg1);
34756 machine_mode tmode = insn_data[icode].operand[0].mode;
34757 machine_mode mode0 = insn_data[icode].operand[1].mode;
34758 machine_mode mode1 = insn_data[icode].operand[2].mode;
34759
34760 if (VECTOR_MODE_P (mode0))
34761 op0 = safe_vector_operand (op0, mode0);
34762 if (VECTOR_MODE_P (mode1))
34763 op1 = safe_vector_operand (op1, mode1);
34764
34765 if (optimize || !target
34766 || GET_MODE (target) != tmode
34767 || !insn_data[icode].operand[0].predicate (target, tmode))
34768 target = gen_reg_rtx (tmode);
34769
34770 if (GET_MODE (op1) == SImode && mode1 == TImode)
34771 {
34772 rtx x = gen_reg_rtx (V4SImode);
34773 emit_insn (gen_sse2_loadd (x, op1));
34774 op1 = gen_lowpart (TImode, x);
34775 }
34776
34777 if (!insn_data[icode].operand[1].predicate (op0, mode0))
34778 op0 = copy_to_mode_reg (mode0, op0);
34779 if (!insn_data[icode].operand[2].predicate (op1, mode1))
34780 op1 = copy_to_mode_reg (mode1, op1);
34781
34782 pat = GEN_FCN (icode) (target, op0, op1);
34783 if (! pat)
34784 return 0;
34785
34786 emit_insn (pat);
34787
34788 return target;
34789 }
34790
34791 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
34792
34793 static rtx
34794 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
34795 enum ix86_builtin_func_type m_type,
34796 enum rtx_code sub_code)
34797 {
34798 rtx pat;
34799 int i;
34800 int nargs;
34801 bool comparison_p = false;
34802 bool tf_p = false;
34803 bool last_arg_constant = false;
34804 int num_memory = 0;
34805 struct {
34806 rtx op;
34807 machine_mode mode;
34808 } args[4];
34809
34810 machine_mode tmode = insn_data[icode].operand[0].mode;
34811
34812 switch (m_type)
34813 {
34814 case MULTI_ARG_4_DF2_DI_I:
34815 case MULTI_ARG_4_DF2_DI_I1:
34816 case MULTI_ARG_4_SF2_SI_I:
34817 case MULTI_ARG_4_SF2_SI_I1:
34818 nargs = 4;
34819 last_arg_constant = true;
34820 break;
34821
34822 case MULTI_ARG_3_SF:
34823 case MULTI_ARG_3_DF:
34824 case MULTI_ARG_3_SF2:
34825 case MULTI_ARG_3_DF2:
34826 case MULTI_ARG_3_DI:
34827 case MULTI_ARG_3_SI:
34828 case MULTI_ARG_3_SI_DI:
34829 case MULTI_ARG_3_HI:
34830 case MULTI_ARG_3_HI_SI:
34831 case MULTI_ARG_3_QI:
34832 case MULTI_ARG_3_DI2:
34833 case MULTI_ARG_3_SI2:
34834 case MULTI_ARG_3_HI2:
34835 case MULTI_ARG_3_QI2:
34836 nargs = 3;
34837 break;
34838
34839 case MULTI_ARG_2_SF:
34840 case MULTI_ARG_2_DF:
34841 case MULTI_ARG_2_DI:
34842 case MULTI_ARG_2_SI:
34843 case MULTI_ARG_2_HI:
34844 case MULTI_ARG_2_QI:
34845 nargs = 2;
34846 break;
34847
34848 case MULTI_ARG_2_DI_IMM:
34849 case MULTI_ARG_2_SI_IMM:
34850 case MULTI_ARG_2_HI_IMM:
34851 case MULTI_ARG_2_QI_IMM:
34852 nargs = 2;
34853 last_arg_constant = true;
34854 break;
34855
34856 case MULTI_ARG_1_SF:
34857 case MULTI_ARG_1_DF:
34858 case MULTI_ARG_1_SF2:
34859 case MULTI_ARG_1_DF2:
34860 case MULTI_ARG_1_DI:
34861 case MULTI_ARG_1_SI:
34862 case MULTI_ARG_1_HI:
34863 case MULTI_ARG_1_QI:
34864 case MULTI_ARG_1_SI_DI:
34865 case MULTI_ARG_1_HI_DI:
34866 case MULTI_ARG_1_HI_SI:
34867 case MULTI_ARG_1_QI_DI:
34868 case MULTI_ARG_1_QI_SI:
34869 case MULTI_ARG_1_QI_HI:
34870 nargs = 1;
34871 break;
34872
34873 case MULTI_ARG_2_DI_CMP:
34874 case MULTI_ARG_2_SI_CMP:
34875 case MULTI_ARG_2_HI_CMP:
34876 case MULTI_ARG_2_QI_CMP:
34877 nargs = 2;
34878 comparison_p = true;
34879 break;
34880
34881 case MULTI_ARG_2_SF_TF:
34882 case MULTI_ARG_2_DF_TF:
34883 case MULTI_ARG_2_DI_TF:
34884 case MULTI_ARG_2_SI_TF:
34885 case MULTI_ARG_2_HI_TF:
34886 case MULTI_ARG_2_QI_TF:
34887 nargs = 2;
34888 tf_p = true;
34889 break;
34890
34891 default:
34892 gcc_unreachable ();
34893 }
34894
34895 if (optimize || !target
34896 || GET_MODE (target) != tmode
34897 || !insn_data[icode].operand[0].predicate (target, tmode))
34898 target = gen_reg_rtx (tmode);
34899 else if (memory_operand (target, tmode))
34900 num_memory++;
34901
34902 gcc_assert (nargs <= 4);
34903
34904 for (i = 0; i < nargs; i++)
34905 {
34906 tree arg = CALL_EXPR_ARG (exp, i);
34907 rtx op = expand_normal (arg);
34908 int adjust = (comparison_p) ? 1 : 0;
34909 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
34910
34911 if (last_arg_constant && i == nargs - 1)
34912 {
34913 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
34914 {
34915 enum insn_code new_icode = icode;
34916 switch (icode)
34917 {
34918 case CODE_FOR_xop_vpermil2v2df3:
34919 case CODE_FOR_xop_vpermil2v4sf3:
34920 case CODE_FOR_xop_vpermil2v4df3:
34921 case CODE_FOR_xop_vpermil2v8sf3:
34922 error ("the last argument must be a 2-bit immediate");
34923 return gen_reg_rtx (tmode);
34924 case CODE_FOR_xop_rotlv2di3:
34925 new_icode = CODE_FOR_rotlv2di3;
34926 goto xop_rotl;
34927 case CODE_FOR_xop_rotlv4si3:
34928 new_icode = CODE_FOR_rotlv4si3;
34929 goto xop_rotl;
34930 case CODE_FOR_xop_rotlv8hi3:
34931 new_icode = CODE_FOR_rotlv8hi3;
34932 goto xop_rotl;
34933 case CODE_FOR_xop_rotlv16qi3:
34934 new_icode = CODE_FOR_rotlv16qi3;
34935 xop_rotl:
34936 if (CONST_INT_P (op))
34937 {
34938 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
34939 op = GEN_INT (INTVAL (op) & mask);
34940 gcc_checking_assert
34941 (insn_data[icode].operand[i + 1].predicate (op, mode));
34942 }
34943 else
34944 {
34945 gcc_checking_assert
34946 (nargs == 2
34947 && insn_data[new_icode].operand[0].mode == tmode
34948 && insn_data[new_icode].operand[1].mode == tmode
34949 && insn_data[new_icode].operand[2].mode == mode
34950 && insn_data[new_icode].operand[0].predicate
34951 == insn_data[icode].operand[0].predicate
34952 && insn_data[new_icode].operand[1].predicate
34953 == insn_data[icode].operand[1].predicate);
34954 icode = new_icode;
34955 goto non_constant;
34956 }
34957 break;
34958 default:
34959 gcc_unreachable ();
34960 }
34961 }
34962 }
34963 else
34964 {
34965 non_constant:
34966 if (VECTOR_MODE_P (mode))
34967 op = safe_vector_operand (op, mode);
34968
34969 /* If we aren't optimizing, only allow one memory operand to be
34970 generated. */
34971 if (memory_operand (op, mode))
34972 num_memory++;
34973
34974 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
34975
34976 if (optimize
34977 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
34978 || num_memory > 1)
34979 op = force_reg (mode, op);
34980 }
34981
34982 args[i].op = op;
34983 args[i].mode = mode;
34984 }
34985
34986 switch (nargs)
34987 {
34988 case 1:
34989 pat = GEN_FCN (icode) (target, args[0].op);
34990 break;
34991
34992 case 2:
34993 if (tf_p)
34994 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34995 GEN_INT ((int)sub_code));
34996 else if (! comparison_p)
34997 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
34998 else
34999 {
35000 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
35001 args[0].op,
35002 args[1].op);
35003
35004 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
35005 }
35006 break;
35007
35008 case 3:
35009 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
35010 break;
35011
35012 case 4:
35013 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
35014 break;
35015
35016 default:
35017 gcc_unreachable ();
35018 }
35019
35020 if (! pat)
35021 return 0;
35022
35023 emit_insn (pat);
35024 return target;
35025 }
35026
35027 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
35028 insns with vec_merge. */
35029
35030 static rtx
35031 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
35032 rtx target)
35033 {
35034 rtx pat;
35035 tree arg0 = CALL_EXPR_ARG (exp, 0);
35036 rtx op1, op0 = expand_normal (arg0);
35037 machine_mode tmode = insn_data[icode].operand[0].mode;
35038 machine_mode mode0 = insn_data[icode].operand[1].mode;
35039
35040 if (optimize || !target
35041 || GET_MODE (target) != tmode
35042 || !insn_data[icode].operand[0].predicate (target, tmode))
35043 target = gen_reg_rtx (tmode);
35044
35045 if (VECTOR_MODE_P (mode0))
35046 op0 = safe_vector_operand (op0, mode0);
35047
35048 if ((optimize && !register_operand (op0, mode0))
35049 || !insn_data[icode].operand[1].predicate (op0, mode0))
35050 op0 = copy_to_mode_reg (mode0, op0);
35051
35052 op1 = op0;
35053 if (!insn_data[icode].operand[2].predicate (op1, mode0))
35054 op1 = copy_to_mode_reg (mode0, op1);
35055
35056 pat = GEN_FCN (icode) (target, op0, op1);
35057 if (! pat)
35058 return 0;
35059 emit_insn (pat);
35060 return target;
35061 }
35062
35063 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
35064
35065 static rtx
35066 ix86_expand_sse_compare (const struct builtin_description *d,
35067 tree exp, rtx target, bool swap)
35068 {
35069 rtx pat;
35070 tree arg0 = CALL_EXPR_ARG (exp, 0);
35071 tree arg1 = CALL_EXPR_ARG (exp, 1);
35072 rtx op0 = expand_normal (arg0);
35073 rtx op1 = expand_normal (arg1);
35074 rtx op2;
35075 machine_mode tmode = insn_data[d->icode].operand[0].mode;
35076 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
35077 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
35078 enum rtx_code comparison = d->comparison;
35079
35080 if (VECTOR_MODE_P (mode0))
35081 op0 = safe_vector_operand (op0, mode0);
35082 if (VECTOR_MODE_P (mode1))
35083 op1 = safe_vector_operand (op1, mode1);
35084
35085 /* Swap operands if we have a comparison that isn't available in
35086 hardware. */
35087 if (swap)
35088 std::swap (op0, op1);
35089
35090 if (optimize || !target
35091 || GET_MODE (target) != tmode
35092 || !insn_data[d->icode].operand[0].predicate (target, tmode))
35093 target = gen_reg_rtx (tmode);
35094
35095 if ((optimize && !register_operand (op0, mode0))
35096 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
35097 op0 = copy_to_mode_reg (mode0, op0);
35098 if ((optimize && !register_operand (op1, mode1))
35099 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
35100 op1 = copy_to_mode_reg (mode1, op1);
35101
35102 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
35103 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
35104 if (! pat)
35105 return 0;
35106 emit_insn (pat);
35107 return target;
35108 }
35109
35110 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
35111
35112 static rtx
35113 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
35114 rtx target)
35115 {
35116 rtx pat;
35117 tree arg0 = CALL_EXPR_ARG (exp, 0);
35118 tree arg1 = CALL_EXPR_ARG (exp, 1);
35119 rtx op0 = expand_normal (arg0);
35120 rtx op1 = expand_normal (arg1);
35121 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
35122 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
35123 enum rtx_code comparison = d->comparison;
35124
35125 if (VECTOR_MODE_P (mode0))
35126 op0 = safe_vector_operand (op0, mode0);
35127 if (VECTOR_MODE_P (mode1))
35128 op1 = safe_vector_operand (op1, mode1);
35129
35130 /* Swap operands if we have a comparison that isn't available in
35131 hardware. */
35132 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
35133 std::swap (op0, op1);
35134
35135 target = gen_reg_rtx (SImode);
35136 emit_move_insn (target, const0_rtx);
35137 target = gen_rtx_SUBREG (QImode, target, 0);
35138
35139 if ((optimize && !register_operand (op0, mode0))
35140 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35141 op0 = copy_to_mode_reg (mode0, op0);
35142 if ((optimize && !register_operand (op1, mode1))
35143 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
35144 op1 = copy_to_mode_reg (mode1, op1);
35145
35146 pat = GEN_FCN (d->icode) (op0, op1);
35147 if (! pat)
35148 return 0;
35149 emit_insn (pat);
35150 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35151 gen_rtx_fmt_ee (comparison, QImode,
35152 SET_DEST (pat),
35153 const0_rtx)));
35154
35155 return SUBREG_REG (target);
35156 }
35157
35158 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
35159
35160 static rtx
35161 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
35162 rtx target)
35163 {
35164 rtx pat;
35165 tree arg0 = CALL_EXPR_ARG (exp, 0);
35166 rtx op1, op0 = expand_normal (arg0);
35167 machine_mode tmode = insn_data[d->icode].operand[0].mode;
35168 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
35169
35170 if (optimize || target == 0
35171 || GET_MODE (target) != tmode
35172 || !insn_data[d->icode].operand[0].predicate (target, tmode))
35173 target = gen_reg_rtx (tmode);
35174
35175 if (VECTOR_MODE_P (mode0))
35176 op0 = safe_vector_operand (op0, mode0);
35177
35178 if ((optimize && !register_operand (op0, mode0))
35179 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35180 op0 = copy_to_mode_reg (mode0, op0);
35181
35182 op1 = GEN_INT (d->comparison);
35183
35184 pat = GEN_FCN (d->icode) (target, op0, op1);
35185 if (! pat)
35186 return 0;
35187 emit_insn (pat);
35188 return target;
35189 }
35190
35191 static rtx
35192 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
35193 tree exp, rtx target)
35194 {
35195 rtx pat;
35196 tree arg0 = CALL_EXPR_ARG (exp, 0);
35197 tree arg1 = CALL_EXPR_ARG (exp, 1);
35198 rtx op0 = expand_normal (arg0);
35199 rtx op1 = expand_normal (arg1);
35200 rtx op2;
35201 machine_mode tmode = insn_data[d->icode].operand[0].mode;
35202 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
35203 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
35204
35205 if (optimize || target == 0
35206 || GET_MODE (target) != tmode
35207 || !insn_data[d->icode].operand[0].predicate (target, tmode))
35208 target = gen_reg_rtx (tmode);
35209
35210 op0 = safe_vector_operand (op0, mode0);
35211 op1 = safe_vector_operand (op1, mode1);
35212
35213 if ((optimize && !register_operand (op0, mode0))
35214 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35215 op0 = copy_to_mode_reg (mode0, op0);
35216 if ((optimize && !register_operand (op1, mode1))
35217 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
35218 op1 = copy_to_mode_reg (mode1, op1);
35219
35220 op2 = GEN_INT (d->comparison);
35221
35222 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
35223 if (! pat)
35224 return 0;
35225 emit_insn (pat);
35226 return target;
35227 }
35228
35229 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
35230
35231 static rtx
35232 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
35233 rtx target)
35234 {
35235 rtx pat;
35236 tree arg0 = CALL_EXPR_ARG (exp, 0);
35237 tree arg1 = CALL_EXPR_ARG (exp, 1);
35238 rtx op0 = expand_normal (arg0);
35239 rtx op1 = expand_normal (arg1);
35240 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
35241 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
35242 enum rtx_code comparison = d->comparison;
35243
35244 if (VECTOR_MODE_P (mode0))
35245 op0 = safe_vector_operand (op0, mode0);
35246 if (VECTOR_MODE_P (mode1))
35247 op1 = safe_vector_operand (op1, mode1);
35248
35249 target = gen_reg_rtx (SImode);
35250 emit_move_insn (target, const0_rtx);
35251 target = gen_rtx_SUBREG (QImode, target, 0);
35252
35253 if ((optimize && !register_operand (op0, mode0))
35254 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35255 op0 = copy_to_mode_reg (mode0, op0);
35256 if ((optimize && !register_operand (op1, mode1))
35257 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
35258 op1 = copy_to_mode_reg (mode1, op1);
35259
35260 pat = GEN_FCN (d->icode) (op0, op1);
35261 if (! pat)
35262 return 0;
35263 emit_insn (pat);
35264 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35265 gen_rtx_fmt_ee (comparison, QImode,
35266 SET_DEST (pat),
35267 const0_rtx)));
35268
35269 return SUBREG_REG (target);
35270 }
35271
35272 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
35273
35274 static rtx
35275 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
35276 tree exp, rtx target)
35277 {
35278 rtx pat;
35279 tree arg0 = CALL_EXPR_ARG (exp, 0);
35280 tree arg1 = CALL_EXPR_ARG (exp, 1);
35281 tree arg2 = CALL_EXPR_ARG (exp, 2);
35282 tree arg3 = CALL_EXPR_ARG (exp, 3);
35283 tree arg4 = CALL_EXPR_ARG (exp, 4);
35284 rtx scratch0, scratch1;
35285 rtx op0 = expand_normal (arg0);
35286 rtx op1 = expand_normal (arg1);
35287 rtx op2 = expand_normal (arg2);
35288 rtx op3 = expand_normal (arg3);
35289 rtx op4 = expand_normal (arg4);
35290 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
35291
35292 tmode0 = insn_data[d->icode].operand[0].mode;
35293 tmode1 = insn_data[d->icode].operand[1].mode;
35294 modev2 = insn_data[d->icode].operand[2].mode;
35295 modei3 = insn_data[d->icode].operand[3].mode;
35296 modev4 = insn_data[d->icode].operand[4].mode;
35297 modei5 = insn_data[d->icode].operand[5].mode;
35298 modeimm = insn_data[d->icode].operand[6].mode;
35299
35300 if (VECTOR_MODE_P (modev2))
35301 op0 = safe_vector_operand (op0, modev2);
35302 if (VECTOR_MODE_P (modev4))
35303 op2 = safe_vector_operand (op2, modev4);
35304
35305 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
35306 op0 = copy_to_mode_reg (modev2, op0);
35307 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
35308 op1 = copy_to_mode_reg (modei3, op1);
35309 if ((optimize && !register_operand (op2, modev4))
35310 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
35311 op2 = copy_to_mode_reg (modev4, op2);
35312 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
35313 op3 = copy_to_mode_reg (modei5, op3);
35314
35315 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
35316 {
35317 error ("the fifth argument must be an 8-bit immediate");
35318 return const0_rtx;
35319 }
35320
35321 if (d->code == IX86_BUILTIN_PCMPESTRI128)
35322 {
35323 if (optimize || !target
35324 || GET_MODE (target) != tmode0
35325 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
35326 target = gen_reg_rtx (tmode0);
35327
35328 scratch1 = gen_reg_rtx (tmode1);
35329
35330 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
35331 }
35332 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
35333 {
35334 if (optimize || !target
35335 || GET_MODE (target) != tmode1
35336 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
35337 target = gen_reg_rtx (tmode1);
35338
35339 scratch0 = gen_reg_rtx (tmode0);
35340
35341 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
35342 }
35343 else
35344 {
35345 gcc_assert (d->flag);
35346
35347 scratch0 = gen_reg_rtx (tmode0);
35348 scratch1 = gen_reg_rtx (tmode1);
35349
35350 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
35351 }
35352
35353 if (! pat)
35354 return 0;
35355
35356 emit_insn (pat);
35357
35358 if (d->flag)
35359 {
35360 target = gen_reg_rtx (SImode);
35361 emit_move_insn (target, const0_rtx);
35362 target = gen_rtx_SUBREG (QImode, target, 0);
35363
35364 emit_insn
35365 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35366 gen_rtx_fmt_ee (EQ, QImode,
35367 gen_rtx_REG ((machine_mode) d->flag,
35368 FLAGS_REG),
35369 const0_rtx)));
35370 return SUBREG_REG (target);
35371 }
35372 else
35373 return target;
35374 }
35375
35376
35377 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
35378
35379 static rtx
35380 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
35381 tree exp, rtx target)
35382 {
35383 rtx pat;
35384 tree arg0 = CALL_EXPR_ARG (exp, 0);
35385 tree arg1 = CALL_EXPR_ARG (exp, 1);
35386 tree arg2 = CALL_EXPR_ARG (exp, 2);
35387 rtx scratch0, scratch1;
35388 rtx op0 = expand_normal (arg0);
35389 rtx op1 = expand_normal (arg1);
35390 rtx op2 = expand_normal (arg2);
35391 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
35392
35393 tmode0 = insn_data[d->icode].operand[0].mode;
35394 tmode1 = insn_data[d->icode].operand[1].mode;
35395 modev2 = insn_data[d->icode].operand[2].mode;
35396 modev3 = insn_data[d->icode].operand[3].mode;
35397 modeimm = insn_data[d->icode].operand[4].mode;
35398
35399 if (VECTOR_MODE_P (modev2))
35400 op0 = safe_vector_operand (op0, modev2);
35401 if (VECTOR_MODE_P (modev3))
35402 op1 = safe_vector_operand (op1, modev3);
35403
35404 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
35405 op0 = copy_to_mode_reg (modev2, op0);
35406 if ((optimize && !register_operand (op1, modev3))
35407 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
35408 op1 = copy_to_mode_reg (modev3, op1);
35409
35410 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
35411 {
35412 error ("the third argument must be an 8-bit immediate");
35413 return const0_rtx;
35414 }
35415
35416 if (d->code == IX86_BUILTIN_PCMPISTRI128)
35417 {
35418 if (optimize || !target
35419 || GET_MODE (target) != tmode0
35420 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
35421 target = gen_reg_rtx (tmode0);
35422
35423 scratch1 = gen_reg_rtx (tmode1);
35424
35425 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
35426 }
35427 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
35428 {
35429 if (optimize || !target
35430 || GET_MODE (target) != tmode1
35431 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
35432 target = gen_reg_rtx (tmode1);
35433
35434 scratch0 = gen_reg_rtx (tmode0);
35435
35436 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
35437 }
35438 else
35439 {
35440 gcc_assert (d->flag);
35441
35442 scratch0 = gen_reg_rtx (tmode0);
35443 scratch1 = gen_reg_rtx (tmode1);
35444
35445 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
35446 }
35447
35448 if (! pat)
35449 return 0;
35450
35451 emit_insn (pat);
35452
35453 if (d->flag)
35454 {
35455 target = gen_reg_rtx (SImode);
35456 emit_move_insn (target, const0_rtx);
35457 target = gen_rtx_SUBREG (QImode, target, 0);
35458
35459 emit_insn
35460 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35461 gen_rtx_fmt_ee (EQ, QImode,
35462 gen_rtx_REG ((machine_mode) d->flag,
35463 FLAGS_REG),
35464 const0_rtx)));
35465 return SUBREG_REG (target);
35466 }
35467 else
35468 return target;
35469 }
35470
35471 /* Subroutine of ix86_expand_builtin to take care of insns with
35472 variable number of operands. */
35473
35474 static rtx
35475 ix86_expand_args_builtin (const struct builtin_description *d,
35476 tree exp, rtx target)
35477 {
35478 rtx pat, real_target;
35479 unsigned int i, nargs;
35480 unsigned int nargs_constant = 0;
35481 unsigned int mask_pos = 0;
35482 int num_memory = 0;
35483 struct
35484 {
35485 rtx op;
35486 machine_mode mode;
35487 } args[6];
35488 bool second_arg_count = false;
35489 enum insn_code icode = d->icode;
35490 const struct insn_data_d *insn_p = &insn_data[icode];
35491 machine_mode tmode = insn_p->operand[0].mode;
35492 machine_mode rmode = VOIDmode;
35493 bool swap = false;
35494 enum rtx_code comparison = d->comparison;
35495
35496 switch ((enum ix86_builtin_func_type) d->flag)
35497 {
35498 case V2DF_FTYPE_V2DF_ROUND:
35499 case V4DF_FTYPE_V4DF_ROUND:
35500 case V8DF_FTYPE_V8DF_ROUND:
35501 case V4SF_FTYPE_V4SF_ROUND:
35502 case V8SF_FTYPE_V8SF_ROUND:
35503 case V16SF_FTYPE_V16SF_ROUND:
35504 case V4SI_FTYPE_V4SF_ROUND:
35505 case V8SI_FTYPE_V8SF_ROUND:
35506 case V16SI_FTYPE_V16SF_ROUND:
35507 return ix86_expand_sse_round (d, exp, target);
35508 case V4SI_FTYPE_V2DF_V2DF_ROUND:
35509 case V8SI_FTYPE_V4DF_V4DF_ROUND:
35510 case V16SI_FTYPE_V8DF_V8DF_ROUND:
35511 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
35512 case INT_FTYPE_V8SF_V8SF_PTEST:
35513 case INT_FTYPE_V4DI_V4DI_PTEST:
35514 case INT_FTYPE_V4DF_V4DF_PTEST:
35515 case INT_FTYPE_V4SF_V4SF_PTEST:
35516 case INT_FTYPE_V2DI_V2DI_PTEST:
35517 case INT_FTYPE_V2DF_V2DF_PTEST:
35518 return ix86_expand_sse_ptest (d, exp, target);
35519 case FLOAT128_FTYPE_FLOAT128:
35520 case FLOAT_FTYPE_FLOAT:
35521 case INT_FTYPE_INT:
35522 case UINT_FTYPE_UINT:
35523 case UINT16_FTYPE_UINT16:
35524 case UINT64_FTYPE_INT:
35525 case UINT64_FTYPE_UINT64:
35526 case INT64_FTYPE_INT64:
35527 case INT64_FTYPE_V4SF:
35528 case INT64_FTYPE_V2DF:
35529 case INT_FTYPE_V16QI:
35530 case INT_FTYPE_V8QI:
35531 case INT_FTYPE_V8SF:
35532 case INT_FTYPE_V4DF:
35533 case INT_FTYPE_V4SF:
35534 case INT_FTYPE_V2DF:
35535 case INT_FTYPE_V32QI:
35536 case V16QI_FTYPE_V16QI:
35537 case V8SI_FTYPE_V8SF:
35538 case V8SI_FTYPE_V4SI:
35539 case V8HI_FTYPE_V8HI:
35540 case V8HI_FTYPE_V16QI:
35541 case V8QI_FTYPE_V8QI:
35542 case V8SF_FTYPE_V8SF:
35543 case V8SF_FTYPE_V8SI:
35544 case V8SF_FTYPE_V4SF:
35545 case V8SF_FTYPE_V8HI:
35546 case V4SI_FTYPE_V4SI:
35547 case V4SI_FTYPE_V16QI:
35548 case V4SI_FTYPE_V4SF:
35549 case V4SI_FTYPE_V8SI:
35550 case V4SI_FTYPE_V8HI:
35551 case V4SI_FTYPE_V4DF:
35552 case V4SI_FTYPE_V2DF:
35553 case V4HI_FTYPE_V4HI:
35554 case V4DF_FTYPE_V4DF:
35555 case V4DF_FTYPE_V4SI:
35556 case V4DF_FTYPE_V4SF:
35557 case V4DF_FTYPE_V2DF:
35558 case V4SF_FTYPE_V4SF:
35559 case V4SF_FTYPE_V4SI:
35560 case V4SF_FTYPE_V8SF:
35561 case V4SF_FTYPE_V4DF:
35562 case V4SF_FTYPE_V8HI:
35563 case V4SF_FTYPE_V2DF:
35564 case V2DI_FTYPE_V2DI:
35565 case V2DI_FTYPE_V16QI:
35566 case V2DI_FTYPE_V8HI:
35567 case V2DI_FTYPE_V4SI:
35568 case V2DF_FTYPE_V2DF:
35569 case V2DF_FTYPE_V4SI:
35570 case V2DF_FTYPE_V4DF:
35571 case V2DF_FTYPE_V4SF:
35572 case V2DF_FTYPE_V2SI:
35573 case V2SI_FTYPE_V2SI:
35574 case V2SI_FTYPE_V4SF:
35575 case V2SI_FTYPE_V2SF:
35576 case V2SI_FTYPE_V2DF:
35577 case V2SF_FTYPE_V2SF:
35578 case V2SF_FTYPE_V2SI:
35579 case V32QI_FTYPE_V32QI:
35580 case V32QI_FTYPE_V16QI:
35581 case V16HI_FTYPE_V16HI:
35582 case V16HI_FTYPE_V8HI:
35583 case V8SI_FTYPE_V8SI:
35584 case V16HI_FTYPE_V16QI:
35585 case V8SI_FTYPE_V16QI:
35586 case V4DI_FTYPE_V16QI:
35587 case V8SI_FTYPE_V8HI:
35588 case V4DI_FTYPE_V8HI:
35589 case V4DI_FTYPE_V4SI:
35590 case V4DI_FTYPE_V2DI:
35591 case UQI_FTYPE_UQI:
35592 case UHI_FTYPE_UHI:
35593 case USI_FTYPE_USI:
35594 case USI_FTYPE_UQI:
35595 case USI_FTYPE_UHI:
35596 case UDI_FTYPE_UDI:
35597 case UHI_FTYPE_V16QI:
35598 case USI_FTYPE_V32QI:
35599 case UDI_FTYPE_V64QI:
35600 case V16QI_FTYPE_UHI:
35601 case V32QI_FTYPE_USI:
35602 case V64QI_FTYPE_UDI:
35603 case V8HI_FTYPE_UQI:
35604 case V16HI_FTYPE_UHI:
35605 case V32HI_FTYPE_USI:
35606 case V4SI_FTYPE_UQI:
35607 case V8SI_FTYPE_UQI:
35608 case V4SI_FTYPE_UHI:
35609 case V8SI_FTYPE_UHI:
35610 case UQI_FTYPE_V8HI:
35611 case UHI_FTYPE_V16HI:
35612 case USI_FTYPE_V32HI:
35613 case UQI_FTYPE_V4SI:
35614 case UQI_FTYPE_V8SI:
35615 case UHI_FTYPE_V16SI:
35616 case UQI_FTYPE_V2DI:
35617 case UQI_FTYPE_V4DI:
35618 case UQI_FTYPE_V8DI:
35619 case V16SI_FTYPE_UHI:
35620 case V2DI_FTYPE_UQI:
35621 case V4DI_FTYPE_UQI:
35622 case V16SI_FTYPE_INT:
35623 case V16SF_FTYPE_V8SF:
35624 case V16SI_FTYPE_V8SI:
35625 case V16SF_FTYPE_V4SF:
35626 case V16SI_FTYPE_V4SI:
35627 case V16SI_FTYPE_V16SF:
35628 case V16SI_FTYPE_V16SI:
35629 case V16SF_FTYPE_V16SF:
35630 case V8DI_FTYPE_UQI:
35631 case V8DI_FTYPE_V8DI:
35632 case V8DF_FTYPE_V4DF:
35633 case V8DF_FTYPE_V2DF:
35634 case V8DF_FTYPE_V8DF:
35635 nargs = 1;
35636 break;
35637 case V4SF_FTYPE_V4SF_VEC_MERGE:
35638 case V2DF_FTYPE_V2DF_VEC_MERGE:
35639 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
35640 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
35641 case V16QI_FTYPE_V16QI_V16QI:
35642 case V16QI_FTYPE_V8HI_V8HI:
35643 case V16SF_FTYPE_V16SF_V16SF:
35644 case V8QI_FTYPE_V8QI_V8QI:
35645 case V8QI_FTYPE_V4HI_V4HI:
35646 case V8HI_FTYPE_V8HI_V8HI:
35647 case V8HI_FTYPE_V16QI_V16QI:
35648 case V8HI_FTYPE_V4SI_V4SI:
35649 case V8SF_FTYPE_V8SF_V8SF:
35650 case V8SF_FTYPE_V8SF_V8SI:
35651 case V8DF_FTYPE_V8DF_V8DF:
35652 case V4SI_FTYPE_V4SI_V4SI:
35653 case V4SI_FTYPE_V8HI_V8HI:
35654 case V4SI_FTYPE_V2DF_V2DF:
35655 case V4HI_FTYPE_V4HI_V4HI:
35656 case V4HI_FTYPE_V8QI_V8QI:
35657 case V4HI_FTYPE_V2SI_V2SI:
35658 case V4DF_FTYPE_V4DF_V4DF:
35659 case V4DF_FTYPE_V4DF_V4DI:
35660 case V4SF_FTYPE_V4SF_V4SF:
35661 case V4SF_FTYPE_V4SF_V4SI:
35662 case V4SF_FTYPE_V4SF_V2SI:
35663 case V4SF_FTYPE_V4SF_V2DF:
35664 case V4SF_FTYPE_V4SF_UINT:
35665 case V4SF_FTYPE_V4SF_DI:
35666 case V4SF_FTYPE_V4SF_SI:
35667 case V2DI_FTYPE_V2DI_V2DI:
35668 case V2DI_FTYPE_V16QI_V16QI:
35669 case V2DI_FTYPE_V4SI_V4SI:
35670 case V2DI_FTYPE_V2DI_V16QI:
35671 case V2SI_FTYPE_V2SI_V2SI:
35672 case V2SI_FTYPE_V4HI_V4HI:
35673 case V2SI_FTYPE_V2SF_V2SF:
35674 case V2DF_FTYPE_V2DF_V2DF:
35675 case V2DF_FTYPE_V2DF_V4SF:
35676 case V2DF_FTYPE_V2DF_V2DI:
35677 case V2DF_FTYPE_V2DF_DI:
35678 case V2DF_FTYPE_V2DF_SI:
35679 case V2DF_FTYPE_V2DF_UINT:
35680 case V2SF_FTYPE_V2SF_V2SF:
35681 case V1DI_FTYPE_V1DI_V1DI:
35682 case V1DI_FTYPE_V8QI_V8QI:
35683 case V1DI_FTYPE_V2SI_V2SI:
35684 case V32QI_FTYPE_V16HI_V16HI:
35685 case V16HI_FTYPE_V8SI_V8SI:
35686 case V32QI_FTYPE_V32QI_V32QI:
35687 case V16HI_FTYPE_V32QI_V32QI:
35688 case V16HI_FTYPE_V16HI_V16HI:
35689 case V8SI_FTYPE_V4DF_V4DF:
35690 case V8SI_FTYPE_V8SI_V8SI:
35691 case V8SI_FTYPE_V16HI_V16HI:
35692 case V4DI_FTYPE_V4DI_V4DI:
35693 case V4DI_FTYPE_V8SI_V8SI:
35694 case V8DI_FTYPE_V64QI_V64QI:
35695 if (comparison == UNKNOWN)
35696 return ix86_expand_binop_builtin (icode, exp, target);
35697 nargs = 2;
35698 break;
35699 case V4SF_FTYPE_V4SF_V4SF_SWAP:
35700 case V2DF_FTYPE_V2DF_V2DF_SWAP:
35701 gcc_assert (comparison != UNKNOWN);
35702 nargs = 2;
35703 swap = true;
35704 break;
35705 case V16HI_FTYPE_V16HI_V8HI_COUNT:
35706 case V16HI_FTYPE_V16HI_SI_COUNT:
35707 case V8SI_FTYPE_V8SI_V4SI_COUNT:
35708 case V8SI_FTYPE_V8SI_SI_COUNT:
35709 case V4DI_FTYPE_V4DI_V2DI_COUNT:
35710 case V4DI_FTYPE_V4DI_INT_COUNT:
35711 case V8HI_FTYPE_V8HI_V8HI_COUNT:
35712 case V8HI_FTYPE_V8HI_SI_COUNT:
35713 case V4SI_FTYPE_V4SI_V4SI_COUNT:
35714 case V4SI_FTYPE_V4SI_SI_COUNT:
35715 case V4HI_FTYPE_V4HI_V4HI_COUNT:
35716 case V4HI_FTYPE_V4HI_SI_COUNT:
35717 case V2DI_FTYPE_V2DI_V2DI_COUNT:
35718 case V2DI_FTYPE_V2DI_SI_COUNT:
35719 case V2SI_FTYPE_V2SI_V2SI_COUNT:
35720 case V2SI_FTYPE_V2SI_SI_COUNT:
35721 case V1DI_FTYPE_V1DI_V1DI_COUNT:
35722 case V1DI_FTYPE_V1DI_SI_COUNT:
35723 nargs = 2;
35724 second_arg_count = true;
35725 break;
35726 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
35727 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
35728 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
35729 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
35730 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
35731 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
35732 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
35733 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
35734 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
35735 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
35736 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
35737 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
35738 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
35739 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
35740 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
35741 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
35742 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
35743 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
35744 nargs = 4;
35745 second_arg_count = true;
35746 break;
35747 case UINT64_FTYPE_UINT64_UINT64:
35748 case UINT_FTYPE_UINT_UINT:
35749 case UINT_FTYPE_UINT_USHORT:
35750 case UINT_FTYPE_UINT_UCHAR:
35751 case UINT16_FTYPE_UINT16_INT:
35752 case UINT8_FTYPE_UINT8_INT:
35753 case UQI_FTYPE_UQI_UQI:
35754 case UHI_FTYPE_UHI_UHI:
35755 case USI_FTYPE_USI_USI:
35756 case UDI_FTYPE_UDI_UDI:
35757 case V16SI_FTYPE_V8DF_V8DF:
35758 nargs = 2;
35759 break;
35760 case V2DI_FTYPE_V2DI_INT_CONVERT:
35761 nargs = 2;
35762 rmode = V1TImode;
35763 nargs_constant = 1;
35764 break;
35765 case V4DI_FTYPE_V4DI_INT_CONVERT:
35766 nargs = 2;
35767 rmode = V2TImode;
35768 nargs_constant = 1;
35769 break;
35770 case V8DI_FTYPE_V8DI_INT_CONVERT:
35771 nargs = 2;
35772 rmode = V4TImode;
35773 nargs_constant = 1;
35774 break;
35775 case V8HI_FTYPE_V8HI_INT:
35776 case V8HI_FTYPE_V8SF_INT:
35777 case V16HI_FTYPE_V16SF_INT:
35778 case V8HI_FTYPE_V4SF_INT:
35779 case V8SF_FTYPE_V8SF_INT:
35780 case V4SF_FTYPE_V16SF_INT:
35781 case V16SF_FTYPE_V16SF_INT:
35782 case V4SI_FTYPE_V4SI_INT:
35783 case V4SI_FTYPE_V8SI_INT:
35784 case V4HI_FTYPE_V4HI_INT:
35785 case V4DF_FTYPE_V4DF_INT:
35786 case V4DF_FTYPE_V8DF_INT:
35787 case V4SF_FTYPE_V4SF_INT:
35788 case V4SF_FTYPE_V8SF_INT:
35789 case V2DI_FTYPE_V2DI_INT:
35790 case V2DF_FTYPE_V2DF_INT:
35791 case V2DF_FTYPE_V4DF_INT:
35792 case V16HI_FTYPE_V16HI_INT:
35793 case V8SI_FTYPE_V8SI_INT:
35794 case V16SI_FTYPE_V16SI_INT:
35795 case V4SI_FTYPE_V16SI_INT:
35796 case V4DI_FTYPE_V4DI_INT:
35797 case V2DI_FTYPE_V4DI_INT:
35798 case V4DI_FTYPE_V8DI_INT:
35799 case QI_FTYPE_V4SF_INT:
35800 case QI_FTYPE_V2DF_INT:
35801 case UQI_FTYPE_UQI_UQI_CONST:
35802 case UHI_FTYPE_UHI_UQI:
35803 case USI_FTYPE_USI_UQI:
35804 case UDI_FTYPE_UDI_UQI:
35805 nargs = 2;
35806 nargs_constant = 1;
35807 break;
35808 case V16QI_FTYPE_V16QI_V16QI_V16QI:
35809 case V8SF_FTYPE_V8SF_V8SF_V8SF:
35810 case V4DF_FTYPE_V4DF_V4DF_V4DF:
35811 case V4SF_FTYPE_V4SF_V4SF_V4SF:
35812 case V2DF_FTYPE_V2DF_V2DF_V2DF:
35813 case V32QI_FTYPE_V32QI_V32QI_V32QI:
35814 case UHI_FTYPE_V16SI_V16SI_UHI:
35815 case UQI_FTYPE_V8DI_V8DI_UQI:
35816 case V16HI_FTYPE_V16SI_V16HI_UHI:
35817 case V16QI_FTYPE_V16SI_V16QI_UHI:
35818 case V16QI_FTYPE_V8DI_V16QI_UQI:
35819 case V16SF_FTYPE_V16SF_V16SF_UHI:
35820 case V16SF_FTYPE_V4SF_V16SF_UHI:
35821 case V16SI_FTYPE_SI_V16SI_UHI:
35822 case V16SI_FTYPE_V16HI_V16SI_UHI:
35823 case V16SI_FTYPE_V16QI_V16SI_UHI:
35824 case V8SF_FTYPE_V4SF_V8SF_UQI:
35825 case V4DF_FTYPE_V2DF_V4DF_UQI:
35826 case V8SI_FTYPE_V4SI_V8SI_UQI:
35827 case V8SI_FTYPE_SI_V8SI_UQI:
35828 case V4SI_FTYPE_V4SI_V4SI_UQI:
35829 case V4SI_FTYPE_SI_V4SI_UQI:
35830 case V4DI_FTYPE_V2DI_V4DI_UQI:
35831 case V4DI_FTYPE_DI_V4DI_UQI:
35832 case V2DI_FTYPE_V2DI_V2DI_UQI:
35833 case V2DI_FTYPE_DI_V2DI_UQI:
35834 case V64QI_FTYPE_V64QI_V64QI_UDI:
35835 case V64QI_FTYPE_V16QI_V64QI_UDI:
35836 case V64QI_FTYPE_QI_V64QI_UDI:
35837 case V32QI_FTYPE_V32QI_V32QI_USI:
35838 case V32QI_FTYPE_V16QI_V32QI_USI:
35839 case V32QI_FTYPE_QI_V32QI_USI:
35840 case V16QI_FTYPE_V16QI_V16QI_UHI:
35841 case V16QI_FTYPE_QI_V16QI_UHI:
35842 case V32HI_FTYPE_V8HI_V32HI_USI:
35843 case V32HI_FTYPE_HI_V32HI_USI:
35844 case V16HI_FTYPE_V8HI_V16HI_UHI:
35845 case V16HI_FTYPE_HI_V16HI_UHI:
35846 case V8HI_FTYPE_V8HI_V8HI_UQI:
35847 case V8HI_FTYPE_HI_V8HI_UQI:
35848 case V8SF_FTYPE_V8HI_V8SF_UQI:
35849 case V4SF_FTYPE_V8HI_V4SF_UQI:
35850 case V8SI_FTYPE_V8SF_V8SI_UQI:
35851 case V4SI_FTYPE_V4SF_V4SI_UQI:
35852 case V4DI_FTYPE_V4SF_V4DI_UQI:
35853 case V2DI_FTYPE_V4SF_V2DI_UQI:
35854 case V4SF_FTYPE_V4DI_V4SF_UQI:
35855 case V4SF_FTYPE_V2DI_V4SF_UQI:
35856 case V4DF_FTYPE_V4DI_V4DF_UQI:
35857 case V2DF_FTYPE_V2DI_V2DF_UQI:
35858 case V16QI_FTYPE_V8HI_V16QI_UQI:
35859 case V16QI_FTYPE_V16HI_V16QI_UHI:
35860 case V16QI_FTYPE_V4SI_V16QI_UQI:
35861 case V16QI_FTYPE_V8SI_V16QI_UQI:
35862 case V8HI_FTYPE_V4SI_V8HI_UQI:
35863 case V8HI_FTYPE_V8SI_V8HI_UQI:
35864 case V16QI_FTYPE_V2DI_V16QI_UQI:
35865 case V16QI_FTYPE_V4DI_V16QI_UQI:
35866 case V8HI_FTYPE_V2DI_V8HI_UQI:
35867 case V8HI_FTYPE_V4DI_V8HI_UQI:
35868 case V4SI_FTYPE_V2DI_V4SI_UQI:
35869 case V4SI_FTYPE_V4DI_V4SI_UQI:
35870 case V32QI_FTYPE_V32HI_V32QI_USI:
35871 case UHI_FTYPE_V16QI_V16QI_UHI:
35872 case USI_FTYPE_V32QI_V32QI_USI:
35873 case UDI_FTYPE_V64QI_V64QI_UDI:
35874 case UQI_FTYPE_V8HI_V8HI_UQI:
35875 case UHI_FTYPE_V16HI_V16HI_UHI:
35876 case USI_FTYPE_V32HI_V32HI_USI:
35877 case UQI_FTYPE_V4SI_V4SI_UQI:
35878 case UQI_FTYPE_V8SI_V8SI_UQI:
35879 case UQI_FTYPE_V2DI_V2DI_UQI:
35880 case UQI_FTYPE_V4DI_V4DI_UQI:
35881 case V4SF_FTYPE_V2DF_V4SF_UQI:
35882 case V4SF_FTYPE_V4DF_V4SF_UQI:
35883 case V16SI_FTYPE_V16SI_V16SI_UHI:
35884 case V16SI_FTYPE_V4SI_V16SI_UHI:
35885 case V2DI_FTYPE_V4SI_V2DI_UQI:
35886 case V2DI_FTYPE_V8HI_V2DI_UQI:
35887 case V2DI_FTYPE_V16QI_V2DI_UQI:
35888 case V4DI_FTYPE_V4DI_V4DI_UQI:
35889 case V4DI_FTYPE_V4SI_V4DI_UQI:
35890 case V4DI_FTYPE_V8HI_V4DI_UQI:
35891 case V4DI_FTYPE_V16QI_V4DI_UQI:
35892 case V4DI_FTYPE_V4DF_V4DI_UQI:
35893 case V2DI_FTYPE_V2DF_V2DI_UQI:
35894 case V4SI_FTYPE_V4DF_V4SI_UQI:
35895 case V4SI_FTYPE_V2DF_V4SI_UQI:
35896 case V4SI_FTYPE_V8HI_V4SI_UQI:
35897 case V4SI_FTYPE_V16QI_V4SI_UQI:
35898 case V4DI_FTYPE_V4DI_V4DI_V4DI:
35899 case V8DF_FTYPE_V2DF_V8DF_UQI:
35900 case V8DF_FTYPE_V4DF_V8DF_UQI:
35901 case V8DF_FTYPE_V8DF_V8DF_UQI:
35902 case V8SF_FTYPE_V8SF_V8SF_UQI:
35903 case V8SF_FTYPE_V8SI_V8SF_UQI:
35904 case V4DF_FTYPE_V4DF_V4DF_UQI:
35905 case V4SF_FTYPE_V4SF_V4SF_UQI:
35906 case V2DF_FTYPE_V2DF_V2DF_UQI:
35907 case V2DF_FTYPE_V4SF_V2DF_UQI:
35908 case V2DF_FTYPE_V4SI_V2DF_UQI:
35909 case V4SF_FTYPE_V4SI_V4SF_UQI:
35910 case V4DF_FTYPE_V4SF_V4DF_UQI:
35911 case V4DF_FTYPE_V4SI_V4DF_UQI:
35912 case V8SI_FTYPE_V8SI_V8SI_UQI:
35913 case V8SI_FTYPE_V8HI_V8SI_UQI:
35914 case V8SI_FTYPE_V16QI_V8SI_UQI:
35915 case V8DF_FTYPE_V8SI_V8DF_UQI:
35916 case V8DI_FTYPE_DI_V8DI_UQI:
35917 case V16SF_FTYPE_V8SF_V16SF_UHI:
35918 case V16SI_FTYPE_V8SI_V16SI_UHI:
35919 case V16HI_FTYPE_V16HI_V16HI_UHI:
35920 case V8HI_FTYPE_V16QI_V8HI_UQI:
35921 case V16HI_FTYPE_V16QI_V16HI_UHI:
35922 case V32HI_FTYPE_V32HI_V32HI_USI:
35923 case V32HI_FTYPE_V32QI_V32HI_USI:
35924 case V8DI_FTYPE_V16QI_V8DI_UQI:
35925 case V8DI_FTYPE_V2DI_V8DI_UQI:
35926 case V8DI_FTYPE_V4DI_V8DI_UQI:
35927 case V8DI_FTYPE_V8DI_V8DI_UQI:
35928 case V8DI_FTYPE_V8HI_V8DI_UQI:
35929 case V8DI_FTYPE_V8SI_V8DI_UQI:
35930 case V8HI_FTYPE_V8DI_V8HI_UQI:
35931 case V8SI_FTYPE_V8DI_V8SI_UQI:
35932 case V4SI_FTYPE_V4SI_V4SI_V4SI:
35933 nargs = 3;
35934 break;
35935 case V32QI_FTYPE_V32QI_V32QI_INT:
35936 case V16HI_FTYPE_V16HI_V16HI_INT:
35937 case V16QI_FTYPE_V16QI_V16QI_INT:
35938 case V4DI_FTYPE_V4DI_V4DI_INT:
35939 case V8HI_FTYPE_V8HI_V8HI_INT:
35940 case V8SI_FTYPE_V8SI_V8SI_INT:
35941 case V8SI_FTYPE_V8SI_V4SI_INT:
35942 case V8SF_FTYPE_V8SF_V8SF_INT:
35943 case V8SF_FTYPE_V8SF_V4SF_INT:
35944 case V4SI_FTYPE_V4SI_V4SI_INT:
35945 case V4DF_FTYPE_V4DF_V4DF_INT:
35946 case V16SF_FTYPE_V16SF_V16SF_INT:
35947 case V16SF_FTYPE_V16SF_V4SF_INT:
35948 case V16SI_FTYPE_V16SI_V4SI_INT:
35949 case V4DF_FTYPE_V4DF_V2DF_INT:
35950 case V4SF_FTYPE_V4SF_V4SF_INT:
35951 case V2DI_FTYPE_V2DI_V2DI_INT:
35952 case V4DI_FTYPE_V4DI_V2DI_INT:
35953 case V2DF_FTYPE_V2DF_V2DF_INT:
35954 case UQI_FTYPE_V8DI_V8UDI_INT:
35955 case UQI_FTYPE_V8DF_V8DF_INT:
35956 case UQI_FTYPE_V2DF_V2DF_INT:
35957 case UQI_FTYPE_V4SF_V4SF_INT:
35958 case UHI_FTYPE_V16SI_V16SI_INT:
35959 case UHI_FTYPE_V16SF_V16SF_INT:
35960 nargs = 3;
35961 nargs_constant = 1;
35962 break;
35963 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
35964 nargs = 3;
35965 rmode = V4DImode;
35966 nargs_constant = 1;
35967 break;
35968 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
35969 nargs = 3;
35970 rmode = V2DImode;
35971 nargs_constant = 1;
35972 break;
35973 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
35974 nargs = 3;
35975 rmode = DImode;
35976 nargs_constant = 1;
35977 break;
35978 case V2DI_FTYPE_V2DI_UINT_UINT:
35979 nargs = 3;
35980 nargs_constant = 2;
35981 break;
35982 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
35983 nargs = 3;
35984 rmode = V8DImode;
35985 nargs_constant = 1;
35986 break;
35987 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
35988 nargs = 5;
35989 rmode = V8DImode;
35990 mask_pos = 2;
35991 nargs_constant = 1;
35992 break;
35993 case QI_FTYPE_V8DF_INT_UQI:
35994 case QI_FTYPE_V4DF_INT_UQI:
35995 case QI_FTYPE_V2DF_INT_UQI:
35996 case HI_FTYPE_V16SF_INT_UHI:
35997 case QI_FTYPE_V8SF_INT_UQI:
35998 case QI_FTYPE_V4SF_INT_UQI:
35999 nargs = 3;
36000 mask_pos = 1;
36001 nargs_constant = 1;
36002 break;
36003 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
36004 nargs = 5;
36005 rmode = V4DImode;
36006 mask_pos = 2;
36007 nargs_constant = 1;
36008 break;
36009 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
36010 nargs = 5;
36011 rmode = V2DImode;
36012 mask_pos = 2;
36013 nargs_constant = 1;
36014 break;
36015 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
36016 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
36017 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
36018 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
36019 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
36020 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
36021 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
36022 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
36023 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
36024 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
36025 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
36026 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
36027 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
36028 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
36029 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
36030 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
36031 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
36032 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
36033 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
36034 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
36035 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
36036 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
36037 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
36038 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
36039 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
36040 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
36041 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
36042 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
36043 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
36044 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
36045 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
36046 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
36047 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
36048 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
36049 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
36050 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
36051 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
36052 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
36053 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
36054 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
36055 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
36056 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
36057 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
36058 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
36059 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
36060 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
36061 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
36062 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
36063 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
36064 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
36065 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
36066 nargs = 4;
36067 break;
36068 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
36069 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
36070 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
36071 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
36072 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
36073 nargs = 4;
36074 nargs_constant = 1;
36075 break;
36076 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
36077 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
36078 case QI_FTYPE_V4DF_V4DF_INT_UQI:
36079 case QI_FTYPE_V8SF_V8SF_INT_UQI:
36080 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
36081 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
36082 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
36083 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
36084 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
36085 case USI_FTYPE_V32QI_V32QI_INT_USI:
36086 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
36087 case USI_FTYPE_V32HI_V32HI_INT_USI:
36088 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
36089 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
36090 nargs = 4;
36091 mask_pos = 1;
36092 nargs_constant = 1;
36093 break;
36094 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
36095 nargs = 4;
36096 nargs_constant = 2;
36097 break;
36098 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
36099 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
36100 nargs = 4;
36101 break;
36102 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
36103 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
36104 mask_pos = 1;
36105 nargs = 4;
36106 nargs_constant = 1;
36107 break;
36108 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
36109 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
36110 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
36111 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
36112 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
36113 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
36114 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
36115 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
36116 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
36117 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
36118 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
36119 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
36120 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
36121 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
36122 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
36123 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
36124 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
36125 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
36126 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
36127 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
36128 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
36129 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
36130 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
36131 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
36132 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
36133 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
36134 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
36135 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
36136 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
36137 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
36138 nargs = 4;
36139 mask_pos = 2;
36140 nargs_constant = 1;
36141 break;
36142 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
36143 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
36144 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
36145 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
36146 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
36147 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
36148 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
36149 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
36150 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
36151 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
36152 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
36153 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
36154 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
36155 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
36156 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
36157 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
36158 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
36159 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
36160 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
36161 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
36162 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
36163 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
36164 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
36165 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
36166 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
36167 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
36168 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
36169 nargs = 5;
36170 mask_pos = 2;
36171 nargs_constant = 1;
36172 break;
36173 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
36174 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
36175 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
36176 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
36177 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
36178 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
36179 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
36180 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
36181 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
36182 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
36183 nargs = 5;
36184 mask_pos = 1;
36185 nargs_constant = 1;
36186 break;
36187
36188 default:
36189 gcc_unreachable ();
36190 }
36191
36192 gcc_assert (nargs <= ARRAY_SIZE (args));
36193
36194 if (comparison != UNKNOWN)
36195 {
36196 gcc_assert (nargs == 2);
36197 return ix86_expand_sse_compare (d, exp, target, swap);
36198 }
36199
36200 if (rmode == VOIDmode || rmode == tmode)
36201 {
36202 if (optimize
36203 || target == 0
36204 || GET_MODE (target) != tmode
36205 || !insn_p->operand[0].predicate (target, tmode))
36206 target = gen_reg_rtx (tmode);
36207 else if (memory_operand (target, tmode))
36208 num_memory++;
36209 real_target = target;
36210 }
36211 else
36212 {
36213 real_target = gen_reg_rtx (tmode);
36214 target = lowpart_subreg (rmode, real_target, tmode);
36215 }
36216
36217 for (i = 0; i < nargs; i++)
36218 {
36219 tree arg = CALL_EXPR_ARG (exp, i);
36220 rtx op = expand_normal (arg);
36221 machine_mode mode = insn_p->operand[i + 1].mode;
36222 bool match = insn_p->operand[i + 1].predicate (op, mode);
36223
36224 if (second_arg_count && i == 1)
36225 {
36226 /* SIMD shift insns take either an 8-bit immediate or
36227 register as count. But builtin functions take int as
36228 count. If count doesn't match, we put it in register.
36229 The instructions are using 64-bit count, if op is just
36230 32-bit, zero-extend it, as negative shift counts
36231 are undefined behavior and zero-extension is more
36232 efficient. */
36233 if (!match)
36234 {
36235 if (SCALAR_INT_MODE_P (GET_MODE (op)))
36236 op = convert_modes (mode, GET_MODE (op), op, 1);
36237 else
36238 op = lowpart_subreg (mode, op, GET_MODE (op));
36239 if (!insn_p->operand[i + 1].predicate (op, mode))
36240 op = copy_to_reg (op);
36241 }
36242 }
36243 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
36244 (!mask_pos && (nargs - i) <= nargs_constant))
36245 {
36246 if (!match)
36247 switch (icode)
36248 {
36249 case CODE_FOR_avx_vinsertf128v4di:
36250 case CODE_FOR_avx_vextractf128v4di:
36251 error ("the last argument must be an 1-bit immediate");
36252 return const0_rtx;
36253
36254 case CODE_FOR_avx512f_cmpv8di3_mask:
36255 case CODE_FOR_avx512f_cmpv16si3_mask:
36256 case CODE_FOR_avx512f_ucmpv8di3_mask:
36257 case CODE_FOR_avx512f_ucmpv16si3_mask:
36258 case CODE_FOR_avx512vl_cmpv4di3_mask:
36259 case CODE_FOR_avx512vl_cmpv8si3_mask:
36260 case CODE_FOR_avx512vl_ucmpv4di3_mask:
36261 case CODE_FOR_avx512vl_ucmpv8si3_mask:
36262 case CODE_FOR_avx512vl_cmpv2di3_mask:
36263 case CODE_FOR_avx512vl_cmpv4si3_mask:
36264 case CODE_FOR_avx512vl_ucmpv2di3_mask:
36265 case CODE_FOR_avx512vl_ucmpv4si3_mask:
36266 error ("the last argument must be a 3-bit immediate");
36267 return const0_rtx;
36268
36269 case CODE_FOR_sse4_1_roundsd:
36270 case CODE_FOR_sse4_1_roundss:
36271
36272 case CODE_FOR_sse4_1_roundpd:
36273 case CODE_FOR_sse4_1_roundps:
36274 case CODE_FOR_avx_roundpd256:
36275 case CODE_FOR_avx_roundps256:
36276
36277 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
36278 case CODE_FOR_sse4_1_roundps_sfix:
36279 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
36280 case CODE_FOR_avx_roundps_sfix256:
36281
36282 case CODE_FOR_sse4_1_blendps:
36283 case CODE_FOR_avx_blendpd256:
36284 case CODE_FOR_avx_vpermilv4df:
36285 case CODE_FOR_avx_vpermilv4df_mask:
36286 case CODE_FOR_avx512f_getmantv8df_mask:
36287 case CODE_FOR_avx512f_getmantv16sf_mask:
36288 case CODE_FOR_avx512vl_getmantv8sf_mask:
36289 case CODE_FOR_avx512vl_getmantv4df_mask:
36290 case CODE_FOR_avx512vl_getmantv4sf_mask:
36291 case CODE_FOR_avx512vl_getmantv2df_mask:
36292 case CODE_FOR_avx512dq_rangepv8df_mask_round:
36293 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
36294 case CODE_FOR_avx512dq_rangepv4df_mask:
36295 case CODE_FOR_avx512dq_rangepv8sf_mask:
36296 case CODE_FOR_avx512dq_rangepv2df_mask:
36297 case CODE_FOR_avx512dq_rangepv4sf_mask:
36298 case CODE_FOR_avx_shufpd256_mask:
36299 error ("the last argument must be a 4-bit immediate");
36300 return const0_rtx;
36301
36302 case CODE_FOR_sha1rnds4:
36303 case CODE_FOR_sse4_1_blendpd:
36304 case CODE_FOR_avx_vpermilv2df:
36305 case CODE_FOR_avx_vpermilv2df_mask:
36306 case CODE_FOR_xop_vpermil2v2df3:
36307 case CODE_FOR_xop_vpermil2v4sf3:
36308 case CODE_FOR_xop_vpermil2v4df3:
36309 case CODE_FOR_xop_vpermil2v8sf3:
36310 case CODE_FOR_avx512f_vinsertf32x4_mask:
36311 case CODE_FOR_avx512f_vinserti32x4_mask:
36312 case CODE_FOR_avx512f_vextractf32x4_mask:
36313 case CODE_FOR_avx512f_vextracti32x4_mask:
36314 case CODE_FOR_sse2_shufpd:
36315 case CODE_FOR_sse2_shufpd_mask:
36316 case CODE_FOR_avx512dq_shuf_f64x2_mask:
36317 case CODE_FOR_avx512dq_shuf_i64x2_mask:
36318 case CODE_FOR_avx512vl_shuf_i32x4_mask:
36319 case CODE_FOR_avx512vl_shuf_f32x4_mask:
36320 error ("the last argument must be a 2-bit immediate");
36321 return const0_rtx;
36322
36323 case CODE_FOR_avx_vextractf128v4df:
36324 case CODE_FOR_avx_vextractf128v8sf:
36325 case CODE_FOR_avx_vextractf128v8si:
36326 case CODE_FOR_avx_vinsertf128v4df:
36327 case CODE_FOR_avx_vinsertf128v8sf:
36328 case CODE_FOR_avx_vinsertf128v8si:
36329 case CODE_FOR_avx512f_vinsertf64x4_mask:
36330 case CODE_FOR_avx512f_vinserti64x4_mask:
36331 case CODE_FOR_avx512f_vextractf64x4_mask:
36332 case CODE_FOR_avx512f_vextracti64x4_mask:
36333 case CODE_FOR_avx512dq_vinsertf32x8_mask:
36334 case CODE_FOR_avx512dq_vinserti32x8_mask:
36335 case CODE_FOR_avx512vl_vinsertv4df:
36336 case CODE_FOR_avx512vl_vinsertv4di:
36337 case CODE_FOR_avx512vl_vinsertv8sf:
36338 case CODE_FOR_avx512vl_vinsertv8si:
36339 error ("the last argument must be a 1-bit immediate");
36340 return const0_rtx;
36341
36342 case CODE_FOR_avx_vmcmpv2df3:
36343 case CODE_FOR_avx_vmcmpv4sf3:
36344 case CODE_FOR_avx_cmpv2df3:
36345 case CODE_FOR_avx_cmpv4sf3:
36346 case CODE_FOR_avx_cmpv4df3:
36347 case CODE_FOR_avx_cmpv8sf3:
36348 case CODE_FOR_avx512f_cmpv8df3_mask:
36349 case CODE_FOR_avx512f_cmpv16sf3_mask:
36350 case CODE_FOR_avx512f_vmcmpv2df3_mask:
36351 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
36352 error ("the last argument must be a 5-bit immediate");
36353 return const0_rtx;
36354
36355 default:
36356 switch (nargs_constant)
36357 {
36358 case 2:
36359 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
36360 (!mask_pos && (nargs - i) == nargs_constant))
36361 {
36362 error ("the next to last argument must be an 8-bit immediate");
36363 break;
36364 }
36365 /* FALLTHRU */
36366 case 1:
36367 error ("the last argument must be an 8-bit immediate");
36368 break;
36369 default:
36370 gcc_unreachable ();
36371 }
36372 return const0_rtx;
36373 }
36374 }
36375 else
36376 {
36377 if (VECTOR_MODE_P (mode))
36378 op = safe_vector_operand (op, mode);
36379
36380 /* If we aren't optimizing, only allow one memory operand to
36381 be generated. */
36382 if (memory_operand (op, mode))
36383 num_memory++;
36384
36385 op = fixup_modeless_constant (op, mode);
36386
36387 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
36388 {
36389 if (optimize || !match || num_memory > 1)
36390 op = copy_to_mode_reg (mode, op);
36391 }
36392 else
36393 {
36394 op = copy_to_reg (op);
36395 op = lowpart_subreg (mode, op, GET_MODE (op));
36396 }
36397 }
36398
36399 args[i].op = op;
36400 args[i].mode = mode;
36401 }
36402
36403 switch (nargs)
36404 {
36405 case 1:
36406 pat = GEN_FCN (icode) (real_target, args[0].op);
36407 break;
36408 case 2:
36409 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
36410 break;
36411 case 3:
36412 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36413 args[2].op);
36414 break;
36415 case 4:
36416 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36417 args[2].op, args[3].op);
36418 break;
36419 case 5:
36420 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36421 args[2].op, args[3].op, args[4].op);
36422 break;
36423 case 6:
36424 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36425 args[2].op, args[3].op, args[4].op,
36426 args[5].op);
36427 break;
36428 default:
36429 gcc_unreachable ();
36430 }
36431
36432 if (! pat)
36433 return 0;
36434
36435 emit_insn (pat);
36436 return target;
36437 }
36438
36439 /* Transform pattern of following layout:
36440 (parallel [
36441 set (A B)
36442 (unspec [C] UNSPEC_EMBEDDED_ROUNDING)])
36443 ])
36444 into:
36445 (set (A B))
36446
36447 Or:
36448 (parallel [ A B
36449 ...
36450 (unspec [C] UNSPEC_EMBEDDED_ROUNDING)
36451 ...
36452 ])
36453 into:
36454 (parallel [ A B ... ]) */
36455
36456 static rtx
36457 ix86_erase_embedded_rounding (rtx pat)
36458 {
36459 if (GET_CODE (pat) == INSN)
36460 pat = PATTERN (pat);
36461
36462 gcc_assert (GET_CODE (pat) == PARALLEL);
36463
36464 if (XVECLEN (pat, 0) == 2)
36465 {
36466 rtx p0 = XVECEXP (pat, 0, 0);
36467 rtx p1 = XVECEXP (pat, 0, 1);
36468
36469 gcc_assert (GET_CODE (p0) == SET
36470 && GET_CODE (p1) == UNSPEC
36471 && XINT (p1, 1) == UNSPEC_EMBEDDED_ROUNDING);
36472
36473 return p0;
36474 }
36475 else
36476 {
36477 rtx *res = XALLOCAVEC (rtx, XVECLEN (pat, 0));
36478 int i = 0;
36479 int j = 0;
36480
36481 for (; i < XVECLEN (pat, 0); ++i)
36482 {
36483 rtx elem = XVECEXP (pat, 0, i);
36484 if (GET_CODE (elem) != UNSPEC
36485 || XINT (elem, 1) != UNSPEC_EMBEDDED_ROUNDING)
36486 res [j++] = elem;
36487 }
36488
36489 /* No more than 1 occurence was removed. */
36490 gcc_assert (j >= XVECLEN (pat, 0) - 1);
36491
36492 return gen_rtx_PARALLEL (GET_MODE (pat), gen_rtvec_v (j, res));
36493 }
36494 }
36495
36496 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
36497 with rounding. */
36498 static rtx
36499 ix86_expand_sse_comi_round (const struct builtin_description *d,
36500 tree exp, rtx target)
36501 {
36502 rtx pat, set_dst;
36503 tree arg0 = CALL_EXPR_ARG (exp, 0);
36504 tree arg1 = CALL_EXPR_ARG (exp, 1);
36505 tree arg2 = CALL_EXPR_ARG (exp, 2);
36506 tree arg3 = CALL_EXPR_ARG (exp, 3);
36507 rtx op0 = expand_normal (arg0);
36508 rtx op1 = expand_normal (arg1);
36509 rtx op2 = expand_normal (arg2);
36510 rtx op3 = expand_normal (arg3);
36511 enum insn_code icode = d->icode;
36512 const struct insn_data_d *insn_p = &insn_data[icode];
36513 machine_mode mode0 = insn_p->operand[0].mode;
36514 machine_mode mode1 = insn_p->operand[1].mode;
36515 enum rtx_code comparison = UNEQ;
36516 bool need_ucomi = false;
36517
36518 /* See avxintrin.h for values. */
36519 enum rtx_code comi_comparisons[32] =
36520 {
36521 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
36522 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
36523 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
36524 };
36525 bool need_ucomi_values[32] =
36526 {
36527 true, false, false, true, true, false, false, true,
36528 true, false, false, true, true, false, false, true,
36529 false, true, true, false, false, true, true, false,
36530 false, true, true, false, false, true, true, false
36531 };
36532
36533 if (!CONST_INT_P (op2))
36534 {
36535 error ("the third argument must be comparison constant");
36536 return const0_rtx;
36537 }
36538 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
36539 {
36540 error ("incorrect comparison mode");
36541 return const0_rtx;
36542 }
36543
36544 if (!insn_p->operand[2].predicate (op3, SImode))
36545 {
36546 error ("incorrect rounding operand");
36547 return const0_rtx;
36548 }
36549
36550 comparison = comi_comparisons[INTVAL (op2)];
36551 need_ucomi = need_ucomi_values[INTVAL (op2)];
36552
36553 if (VECTOR_MODE_P (mode0))
36554 op0 = safe_vector_operand (op0, mode0);
36555 if (VECTOR_MODE_P (mode1))
36556 op1 = safe_vector_operand (op1, mode1);
36557
36558 target = gen_reg_rtx (SImode);
36559 emit_move_insn (target, const0_rtx);
36560 target = gen_rtx_SUBREG (QImode, target, 0);
36561
36562 if ((optimize && !register_operand (op0, mode0))
36563 || !insn_p->operand[0].predicate (op0, mode0))
36564 op0 = copy_to_mode_reg (mode0, op0);
36565 if ((optimize && !register_operand (op1, mode1))
36566 || !insn_p->operand[1].predicate (op1, mode1))
36567 op1 = copy_to_mode_reg (mode1, op1);
36568
36569 if (need_ucomi)
36570 icode = icode == CODE_FOR_sse_comi_round
36571 ? CODE_FOR_sse_ucomi_round
36572 : CODE_FOR_sse2_ucomi_round;
36573
36574 pat = GEN_FCN (icode) (op0, op1, op3);
36575 if (! pat)
36576 return 0;
36577
36578 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
36579 if (INTVAL (op3) == NO_ROUND)
36580 {
36581 pat = ix86_erase_embedded_rounding (pat);
36582 if (! pat)
36583 return 0;
36584
36585 set_dst = SET_DEST (pat);
36586 }
36587 else
36588 {
36589 gcc_assert (GET_CODE (XVECEXP (pat, 0, 0)) == SET);
36590 set_dst = SET_DEST (XVECEXP (pat, 0, 0));
36591 }
36592
36593 emit_insn (pat);
36594 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
36595 gen_rtx_fmt_ee (comparison, QImode,
36596 set_dst,
36597 const0_rtx)));
36598
36599 return SUBREG_REG (target);
36600 }
36601
36602 static rtx
36603 ix86_expand_round_builtin (const struct builtin_description *d,
36604 tree exp, rtx target)
36605 {
36606 rtx pat;
36607 unsigned int i, nargs;
36608 struct
36609 {
36610 rtx op;
36611 machine_mode mode;
36612 } args[6];
36613 enum insn_code icode = d->icode;
36614 const struct insn_data_d *insn_p = &insn_data[icode];
36615 machine_mode tmode = insn_p->operand[0].mode;
36616 unsigned int nargs_constant = 0;
36617 unsigned int redundant_embed_rnd = 0;
36618
36619 switch ((enum ix86_builtin_func_type) d->flag)
36620 {
36621 case UINT64_FTYPE_V2DF_INT:
36622 case UINT64_FTYPE_V4SF_INT:
36623 case UINT_FTYPE_V2DF_INT:
36624 case UINT_FTYPE_V4SF_INT:
36625 case INT64_FTYPE_V2DF_INT:
36626 case INT64_FTYPE_V4SF_INT:
36627 case INT_FTYPE_V2DF_INT:
36628 case INT_FTYPE_V4SF_INT:
36629 nargs = 2;
36630 break;
36631 case V4SF_FTYPE_V4SF_UINT_INT:
36632 case V4SF_FTYPE_V4SF_UINT64_INT:
36633 case V2DF_FTYPE_V2DF_UINT64_INT:
36634 case V4SF_FTYPE_V4SF_INT_INT:
36635 case V4SF_FTYPE_V4SF_INT64_INT:
36636 case V2DF_FTYPE_V2DF_INT64_INT:
36637 case V4SF_FTYPE_V4SF_V4SF_INT:
36638 case V2DF_FTYPE_V2DF_V2DF_INT:
36639 case V4SF_FTYPE_V4SF_V2DF_INT:
36640 case V2DF_FTYPE_V2DF_V4SF_INT:
36641 nargs = 3;
36642 break;
36643 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
36644 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
36645 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
36646 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
36647 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
36648 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
36649 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
36650 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
36651 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
36652 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
36653 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
36654 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
36655 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
36656 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
36657 nargs = 4;
36658 break;
36659 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
36660 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
36661 nargs_constant = 2;
36662 nargs = 4;
36663 break;
36664 case INT_FTYPE_V4SF_V4SF_INT_INT:
36665 case INT_FTYPE_V2DF_V2DF_INT_INT:
36666 return ix86_expand_sse_comi_round (d, exp, target);
36667 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
36668 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
36669 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
36670 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
36671 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
36672 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
36673 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
36674 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
36675 nargs = 5;
36676 break;
36677 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
36678 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
36679 nargs_constant = 4;
36680 nargs = 5;
36681 break;
36682 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
36683 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
36684 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
36685 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
36686 nargs_constant = 3;
36687 nargs = 5;
36688 break;
36689 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
36690 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
36691 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
36692 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
36693 nargs = 6;
36694 nargs_constant = 4;
36695 break;
36696 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
36697 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
36698 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
36699 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
36700 nargs = 6;
36701 nargs_constant = 3;
36702 break;
36703 default:
36704 gcc_unreachable ();
36705 }
36706 gcc_assert (nargs <= ARRAY_SIZE (args));
36707
36708 if (optimize
36709 || target == 0
36710 || GET_MODE (target) != tmode
36711 || !insn_p->operand[0].predicate (target, tmode))
36712 target = gen_reg_rtx (tmode);
36713
36714 for (i = 0; i < nargs; i++)
36715 {
36716 tree arg = CALL_EXPR_ARG (exp, i);
36717 rtx op = expand_normal (arg);
36718 machine_mode mode = insn_p->operand[i + 1].mode;
36719 bool match = insn_p->operand[i + 1].predicate (op, mode);
36720
36721 if (i == nargs - nargs_constant)
36722 {
36723 if (!match)
36724 {
36725 switch (icode)
36726 {
36727 case CODE_FOR_avx512f_getmantv8df_mask_round:
36728 case CODE_FOR_avx512f_getmantv16sf_mask_round:
36729 case CODE_FOR_avx512f_vgetmantv2df_round:
36730 case CODE_FOR_avx512f_vgetmantv4sf_round:
36731 error ("the immediate argument must be a 4-bit immediate");
36732 return const0_rtx;
36733 case CODE_FOR_avx512f_cmpv8df3_mask_round:
36734 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
36735 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
36736 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
36737 error ("the immediate argument must be a 5-bit immediate");
36738 return const0_rtx;
36739 default:
36740 error ("the immediate argument must be an 8-bit immediate");
36741 return const0_rtx;
36742 }
36743 }
36744 }
36745 else if (i == nargs-1)
36746 {
36747 if (!insn_p->operand[nargs].predicate (op, SImode))
36748 {
36749 error ("incorrect rounding operand");
36750 return const0_rtx;
36751 }
36752
36753 /* If there is no rounding use normal version of the pattern. */
36754 if (INTVAL (op) == NO_ROUND)
36755 redundant_embed_rnd = 1;
36756 }
36757 else
36758 {
36759 if (VECTOR_MODE_P (mode))
36760 op = safe_vector_operand (op, mode);
36761
36762 op = fixup_modeless_constant (op, mode);
36763
36764 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
36765 {
36766 if (optimize || !match)
36767 op = copy_to_mode_reg (mode, op);
36768 }
36769 else
36770 {
36771 op = copy_to_reg (op);
36772 op = lowpart_subreg (mode, op, GET_MODE (op));
36773 }
36774 }
36775
36776 args[i].op = op;
36777 args[i].mode = mode;
36778 }
36779
36780 switch (nargs)
36781 {
36782 case 1:
36783 pat = GEN_FCN (icode) (target, args[0].op);
36784 break;
36785 case 2:
36786 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
36787 break;
36788 case 3:
36789 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36790 args[2].op);
36791 break;
36792 case 4:
36793 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36794 args[2].op, args[3].op);
36795 break;
36796 case 5:
36797 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36798 args[2].op, args[3].op, args[4].op);
36799 break;
36800 case 6:
36801 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36802 args[2].op, args[3].op, args[4].op,
36803 args[5].op);
36804 break;
36805 default:
36806 gcc_unreachable ();
36807 }
36808
36809 if (!pat)
36810 return 0;
36811
36812 if (redundant_embed_rnd)
36813 pat = ix86_erase_embedded_rounding (pat);
36814
36815 emit_insn (pat);
36816 return target;
36817 }
36818
36819 /* Subroutine of ix86_expand_builtin to take care of special insns
36820 with variable number of operands. */
36821
36822 static rtx
36823 ix86_expand_special_args_builtin (const struct builtin_description *d,
36824 tree exp, rtx target)
36825 {
36826 tree arg;
36827 rtx pat, op;
36828 unsigned int i, nargs, arg_adjust, memory;
36829 bool aligned_mem = false;
36830 struct
36831 {
36832 rtx op;
36833 machine_mode mode;
36834 } args[3];
36835 enum insn_code icode = d->icode;
36836 bool last_arg_constant = false;
36837 const struct insn_data_d *insn_p = &insn_data[icode];
36838 machine_mode tmode = insn_p->operand[0].mode;
36839 enum { load, store } klass;
36840
36841 switch ((enum ix86_builtin_func_type) d->flag)
36842 {
36843 case VOID_FTYPE_VOID:
36844 emit_insn (GEN_FCN (icode) (target));
36845 return 0;
36846 case VOID_FTYPE_UINT64:
36847 case VOID_FTYPE_UNSIGNED:
36848 nargs = 0;
36849 klass = store;
36850 memory = 0;
36851 break;
36852
36853 case INT_FTYPE_VOID:
36854 case USHORT_FTYPE_VOID:
36855 case UINT64_FTYPE_VOID:
36856 case UNSIGNED_FTYPE_VOID:
36857 nargs = 0;
36858 klass = load;
36859 memory = 0;
36860 break;
36861 case UINT64_FTYPE_PUNSIGNED:
36862 case V2DI_FTYPE_PV2DI:
36863 case V4DI_FTYPE_PV4DI:
36864 case V32QI_FTYPE_PCCHAR:
36865 case V16QI_FTYPE_PCCHAR:
36866 case V8SF_FTYPE_PCV4SF:
36867 case V8SF_FTYPE_PCFLOAT:
36868 case V4SF_FTYPE_PCFLOAT:
36869 case V4DF_FTYPE_PCV2DF:
36870 case V4DF_FTYPE_PCDOUBLE:
36871 case V2DF_FTYPE_PCDOUBLE:
36872 case VOID_FTYPE_PVOID:
36873 case V8DI_FTYPE_PV8DI:
36874 nargs = 1;
36875 klass = load;
36876 memory = 0;
36877 switch (icode)
36878 {
36879 case CODE_FOR_sse4_1_movntdqa:
36880 case CODE_FOR_avx2_movntdqa:
36881 case CODE_FOR_avx512f_movntdqa:
36882 aligned_mem = true;
36883 break;
36884 default:
36885 break;
36886 }
36887 break;
36888 case VOID_FTYPE_PV2SF_V4SF:
36889 case VOID_FTYPE_PV8DI_V8DI:
36890 case VOID_FTYPE_PV4DI_V4DI:
36891 case VOID_FTYPE_PV2DI_V2DI:
36892 case VOID_FTYPE_PCHAR_V32QI:
36893 case VOID_FTYPE_PCHAR_V16QI:
36894 case VOID_FTYPE_PFLOAT_V16SF:
36895 case VOID_FTYPE_PFLOAT_V8SF:
36896 case VOID_FTYPE_PFLOAT_V4SF:
36897 case VOID_FTYPE_PDOUBLE_V8DF:
36898 case VOID_FTYPE_PDOUBLE_V4DF:
36899 case VOID_FTYPE_PDOUBLE_V2DF:
36900 case VOID_FTYPE_PLONGLONG_LONGLONG:
36901 case VOID_FTYPE_PULONGLONG_ULONGLONG:
36902 case VOID_FTYPE_PINT_INT:
36903 nargs = 1;
36904 klass = store;
36905 /* Reserve memory operand for target. */
36906 memory = ARRAY_SIZE (args);
36907 switch (icode)
36908 {
36909 /* These builtins and instructions require the memory
36910 to be properly aligned. */
36911 case CODE_FOR_avx_movntv4di:
36912 case CODE_FOR_sse2_movntv2di:
36913 case CODE_FOR_avx_movntv8sf:
36914 case CODE_FOR_sse_movntv4sf:
36915 case CODE_FOR_sse4a_vmmovntv4sf:
36916 case CODE_FOR_avx_movntv4df:
36917 case CODE_FOR_sse2_movntv2df:
36918 case CODE_FOR_sse4a_vmmovntv2df:
36919 case CODE_FOR_sse2_movntidi:
36920 case CODE_FOR_sse_movntq:
36921 case CODE_FOR_sse2_movntisi:
36922 case CODE_FOR_avx512f_movntv16sf:
36923 case CODE_FOR_avx512f_movntv8df:
36924 case CODE_FOR_avx512f_movntv8di:
36925 aligned_mem = true;
36926 break;
36927 default:
36928 break;
36929 }
36930 break;
36931 case V4SF_FTYPE_V4SF_PCV2SF:
36932 case V2DF_FTYPE_V2DF_PCDOUBLE:
36933 nargs = 2;
36934 klass = load;
36935 memory = 1;
36936 break;
36937 case V8SF_FTYPE_PCV8SF_V8SI:
36938 case V4DF_FTYPE_PCV4DF_V4DI:
36939 case V4SF_FTYPE_PCV4SF_V4SI:
36940 case V2DF_FTYPE_PCV2DF_V2DI:
36941 case V8SI_FTYPE_PCV8SI_V8SI:
36942 case V4DI_FTYPE_PCV4DI_V4DI:
36943 case V4SI_FTYPE_PCV4SI_V4SI:
36944 case V2DI_FTYPE_PCV2DI_V2DI:
36945 case VOID_FTYPE_INT_INT64:
36946 nargs = 2;
36947 klass = load;
36948 memory = 0;
36949 break;
36950 case VOID_FTYPE_PV8DF_V8DF_UQI:
36951 case VOID_FTYPE_PV4DF_V4DF_UQI:
36952 case VOID_FTYPE_PV2DF_V2DF_UQI:
36953 case VOID_FTYPE_PV16SF_V16SF_UHI:
36954 case VOID_FTYPE_PV8SF_V8SF_UQI:
36955 case VOID_FTYPE_PV4SF_V4SF_UQI:
36956 case VOID_FTYPE_PV8DI_V8DI_UQI:
36957 case VOID_FTYPE_PV4DI_V4DI_UQI:
36958 case VOID_FTYPE_PV2DI_V2DI_UQI:
36959 case VOID_FTYPE_PV16SI_V16SI_UHI:
36960 case VOID_FTYPE_PV8SI_V8SI_UQI:
36961 case VOID_FTYPE_PV4SI_V4SI_UQI:
36962 switch (icode)
36963 {
36964 /* These builtins and instructions require the memory
36965 to be properly aligned. */
36966 case CODE_FOR_avx512f_storev16sf_mask:
36967 case CODE_FOR_avx512f_storev16si_mask:
36968 case CODE_FOR_avx512f_storev8df_mask:
36969 case CODE_FOR_avx512f_storev8di_mask:
36970 case CODE_FOR_avx512vl_storev8sf_mask:
36971 case CODE_FOR_avx512vl_storev8si_mask:
36972 case CODE_FOR_avx512vl_storev4df_mask:
36973 case CODE_FOR_avx512vl_storev4di_mask:
36974 case CODE_FOR_avx512vl_storev4sf_mask:
36975 case CODE_FOR_avx512vl_storev4si_mask:
36976 case CODE_FOR_avx512vl_storev2df_mask:
36977 case CODE_FOR_avx512vl_storev2di_mask:
36978 aligned_mem = true;
36979 break;
36980 default:
36981 break;
36982 }
36983 /* FALLTHRU */
36984 case VOID_FTYPE_PV8SF_V8SI_V8SF:
36985 case VOID_FTYPE_PV4DF_V4DI_V4DF:
36986 case VOID_FTYPE_PV4SF_V4SI_V4SF:
36987 case VOID_FTYPE_PV2DF_V2DI_V2DF:
36988 case VOID_FTYPE_PV8SI_V8SI_V8SI:
36989 case VOID_FTYPE_PV4DI_V4DI_V4DI:
36990 case VOID_FTYPE_PV4SI_V4SI_V4SI:
36991 case VOID_FTYPE_PV2DI_V2DI_V2DI:
36992 case VOID_FTYPE_PV8SI_V8DI_UQI:
36993 case VOID_FTYPE_PV8HI_V8DI_UQI:
36994 case VOID_FTYPE_PV16HI_V16SI_UHI:
36995 case VOID_FTYPE_PV16QI_V8DI_UQI:
36996 case VOID_FTYPE_PV16QI_V16SI_UHI:
36997 case VOID_FTYPE_PV4SI_V4DI_UQI:
36998 case VOID_FTYPE_PV4SI_V2DI_UQI:
36999 case VOID_FTYPE_PV8HI_V4DI_UQI:
37000 case VOID_FTYPE_PV8HI_V2DI_UQI:
37001 case VOID_FTYPE_PV8HI_V8SI_UQI:
37002 case VOID_FTYPE_PV8HI_V4SI_UQI:
37003 case VOID_FTYPE_PV16QI_V4DI_UQI:
37004 case VOID_FTYPE_PV16QI_V2DI_UQI:
37005 case VOID_FTYPE_PV16QI_V8SI_UQI:
37006 case VOID_FTYPE_PV16QI_V4SI_UQI:
37007 case VOID_FTYPE_PCHAR_V64QI_UDI:
37008 case VOID_FTYPE_PCHAR_V32QI_USI:
37009 case VOID_FTYPE_PCHAR_V16QI_UHI:
37010 case VOID_FTYPE_PSHORT_V32HI_USI:
37011 case VOID_FTYPE_PSHORT_V16HI_UHI:
37012 case VOID_FTYPE_PSHORT_V8HI_UQI:
37013 case VOID_FTYPE_PINT_V16SI_UHI:
37014 case VOID_FTYPE_PINT_V8SI_UQI:
37015 case VOID_FTYPE_PINT_V4SI_UQI:
37016 case VOID_FTYPE_PINT64_V8DI_UQI:
37017 case VOID_FTYPE_PINT64_V4DI_UQI:
37018 case VOID_FTYPE_PINT64_V2DI_UQI:
37019 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
37020 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
37021 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
37022 case VOID_FTYPE_PFLOAT_V16SF_UHI:
37023 case VOID_FTYPE_PFLOAT_V8SF_UQI:
37024 case VOID_FTYPE_PFLOAT_V4SF_UQI:
37025 nargs = 2;
37026 klass = store;
37027 /* Reserve memory operand for target. */
37028 memory = ARRAY_SIZE (args);
37029 break;
37030 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
37031 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
37032 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
37033 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
37034 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
37035 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
37036 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
37037 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
37038 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
37039 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
37040 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
37041 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
37042 switch (icode)
37043 {
37044 /* These builtins and instructions require the memory
37045 to be properly aligned. */
37046 case CODE_FOR_avx512f_loadv16sf_mask:
37047 case CODE_FOR_avx512f_loadv16si_mask:
37048 case CODE_FOR_avx512f_loadv8df_mask:
37049 case CODE_FOR_avx512f_loadv8di_mask:
37050 case CODE_FOR_avx512vl_loadv8sf_mask:
37051 case CODE_FOR_avx512vl_loadv8si_mask:
37052 case CODE_FOR_avx512vl_loadv4df_mask:
37053 case CODE_FOR_avx512vl_loadv4di_mask:
37054 case CODE_FOR_avx512vl_loadv4sf_mask:
37055 case CODE_FOR_avx512vl_loadv4si_mask:
37056 case CODE_FOR_avx512vl_loadv2df_mask:
37057 case CODE_FOR_avx512vl_loadv2di_mask:
37058 case CODE_FOR_avx512bw_loadv64qi_mask:
37059 case CODE_FOR_avx512vl_loadv32qi_mask:
37060 case CODE_FOR_avx512vl_loadv16qi_mask:
37061 case CODE_FOR_avx512bw_loadv32hi_mask:
37062 case CODE_FOR_avx512vl_loadv16hi_mask:
37063 case CODE_FOR_avx512vl_loadv8hi_mask:
37064 aligned_mem = true;
37065 break;
37066 default:
37067 break;
37068 }
37069 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
37070 case V32QI_FTYPE_PCCHAR_V32QI_USI:
37071 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
37072 case V32HI_FTYPE_PCSHORT_V32HI_USI:
37073 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
37074 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
37075 case V16SI_FTYPE_PCINT_V16SI_UHI:
37076 case V8SI_FTYPE_PCINT_V8SI_UQI:
37077 case V4SI_FTYPE_PCINT_V4SI_UQI:
37078 case V8DI_FTYPE_PCINT64_V8DI_UQI:
37079 case V4DI_FTYPE_PCINT64_V4DI_UQI:
37080 case V2DI_FTYPE_PCINT64_V2DI_UQI:
37081 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
37082 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
37083 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
37084 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
37085 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
37086 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
37087 nargs = 3;
37088 klass = load;
37089 memory = 0;
37090 break;
37091 case VOID_FTYPE_UINT_UINT_UINT:
37092 case VOID_FTYPE_UINT64_UINT_UINT:
37093 case UCHAR_FTYPE_UINT_UINT_UINT:
37094 case UCHAR_FTYPE_UINT64_UINT_UINT:
37095 nargs = 3;
37096 klass = load;
37097 memory = ARRAY_SIZE (args);
37098 last_arg_constant = true;
37099 break;
37100 default:
37101 gcc_unreachable ();
37102 }
37103
37104 gcc_assert (nargs <= ARRAY_SIZE (args));
37105
37106 if (klass == store)
37107 {
37108 arg = CALL_EXPR_ARG (exp, 0);
37109 op = expand_normal (arg);
37110 gcc_assert (target == 0);
37111 if (memory)
37112 {
37113 op = ix86_zero_extend_to_Pmode (op);
37114 target = gen_rtx_MEM (tmode, op);
37115 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
37116 on it. Try to improve it using get_pointer_alignment,
37117 and if the special builtin is one that requires strict
37118 mode alignment, also from it's GET_MODE_ALIGNMENT.
37119 Failure to do so could lead to ix86_legitimate_combined_insn
37120 rejecting all changes to such insns. */
37121 unsigned int align = get_pointer_alignment (arg);
37122 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
37123 align = GET_MODE_ALIGNMENT (tmode);
37124 if (MEM_ALIGN (target) < align)
37125 set_mem_align (target, align);
37126 }
37127 else
37128 target = force_reg (tmode, op);
37129 arg_adjust = 1;
37130 }
37131 else
37132 {
37133 arg_adjust = 0;
37134 if (optimize
37135 || target == 0
37136 || !register_operand (target, tmode)
37137 || GET_MODE (target) != tmode)
37138 target = gen_reg_rtx (tmode);
37139 }
37140
37141 for (i = 0; i < nargs; i++)
37142 {
37143 machine_mode mode = insn_p->operand[i + 1].mode;
37144 bool match;
37145
37146 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
37147 op = expand_normal (arg);
37148 match = insn_p->operand[i + 1].predicate (op, mode);
37149
37150 if (last_arg_constant && (i + 1) == nargs)
37151 {
37152 if (!match)
37153 {
37154 if (icode == CODE_FOR_lwp_lwpvalsi3
37155 || icode == CODE_FOR_lwp_lwpinssi3
37156 || icode == CODE_FOR_lwp_lwpvaldi3
37157 || icode == CODE_FOR_lwp_lwpinsdi3)
37158 error ("the last argument must be a 32-bit immediate");
37159 else
37160 error ("the last argument must be an 8-bit immediate");
37161 return const0_rtx;
37162 }
37163 }
37164 else
37165 {
37166 if (i == memory)
37167 {
37168 /* This must be the memory operand. */
37169 op = ix86_zero_extend_to_Pmode (op);
37170 op = gen_rtx_MEM (mode, op);
37171 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
37172 on it. Try to improve it using get_pointer_alignment,
37173 and if the special builtin is one that requires strict
37174 mode alignment, also from it's GET_MODE_ALIGNMENT.
37175 Failure to do so could lead to ix86_legitimate_combined_insn
37176 rejecting all changes to such insns. */
37177 unsigned int align = get_pointer_alignment (arg);
37178 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
37179 align = GET_MODE_ALIGNMENT (mode);
37180 if (MEM_ALIGN (op) < align)
37181 set_mem_align (op, align);
37182 }
37183 else
37184 {
37185 /* This must be register. */
37186 if (VECTOR_MODE_P (mode))
37187 op = safe_vector_operand (op, mode);
37188
37189 op = fixup_modeless_constant (op, mode);
37190
37191 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
37192 op = copy_to_mode_reg (mode, op);
37193 else
37194 {
37195 op = copy_to_reg (op);
37196 op = lowpart_subreg (mode, op, GET_MODE (op));
37197 }
37198 }
37199 }
37200
37201 args[i].op = op;
37202 args[i].mode = mode;
37203 }
37204
37205 switch (nargs)
37206 {
37207 case 0:
37208 pat = GEN_FCN (icode) (target);
37209 break;
37210 case 1:
37211 pat = GEN_FCN (icode) (target, args[0].op);
37212 break;
37213 case 2:
37214 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
37215 break;
37216 case 3:
37217 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
37218 break;
37219 default:
37220 gcc_unreachable ();
37221 }
37222
37223 if (! pat)
37224 return 0;
37225 emit_insn (pat);
37226 return klass == store ? 0 : target;
37227 }
37228
37229 /* Return the integer constant in ARG. Constrain it to be in the range
37230 of the subparts of VEC_TYPE; issue an error if not. */
37231
37232 static int
37233 get_element_number (tree vec_type, tree arg)
37234 {
37235 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
37236
37237 if (!tree_fits_uhwi_p (arg)
37238 || (elt = tree_to_uhwi (arg), elt > max))
37239 {
37240 error ("selector must be an integer constant in the range 0..%wi", max);
37241 return 0;
37242 }
37243
37244 return elt;
37245 }
37246
37247 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
37248 ix86_expand_vector_init. We DO have language-level syntax for this, in
37249 the form of (type){ init-list }. Except that since we can't place emms
37250 instructions from inside the compiler, we can't allow the use of MMX
37251 registers unless the user explicitly asks for it. So we do *not* define
37252 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
37253 we have builtins invoked by mmintrin.h that gives us license to emit
37254 these sorts of instructions. */
37255
37256 static rtx
37257 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
37258 {
37259 machine_mode tmode = TYPE_MODE (type);
37260 machine_mode inner_mode = GET_MODE_INNER (tmode);
37261 int i, n_elt = GET_MODE_NUNITS (tmode);
37262 rtvec v = rtvec_alloc (n_elt);
37263
37264 gcc_assert (VECTOR_MODE_P (tmode));
37265 gcc_assert (call_expr_nargs (exp) == n_elt);
37266
37267 for (i = 0; i < n_elt; ++i)
37268 {
37269 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
37270 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
37271 }
37272
37273 if (!target || !register_operand (target, tmode))
37274 target = gen_reg_rtx (tmode);
37275
37276 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
37277 return target;
37278 }
37279
37280 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
37281 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
37282 had a language-level syntax for referencing vector elements. */
37283
37284 static rtx
37285 ix86_expand_vec_ext_builtin (tree exp, rtx target)
37286 {
37287 machine_mode tmode, mode0;
37288 tree arg0, arg1;
37289 int elt;
37290 rtx op0;
37291
37292 arg0 = CALL_EXPR_ARG (exp, 0);
37293 arg1 = CALL_EXPR_ARG (exp, 1);
37294
37295 op0 = expand_normal (arg0);
37296 elt = get_element_number (TREE_TYPE (arg0), arg1);
37297
37298 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
37299 mode0 = TYPE_MODE (TREE_TYPE (arg0));
37300 gcc_assert (VECTOR_MODE_P (mode0));
37301
37302 op0 = force_reg (mode0, op0);
37303
37304 if (optimize || !target || !register_operand (target, tmode))
37305 target = gen_reg_rtx (tmode);
37306
37307 ix86_expand_vector_extract (true, target, op0, elt);
37308
37309 return target;
37310 }
37311
37312 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
37313 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
37314 a language-level syntax for referencing vector elements. */
37315
37316 static rtx
37317 ix86_expand_vec_set_builtin (tree exp)
37318 {
37319 machine_mode tmode, mode1;
37320 tree arg0, arg1, arg2;
37321 int elt;
37322 rtx op0, op1, target;
37323
37324 arg0 = CALL_EXPR_ARG (exp, 0);
37325 arg1 = CALL_EXPR_ARG (exp, 1);
37326 arg2 = CALL_EXPR_ARG (exp, 2);
37327
37328 tmode = TYPE_MODE (TREE_TYPE (arg0));
37329 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
37330 gcc_assert (VECTOR_MODE_P (tmode));
37331
37332 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
37333 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
37334 elt = get_element_number (TREE_TYPE (arg0), arg2);
37335
37336 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
37337 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
37338
37339 op0 = force_reg (tmode, op0);
37340 op1 = force_reg (mode1, op1);
37341
37342 /* OP0 is the source of these builtin functions and shouldn't be
37343 modified. Create a copy, use it and return it as target. */
37344 target = gen_reg_rtx (tmode);
37345 emit_move_insn (target, op0);
37346 ix86_expand_vector_set (true, target, op1, elt);
37347
37348 return target;
37349 }
37350
37351 /* Emit conditional move of SRC to DST with condition
37352 OP1 CODE OP2. */
37353 static void
37354 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
37355 {
37356 rtx t;
37357
37358 if (TARGET_CMOVE)
37359 {
37360 t = ix86_expand_compare (code, op1, op2);
37361 emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
37362 src, dst)));
37363 }
37364 else
37365 {
37366 rtx_code_label *nomove = gen_label_rtx ();
37367 emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
37368 const0_rtx, GET_MODE (op1), 1, nomove);
37369 emit_move_insn (dst, src);
37370 emit_label (nomove);
37371 }
37372 }
37373
37374 /* Choose max of DST and SRC and put it to DST. */
37375 static void
37376 ix86_emit_move_max (rtx dst, rtx src)
37377 {
37378 ix86_emit_cmove (dst, src, LTU, dst, src);
37379 }
37380
37381 /* Expand an expression EXP that calls a built-in function,
37382 with result going to TARGET if that's convenient
37383 (and in mode MODE if that's convenient).
37384 SUBTARGET may be used as the target for computing one of EXP's operands.
37385 IGNORE is nonzero if the value is to be ignored. */
37386
37387 static rtx
37388 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
37389 machine_mode mode, int ignore)
37390 {
37391 size_t i;
37392 enum insn_code icode;
37393 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
37394 tree arg0, arg1, arg2, arg3, arg4;
37395 rtx op0, op1, op2, op3, op4, pat, insn;
37396 machine_mode mode0, mode1, mode2, mode3, mode4;
37397 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
37398
37399 /* For CPU builtins that can be folded, fold first and expand the fold. */
37400 switch (fcode)
37401 {
37402 case IX86_BUILTIN_CPU_INIT:
37403 {
37404 /* Make it call __cpu_indicator_init in libgcc. */
37405 tree call_expr, fndecl, type;
37406 type = build_function_type_list (integer_type_node, NULL_TREE);
37407 fndecl = build_fn_decl ("__cpu_indicator_init", type);
37408 call_expr = build_call_expr (fndecl, 0);
37409 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
37410 }
37411 case IX86_BUILTIN_CPU_IS:
37412 case IX86_BUILTIN_CPU_SUPPORTS:
37413 {
37414 tree arg0 = CALL_EXPR_ARG (exp, 0);
37415 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
37416 gcc_assert (fold_expr != NULL_TREE);
37417 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
37418 }
37419 }
37420
37421 /* Determine whether the builtin function is available under the current ISA.
37422 Originally the builtin was not created if it wasn't applicable to the
37423 current ISA based on the command line switches. With function specific
37424 options, we need to check in the context of the function making the call
37425 whether it is supported. Treat AVX512VL specially. For other flags,
37426 if isa includes more than one ISA bit, treat those are requiring any
37427 of them. For AVX512VL, require both AVX512VL and the non-AVX512VL
37428 ISAs. Similarly for 64BIT, but we shouldn't be building such builtins
37429 at all, -m64 is a whole TU option. */
37430 if (((ix86_builtins_isa[fcode].isa
37431 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_64BIT))
37432 && !(ix86_builtins_isa[fcode].isa
37433 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_64BIT)
37434 & ix86_isa_flags))
37435 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_AVX512VL)
37436 && !(ix86_isa_flags & OPTION_MASK_ISA_AVX512VL))
37437 || (ix86_builtins_isa[fcode].isa2
37438 && !(ix86_builtins_isa[fcode].isa2 & ix86_isa_flags2)))
37439 {
37440 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa,
37441 ix86_builtins_isa[fcode].isa2, 0, 0,
37442 NULL, NULL, (enum fpmath_unit) 0,
37443 false);
37444 if (!opts)
37445 error ("%qE needs unknown isa option", fndecl);
37446 else
37447 {
37448 gcc_assert (opts != NULL);
37449 error ("%qE needs isa option %s", fndecl, opts);
37450 free (opts);
37451 }
37452 return expand_call (exp, target, ignore);
37453 }
37454
37455 switch (fcode)
37456 {
37457 case IX86_BUILTIN_BNDMK:
37458 if (!target
37459 || GET_MODE (target) != BNDmode
37460 || !register_operand (target, BNDmode))
37461 target = gen_reg_rtx (BNDmode);
37462
37463 arg0 = CALL_EXPR_ARG (exp, 0);
37464 arg1 = CALL_EXPR_ARG (exp, 1);
37465
37466 op0 = expand_normal (arg0);
37467 op1 = expand_normal (arg1);
37468
37469 if (!register_operand (op0, Pmode))
37470 op0 = ix86_zero_extend_to_Pmode (op0);
37471 if (!register_operand (op1, Pmode))
37472 op1 = ix86_zero_extend_to_Pmode (op1);
37473
37474 /* Builtin arg1 is size of block but instruction op1 should
37475 be (size - 1). */
37476 op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
37477 NULL_RTX, 1, OPTAB_DIRECT);
37478
37479 emit_insn (BNDmode == BND64mode
37480 ? gen_bnd64_mk (target, op0, op1)
37481 : gen_bnd32_mk (target, op0, op1));
37482 return target;
37483
37484 case IX86_BUILTIN_BNDSTX:
37485 arg0 = CALL_EXPR_ARG (exp, 0);
37486 arg1 = CALL_EXPR_ARG (exp, 1);
37487 arg2 = CALL_EXPR_ARG (exp, 2);
37488
37489 op0 = expand_normal (arg0);
37490 op1 = expand_normal (arg1);
37491 op2 = expand_normal (arg2);
37492
37493 if (!register_operand (op0, Pmode))
37494 op0 = ix86_zero_extend_to_Pmode (op0);
37495 if (!register_operand (op1, BNDmode))
37496 op1 = copy_to_mode_reg (BNDmode, op1);
37497 if (!register_operand (op2, Pmode))
37498 op2 = ix86_zero_extend_to_Pmode (op2);
37499
37500 emit_insn (BNDmode == BND64mode
37501 ? gen_bnd64_stx (op2, op0, op1)
37502 : gen_bnd32_stx (op2, op0, op1));
37503 return 0;
37504
37505 case IX86_BUILTIN_BNDLDX:
37506 if (!target
37507 || GET_MODE (target) != BNDmode
37508 || !register_operand (target, BNDmode))
37509 target = gen_reg_rtx (BNDmode);
37510
37511 arg0 = CALL_EXPR_ARG (exp, 0);
37512 arg1 = CALL_EXPR_ARG (exp, 1);
37513
37514 op0 = expand_normal (arg0);
37515 op1 = expand_normal (arg1);
37516
37517 if (!register_operand (op0, Pmode))
37518 op0 = ix86_zero_extend_to_Pmode (op0);
37519 if (!register_operand (op1, Pmode))
37520 op1 = ix86_zero_extend_to_Pmode (op1);
37521
37522 emit_insn (BNDmode == BND64mode
37523 ? gen_bnd64_ldx (target, op0, op1)
37524 : gen_bnd32_ldx (target, op0, op1));
37525 return target;
37526
37527 case IX86_BUILTIN_BNDCL:
37528 arg0 = CALL_EXPR_ARG (exp, 0);
37529 arg1 = CALL_EXPR_ARG (exp, 1);
37530
37531 op0 = expand_normal (arg0);
37532 op1 = expand_normal (arg1);
37533
37534 if (!register_operand (op0, Pmode))
37535 op0 = ix86_zero_extend_to_Pmode (op0);
37536 if (!register_operand (op1, BNDmode))
37537 op1 = copy_to_mode_reg (BNDmode, op1);
37538
37539 emit_insn (BNDmode == BND64mode
37540 ? gen_bnd64_cl (op1, op0)
37541 : gen_bnd32_cl (op1, op0));
37542 return 0;
37543
37544 case IX86_BUILTIN_BNDCU:
37545 arg0 = CALL_EXPR_ARG (exp, 0);
37546 arg1 = CALL_EXPR_ARG (exp, 1);
37547
37548 op0 = expand_normal (arg0);
37549 op1 = expand_normal (arg1);
37550
37551 if (!register_operand (op0, Pmode))
37552 op0 = ix86_zero_extend_to_Pmode (op0);
37553 if (!register_operand (op1, BNDmode))
37554 op1 = copy_to_mode_reg (BNDmode, op1);
37555
37556 emit_insn (BNDmode == BND64mode
37557 ? gen_bnd64_cu (op1, op0)
37558 : gen_bnd32_cu (op1, op0));
37559 return 0;
37560
37561 case IX86_BUILTIN_BNDRET:
37562 arg0 = CALL_EXPR_ARG (exp, 0);
37563 target = chkp_get_rtl_bounds (arg0);
37564
37565 /* If no bounds were specified for returned value,
37566 then use INIT bounds. It usually happens when
37567 some built-in function is expanded. */
37568 if (!target)
37569 {
37570 rtx t1 = gen_reg_rtx (Pmode);
37571 rtx t2 = gen_reg_rtx (Pmode);
37572 target = gen_reg_rtx (BNDmode);
37573 emit_move_insn (t1, const0_rtx);
37574 emit_move_insn (t2, constm1_rtx);
37575 emit_insn (BNDmode == BND64mode
37576 ? gen_bnd64_mk (target, t1, t2)
37577 : gen_bnd32_mk (target, t1, t2));
37578 }
37579
37580 gcc_assert (target && REG_P (target));
37581 return target;
37582
37583 case IX86_BUILTIN_BNDNARROW:
37584 {
37585 rtx m1, m1h1, m1h2, lb, ub, t1;
37586
37587 /* Return value and lb. */
37588 arg0 = CALL_EXPR_ARG (exp, 0);
37589 /* Bounds. */
37590 arg1 = CALL_EXPR_ARG (exp, 1);
37591 /* Size. */
37592 arg2 = CALL_EXPR_ARG (exp, 2);
37593
37594 lb = expand_normal (arg0);
37595 op1 = expand_normal (arg1);
37596 op2 = expand_normal (arg2);
37597
37598 /* Size was passed but we need to use (size - 1) as for bndmk. */
37599 op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
37600 NULL_RTX, 1, OPTAB_DIRECT);
37601
37602 /* Add LB to size and inverse to get UB. */
37603 op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
37604 op2, 1, OPTAB_DIRECT);
37605 ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
37606
37607 if (!register_operand (lb, Pmode))
37608 lb = ix86_zero_extend_to_Pmode (lb);
37609 if (!register_operand (ub, Pmode))
37610 ub = ix86_zero_extend_to_Pmode (ub);
37611
37612 /* We need to move bounds to memory before any computations. */
37613 if (MEM_P (op1))
37614 m1 = op1;
37615 else
37616 {
37617 m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
37618 emit_move_insn (m1, op1);
37619 }
37620
37621 /* Generate mem expression to be used for access to LB and UB. */
37622 m1h1 = adjust_address (m1, Pmode, 0);
37623 m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
37624
37625 t1 = gen_reg_rtx (Pmode);
37626
37627 /* Compute LB. */
37628 emit_move_insn (t1, m1h1);
37629 ix86_emit_move_max (t1, lb);
37630 emit_move_insn (m1h1, t1);
37631
37632 /* Compute UB. UB is stored in 1's complement form. Therefore
37633 we also use max here. */
37634 emit_move_insn (t1, m1h2);
37635 ix86_emit_move_max (t1, ub);
37636 emit_move_insn (m1h2, t1);
37637
37638 op2 = gen_reg_rtx (BNDmode);
37639 emit_move_insn (op2, m1);
37640
37641 return chkp_join_splitted_slot (lb, op2);
37642 }
37643
37644 case IX86_BUILTIN_BNDINT:
37645 {
37646 rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
37647
37648 if (!target
37649 || GET_MODE (target) != BNDmode
37650 || !register_operand (target, BNDmode))
37651 target = gen_reg_rtx (BNDmode);
37652
37653 arg0 = CALL_EXPR_ARG (exp, 0);
37654 arg1 = CALL_EXPR_ARG (exp, 1);
37655
37656 op0 = expand_normal (arg0);
37657 op1 = expand_normal (arg1);
37658
37659 res = assign_386_stack_local (BNDmode, SLOT_TEMP);
37660 rh1 = adjust_address (res, Pmode, 0);
37661 rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
37662
37663 /* Put first bounds to temporaries. */
37664 lb1 = gen_reg_rtx (Pmode);
37665 ub1 = gen_reg_rtx (Pmode);
37666 if (MEM_P (op0))
37667 {
37668 emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
37669 emit_move_insn (ub1, adjust_address (op0, Pmode,
37670 GET_MODE_SIZE (Pmode)));
37671 }
37672 else
37673 {
37674 emit_move_insn (res, op0);
37675 emit_move_insn (lb1, rh1);
37676 emit_move_insn (ub1, rh2);
37677 }
37678
37679 /* Put second bounds to temporaries. */
37680 lb2 = gen_reg_rtx (Pmode);
37681 ub2 = gen_reg_rtx (Pmode);
37682 if (MEM_P (op1))
37683 {
37684 emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
37685 emit_move_insn (ub2, adjust_address (op1, Pmode,
37686 GET_MODE_SIZE (Pmode)));
37687 }
37688 else
37689 {
37690 emit_move_insn (res, op1);
37691 emit_move_insn (lb2, rh1);
37692 emit_move_insn (ub2, rh2);
37693 }
37694
37695 /* Compute LB. */
37696 ix86_emit_move_max (lb1, lb2);
37697 emit_move_insn (rh1, lb1);
37698
37699 /* Compute UB. UB is stored in 1's complement form. Therefore
37700 we also use max here. */
37701 ix86_emit_move_max (ub1, ub2);
37702 emit_move_insn (rh2, ub1);
37703
37704 emit_move_insn (target, res);
37705
37706 return target;
37707 }
37708
37709 case IX86_BUILTIN_SIZEOF:
37710 {
37711 tree name;
37712 rtx symbol;
37713
37714 if (!target
37715 || GET_MODE (target) != Pmode
37716 || !register_operand (target, Pmode))
37717 target = gen_reg_rtx (Pmode);
37718
37719 arg0 = CALL_EXPR_ARG (exp, 0);
37720 gcc_assert (VAR_P (arg0));
37721
37722 name = DECL_ASSEMBLER_NAME (arg0);
37723 symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
37724
37725 emit_insn (Pmode == SImode
37726 ? gen_move_size_reloc_si (target, symbol)
37727 : gen_move_size_reloc_di (target, symbol));
37728
37729 return target;
37730 }
37731
37732 case IX86_BUILTIN_BNDLOWER:
37733 {
37734 rtx mem, hmem;
37735
37736 if (!target
37737 || GET_MODE (target) != Pmode
37738 || !register_operand (target, Pmode))
37739 target = gen_reg_rtx (Pmode);
37740
37741 arg0 = CALL_EXPR_ARG (exp, 0);
37742 op0 = expand_normal (arg0);
37743
37744 /* We need to move bounds to memory first. */
37745 if (MEM_P (op0))
37746 mem = op0;
37747 else
37748 {
37749 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
37750 emit_move_insn (mem, op0);
37751 }
37752
37753 /* Generate mem expression to access LB and load it. */
37754 hmem = adjust_address (mem, Pmode, 0);
37755 emit_move_insn (target, hmem);
37756
37757 return target;
37758 }
37759
37760 case IX86_BUILTIN_BNDUPPER:
37761 {
37762 rtx mem, hmem, res;
37763
37764 if (!target
37765 || GET_MODE (target) != Pmode
37766 || !register_operand (target, Pmode))
37767 target = gen_reg_rtx (Pmode);
37768
37769 arg0 = CALL_EXPR_ARG (exp, 0);
37770 op0 = expand_normal (arg0);
37771
37772 /* We need to move bounds to memory first. */
37773 if (MEM_P (op0))
37774 mem = op0;
37775 else
37776 {
37777 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
37778 emit_move_insn (mem, op0);
37779 }
37780
37781 /* Generate mem expression to access UB. */
37782 hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
37783
37784 /* We need to inverse all bits of UB. */
37785 res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
37786
37787 if (res != target)
37788 emit_move_insn (target, res);
37789
37790 return target;
37791 }
37792
37793 case IX86_BUILTIN_MASKMOVQ:
37794 case IX86_BUILTIN_MASKMOVDQU:
37795 icode = (fcode == IX86_BUILTIN_MASKMOVQ
37796 ? CODE_FOR_mmx_maskmovq
37797 : CODE_FOR_sse2_maskmovdqu);
37798 /* Note the arg order is different from the operand order. */
37799 arg1 = CALL_EXPR_ARG (exp, 0);
37800 arg2 = CALL_EXPR_ARG (exp, 1);
37801 arg0 = CALL_EXPR_ARG (exp, 2);
37802 op0 = expand_normal (arg0);
37803 op1 = expand_normal (arg1);
37804 op2 = expand_normal (arg2);
37805 mode0 = insn_data[icode].operand[0].mode;
37806 mode1 = insn_data[icode].operand[1].mode;
37807 mode2 = insn_data[icode].operand[2].mode;
37808
37809 op0 = ix86_zero_extend_to_Pmode (op0);
37810 op0 = gen_rtx_MEM (mode1, op0);
37811
37812 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37813 op0 = copy_to_mode_reg (mode0, op0);
37814 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37815 op1 = copy_to_mode_reg (mode1, op1);
37816 if (!insn_data[icode].operand[2].predicate (op2, mode2))
37817 op2 = copy_to_mode_reg (mode2, op2);
37818 pat = GEN_FCN (icode) (op0, op1, op2);
37819 if (! pat)
37820 return 0;
37821 emit_insn (pat);
37822 return 0;
37823
37824 case IX86_BUILTIN_LDMXCSR:
37825 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
37826 target = assign_386_stack_local (SImode, SLOT_TEMP);
37827 emit_move_insn (target, op0);
37828 emit_insn (gen_sse_ldmxcsr (target));
37829 return 0;
37830
37831 case IX86_BUILTIN_STMXCSR:
37832 target = assign_386_stack_local (SImode, SLOT_TEMP);
37833 emit_insn (gen_sse_stmxcsr (target));
37834 return copy_to_mode_reg (SImode, target);
37835
37836 case IX86_BUILTIN_CLFLUSH:
37837 arg0 = CALL_EXPR_ARG (exp, 0);
37838 op0 = expand_normal (arg0);
37839 icode = CODE_FOR_sse2_clflush;
37840 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37841 op0 = ix86_zero_extend_to_Pmode (op0);
37842
37843 emit_insn (gen_sse2_clflush (op0));
37844 return 0;
37845
37846 case IX86_BUILTIN_CLWB:
37847 arg0 = CALL_EXPR_ARG (exp, 0);
37848 op0 = expand_normal (arg0);
37849 icode = CODE_FOR_clwb;
37850 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37851 op0 = ix86_zero_extend_to_Pmode (op0);
37852
37853 emit_insn (gen_clwb (op0));
37854 return 0;
37855
37856 case IX86_BUILTIN_CLFLUSHOPT:
37857 arg0 = CALL_EXPR_ARG (exp, 0);
37858 op0 = expand_normal (arg0);
37859 icode = CODE_FOR_clflushopt;
37860 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37861 op0 = ix86_zero_extend_to_Pmode (op0);
37862
37863 emit_insn (gen_clflushopt (op0));
37864 return 0;
37865
37866 case IX86_BUILTIN_MONITOR:
37867 case IX86_BUILTIN_MONITORX:
37868 arg0 = CALL_EXPR_ARG (exp, 0);
37869 arg1 = CALL_EXPR_ARG (exp, 1);
37870 arg2 = CALL_EXPR_ARG (exp, 2);
37871 op0 = expand_normal (arg0);
37872 op1 = expand_normal (arg1);
37873 op2 = expand_normal (arg2);
37874 if (!REG_P (op0))
37875 op0 = ix86_zero_extend_to_Pmode (op0);
37876 if (!REG_P (op1))
37877 op1 = copy_to_mode_reg (SImode, op1);
37878 if (!REG_P (op2))
37879 op2 = copy_to_mode_reg (SImode, op2);
37880
37881 emit_insn (fcode == IX86_BUILTIN_MONITOR
37882 ? ix86_gen_monitor (op0, op1, op2)
37883 : ix86_gen_monitorx (op0, op1, op2));
37884 return 0;
37885
37886 case IX86_BUILTIN_MWAIT:
37887 arg0 = CALL_EXPR_ARG (exp, 0);
37888 arg1 = CALL_EXPR_ARG (exp, 1);
37889 op0 = expand_normal (arg0);
37890 op1 = expand_normal (arg1);
37891 if (!REG_P (op0))
37892 op0 = copy_to_mode_reg (SImode, op0);
37893 if (!REG_P (op1))
37894 op1 = copy_to_mode_reg (SImode, op1);
37895 emit_insn (gen_sse3_mwait (op0, op1));
37896 return 0;
37897
37898 case IX86_BUILTIN_MWAITX:
37899 arg0 = CALL_EXPR_ARG (exp, 0);
37900 arg1 = CALL_EXPR_ARG (exp, 1);
37901 arg2 = CALL_EXPR_ARG (exp, 2);
37902 op0 = expand_normal (arg0);
37903 op1 = expand_normal (arg1);
37904 op2 = expand_normal (arg2);
37905 if (!REG_P (op0))
37906 op0 = copy_to_mode_reg (SImode, op0);
37907 if (!REG_P (op1))
37908 op1 = copy_to_mode_reg (SImode, op1);
37909 if (!REG_P (op2))
37910 op2 = copy_to_mode_reg (SImode, op2);
37911 emit_insn (gen_mwaitx (op0, op1, op2));
37912 return 0;
37913
37914 case IX86_BUILTIN_CLZERO:
37915 arg0 = CALL_EXPR_ARG (exp, 0);
37916 op0 = expand_normal (arg0);
37917 if (!REG_P (op0))
37918 op0 = ix86_zero_extend_to_Pmode (op0);
37919 emit_insn (ix86_gen_clzero (op0));
37920 return 0;
37921
37922 case IX86_BUILTIN_VEC_INIT_V2SI:
37923 case IX86_BUILTIN_VEC_INIT_V4HI:
37924 case IX86_BUILTIN_VEC_INIT_V8QI:
37925 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
37926
37927 case IX86_BUILTIN_VEC_EXT_V2DF:
37928 case IX86_BUILTIN_VEC_EXT_V2DI:
37929 case IX86_BUILTIN_VEC_EXT_V4SF:
37930 case IX86_BUILTIN_VEC_EXT_V4SI:
37931 case IX86_BUILTIN_VEC_EXT_V8HI:
37932 case IX86_BUILTIN_VEC_EXT_V2SI:
37933 case IX86_BUILTIN_VEC_EXT_V4HI:
37934 case IX86_BUILTIN_VEC_EXT_V16QI:
37935 return ix86_expand_vec_ext_builtin (exp, target);
37936
37937 case IX86_BUILTIN_VEC_SET_V2DI:
37938 case IX86_BUILTIN_VEC_SET_V4SF:
37939 case IX86_BUILTIN_VEC_SET_V4SI:
37940 case IX86_BUILTIN_VEC_SET_V8HI:
37941 case IX86_BUILTIN_VEC_SET_V4HI:
37942 case IX86_BUILTIN_VEC_SET_V16QI:
37943 return ix86_expand_vec_set_builtin (exp);
37944
37945 case IX86_BUILTIN_NANQ:
37946 case IX86_BUILTIN_NANSQ:
37947 return expand_call (exp, target, ignore);
37948
37949 case IX86_BUILTIN_RDPMC:
37950 case IX86_BUILTIN_RDTSC:
37951 case IX86_BUILTIN_RDTSCP:
37952 case IX86_BUILTIN_XGETBV:
37953
37954 op0 = gen_reg_rtx (DImode);
37955 op1 = gen_reg_rtx (DImode);
37956
37957 if (fcode == IX86_BUILTIN_RDPMC)
37958 {
37959 arg0 = CALL_EXPR_ARG (exp, 0);
37960 op2 = expand_normal (arg0);
37961 if (!register_operand (op2, SImode))
37962 op2 = copy_to_mode_reg (SImode, op2);
37963
37964 insn = (TARGET_64BIT
37965 ? gen_rdpmc_rex64 (op0, op1, op2)
37966 : gen_rdpmc (op0, op2));
37967 emit_insn (insn);
37968 }
37969 else if (fcode == IX86_BUILTIN_XGETBV)
37970 {
37971 arg0 = CALL_EXPR_ARG (exp, 0);
37972 op2 = expand_normal (arg0);
37973 if (!register_operand (op2, SImode))
37974 op2 = copy_to_mode_reg (SImode, op2);
37975
37976 insn = (TARGET_64BIT
37977 ? gen_xgetbv_rex64 (op0, op1, op2)
37978 : gen_xgetbv (op0, op2));
37979 emit_insn (insn);
37980 }
37981 else if (fcode == IX86_BUILTIN_RDTSC)
37982 {
37983 insn = (TARGET_64BIT
37984 ? gen_rdtsc_rex64 (op0, op1)
37985 : gen_rdtsc (op0));
37986 emit_insn (insn);
37987 }
37988 else
37989 {
37990 op2 = gen_reg_rtx (SImode);
37991
37992 insn = (TARGET_64BIT
37993 ? gen_rdtscp_rex64 (op0, op1, op2)
37994 : gen_rdtscp (op0, op2));
37995 emit_insn (insn);
37996
37997 arg0 = CALL_EXPR_ARG (exp, 0);
37998 op4 = expand_normal (arg0);
37999 if (!address_operand (op4, VOIDmode))
38000 {
38001 op4 = convert_memory_address (Pmode, op4);
38002 op4 = copy_addr_to_reg (op4);
38003 }
38004 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
38005 }
38006
38007 if (target == 0)
38008 {
38009 /* mode is VOIDmode if __builtin_rd* has been called
38010 without lhs. */
38011 if (mode == VOIDmode)
38012 return target;
38013 target = gen_reg_rtx (mode);
38014 }
38015
38016 if (TARGET_64BIT)
38017 {
38018 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
38019 op1, 1, OPTAB_DIRECT);
38020 op0 = expand_simple_binop (DImode, IOR, op0, op1,
38021 op0, 1, OPTAB_DIRECT);
38022 }
38023
38024 emit_move_insn (target, op0);
38025 return target;
38026
38027 case IX86_BUILTIN_FXSAVE:
38028 case IX86_BUILTIN_FXRSTOR:
38029 case IX86_BUILTIN_FXSAVE64:
38030 case IX86_BUILTIN_FXRSTOR64:
38031 case IX86_BUILTIN_FNSTENV:
38032 case IX86_BUILTIN_FLDENV:
38033 mode0 = BLKmode;
38034 switch (fcode)
38035 {
38036 case IX86_BUILTIN_FXSAVE:
38037 icode = CODE_FOR_fxsave;
38038 break;
38039 case IX86_BUILTIN_FXRSTOR:
38040 icode = CODE_FOR_fxrstor;
38041 break;
38042 case IX86_BUILTIN_FXSAVE64:
38043 icode = CODE_FOR_fxsave64;
38044 break;
38045 case IX86_BUILTIN_FXRSTOR64:
38046 icode = CODE_FOR_fxrstor64;
38047 break;
38048 case IX86_BUILTIN_FNSTENV:
38049 icode = CODE_FOR_fnstenv;
38050 break;
38051 case IX86_BUILTIN_FLDENV:
38052 icode = CODE_FOR_fldenv;
38053 break;
38054 default:
38055 gcc_unreachable ();
38056 }
38057
38058 arg0 = CALL_EXPR_ARG (exp, 0);
38059 op0 = expand_normal (arg0);
38060
38061 if (!address_operand (op0, VOIDmode))
38062 {
38063 op0 = convert_memory_address (Pmode, op0);
38064 op0 = copy_addr_to_reg (op0);
38065 }
38066 op0 = gen_rtx_MEM (mode0, op0);
38067
38068 pat = GEN_FCN (icode) (op0);
38069 if (pat)
38070 emit_insn (pat);
38071 return 0;
38072
38073 case IX86_BUILTIN_XSETBV:
38074 arg0 = CALL_EXPR_ARG (exp, 0);
38075 arg1 = CALL_EXPR_ARG (exp, 1);
38076 op0 = expand_normal (arg0);
38077 op1 = expand_normal (arg1);
38078
38079 if (!REG_P (op0))
38080 op0 = copy_to_mode_reg (SImode, op0);
38081
38082 if (TARGET_64BIT)
38083 {
38084 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
38085 NULL, 1, OPTAB_DIRECT);
38086
38087 op2 = gen_lowpart (SImode, op2);
38088 op1 = gen_lowpart (SImode, op1);
38089 if (!REG_P (op1))
38090 op1 = copy_to_mode_reg (SImode, op1);
38091 if (!REG_P (op2))
38092 op2 = copy_to_mode_reg (SImode, op2);
38093 icode = CODE_FOR_xsetbv_rex64;
38094 pat = GEN_FCN (icode) (op0, op1, op2);
38095 }
38096 else
38097 {
38098 if (!REG_P (op1))
38099 op1 = copy_to_mode_reg (DImode, op1);
38100 icode = CODE_FOR_xsetbv;
38101 pat = GEN_FCN (icode) (op0, op1);
38102 }
38103 if (pat)
38104 emit_insn (pat);
38105 return 0;
38106
38107 case IX86_BUILTIN_XSAVE:
38108 case IX86_BUILTIN_XRSTOR:
38109 case IX86_BUILTIN_XSAVE64:
38110 case IX86_BUILTIN_XRSTOR64:
38111 case IX86_BUILTIN_XSAVEOPT:
38112 case IX86_BUILTIN_XSAVEOPT64:
38113 case IX86_BUILTIN_XSAVES:
38114 case IX86_BUILTIN_XRSTORS:
38115 case IX86_BUILTIN_XSAVES64:
38116 case IX86_BUILTIN_XRSTORS64:
38117 case IX86_BUILTIN_XSAVEC:
38118 case IX86_BUILTIN_XSAVEC64:
38119 arg0 = CALL_EXPR_ARG (exp, 0);
38120 arg1 = CALL_EXPR_ARG (exp, 1);
38121 op0 = expand_normal (arg0);
38122 op1 = expand_normal (arg1);
38123
38124 if (!address_operand (op0, VOIDmode))
38125 {
38126 op0 = convert_memory_address (Pmode, op0);
38127 op0 = copy_addr_to_reg (op0);
38128 }
38129 op0 = gen_rtx_MEM (BLKmode, op0);
38130
38131 op1 = force_reg (DImode, op1);
38132
38133 if (TARGET_64BIT)
38134 {
38135 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
38136 NULL, 1, OPTAB_DIRECT);
38137 switch (fcode)
38138 {
38139 case IX86_BUILTIN_XSAVE:
38140 icode = CODE_FOR_xsave_rex64;
38141 break;
38142 case IX86_BUILTIN_XRSTOR:
38143 icode = CODE_FOR_xrstor_rex64;
38144 break;
38145 case IX86_BUILTIN_XSAVE64:
38146 icode = CODE_FOR_xsave64;
38147 break;
38148 case IX86_BUILTIN_XRSTOR64:
38149 icode = CODE_FOR_xrstor64;
38150 break;
38151 case IX86_BUILTIN_XSAVEOPT:
38152 icode = CODE_FOR_xsaveopt_rex64;
38153 break;
38154 case IX86_BUILTIN_XSAVEOPT64:
38155 icode = CODE_FOR_xsaveopt64;
38156 break;
38157 case IX86_BUILTIN_XSAVES:
38158 icode = CODE_FOR_xsaves_rex64;
38159 break;
38160 case IX86_BUILTIN_XRSTORS:
38161 icode = CODE_FOR_xrstors_rex64;
38162 break;
38163 case IX86_BUILTIN_XSAVES64:
38164 icode = CODE_FOR_xsaves64;
38165 break;
38166 case IX86_BUILTIN_XRSTORS64:
38167 icode = CODE_FOR_xrstors64;
38168 break;
38169 case IX86_BUILTIN_XSAVEC:
38170 icode = CODE_FOR_xsavec_rex64;
38171 break;
38172 case IX86_BUILTIN_XSAVEC64:
38173 icode = CODE_FOR_xsavec64;
38174 break;
38175 default:
38176 gcc_unreachable ();
38177 }
38178
38179 op2 = gen_lowpart (SImode, op2);
38180 op1 = gen_lowpart (SImode, op1);
38181 pat = GEN_FCN (icode) (op0, op1, op2);
38182 }
38183 else
38184 {
38185 switch (fcode)
38186 {
38187 case IX86_BUILTIN_XSAVE:
38188 icode = CODE_FOR_xsave;
38189 break;
38190 case IX86_BUILTIN_XRSTOR:
38191 icode = CODE_FOR_xrstor;
38192 break;
38193 case IX86_BUILTIN_XSAVEOPT:
38194 icode = CODE_FOR_xsaveopt;
38195 break;
38196 case IX86_BUILTIN_XSAVES:
38197 icode = CODE_FOR_xsaves;
38198 break;
38199 case IX86_BUILTIN_XRSTORS:
38200 icode = CODE_FOR_xrstors;
38201 break;
38202 case IX86_BUILTIN_XSAVEC:
38203 icode = CODE_FOR_xsavec;
38204 break;
38205 default:
38206 gcc_unreachable ();
38207 }
38208 pat = GEN_FCN (icode) (op0, op1);
38209 }
38210
38211 if (pat)
38212 emit_insn (pat);
38213 return 0;
38214
38215 case IX86_BUILTIN_LLWPCB:
38216 arg0 = CALL_EXPR_ARG (exp, 0);
38217 op0 = expand_normal (arg0);
38218 icode = CODE_FOR_lwp_llwpcb;
38219 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38220 op0 = ix86_zero_extend_to_Pmode (op0);
38221 emit_insn (gen_lwp_llwpcb (op0));
38222 return 0;
38223
38224 case IX86_BUILTIN_SLWPCB:
38225 icode = CODE_FOR_lwp_slwpcb;
38226 if (!target
38227 || !insn_data[icode].operand[0].predicate (target, Pmode))
38228 target = gen_reg_rtx (Pmode);
38229 emit_insn (gen_lwp_slwpcb (target));
38230 return target;
38231
38232 case IX86_BUILTIN_BEXTRI32:
38233 case IX86_BUILTIN_BEXTRI64:
38234 arg0 = CALL_EXPR_ARG (exp, 0);
38235 arg1 = CALL_EXPR_ARG (exp, 1);
38236 op0 = expand_normal (arg0);
38237 op1 = expand_normal (arg1);
38238 icode = (fcode == IX86_BUILTIN_BEXTRI32
38239 ? CODE_FOR_tbm_bextri_si
38240 : CODE_FOR_tbm_bextri_di);
38241 if (!CONST_INT_P (op1))
38242 {
38243 error ("last argument must be an immediate");
38244 return const0_rtx;
38245 }
38246 else
38247 {
38248 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
38249 unsigned char lsb_index = INTVAL (op1) & 0xFF;
38250 op1 = GEN_INT (length);
38251 op2 = GEN_INT (lsb_index);
38252 pat = GEN_FCN (icode) (target, op0, op1, op2);
38253 if (pat)
38254 emit_insn (pat);
38255 return target;
38256 }
38257
38258 case IX86_BUILTIN_RDRAND16_STEP:
38259 icode = CODE_FOR_rdrandhi_1;
38260 mode0 = HImode;
38261 goto rdrand_step;
38262
38263 case IX86_BUILTIN_RDRAND32_STEP:
38264 icode = CODE_FOR_rdrandsi_1;
38265 mode0 = SImode;
38266 goto rdrand_step;
38267
38268 case IX86_BUILTIN_RDRAND64_STEP:
38269 icode = CODE_FOR_rdranddi_1;
38270 mode0 = DImode;
38271
38272 rdrand_step:
38273 arg0 = CALL_EXPR_ARG (exp, 0);
38274 op1 = expand_normal (arg0);
38275 if (!address_operand (op1, VOIDmode))
38276 {
38277 op1 = convert_memory_address (Pmode, op1);
38278 op1 = copy_addr_to_reg (op1);
38279 }
38280
38281 op0 = gen_reg_rtx (mode0);
38282 emit_insn (GEN_FCN (icode) (op0));
38283
38284 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
38285
38286 op1 = gen_reg_rtx (SImode);
38287 emit_move_insn (op1, CONST1_RTX (SImode));
38288
38289 /* Emit SImode conditional move. */
38290 if (mode0 == HImode)
38291 {
38292 if (TARGET_ZERO_EXTEND_WITH_AND
38293 && optimize_function_for_speed_p (cfun))
38294 {
38295 op2 = force_reg (SImode, const0_rtx);
38296
38297 emit_insn (gen_movstricthi
38298 (gen_lowpart (HImode, op2), op0));
38299 }
38300 else
38301 {
38302 op2 = gen_reg_rtx (SImode);
38303
38304 emit_insn (gen_zero_extendhisi2 (op2, op0));
38305 }
38306 }
38307 else if (mode0 == SImode)
38308 op2 = op0;
38309 else
38310 op2 = gen_rtx_SUBREG (SImode, op0, 0);
38311
38312 if (target == 0
38313 || !register_operand (target, SImode))
38314 target = gen_reg_rtx (SImode);
38315
38316 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
38317 const0_rtx);
38318 emit_insn (gen_rtx_SET (target,
38319 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
38320 return target;
38321
38322 case IX86_BUILTIN_RDSEED16_STEP:
38323 icode = CODE_FOR_rdseedhi_1;
38324 mode0 = HImode;
38325 goto rdseed_step;
38326
38327 case IX86_BUILTIN_RDSEED32_STEP:
38328 icode = CODE_FOR_rdseedsi_1;
38329 mode0 = SImode;
38330 goto rdseed_step;
38331
38332 case IX86_BUILTIN_RDSEED64_STEP:
38333 icode = CODE_FOR_rdseeddi_1;
38334 mode0 = DImode;
38335
38336 rdseed_step:
38337 arg0 = CALL_EXPR_ARG (exp, 0);
38338 op1 = expand_normal (arg0);
38339 if (!address_operand (op1, VOIDmode))
38340 {
38341 op1 = convert_memory_address (Pmode, op1);
38342 op1 = copy_addr_to_reg (op1);
38343 }
38344
38345 op0 = gen_reg_rtx (mode0);
38346 emit_insn (GEN_FCN (icode) (op0));
38347
38348 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
38349
38350 op2 = gen_reg_rtx (QImode);
38351
38352 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
38353 const0_rtx);
38354 emit_insn (gen_rtx_SET (op2, pat));
38355
38356 if (target == 0
38357 || !register_operand (target, SImode))
38358 target = gen_reg_rtx (SImode);
38359
38360 emit_insn (gen_zero_extendqisi2 (target, op2));
38361 return target;
38362
38363 case IX86_BUILTIN_SBB32:
38364 icode = CODE_FOR_subborrowsi;
38365 mode0 = SImode;
38366 goto handlecarry;
38367
38368 case IX86_BUILTIN_SBB64:
38369 icode = CODE_FOR_subborrowdi;
38370 mode0 = DImode;
38371 goto handlecarry;
38372
38373 case IX86_BUILTIN_ADDCARRYX32:
38374 icode = CODE_FOR_addcarrysi;
38375 mode0 = SImode;
38376 goto handlecarry;
38377
38378 case IX86_BUILTIN_ADDCARRYX64:
38379 icode = CODE_FOR_addcarrydi;
38380 mode0 = DImode;
38381
38382 handlecarry:
38383 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
38384 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
38385 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
38386 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
38387
38388 op1 = expand_normal (arg0);
38389 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
38390
38391 op2 = expand_normal (arg1);
38392 if (!register_operand (op2, mode0))
38393 op2 = copy_to_mode_reg (mode0, op2);
38394
38395 op3 = expand_normal (arg2);
38396 if (!register_operand (op3, mode0))
38397 op3 = copy_to_mode_reg (mode0, op3);
38398
38399 op4 = expand_normal (arg3);
38400 if (!address_operand (op4, VOIDmode))
38401 {
38402 op4 = convert_memory_address (Pmode, op4);
38403 op4 = copy_addr_to_reg (op4);
38404 }
38405
38406 /* Generate CF from input operand. */
38407 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
38408
38409 /* Generate instruction that consumes CF. */
38410 op0 = gen_reg_rtx (mode0);
38411
38412 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
38413 pat = gen_rtx_LTU (mode0, op1, const0_rtx);
38414 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat));
38415
38416 /* Return current CF value. */
38417 if (target == 0)
38418 target = gen_reg_rtx (QImode);
38419
38420 PUT_MODE (pat, QImode);
38421 emit_insn (gen_rtx_SET (target, pat));
38422
38423 /* Store the result. */
38424 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
38425
38426 return target;
38427
38428 case IX86_BUILTIN_READ_FLAGS:
38429 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
38430
38431 if (optimize
38432 || target == NULL_RTX
38433 || !nonimmediate_operand (target, word_mode)
38434 || GET_MODE (target) != word_mode)
38435 target = gen_reg_rtx (word_mode);
38436
38437 emit_insn (gen_pop (target));
38438 return target;
38439
38440 case IX86_BUILTIN_WRITE_FLAGS:
38441
38442 arg0 = CALL_EXPR_ARG (exp, 0);
38443 op0 = expand_normal (arg0);
38444 if (!general_no_elim_operand (op0, word_mode))
38445 op0 = copy_to_mode_reg (word_mode, op0);
38446
38447 emit_insn (gen_push (op0));
38448 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
38449 return 0;
38450
38451 case IX86_BUILTIN_KTESTC8:
38452 icode = CODE_FOR_ktestqi;
38453 mode3 = CCCmode;
38454 goto kortest;
38455
38456 case IX86_BUILTIN_KTESTZ8:
38457 icode = CODE_FOR_ktestqi;
38458 mode3 = CCZmode;
38459 goto kortest;
38460
38461 case IX86_BUILTIN_KTESTC16:
38462 icode = CODE_FOR_ktesthi;
38463 mode3 = CCCmode;
38464 goto kortest;
38465
38466 case IX86_BUILTIN_KTESTZ16:
38467 icode = CODE_FOR_ktesthi;
38468 mode3 = CCZmode;
38469 goto kortest;
38470
38471 case IX86_BUILTIN_KTESTC32:
38472 icode = CODE_FOR_ktestsi;
38473 mode3 = CCCmode;
38474 goto kortest;
38475
38476 case IX86_BUILTIN_KTESTZ32:
38477 icode = CODE_FOR_ktestsi;
38478 mode3 = CCZmode;
38479 goto kortest;
38480
38481 case IX86_BUILTIN_KTESTC64:
38482 icode = CODE_FOR_ktestdi;
38483 mode3 = CCCmode;
38484 goto kortest;
38485
38486 case IX86_BUILTIN_KTESTZ64:
38487 icode = CODE_FOR_ktestdi;
38488 mode3 = CCZmode;
38489 goto kortest;
38490
38491 case IX86_BUILTIN_KORTESTC8:
38492 icode = CODE_FOR_kortestqi;
38493 mode3 = CCCmode;
38494 goto kortest;
38495
38496 case IX86_BUILTIN_KORTESTZ8:
38497 icode = CODE_FOR_kortestqi;
38498 mode3 = CCZmode;
38499 goto kortest;
38500
38501 case IX86_BUILTIN_KORTESTC16:
38502 icode = CODE_FOR_kortesthi;
38503 mode3 = CCCmode;
38504 goto kortest;
38505
38506 case IX86_BUILTIN_KORTESTZ16:
38507 icode = CODE_FOR_kortesthi;
38508 mode3 = CCZmode;
38509 goto kortest;
38510
38511 case IX86_BUILTIN_KORTESTC32:
38512 icode = CODE_FOR_kortestsi;
38513 mode3 = CCCmode;
38514 goto kortest;
38515
38516 case IX86_BUILTIN_KORTESTZ32:
38517 icode = CODE_FOR_kortestsi;
38518 mode3 = CCZmode;
38519 goto kortest;
38520
38521 case IX86_BUILTIN_KORTESTC64:
38522 icode = CODE_FOR_kortestdi;
38523 mode3 = CCCmode;
38524 goto kortest;
38525
38526 case IX86_BUILTIN_KORTESTZ64:
38527 icode = CODE_FOR_kortestdi;
38528 mode3 = CCZmode;
38529
38530 kortest:
38531 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
38532 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
38533 op0 = expand_normal (arg0);
38534 op1 = expand_normal (arg1);
38535
38536 mode0 = insn_data[icode].operand[0].mode;
38537 mode1 = insn_data[icode].operand[1].mode;
38538
38539 if (GET_MODE (op0) != VOIDmode)
38540 op0 = force_reg (GET_MODE (op0), op0);
38541
38542 op0 = gen_lowpart (mode0, op0);
38543
38544 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38545 op0 = copy_to_mode_reg (mode0, op0);
38546
38547 if (GET_MODE (op1) != VOIDmode)
38548 op1 = force_reg (GET_MODE (op1), op1);
38549
38550 op1 = gen_lowpart (mode1, op1);
38551
38552 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38553 op1 = copy_to_mode_reg (mode1, op1);
38554
38555 target = gen_reg_rtx (QImode);
38556
38557 /* Emit kortest. */
38558 emit_insn (GEN_FCN (icode) (op0, op1));
38559 /* And use setcc to return result from flags. */
38560 ix86_expand_setcc (target, EQ,
38561 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
38562 return target;
38563
38564 case IX86_BUILTIN_GATHERSIV2DF:
38565 icode = CODE_FOR_avx2_gathersiv2df;
38566 goto gather_gen;
38567 case IX86_BUILTIN_GATHERSIV4DF:
38568 icode = CODE_FOR_avx2_gathersiv4df;
38569 goto gather_gen;
38570 case IX86_BUILTIN_GATHERDIV2DF:
38571 icode = CODE_FOR_avx2_gatherdiv2df;
38572 goto gather_gen;
38573 case IX86_BUILTIN_GATHERDIV4DF:
38574 icode = CODE_FOR_avx2_gatherdiv4df;
38575 goto gather_gen;
38576 case IX86_BUILTIN_GATHERSIV4SF:
38577 icode = CODE_FOR_avx2_gathersiv4sf;
38578 goto gather_gen;
38579 case IX86_BUILTIN_GATHERSIV8SF:
38580 icode = CODE_FOR_avx2_gathersiv8sf;
38581 goto gather_gen;
38582 case IX86_BUILTIN_GATHERDIV4SF:
38583 icode = CODE_FOR_avx2_gatherdiv4sf;
38584 goto gather_gen;
38585 case IX86_BUILTIN_GATHERDIV8SF:
38586 icode = CODE_FOR_avx2_gatherdiv8sf;
38587 goto gather_gen;
38588 case IX86_BUILTIN_GATHERSIV2DI:
38589 icode = CODE_FOR_avx2_gathersiv2di;
38590 goto gather_gen;
38591 case IX86_BUILTIN_GATHERSIV4DI:
38592 icode = CODE_FOR_avx2_gathersiv4di;
38593 goto gather_gen;
38594 case IX86_BUILTIN_GATHERDIV2DI:
38595 icode = CODE_FOR_avx2_gatherdiv2di;
38596 goto gather_gen;
38597 case IX86_BUILTIN_GATHERDIV4DI:
38598 icode = CODE_FOR_avx2_gatherdiv4di;
38599 goto gather_gen;
38600 case IX86_BUILTIN_GATHERSIV4SI:
38601 icode = CODE_FOR_avx2_gathersiv4si;
38602 goto gather_gen;
38603 case IX86_BUILTIN_GATHERSIV8SI:
38604 icode = CODE_FOR_avx2_gathersiv8si;
38605 goto gather_gen;
38606 case IX86_BUILTIN_GATHERDIV4SI:
38607 icode = CODE_FOR_avx2_gatherdiv4si;
38608 goto gather_gen;
38609 case IX86_BUILTIN_GATHERDIV8SI:
38610 icode = CODE_FOR_avx2_gatherdiv8si;
38611 goto gather_gen;
38612 case IX86_BUILTIN_GATHERALTSIV4DF:
38613 icode = CODE_FOR_avx2_gathersiv4df;
38614 goto gather_gen;
38615 case IX86_BUILTIN_GATHERALTDIV8SF:
38616 icode = CODE_FOR_avx2_gatherdiv8sf;
38617 goto gather_gen;
38618 case IX86_BUILTIN_GATHERALTSIV4DI:
38619 icode = CODE_FOR_avx2_gathersiv4di;
38620 goto gather_gen;
38621 case IX86_BUILTIN_GATHERALTDIV8SI:
38622 icode = CODE_FOR_avx2_gatherdiv8si;
38623 goto gather_gen;
38624 case IX86_BUILTIN_GATHER3SIV16SF:
38625 icode = CODE_FOR_avx512f_gathersiv16sf;
38626 goto gather_gen;
38627 case IX86_BUILTIN_GATHER3SIV8DF:
38628 icode = CODE_FOR_avx512f_gathersiv8df;
38629 goto gather_gen;
38630 case IX86_BUILTIN_GATHER3DIV16SF:
38631 icode = CODE_FOR_avx512f_gatherdiv16sf;
38632 goto gather_gen;
38633 case IX86_BUILTIN_GATHER3DIV8DF:
38634 icode = CODE_FOR_avx512f_gatherdiv8df;
38635 goto gather_gen;
38636 case IX86_BUILTIN_GATHER3SIV16SI:
38637 icode = CODE_FOR_avx512f_gathersiv16si;
38638 goto gather_gen;
38639 case IX86_BUILTIN_GATHER3SIV8DI:
38640 icode = CODE_FOR_avx512f_gathersiv8di;
38641 goto gather_gen;
38642 case IX86_BUILTIN_GATHER3DIV16SI:
38643 icode = CODE_FOR_avx512f_gatherdiv16si;
38644 goto gather_gen;
38645 case IX86_BUILTIN_GATHER3DIV8DI:
38646 icode = CODE_FOR_avx512f_gatherdiv8di;
38647 goto gather_gen;
38648 case IX86_BUILTIN_GATHER3ALTSIV8DF:
38649 icode = CODE_FOR_avx512f_gathersiv8df;
38650 goto gather_gen;
38651 case IX86_BUILTIN_GATHER3ALTDIV16SF:
38652 icode = CODE_FOR_avx512f_gatherdiv16sf;
38653 goto gather_gen;
38654 case IX86_BUILTIN_GATHER3ALTSIV8DI:
38655 icode = CODE_FOR_avx512f_gathersiv8di;
38656 goto gather_gen;
38657 case IX86_BUILTIN_GATHER3ALTDIV16SI:
38658 icode = CODE_FOR_avx512f_gatherdiv16si;
38659 goto gather_gen;
38660 case IX86_BUILTIN_GATHER3SIV2DF:
38661 icode = CODE_FOR_avx512vl_gathersiv2df;
38662 goto gather_gen;
38663 case IX86_BUILTIN_GATHER3SIV4DF:
38664 icode = CODE_FOR_avx512vl_gathersiv4df;
38665 goto gather_gen;
38666 case IX86_BUILTIN_GATHER3DIV2DF:
38667 icode = CODE_FOR_avx512vl_gatherdiv2df;
38668 goto gather_gen;
38669 case IX86_BUILTIN_GATHER3DIV4DF:
38670 icode = CODE_FOR_avx512vl_gatherdiv4df;
38671 goto gather_gen;
38672 case IX86_BUILTIN_GATHER3SIV4SF:
38673 icode = CODE_FOR_avx512vl_gathersiv4sf;
38674 goto gather_gen;
38675 case IX86_BUILTIN_GATHER3SIV8SF:
38676 icode = CODE_FOR_avx512vl_gathersiv8sf;
38677 goto gather_gen;
38678 case IX86_BUILTIN_GATHER3DIV4SF:
38679 icode = CODE_FOR_avx512vl_gatherdiv4sf;
38680 goto gather_gen;
38681 case IX86_BUILTIN_GATHER3DIV8SF:
38682 icode = CODE_FOR_avx512vl_gatherdiv8sf;
38683 goto gather_gen;
38684 case IX86_BUILTIN_GATHER3SIV2DI:
38685 icode = CODE_FOR_avx512vl_gathersiv2di;
38686 goto gather_gen;
38687 case IX86_BUILTIN_GATHER3SIV4DI:
38688 icode = CODE_FOR_avx512vl_gathersiv4di;
38689 goto gather_gen;
38690 case IX86_BUILTIN_GATHER3DIV2DI:
38691 icode = CODE_FOR_avx512vl_gatherdiv2di;
38692 goto gather_gen;
38693 case IX86_BUILTIN_GATHER3DIV4DI:
38694 icode = CODE_FOR_avx512vl_gatherdiv4di;
38695 goto gather_gen;
38696 case IX86_BUILTIN_GATHER3SIV4SI:
38697 icode = CODE_FOR_avx512vl_gathersiv4si;
38698 goto gather_gen;
38699 case IX86_BUILTIN_GATHER3SIV8SI:
38700 icode = CODE_FOR_avx512vl_gathersiv8si;
38701 goto gather_gen;
38702 case IX86_BUILTIN_GATHER3DIV4SI:
38703 icode = CODE_FOR_avx512vl_gatherdiv4si;
38704 goto gather_gen;
38705 case IX86_BUILTIN_GATHER3DIV8SI:
38706 icode = CODE_FOR_avx512vl_gatherdiv8si;
38707 goto gather_gen;
38708 case IX86_BUILTIN_GATHER3ALTSIV4DF:
38709 icode = CODE_FOR_avx512vl_gathersiv4df;
38710 goto gather_gen;
38711 case IX86_BUILTIN_GATHER3ALTDIV8SF:
38712 icode = CODE_FOR_avx512vl_gatherdiv8sf;
38713 goto gather_gen;
38714 case IX86_BUILTIN_GATHER3ALTSIV4DI:
38715 icode = CODE_FOR_avx512vl_gathersiv4di;
38716 goto gather_gen;
38717 case IX86_BUILTIN_GATHER3ALTDIV8SI:
38718 icode = CODE_FOR_avx512vl_gatherdiv8si;
38719 goto gather_gen;
38720 case IX86_BUILTIN_SCATTERSIV16SF:
38721 icode = CODE_FOR_avx512f_scattersiv16sf;
38722 goto scatter_gen;
38723 case IX86_BUILTIN_SCATTERSIV8DF:
38724 icode = CODE_FOR_avx512f_scattersiv8df;
38725 goto scatter_gen;
38726 case IX86_BUILTIN_SCATTERDIV16SF:
38727 icode = CODE_FOR_avx512f_scatterdiv16sf;
38728 goto scatter_gen;
38729 case IX86_BUILTIN_SCATTERDIV8DF:
38730 icode = CODE_FOR_avx512f_scatterdiv8df;
38731 goto scatter_gen;
38732 case IX86_BUILTIN_SCATTERSIV16SI:
38733 icode = CODE_FOR_avx512f_scattersiv16si;
38734 goto scatter_gen;
38735 case IX86_BUILTIN_SCATTERSIV8DI:
38736 icode = CODE_FOR_avx512f_scattersiv8di;
38737 goto scatter_gen;
38738 case IX86_BUILTIN_SCATTERDIV16SI:
38739 icode = CODE_FOR_avx512f_scatterdiv16si;
38740 goto scatter_gen;
38741 case IX86_BUILTIN_SCATTERDIV8DI:
38742 icode = CODE_FOR_avx512f_scatterdiv8di;
38743 goto scatter_gen;
38744 case IX86_BUILTIN_SCATTERSIV8SF:
38745 icode = CODE_FOR_avx512vl_scattersiv8sf;
38746 goto scatter_gen;
38747 case IX86_BUILTIN_SCATTERSIV4SF:
38748 icode = CODE_FOR_avx512vl_scattersiv4sf;
38749 goto scatter_gen;
38750 case IX86_BUILTIN_SCATTERSIV4DF:
38751 icode = CODE_FOR_avx512vl_scattersiv4df;
38752 goto scatter_gen;
38753 case IX86_BUILTIN_SCATTERSIV2DF:
38754 icode = CODE_FOR_avx512vl_scattersiv2df;
38755 goto scatter_gen;
38756 case IX86_BUILTIN_SCATTERDIV8SF:
38757 icode = CODE_FOR_avx512vl_scatterdiv8sf;
38758 goto scatter_gen;
38759 case IX86_BUILTIN_SCATTERDIV4SF:
38760 icode = CODE_FOR_avx512vl_scatterdiv4sf;
38761 goto scatter_gen;
38762 case IX86_BUILTIN_SCATTERDIV4DF:
38763 icode = CODE_FOR_avx512vl_scatterdiv4df;
38764 goto scatter_gen;
38765 case IX86_BUILTIN_SCATTERDIV2DF:
38766 icode = CODE_FOR_avx512vl_scatterdiv2df;
38767 goto scatter_gen;
38768 case IX86_BUILTIN_SCATTERSIV8SI:
38769 icode = CODE_FOR_avx512vl_scattersiv8si;
38770 goto scatter_gen;
38771 case IX86_BUILTIN_SCATTERSIV4SI:
38772 icode = CODE_FOR_avx512vl_scattersiv4si;
38773 goto scatter_gen;
38774 case IX86_BUILTIN_SCATTERSIV4DI:
38775 icode = CODE_FOR_avx512vl_scattersiv4di;
38776 goto scatter_gen;
38777 case IX86_BUILTIN_SCATTERSIV2DI:
38778 icode = CODE_FOR_avx512vl_scattersiv2di;
38779 goto scatter_gen;
38780 case IX86_BUILTIN_SCATTERDIV8SI:
38781 icode = CODE_FOR_avx512vl_scatterdiv8si;
38782 goto scatter_gen;
38783 case IX86_BUILTIN_SCATTERDIV4SI:
38784 icode = CODE_FOR_avx512vl_scatterdiv4si;
38785 goto scatter_gen;
38786 case IX86_BUILTIN_SCATTERDIV4DI:
38787 icode = CODE_FOR_avx512vl_scatterdiv4di;
38788 goto scatter_gen;
38789 case IX86_BUILTIN_SCATTERDIV2DI:
38790 icode = CODE_FOR_avx512vl_scatterdiv2di;
38791 goto scatter_gen;
38792 case IX86_BUILTIN_GATHERPFDPD:
38793 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
38794 goto vec_prefetch_gen;
38795 case IX86_BUILTIN_SCATTERALTSIV8DF:
38796 icode = CODE_FOR_avx512f_scattersiv8df;
38797 goto scatter_gen;
38798 case IX86_BUILTIN_SCATTERALTDIV16SF:
38799 icode = CODE_FOR_avx512f_scatterdiv16sf;
38800 goto scatter_gen;
38801 case IX86_BUILTIN_SCATTERALTSIV8DI:
38802 icode = CODE_FOR_avx512f_scattersiv8di;
38803 goto scatter_gen;
38804 case IX86_BUILTIN_SCATTERALTDIV16SI:
38805 icode = CODE_FOR_avx512f_scatterdiv16si;
38806 goto scatter_gen;
38807 case IX86_BUILTIN_GATHERPFDPS:
38808 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
38809 goto vec_prefetch_gen;
38810 case IX86_BUILTIN_GATHERPFQPD:
38811 icode = CODE_FOR_avx512pf_gatherpfv8didf;
38812 goto vec_prefetch_gen;
38813 case IX86_BUILTIN_GATHERPFQPS:
38814 icode = CODE_FOR_avx512pf_gatherpfv8disf;
38815 goto vec_prefetch_gen;
38816 case IX86_BUILTIN_SCATTERPFDPD:
38817 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
38818 goto vec_prefetch_gen;
38819 case IX86_BUILTIN_SCATTERPFDPS:
38820 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
38821 goto vec_prefetch_gen;
38822 case IX86_BUILTIN_SCATTERPFQPD:
38823 icode = CODE_FOR_avx512pf_scatterpfv8didf;
38824 goto vec_prefetch_gen;
38825 case IX86_BUILTIN_SCATTERPFQPS:
38826 icode = CODE_FOR_avx512pf_scatterpfv8disf;
38827 goto vec_prefetch_gen;
38828
38829 gather_gen:
38830 rtx half;
38831 rtx (*gen) (rtx, rtx);
38832
38833 arg0 = CALL_EXPR_ARG (exp, 0);
38834 arg1 = CALL_EXPR_ARG (exp, 1);
38835 arg2 = CALL_EXPR_ARG (exp, 2);
38836 arg3 = CALL_EXPR_ARG (exp, 3);
38837 arg4 = CALL_EXPR_ARG (exp, 4);
38838 op0 = expand_normal (arg0);
38839 op1 = expand_normal (arg1);
38840 op2 = expand_normal (arg2);
38841 op3 = expand_normal (arg3);
38842 op4 = expand_normal (arg4);
38843 /* Note the arg order is different from the operand order. */
38844 mode0 = insn_data[icode].operand[1].mode;
38845 mode2 = insn_data[icode].operand[3].mode;
38846 mode3 = insn_data[icode].operand[4].mode;
38847 mode4 = insn_data[icode].operand[5].mode;
38848
38849 if (target == NULL_RTX
38850 || GET_MODE (target) != insn_data[icode].operand[0].mode
38851 || !insn_data[icode].operand[0].predicate (target,
38852 GET_MODE (target)))
38853 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
38854 else
38855 subtarget = target;
38856
38857 switch (fcode)
38858 {
38859 case IX86_BUILTIN_GATHER3ALTSIV8DF:
38860 case IX86_BUILTIN_GATHER3ALTSIV8DI:
38861 half = gen_reg_rtx (V8SImode);
38862 if (!nonimmediate_operand (op2, V16SImode))
38863 op2 = copy_to_mode_reg (V16SImode, op2);
38864 emit_insn (gen_vec_extract_lo_v16si (half, op2));
38865 op2 = half;
38866 break;
38867 case IX86_BUILTIN_GATHER3ALTSIV4DF:
38868 case IX86_BUILTIN_GATHER3ALTSIV4DI:
38869 case IX86_BUILTIN_GATHERALTSIV4DF:
38870 case IX86_BUILTIN_GATHERALTSIV4DI:
38871 half = gen_reg_rtx (V4SImode);
38872 if (!nonimmediate_operand (op2, V8SImode))
38873 op2 = copy_to_mode_reg (V8SImode, op2);
38874 emit_insn (gen_vec_extract_lo_v8si (half, op2));
38875 op2 = half;
38876 break;
38877 case IX86_BUILTIN_GATHER3ALTDIV16SF:
38878 case IX86_BUILTIN_GATHER3ALTDIV16SI:
38879 half = gen_reg_rtx (mode0);
38880 if (mode0 == V8SFmode)
38881 gen = gen_vec_extract_lo_v16sf;
38882 else
38883 gen = gen_vec_extract_lo_v16si;
38884 if (!nonimmediate_operand (op0, GET_MODE (op0)))
38885 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
38886 emit_insn (gen (half, op0));
38887 op0 = half;
38888 if (GET_MODE (op3) != VOIDmode)
38889 {
38890 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38891 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38892 emit_insn (gen (half, op3));
38893 op3 = half;
38894 }
38895 break;
38896 case IX86_BUILTIN_GATHER3ALTDIV8SF:
38897 case IX86_BUILTIN_GATHER3ALTDIV8SI:
38898 case IX86_BUILTIN_GATHERALTDIV8SF:
38899 case IX86_BUILTIN_GATHERALTDIV8SI:
38900 half = gen_reg_rtx (mode0);
38901 if (mode0 == V4SFmode)
38902 gen = gen_vec_extract_lo_v8sf;
38903 else
38904 gen = gen_vec_extract_lo_v8si;
38905 if (!nonimmediate_operand (op0, GET_MODE (op0)))
38906 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
38907 emit_insn (gen (half, op0));
38908 op0 = half;
38909 if (GET_MODE (op3) != VOIDmode)
38910 {
38911 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38912 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38913 emit_insn (gen (half, op3));
38914 op3 = half;
38915 }
38916 break;
38917 default:
38918 break;
38919 }
38920
38921 /* Force memory operand only with base register here. But we
38922 don't want to do it on memory operand for other builtin
38923 functions. */
38924 op1 = ix86_zero_extend_to_Pmode (op1);
38925
38926 if (!insn_data[icode].operand[1].predicate (op0, mode0))
38927 op0 = copy_to_mode_reg (mode0, op0);
38928 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
38929 op1 = copy_to_mode_reg (Pmode, op1);
38930 if (!insn_data[icode].operand[3].predicate (op2, mode2))
38931 op2 = copy_to_mode_reg (mode2, op2);
38932
38933 op3 = fixup_modeless_constant (op3, mode3);
38934
38935 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
38936 {
38937 if (!insn_data[icode].operand[4].predicate (op3, mode3))
38938 op3 = copy_to_mode_reg (mode3, op3);
38939 }
38940 else
38941 {
38942 op3 = copy_to_reg (op3);
38943 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
38944 }
38945 if (!insn_data[icode].operand[5].predicate (op4, mode4))
38946 {
38947 error ("the last argument must be scale 1, 2, 4, 8");
38948 return const0_rtx;
38949 }
38950
38951 /* Optimize. If mask is known to have all high bits set,
38952 replace op0 with pc_rtx to signal that the instruction
38953 overwrites the whole destination and doesn't use its
38954 previous contents. */
38955 if (optimize)
38956 {
38957 if (TREE_CODE (arg3) == INTEGER_CST)
38958 {
38959 if (integer_all_onesp (arg3))
38960 op0 = pc_rtx;
38961 }
38962 else if (TREE_CODE (arg3) == VECTOR_CST)
38963 {
38964 unsigned int negative = 0;
38965 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
38966 {
38967 tree cst = VECTOR_CST_ELT (arg3, i);
38968 if (TREE_CODE (cst) == INTEGER_CST
38969 && tree_int_cst_sign_bit (cst))
38970 negative++;
38971 else if (TREE_CODE (cst) == REAL_CST
38972 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
38973 negative++;
38974 }
38975 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
38976 op0 = pc_rtx;
38977 }
38978 else if (TREE_CODE (arg3) == SSA_NAME
38979 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
38980 {
38981 /* Recognize also when mask is like:
38982 __v2df src = _mm_setzero_pd ();
38983 __v2df mask = _mm_cmpeq_pd (src, src);
38984 or
38985 __v8sf src = _mm256_setzero_ps ();
38986 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
38987 as that is a cheaper way to load all ones into
38988 a register than having to load a constant from
38989 memory. */
38990 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
38991 if (is_gimple_call (def_stmt))
38992 {
38993 tree fndecl = gimple_call_fndecl (def_stmt);
38994 if (fndecl
38995 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
38996 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
38997 {
38998 case IX86_BUILTIN_CMPPD:
38999 case IX86_BUILTIN_CMPPS:
39000 case IX86_BUILTIN_CMPPD256:
39001 case IX86_BUILTIN_CMPPS256:
39002 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
39003 break;
39004 /* FALLTHRU */
39005 case IX86_BUILTIN_CMPEQPD:
39006 case IX86_BUILTIN_CMPEQPS:
39007 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
39008 && initializer_zerop (gimple_call_arg (def_stmt,
39009 1)))
39010 op0 = pc_rtx;
39011 break;
39012 default:
39013 break;
39014 }
39015 }
39016 }
39017 }
39018
39019 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
39020 if (! pat)
39021 return const0_rtx;
39022 emit_insn (pat);
39023
39024 switch (fcode)
39025 {
39026 case IX86_BUILTIN_GATHER3DIV16SF:
39027 if (target == NULL_RTX)
39028 target = gen_reg_rtx (V8SFmode);
39029 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
39030 break;
39031 case IX86_BUILTIN_GATHER3DIV16SI:
39032 if (target == NULL_RTX)
39033 target = gen_reg_rtx (V8SImode);
39034 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
39035 break;
39036 case IX86_BUILTIN_GATHER3DIV8SF:
39037 case IX86_BUILTIN_GATHERDIV8SF:
39038 if (target == NULL_RTX)
39039 target = gen_reg_rtx (V4SFmode);
39040 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
39041 break;
39042 case IX86_BUILTIN_GATHER3DIV8SI:
39043 case IX86_BUILTIN_GATHERDIV8SI:
39044 if (target == NULL_RTX)
39045 target = gen_reg_rtx (V4SImode);
39046 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
39047 break;
39048 default:
39049 target = subtarget;
39050 break;
39051 }
39052 return target;
39053
39054 scatter_gen:
39055 arg0 = CALL_EXPR_ARG (exp, 0);
39056 arg1 = CALL_EXPR_ARG (exp, 1);
39057 arg2 = CALL_EXPR_ARG (exp, 2);
39058 arg3 = CALL_EXPR_ARG (exp, 3);
39059 arg4 = CALL_EXPR_ARG (exp, 4);
39060 op0 = expand_normal (arg0);
39061 op1 = expand_normal (arg1);
39062 op2 = expand_normal (arg2);
39063 op3 = expand_normal (arg3);
39064 op4 = expand_normal (arg4);
39065 mode1 = insn_data[icode].operand[1].mode;
39066 mode2 = insn_data[icode].operand[2].mode;
39067 mode3 = insn_data[icode].operand[3].mode;
39068 mode4 = insn_data[icode].operand[4].mode;
39069
39070 /* Scatter instruction stores operand op3 to memory with
39071 indices from op2 and scale from op4 under writemask op1.
39072 If index operand op2 has more elements then source operand
39073 op3 one need to use only its low half. And vice versa. */
39074 switch (fcode)
39075 {
39076 case IX86_BUILTIN_SCATTERALTSIV8DF:
39077 case IX86_BUILTIN_SCATTERALTSIV8DI:
39078 half = gen_reg_rtx (V8SImode);
39079 if (!nonimmediate_operand (op2, V16SImode))
39080 op2 = copy_to_mode_reg (V16SImode, op2);
39081 emit_insn (gen_vec_extract_lo_v16si (half, op2));
39082 op2 = half;
39083 break;
39084 case IX86_BUILTIN_SCATTERALTDIV16SF:
39085 case IX86_BUILTIN_SCATTERALTDIV16SI:
39086 half = gen_reg_rtx (mode3);
39087 if (mode3 == V8SFmode)
39088 gen = gen_vec_extract_lo_v16sf;
39089 else
39090 gen = gen_vec_extract_lo_v16si;
39091 if (!nonimmediate_operand (op3, GET_MODE (op3)))
39092 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
39093 emit_insn (gen (half, op3));
39094 op3 = half;
39095 break;
39096 default:
39097 break;
39098 }
39099
39100 /* Force memory operand only with base register here. But we
39101 don't want to do it on memory operand for other builtin
39102 functions. */
39103 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
39104
39105 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
39106 op0 = copy_to_mode_reg (Pmode, op0);
39107
39108 op1 = fixup_modeless_constant (op1, mode1);
39109
39110 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
39111 {
39112 if (!insn_data[icode].operand[1].predicate (op1, mode1))
39113 op1 = copy_to_mode_reg (mode1, op1);
39114 }
39115 else
39116 {
39117 op1 = copy_to_reg (op1);
39118 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
39119 }
39120
39121 if (!insn_data[icode].operand[2].predicate (op2, mode2))
39122 op2 = copy_to_mode_reg (mode2, op2);
39123
39124 if (!insn_data[icode].operand[3].predicate (op3, mode3))
39125 op3 = copy_to_mode_reg (mode3, op3);
39126
39127 if (!insn_data[icode].operand[4].predicate (op4, mode4))
39128 {
39129 error ("the last argument must be scale 1, 2, 4, 8");
39130 return const0_rtx;
39131 }
39132
39133 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
39134 if (! pat)
39135 return const0_rtx;
39136
39137 emit_insn (pat);
39138 return 0;
39139
39140 vec_prefetch_gen:
39141 arg0 = CALL_EXPR_ARG (exp, 0);
39142 arg1 = CALL_EXPR_ARG (exp, 1);
39143 arg2 = CALL_EXPR_ARG (exp, 2);
39144 arg3 = CALL_EXPR_ARG (exp, 3);
39145 arg4 = CALL_EXPR_ARG (exp, 4);
39146 op0 = expand_normal (arg0);
39147 op1 = expand_normal (arg1);
39148 op2 = expand_normal (arg2);
39149 op3 = expand_normal (arg3);
39150 op4 = expand_normal (arg4);
39151 mode0 = insn_data[icode].operand[0].mode;
39152 mode1 = insn_data[icode].operand[1].mode;
39153 mode3 = insn_data[icode].operand[3].mode;
39154 mode4 = insn_data[icode].operand[4].mode;
39155
39156 op0 = fixup_modeless_constant (op0, mode0);
39157
39158 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
39159 {
39160 if (!insn_data[icode].operand[0].predicate (op0, mode0))
39161 op0 = copy_to_mode_reg (mode0, op0);
39162 }
39163 else
39164 {
39165 op0 = copy_to_reg (op0);
39166 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
39167 }
39168
39169 if (!insn_data[icode].operand[1].predicate (op1, mode1))
39170 op1 = copy_to_mode_reg (mode1, op1);
39171
39172 /* Force memory operand only with base register here. But we
39173 don't want to do it on memory operand for other builtin
39174 functions. */
39175 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
39176
39177 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
39178 op2 = copy_to_mode_reg (Pmode, op2);
39179
39180 if (!insn_data[icode].operand[3].predicate (op3, mode3))
39181 {
39182 error ("the forth argument must be scale 1, 2, 4, 8");
39183 return const0_rtx;
39184 }
39185
39186 if (!insn_data[icode].operand[4].predicate (op4, mode4))
39187 {
39188 error ("incorrect hint operand");
39189 return const0_rtx;
39190 }
39191
39192 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
39193 if (! pat)
39194 return const0_rtx;
39195
39196 emit_insn (pat);
39197
39198 return 0;
39199
39200 case IX86_BUILTIN_XABORT:
39201 icode = CODE_FOR_xabort;
39202 arg0 = CALL_EXPR_ARG (exp, 0);
39203 op0 = expand_normal (arg0);
39204 mode0 = insn_data[icode].operand[0].mode;
39205 if (!insn_data[icode].operand[0].predicate (op0, mode0))
39206 {
39207 error ("the xabort's argument must be an 8-bit immediate");
39208 return const0_rtx;
39209 }
39210 emit_insn (gen_xabort (op0));
39211 return 0;
39212
39213 default:
39214 break;
39215 }
39216
39217 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
39218 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
39219 {
39220 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
39221 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
39222 target);
39223 }
39224
39225 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
39226 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
39227 {
39228 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
39229 switch (fcode)
39230 {
39231 case IX86_BUILTIN_FABSQ:
39232 case IX86_BUILTIN_COPYSIGNQ:
39233 if (!TARGET_SSE)
39234 /* Emit a normal call if SSE isn't available. */
39235 return expand_call (exp, target, ignore);
39236 /* FALLTHRU */
39237 default:
39238 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
39239 }
39240 }
39241
39242 if (fcode >= IX86_BUILTIN__BDESC_ARGS2_FIRST
39243 && fcode <= IX86_BUILTIN__BDESC_ARGS2_LAST)
39244 {
39245 i = fcode - IX86_BUILTIN__BDESC_ARGS2_FIRST;
39246 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
39247 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
39248 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
39249 int masked = 1;
39250 machine_mode mode, wide_mode, nar_mode;
39251
39252 nar_mode = V4SFmode;
39253 mode = V16SFmode;
39254 wide_mode = V64SFmode;
39255 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
39256 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
39257
39258 switch (fcode)
39259 {
39260 case IX86_BUILTIN_4FMAPS:
39261 fcn = gen_avx5124fmaddps_4fmaddps;
39262 masked = 0;
39263 goto v4fma_expand;
39264
39265 case IX86_BUILTIN_4DPWSSD:
39266 nar_mode = V4SImode;
39267 mode = V16SImode;
39268 wide_mode = V64SImode;
39269 fcn = gen_avx5124vnniw_vp4dpwssd;
39270 masked = 0;
39271 goto v4fma_expand;
39272
39273 case IX86_BUILTIN_4DPWSSDS:
39274 nar_mode = V4SImode;
39275 mode = V16SImode;
39276 wide_mode = V64SImode;
39277 fcn = gen_avx5124vnniw_vp4dpwssds;
39278 masked = 0;
39279 goto v4fma_expand;
39280
39281 case IX86_BUILTIN_4FNMAPS:
39282 fcn = gen_avx5124fmaddps_4fnmaddps;
39283 masked = 0;
39284 goto v4fma_expand;
39285
39286 case IX86_BUILTIN_4FNMAPS_MASK:
39287 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
39288 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
39289 goto v4fma_expand;
39290
39291 case IX86_BUILTIN_4DPWSSD_MASK:
39292 nar_mode = V4SImode;
39293 mode = V16SImode;
39294 wide_mode = V64SImode;
39295 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
39296 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
39297 goto v4fma_expand;
39298
39299 case IX86_BUILTIN_4DPWSSDS_MASK:
39300 nar_mode = V4SImode;
39301 mode = V16SImode;
39302 wide_mode = V64SImode;
39303 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
39304 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
39305 goto v4fma_expand;
39306
39307 case IX86_BUILTIN_4FMAPS_MASK:
39308 {
39309 tree args[4];
39310 rtx ops[4];
39311 rtx wide_reg;
39312 rtx accum;
39313 rtx addr;
39314 rtx mem;
39315
39316 v4fma_expand:
39317 wide_reg = gen_reg_rtx (wide_mode);
39318 for (i = 0; i < 4; i++)
39319 {
39320 args[i] = CALL_EXPR_ARG (exp, i);
39321 ops[i] = expand_normal (args[i]);
39322
39323 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
39324 ops[i]);
39325 }
39326
39327 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
39328 accum = force_reg (mode, accum);
39329
39330 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
39331 addr = force_reg (Pmode, addr);
39332
39333 mem = gen_rtx_MEM (nar_mode, addr);
39334
39335 target = gen_reg_rtx (mode);
39336
39337 emit_move_insn (target, accum);
39338
39339 if (! masked)
39340 emit_insn (fcn (target, accum, wide_reg, mem));
39341 else
39342 {
39343 rtx merge, mask;
39344 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
39345
39346 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
39347
39348 if (CONST_INT_P (mask))
39349 mask = fixup_modeless_constant (mask, HImode);
39350
39351 mask = force_reg (HImode, mask);
39352
39353 if (GET_MODE (mask) != HImode)
39354 mask = gen_rtx_SUBREG (HImode, mask, 0);
39355
39356 /* If merge is 0 then we're about to emit z-masked variant. */
39357 if (const0_operand (merge, mode))
39358 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
39359 /* If merge is the same as accum then emit merge-masked variant. */
39360 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
39361 {
39362 merge = force_reg (mode, merge);
39363 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
39364 }
39365 /* Merge with something unknown might happen if we z-mask w/ -O0. */
39366 else
39367 {
39368 target = gen_reg_rtx (mode);
39369 emit_move_insn (target, merge);
39370 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
39371 }
39372 }
39373 return target;
39374 }
39375
39376 case IX86_BUILTIN_4FNMASS:
39377 fcn = gen_avx5124fmaddps_4fnmaddss;
39378 masked = 0;
39379 goto s4fma_expand;
39380
39381 case IX86_BUILTIN_4FMASS:
39382 fcn = gen_avx5124fmaddps_4fmaddss;
39383 masked = 0;
39384 goto s4fma_expand;
39385
39386 case IX86_BUILTIN_4FNMASS_MASK:
39387 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
39388 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
39389 goto s4fma_expand;
39390
39391 case IX86_BUILTIN_4FMASS_MASK:
39392 {
39393 tree args[4];
39394 rtx ops[4];
39395 rtx wide_reg;
39396 rtx accum;
39397 rtx addr;
39398 rtx mem;
39399
39400 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
39401 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
39402
39403 s4fma_expand:
39404 mode = V4SFmode;
39405 wide_reg = gen_reg_rtx (V64SFmode);
39406 for (i = 0; i < 4; i++)
39407 {
39408 rtx tmp;
39409 args[i] = CALL_EXPR_ARG (exp, i);
39410 ops[i] = expand_normal (args[i]);
39411
39412 tmp = gen_reg_rtx (SFmode);
39413 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
39414
39415 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
39416 gen_rtx_SUBREG (V16SFmode, tmp, 0));
39417 }
39418
39419 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
39420 accum = force_reg (V4SFmode, accum);
39421
39422 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
39423 addr = force_reg (Pmode, addr);
39424
39425 mem = gen_rtx_MEM (V4SFmode, addr);
39426
39427 target = gen_reg_rtx (V4SFmode);
39428
39429 emit_move_insn (target, accum);
39430
39431 if (! masked)
39432 emit_insn (fcn (target, accum, wide_reg, mem));
39433 else
39434 {
39435 rtx merge, mask;
39436 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
39437
39438 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
39439
39440 if (CONST_INT_P (mask))
39441 mask = fixup_modeless_constant (mask, QImode);
39442
39443 mask = force_reg (QImode, mask);
39444
39445 if (GET_MODE (mask) != QImode)
39446 mask = gen_rtx_SUBREG (QImode, mask, 0);
39447
39448 /* If merge is 0 then we're about to emit z-masked variant. */
39449 if (const0_operand (merge, mode))
39450 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
39451 /* If merge is the same as accum then emit merge-masked
39452 variant. */
39453 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
39454 {
39455 merge = force_reg (mode, merge);
39456 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
39457 }
39458 /* Merge with something unknown might happen if we z-mask
39459 w/ -O0. */
39460 else
39461 {
39462 target = gen_reg_rtx (mode);
39463 emit_move_insn (target, merge);
39464 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
39465 }
39466 }
39467 return target;
39468 }
39469 case IX86_BUILTIN_RDPID:
39470 return ix86_expand_special_args_builtin (bdesc_args2 + i, exp,
39471 target);
39472 default:
39473 return ix86_expand_args_builtin (bdesc_args2 + i, exp, target);
39474 }
39475 }
39476
39477 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
39478 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
39479 {
39480 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
39481 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
39482 }
39483
39484 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
39485 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
39486 {
39487 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
39488 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
39489 }
39490
39491 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
39492 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
39493 {
39494 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
39495 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
39496 }
39497
39498 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
39499 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
39500 {
39501 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
39502 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
39503 }
39504
39505 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
39506 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
39507 {
39508 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
39509 const struct builtin_description *d = bdesc_multi_arg + i;
39510 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
39511 (enum ix86_builtin_func_type)
39512 d->flag, d->comparison);
39513 }
39514
39515 gcc_unreachable ();
39516 }
39517
39518 /* This returns the target-specific builtin with code CODE if
39519 current_function_decl has visibility on this builtin, which is checked
39520 using isa flags. Returns NULL_TREE otherwise. */
39521
39522 static tree ix86_get_builtin (enum ix86_builtins code)
39523 {
39524 struct cl_target_option *opts;
39525 tree target_tree = NULL_TREE;
39526
39527 /* Determine the isa flags of current_function_decl. */
39528
39529 if (current_function_decl)
39530 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
39531
39532 if (target_tree == NULL)
39533 target_tree = target_option_default_node;
39534
39535 opts = TREE_TARGET_OPTION (target_tree);
39536
39537 if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
39538 || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
39539 return ix86_builtin_decl (code, true);
39540 else
39541 return NULL_TREE;
39542 }
39543
39544 /* Return function decl for target specific builtin
39545 for given MPX builtin passed i FCODE. */
39546 static tree
39547 ix86_builtin_mpx_function (unsigned fcode)
39548 {
39549 switch (fcode)
39550 {
39551 case BUILT_IN_CHKP_BNDMK:
39552 return ix86_builtins[IX86_BUILTIN_BNDMK];
39553
39554 case BUILT_IN_CHKP_BNDSTX:
39555 return ix86_builtins[IX86_BUILTIN_BNDSTX];
39556
39557 case BUILT_IN_CHKP_BNDLDX:
39558 return ix86_builtins[IX86_BUILTIN_BNDLDX];
39559
39560 case BUILT_IN_CHKP_BNDCL:
39561 return ix86_builtins[IX86_BUILTIN_BNDCL];
39562
39563 case BUILT_IN_CHKP_BNDCU:
39564 return ix86_builtins[IX86_BUILTIN_BNDCU];
39565
39566 case BUILT_IN_CHKP_BNDRET:
39567 return ix86_builtins[IX86_BUILTIN_BNDRET];
39568
39569 case BUILT_IN_CHKP_INTERSECT:
39570 return ix86_builtins[IX86_BUILTIN_BNDINT];
39571
39572 case BUILT_IN_CHKP_NARROW:
39573 return ix86_builtins[IX86_BUILTIN_BNDNARROW];
39574
39575 case BUILT_IN_CHKP_SIZEOF:
39576 return ix86_builtins[IX86_BUILTIN_SIZEOF];
39577
39578 case BUILT_IN_CHKP_EXTRACT_LOWER:
39579 return ix86_builtins[IX86_BUILTIN_BNDLOWER];
39580
39581 case BUILT_IN_CHKP_EXTRACT_UPPER:
39582 return ix86_builtins[IX86_BUILTIN_BNDUPPER];
39583
39584 default:
39585 return NULL_TREE;
39586 }
39587
39588 gcc_unreachable ();
39589 }
39590
39591 /* Helper function for ix86_load_bounds and ix86_store_bounds.
39592
39593 Return an address to be used to load/store bounds for pointer
39594 passed in SLOT.
39595
39596 SLOT_NO is an integer constant holding number of a target
39597 dependent special slot to be used in case SLOT is not a memory.
39598
39599 SPECIAL_BASE is a pointer to be used as a base of fake address
39600 to access special slots in Bounds Table. SPECIAL_BASE[-1],
39601 SPECIAL_BASE[-2] etc. will be used as fake pointer locations. */
39602
39603 static rtx
39604 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
39605 {
39606 rtx addr = NULL;
39607
39608 /* NULL slot means we pass bounds for pointer not passed to the
39609 function at all. Register slot means we pass pointer in a
39610 register. In both these cases bounds are passed via Bounds
39611 Table. Since we do not have actual pointer stored in memory,
39612 we have to use fake addresses to access Bounds Table. We
39613 start with (special_base - sizeof (void*)) and decrease this
39614 address by pointer size to get addresses for other slots. */
39615 if (!slot || REG_P (slot))
39616 {
39617 gcc_assert (CONST_INT_P (slot_no));
39618 addr = plus_constant (Pmode, special_base,
39619 -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
39620 }
39621 /* If pointer is passed in a memory then its address is used to
39622 access Bounds Table. */
39623 else if (MEM_P (slot))
39624 {
39625 addr = XEXP (slot, 0);
39626 if (!register_operand (addr, Pmode))
39627 addr = copy_addr_to_reg (addr);
39628 }
39629 else
39630 gcc_unreachable ();
39631
39632 return addr;
39633 }
39634
39635 /* Expand pass uses this hook to load bounds for function parameter
39636 PTR passed in SLOT in case its bounds are not passed in a register.
39637
39638 If SLOT is a memory, then bounds are loaded as for regular pointer
39639 loaded from memory. PTR may be NULL in case SLOT is a memory.
39640 In such case value of PTR (if required) may be loaded from SLOT.
39641
39642 If SLOT is NULL or a register then SLOT_NO is an integer constant
39643 holding number of the target dependent special slot which should be
39644 used to obtain bounds.
39645
39646 Return loaded bounds. */
39647
39648 static rtx
39649 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
39650 {
39651 rtx reg = gen_reg_rtx (BNDmode);
39652 rtx addr;
39653
39654 /* Get address to be used to access Bounds Table. Special slots start
39655 at the location of return address of the current function. */
39656 addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
39657
39658 /* Load pointer value from a memory if we don't have it. */
39659 if (!ptr)
39660 {
39661 gcc_assert (MEM_P (slot));
39662 ptr = copy_addr_to_reg (slot);
39663 }
39664
39665 if (!register_operand (ptr, Pmode))
39666 ptr = ix86_zero_extend_to_Pmode (ptr);
39667
39668 emit_insn (BNDmode == BND64mode
39669 ? gen_bnd64_ldx (reg, addr, ptr)
39670 : gen_bnd32_ldx (reg, addr, ptr));
39671
39672 return reg;
39673 }
39674
39675 /* Expand pass uses this hook to store BOUNDS for call argument PTR
39676 passed in SLOT in case BOUNDS are not passed in a register.
39677
39678 If SLOT is a memory, then BOUNDS are stored as for regular pointer
39679 stored in memory. PTR may be NULL in case SLOT is a memory.
39680 In such case value of PTR (if required) may be loaded from SLOT.
39681
39682 If SLOT is NULL or a register then SLOT_NO is an integer constant
39683 holding number of the target dependent special slot which should be
39684 used to store BOUNDS. */
39685
39686 static void
39687 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
39688 {
39689 rtx addr;
39690
39691 /* Get address to be used to access Bounds Table. Special slots start
39692 at the location of return address of a called function. */
39693 addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
39694
39695 /* Load pointer value from a memory if we don't have it. */
39696 if (!ptr)
39697 {
39698 gcc_assert (MEM_P (slot));
39699 ptr = copy_addr_to_reg (slot);
39700 }
39701
39702 if (!register_operand (ptr, Pmode))
39703 ptr = ix86_zero_extend_to_Pmode (ptr);
39704
39705 gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
39706 if (!register_operand (bounds, BNDmode))
39707 bounds = copy_to_mode_reg (BNDmode, bounds);
39708
39709 emit_insn (BNDmode == BND64mode
39710 ? gen_bnd64_stx (addr, ptr, bounds)
39711 : gen_bnd32_stx (addr, ptr, bounds));
39712 }
39713
39714 /* Load and return bounds returned by function in SLOT. */
39715
39716 static rtx
39717 ix86_load_returned_bounds (rtx slot)
39718 {
39719 rtx res;
39720
39721 gcc_assert (REG_P (slot));
39722 res = gen_reg_rtx (BNDmode);
39723 emit_move_insn (res, slot);
39724
39725 return res;
39726 }
39727
39728 /* Store BOUNDS returned by function into SLOT. */
39729
39730 static void
39731 ix86_store_returned_bounds (rtx slot, rtx bounds)
39732 {
39733 gcc_assert (REG_P (slot));
39734 emit_move_insn (slot, bounds);
39735 }
39736
39737 /* Returns a function decl for a vectorized version of the combined function
39738 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
39739 if it is not available. */
39740
39741 static tree
39742 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
39743 tree type_in)
39744 {
39745 machine_mode in_mode, out_mode;
39746 int in_n, out_n;
39747
39748 if (TREE_CODE (type_out) != VECTOR_TYPE
39749 || TREE_CODE (type_in) != VECTOR_TYPE)
39750 return NULL_TREE;
39751
39752 out_mode = TYPE_MODE (TREE_TYPE (type_out));
39753 out_n = TYPE_VECTOR_SUBPARTS (type_out);
39754 in_mode = TYPE_MODE (TREE_TYPE (type_in));
39755 in_n = TYPE_VECTOR_SUBPARTS (type_in);
39756
39757 switch (fn)
39758 {
39759 CASE_CFN_EXP2:
39760 if (out_mode == SFmode && in_mode == SFmode)
39761 {
39762 if (out_n == 16 && in_n == 16)
39763 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
39764 }
39765 break;
39766
39767 CASE_CFN_IFLOOR:
39768 CASE_CFN_LFLOOR:
39769 CASE_CFN_LLFLOOR:
39770 /* The round insn does not trap on denormals. */
39771 if (flag_trapping_math || !TARGET_ROUND)
39772 break;
39773
39774 if (out_mode == SImode && in_mode == DFmode)
39775 {
39776 if (out_n == 4 && in_n == 2)
39777 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
39778 else if (out_n == 8 && in_n == 4)
39779 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
39780 else if (out_n == 16 && in_n == 8)
39781 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
39782 }
39783 if (out_mode == SImode && in_mode == SFmode)
39784 {
39785 if (out_n == 4 && in_n == 4)
39786 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
39787 else if (out_n == 8 && in_n == 8)
39788 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
39789 else if (out_n == 16 && in_n == 16)
39790 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
39791 }
39792 break;
39793
39794 CASE_CFN_ICEIL:
39795 CASE_CFN_LCEIL:
39796 CASE_CFN_LLCEIL:
39797 /* The round insn does not trap on denormals. */
39798 if (flag_trapping_math || !TARGET_ROUND)
39799 break;
39800
39801 if (out_mode == SImode && in_mode == DFmode)
39802 {
39803 if (out_n == 4 && in_n == 2)
39804 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
39805 else if (out_n == 8 && in_n == 4)
39806 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
39807 else if (out_n == 16 && in_n == 8)
39808 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
39809 }
39810 if (out_mode == SImode && in_mode == SFmode)
39811 {
39812 if (out_n == 4 && in_n == 4)
39813 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
39814 else if (out_n == 8 && in_n == 8)
39815 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
39816 else if (out_n == 16 && in_n == 16)
39817 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
39818 }
39819 break;
39820
39821 CASE_CFN_IRINT:
39822 CASE_CFN_LRINT:
39823 CASE_CFN_LLRINT:
39824 if (out_mode == SImode && in_mode == DFmode)
39825 {
39826 if (out_n == 4 && in_n == 2)
39827 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
39828 else if (out_n == 8 && in_n == 4)
39829 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
39830 else if (out_n == 16 && in_n == 8)
39831 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
39832 }
39833 if (out_mode == SImode && in_mode == SFmode)
39834 {
39835 if (out_n == 4 && in_n == 4)
39836 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
39837 else if (out_n == 8 && in_n == 8)
39838 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
39839 else if (out_n == 16 && in_n == 16)
39840 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
39841 }
39842 break;
39843
39844 CASE_CFN_IROUND:
39845 CASE_CFN_LROUND:
39846 CASE_CFN_LLROUND:
39847 /* The round insn does not trap on denormals. */
39848 if (flag_trapping_math || !TARGET_ROUND)
39849 break;
39850
39851 if (out_mode == SImode && in_mode == DFmode)
39852 {
39853 if (out_n == 4 && in_n == 2)
39854 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
39855 else if (out_n == 8 && in_n == 4)
39856 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
39857 else if (out_n == 16 && in_n == 8)
39858 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
39859 }
39860 if (out_mode == SImode && in_mode == SFmode)
39861 {
39862 if (out_n == 4 && in_n == 4)
39863 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
39864 else if (out_n == 8 && in_n == 8)
39865 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
39866 else if (out_n == 16 && in_n == 16)
39867 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
39868 }
39869 break;
39870
39871 CASE_CFN_FLOOR:
39872 /* The round insn does not trap on denormals. */
39873 if (flag_trapping_math || !TARGET_ROUND)
39874 break;
39875
39876 if (out_mode == DFmode && in_mode == DFmode)
39877 {
39878 if (out_n == 2 && in_n == 2)
39879 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
39880 else if (out_n == 4 && in_n == 4)
39881 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
39882 else if (out_n == 8 && in_n == 8)
39883 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
39884 }
39885 if (out_mode == SFmode && in_mode == SFmode)
39886 {
39887 if (out_n == 4 && in_n == 4)
39888 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
39889 else if (out_n == 8 && in_n == 8)
39890 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
39891 else if (out_n == 16 && in_n == 16)
39892 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
39893 }
39894 break;
39895
39896 CASE_CFN_CEIL:
39897 /* The round insn does not trap on denormals. */
39898 if (flag_trapping_math || !TARGET_ROUND)
39899 break;
39900
39901 if (out_mode == DFmode && in_mode == DFmode)
39902 {
39903 if (out_n == 2 && in_n == 2)
39904 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
39905 else if (out_n == 4 && in_n == 4)
39906 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
39907 else if (out_n == 8 && in_n == 8)
39908 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
39909 }
39910 if (out_mode == SFmode && in_mode == SFmode)
39911 {
39912 if (out_n == 4 && in_n == 4)
39913 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
39914 else if (out_n == 8 && in_n == 8)
39915 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
39916 else if (out_n == 16 && in_n == 16)
39917 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
39918 }
39919 break;
39920
39921 CASE_CFN_TRUNC:
39922 /* The round insn does not trap on denormals. */
39923 if (flag_trapping_math || !TARGET_ROUND)
39924 break;
39925
39926 if (out_mode == DFmode && in_mode == DFmode)
39927 {
39928 if (out_n == 2 && in_n == 2)
39929 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
39930 else if (out_n == 4 && in_n == 4)
39931 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
39932 else if (out_n == 8 && in_n == 8)
39933 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
39934 }
39935 if (out_mode == SFmode && in_mode == SFmode)
39936 {
39937 if (out_n == 4 && in_n == 4)
39938 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
39939 else if (out_n == 8 && in_n == 8)
39940 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
39941 else if (out_n == 16 && in_n == 16)
39942 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
39943 }
39944 break;
39945
39946 CASE_CFN_RINT:
39947 /* The round insn does not trap on denormals. */
39948 if (flag_trapping_math || !TARGET_ROUND)
39949 break;
39950
39951 if (out_mode == DFmode && in_mode == DFmode)
39952 {
39953 if (out_n == 2 && in_n == 2)
39954 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
39955 else if (out_n == 4 && in_n == 4)
39956 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
39957 }
39958 if (out_mode == SFmode && in_mode == SFmode)
39959 {
39960 if (out_n == 4 && in_n == 4)
39961 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
39962 else if (out_n == 8 && in_n == 8)
39963 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
39964 }
39965 break;
39966
39967 CASE_CFN_FMA:
39968 if (out_mode == DFmode && in_mode == DFmode)
39969 {
39970 if (out_n == 2 && in_n == 2)
39971 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
39972 if (out_n == 4 && in_n == 4)
39973 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
39974 }
39975 if (out_mode == SFmode && in_mode == SFmode)
39976 {
39977 if (out_n == 4 && in_n == 4)
39978 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
39979 if (out_n == 8 && in_n == 8)
39980 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
39981 }
39982 break;
39983
39984 default:
39985 break;
39986 }
39987
39988 /* Dispatch to a handler for a vectorization library. */
39989 if (ix86_veclib_handler)
39990 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
39991
39992 return NULL_TREE;
39993 }
39994
39995 /* Handler for an SVML-style interface to
39996 a library with vectorized intrinsics. */
39997
39998 static tree
39999 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
40000 {
40001 char name[20];
40002 tree fntype, new_fndecl, args;
40003 unsigned arity;
40004 const char *bname;
40005 machine_mode el_mode, in_mode;
40006 int n, in_n;
40007
40008 /* The SVML is suitable for unsafe math only. */
40009 if (!flag_unsafe_math_optimizations)
40010 return NULL_TREE;
40011
40012 el_mode = TYPE_MODE (TREE_TYPE (type_out));
40013 n = TYPE_VECTOR_SUBPARTS (type_out);
40014 in_mode = TYPE_MODE (TREE_TYPE (type_in));
40015 in_n = TYPE_VECTOR_SUBPARTS (type_in);
40016 if (el_mode != in_mode
40017 || n != in_n)
40018 return NULL_TREE;
40019
40020 switch (fn)
40021 {
40022 CASE_CFN_EXP:
40023 CASE_CFN_LOG:
40024 CASE_CFN_LOG10:
40025 CASE_CFN_POW:
40026 CASE_CFN_TANH:
40027 CASE_CFN_TAN:
40028 CASE_CFN_ATAN:
40029 CASE_CFN_ATAN2:
40030 CASE_CFN_ATANH:
40031 CASE_CFN_CBRT:
40032 CASE_CFN_SINH:
40033 CASE_CFN_SIN:
40034 CASE_CFN_ASINH:
40035 CASE_CFN_ASIN:
40036 CASE_CFN_COSH:
40037 CASE_CFN_COS:
40038 CASE_CFN_ACOSH:
40039 CASE_CFN_ACOS:
40040 if ((el_mode != DFmode || n != 2)
40041 && (el_mode != SFmode || n != 4))
40042 return NULL_TREE;
40043 break;
40044
40045 default:
40046 return NULL_TREE;
40047 }
40048
40049 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
40050 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
40051
40052 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
40053 strcpy (name, "vmlsLn4");
40054 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
40055 strcpy (name, "vmldLn2");
40056 else if (n == 4)
40057 {
40058 sprintf (name, "vmls%s", bname+10);
40059 name[strlen (name)-1] = '4';
40060 }
40061 else
40062 sprintf (name, "vmld%s2", bname+10);
40063
40064 /* Convert to uppercase. */
40065 name[4] &= ~0x20;
40066
40067 arity = 0;
40068 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
40069 arity++;
40070
40071 if (arity == 1)
40072 fntype = build_function_type_list (type_out, type_in, NULL);
40073 else
40074 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
40075
40076 /* Build a function declaration for the vectorized function. */
40077 new_fndecl = build_decl (BUILTINS_LOCATION,
40078 FUNCTION_DECL, get_identifier (name), fntype);
40079 TREE_PUBLIC (new_fndecl) = 1;
40080 DECL_EXTERNAL (new_fndecl) = 1;
40081 DECL_IS_NOVOPS (new_fndecl) = 1;
40082 TREE_READONLY (new_fndecl) = 1;
40083
40084 return new_fndecl;
40085 }
40086
40087 /* Handler for an ACML-style interface to
40088 a library with vectorized intrinsics. */
40089
40090 static tree
40091 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
40092 {
40093 char name[20] = "__vr.._";
40094 tree fntype, new_fndecl, args;
40095 unsigned arity;
40096 const char *bname;
40097 machine_mode el_mode, in_mode;
40098 int n, in_n;
40099
40100 /* The ACML is 64bits only and suitable for unsafe math only as
40101 it does not correctly support parts of IEEE with the required
40102 precision such as denormals. */
40103 if (!TARGET_64BIT
40104 || !flag_unsafe_math_optimizations)
40105 return NULL_TREE;
40106
40107 el_mode = TYPE_MODE (TREE_TYPE (type_out));
40108 n = TYPE_VECTOR_SUBPARTS (type_out);
40109 in_mode = TYPE_MODE (TREE_TYPE (type_in));
40110 in_n = TYPE_VECTOR_SUBPARTS (type_in);
40111 if (el_mode != in_mode
40112 || n != in_n)
40113 return NULL_TREE;
40114
40115 switch (fn)
40116 {
40117 CASE_CFN_SIN:
40118 CASE_CFN_COS:
40119 CASE_CFN_EXP:
40120 CASE_CFN_LOG:
40121 CASE_CFN_LOG2:
40122 CASE_CFN_LOG10:
40123 if (el_mode == DFmode && n == 2)
40124 {
40125 name[4] = 'd';
40126 name[5] = '2';
40127 }
40128 else if (el_mode == SFmode && n == 4)
40129 {
40130 name[4] = 's';
40131 name[5] = '4';
40132 }
40133 else
40134 return NULL_TREE;
40135 break;
40136
40137 default:
40138 return NULL_TREE;
40139 }
40140
40141 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
40142 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
40143 sprintf (name + 7, "%s", bname+10);
40144
40145 arity = 0;
40146 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
40147 arity++;
40148
40149 if (arity == 1)
40150 fntype = build_function_type_list (type_out, type_in, NULL);
40151 else
40152 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
40153
40154 /* Build a function declaration for the vectorized function. */
40155 new_fndecl = build_decl (BUILTINS_LOCATION,
40156 FUNCTION_DECL, get_identifier (name), fntype);
40157 TREE_PUBLIC (new_fndecl) = 1;
40158 DECL_EXTERNAL (new_fndecl) = 1;
40159 DECL_IS_NOVOPS (new_fndecl) = 1;
40160 TREE_READONLY (new_fndecl) = 1;
40161
40162 return new_fndecl;
40163 }
40164
40165 /* Returns a decl of a function that implements gather load with
40166 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
40167 Return NULL_TREE if it is not available. */
40168
40169 static tree
40170 ix86_vectorize_builtin_gather (const_tree mem_vectype,
40171 const_tree index_type, int scale)
40172 {
40173 bool si;
40174 enum ix86_builtins code;
40175
40176 if (! TARGET_AVX2)
40177 return NULL_TREE;
40178
40179 if ((TREE_CODE (index_type) != INTEGER_TYPE
40180 && !POINTER_TYPE_P (index_type))
40181 || (TYPE_MODE (index_type) != SImode
40182 && TYPE_MODE (index_type) != DImode))
40183 return NULL_TREE;
40184
40185 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
40186 return NULL_TREE;
40187
40188 /* v*gather* insn sign extends index to pointer mode. */
40189 if (TYPE_PRECISION (index_type) < POINTER_SIZE
40190 && TYPE_UNSIGNED (index_type))
40191 return NULL_TREE;
40192
40193 if (scale <= 0
40194 || scale > 8
40195 || (scale & (scale - 1)) != 0)
40196 return NULL_TREE;
40197
40198 si = TYPE_MODE (index_type) == SImode;
40199 switch (TYPE_MODE (mem_vectype))
40200 {
40201 case V2DFmode:
40202 if (TARGET_AVX512VL)
40203 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
40204 else
40205 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
40206 break;
40207 case V4DFmode:
40208 if (TARGET_AVX512VL)
40209 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
40210 else
40211 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
40212 break;
40213 case V2DImode:
40214 if (TARGET_AVX512VL)
40215 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
40216 else
40217 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
40218 break;
40219 case V4DImode:
40220 if (TARGET_AVX512VL)
40221 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
40222 else
40223 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
40224 break;
40225 case V4SFmode:
40226 if (TARGET_AVX512VL)
40227 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
40228 else
40229 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
40230 break;
40231 case V8SFmode:
40232 if (TARGET_AVX512VL)
40233 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
40234 else
40235 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
40236 break;
40237 case V4SImode:
40238 if (TARGET_AVX512VL)
40239 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
40240 else
40241 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
40242 break;
40243 case V8SImode:
40244 if (TARGET_AVX512VL)
40245 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
40246 else
40247 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
40248 break;
40249 case V8DFmode:
40250 if (TARGET_AVX512F)
40251 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
40252 else
40253 return NULL_TREE;
40254 break;
40255 case V8DImode:
40256 if (TARGET_AVX512F)
40257 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
40258 else
40259 return NULL_TREE;
40260 break;
40261 case V16SFmode:
40262 if (TARGET_AVX512F)
40263 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
40264 else
40265 return NULL_TREE;
40266 break;
40267 case V16SImode:
40268 if (TARGET_AVX512F)
40269 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
40270 else
40271 return NULL_TREE;
40272 break;
40273 default:
40274 return NULL_TREE;
40275 }
40276
40277 return ix86_get_builtin (code);
40278 }
40279
40280 /* Returns a decl of a function that implements scatter store with
40281 register type VECTYPE and index type INDEX_TYPE and SCALE.
40282 Return NULL_TREE if it is not available. */
40283
40284 static tree
40285 ix86_vectorize_builtin_scatter (const_tree vectype,
40286 const_tree index_type, int scale)
40287 {
40288 bool si;
40289 enum ix86_builtins code;
40290
40291 if (!TARGET_AVX512F)
40292 return NULL_TREE;
40293
40294 if ((TREE_CODE (index_type) != INTEGER_TYPE
40295 && !POINTER_TYPE_P (index_type))
40296 || (TYPE_MODE (index_type) != SImode
40297 && TYPE_MODE (index_type) != DImode))
40298 return NULL_TREE;
40299
40300 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
40301 return NULL_TREE;
40302
40303 /* v*scatter* insn sign extends index to pointer mode. */
40304 if (TYPE_PRECISION (index_type) < POINTER_SIZE
40305 && TYPE_UNSIGNED (index_type))
40306 return NULL_TREE;
40307
40308 /* Scale can be 1, 2, 4 or 8. */
40309 if (scale <= 0
40310 || scale > 8
40311 || (scale & (scale - 1)) != 0)
40312 return NULL_TREE;
40313
40314 si = TYPE_MODE (index_type) == SImode;
40315 switch (TYPE_MODE (vectype))
40316 {
40317 case V8DFmode:
40318 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
40319 break;
40320 case V8DImode:
40321 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
40322 break;
40323 case V16SFmode:
40324 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
40325 break;
40326 case V16SImode:
40327 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
40328 break;
40329 default:
40330 return NULL_TREE;
40331 }
40332
40333 return ix86_builtins[code];
40334 }
40335
40336 /* Return true if it is safe to use the rsqrt optabs to optimize
40337 1.0/sqrt. */
40338
40339 static bool
40340 use_rsqrt_p ()
40341 {
40342 return (TARGET_SSE_MATH
40343 && flag_finite_math_only
40344 && !flag_trapping_math
40345 && flag_unsafe_math_optimizations);
40346 }
40347
40348 /* Returns a code for a target-specific builtin that implements
40349 reciprocal of the function, or NULL_TREE if not available. */
40350
40351 static tree
40352 ix86_builtin_reciprocal (tree fndecl)
40353 {
40354 switch (DECL_FUNCTION_CODE (fndecl))
40355 {
40356 /* Vectorized version of sqrt to rsqrt conversion. */
40357 case IX86_BUILTIN_SQRTPS_NR:
40358 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
40359
40360 case IX86_BUILTIN_SQRTPS_NR256:
40361 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
40362
40363 default:
40364 return NULL_TREE;
40365 }
40366 }
40367 \f
40368 /* Helper for avx_vpermilps256_operand et al. This is also used by
40369 the expansion functions to turn the parallel back into a mask.
40370 The return value is 0 for no match and the imm8+1 for a match. */
40371
40372 int
40373 avx_vpermilp_parallel (rtx par, machine_mode mode)
40374 {
40375 unsigned i, nelt = GET_MODE_NUNITS (mode);
40376 unsigned mask = 0;
40377 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
40378
40379 if (XVECLEN (par, 0) != (int) nelt)
40380 return 0;
40381
40382 /* Validate that all of the elements are constants, and not totally
40383 out of range. Copy the data into an integral array to make the
40384 subsequent checks easier. */
40385 for (i = 0; i < nelt; ++i)
40386 {
40387 rtx er = XVECEXP (par, 0, i);
40388 unsigned HOST_WIDE_INT ei;
40389
40390 if (!CONST_INT_P (er))
40391 return 0;
40392 ei = INTVAL (er);
40393 if (ei >= nelt)
40394 return 0;
40395 ipar[i] = ei;
40396 }
40397
40398 switch (mode)
40399 {
40400 case V8DFmode:
40401 /* In the 512-bit DFmode case, we can only move elements within
40402 a 128-bit lane. First fill the second part of the mask,
40403 then fallthru. */
40404 for (i = 4; i < 6; ++i)
40405 {
40406 if (ipar[i] < 4 || ipar[i] >= 6)
40407 return 0;
40408 mask |= (ipar[i] - 4) << i;
40409 }
40410 for (i = 6; i < 8; ++i)
40411 {
40412 if (ipar[i] < 6)
40413 return 0;
40414 mask |= (ipar[i] - 6) << i;
40415 }
40416 /* FALLTHRU */
40417
40418 case V4DFmode:
40419 /* In the 256-bit DFmode case, we can only move elements within
40420 a 128-bit lane. */
40421 for (i = 0; i < 2; ++i)
40422 {
40423 if (ipar[i] >= 2)
40424 return 0;
40425 mask |= ipar[i] << i;
40426 }
40427 for (i = 2; i < 4; ++i)
40428 {
40429 if (ipar[i] < 2)
40430 return 0;
40431 mask |= (ipar[i] - 2) << i;
40432 }
40433 break;
40434
40435 case V16SFmode:
40436 /* In 512 bit SFmode case, permutation in the upper 256 bits
40437 must mirror the permutation in the lower 256-bits. */
40438 for (i = 0; i < 8; ++i)
40439 if (ipar[i] + 8 != ipar[i + 8])
40440 return 0;
40441 /* FALLTHRU */
40442
40443 case V8SFmode:
40444 /* In 256 bit SFmode case, we have full freedom of
40445 movement within the low 128-bit lane, but the high 128-bit
40446 lane must mirror the exact same pattern. */
40447 for (i = 0; i < 4; ++i)
40448 if (ipar[i] + 4 != ipar[i + 4])
40449 return 0;
40450 nelt = 4;
40451 /* FALLTHRU */
40452
40453 case V2DFmode:
40454 case V4SFmode:
40455 /* In the 128-bit case, we've full freedom in the placement of
40456 the elements from the source operand. */
40457 for (i = 0; i < nelt; ++i)
40458 mask |= ipar[i] << (i * (nelt / 2));
40459 break;
40460
40461 default:
40462 gcc_unreachable ();
40463 }
40464
40465 /* Make sure success has a non-zero value by adding one. */
40466 return mask + 1;
40467 }
40468
40469 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
40470 the expansion functions to turn the parallel back into a mask.
40471 The return value is 0 for no match and the imm8+1 for a match. */
40472
40473 int
40474 avx_vperm2f128_parallel (rtx par, machine_mode mode)
40475 {
40476 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
40477 unsigned mask = 0;
40478 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
40479
40480 if (XVECLEN (par, 0) != (int) nelt)
40481 return 0;
40482
40483 /* Validate that all of the elements are constants, and not totally
40484 out of range. Copy the data into an integral array to make the
40485 subsequent checks easier. */
40486 for (i = 0; i < nelt; ++i)
40487 {
40488 rtx er = XVECEXP (par, 0, i);
40489 unsigned HOST_WIDE_INT ei;
40490
40491 if (!CONST_INT_P (er))
40492 return 0;
40493 ei = INTVAL (er);
40494 if (ei >= 2 * nelt)
40495 return 0;
40496 ipar[i] = ei;
40497 }
40498
40499 /* Validate that the halves of the permute are halves. */
40500 for (i = 0; i < nelt2 - 1; ++i)
40501 if (ipar[i] + 1 != ipar[i + 1])
40502 return 0;
40503 for (i = nelt2; i < nelt - 1; ++i)
40504 if (ipar[i] + 1 != ipar[i + 1])
40505 return 0;
40506
40507 /* Reconstruct the mask. */
40508 for (i = 0; i < 2; ++i)
40509 {
40510 unsigned e = ipar[i * nelt2];
40511 if (e % nelt2)
40512 return 0;
40513 e /= nelt2;
40514 mask |= e << (i * 4);
40515 }
40516
40517 /* Make sure success has a non-zero value by adding one. */
40518 return mask + 1;
40519 }
40520 \f
40521 /* Return a register priority for hard reg REGNO. */
40522 static int
40523 ix86_register_priority (int hard_regno)
40524 {
40525 /* ebp and r13 as the base always wants a displacement, r12 as the
40526 base always wants an index. So discourage their usage in an
40527 address. */
40528 if (hard_regno == R12_REG || hard_regno == R13_REG)
40529 return 0;
40530 if (hard_regno == BP_REG)
40531 return 1;
40532 /* New x86-64 int registers result in bigger code size. Discourage
40533 them. */
40534 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
40535 return 2;
40536 /* New x86-64 SSE registers result in bigger code size. Discourage
40537 them. */
40538 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
40539 return 2;
40540 /* Usage of AX register results in smaller code. Prefer it. */
40541 if (hard_regno == AX_REG)
40542 return 4;
40543 return 3;
40544 }
40545
40546 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
40547
40548 Put float CONST_DOUBLE in the constant pool instead of fp regs.
40549 QImode must go into class Q_REGS.
40550 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
40551 movdf to do mem-to-mem moves through integer regs. */
40552
40553 static reg_class_t
40554 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
40555 {
40556 machine_mode mode = GET_MODE (x);
40557
40558 /* We're only allowed to return a subclass of CLASS. Many of the
40559 following checks fail for NO_REGS, so eliminate that early. */
40560 if (regclass == NO_REGS)
40561 return NO_REGS;
40562
40563 /* All classes can load zeros. */
40564 if (x == CONST0_RTX (mode))
40565 return regclass;
40566
40567 /* Force constants into memory if we are loading a (nonzero) constant into
40568 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
40569 instructions to load from a constant. */
40570 if (CONSTANT_P (x)
40571 && (MAYBE_MMX_CLASS_P (regclass)
40572 || MAYBE_SSE_CLASS_P (regclass)
40573 || MAYBE_MASK_CLASS_P (regclass)))
40574 return NO_REGS;
40575
40576 /* Floating-point constants need more complex checks. */
40577 if (CONST_DOUBLE_P (x))
40578 {
40579 /* General regs can load everything. */
40580 if (INTEGER_CLASS_P (regclass))
40581 return regclass;
40582
40583 /* Floats can load 0 and 1 plus some others. Note that we eliminated
40584 zero above. We only want to wind up preferring 80387 registers if
40585 we plan on doing computation with them. */
40586 if (IS_STACK_MODE (mode)
40587 && standard_80387_constant_p (x) > 0)
40588 {
40589 /* Limit class to FP regs. */
40590 if (FLOAT_CLASS_P (regclass))
40591 return FLOAT_REGS;
40592 else if (regclass == FP_TOP_SSE_REGS)
40593 return FP_TOP_REG;
40594 else if (regclass == FP_SECOND_SSE_REGS)
40595 return FP_SECOND_REG;
40596 }
40597
40598 return NO_REGS;
40599 }
40600
40601 /* Prefer SSE regs only, if we can use them for math. */
40602 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40603 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
40604
40605 /* Generally when we see PLUS here, it's the function invariant
40606 (plus soft-fp const_int). Which can only be computed into general
40607 regs. */
40608 if (GET_CODE (x) == PLUS)
40609 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
40610
40611 /* QImode constants are easy to load, but non-constant QImode data
40612 must go into Q_REGS. */
40613 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
40614 {
40615 if (Q_CLASS_P (regclass))
40616 return regclass;
40617 else if (reg_class_subset_p (Q_REGS, regclass))
40618 return Q_REGS;
40619 else
40620 return NO_REGS;
40621 }
40622
40623 return regclass;
40624 }
40625
40626 /* Discourage putting floating-point values in SSE registers unless
40627 SSE math is being used, and likewise for the 387 registers. */
40628 static reg_class_t
40629 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
40630 {
40631 machine_mode mode = GET_MODE (x);
40632
40633 /* Restrict the output reload class to the register bank that we are doing
40634 math on. If we would like not to return a subset of CLASS, reject this
40635 alternative: if reload cannot do this, it will still use its choice. */
40636 mode = GET_MODE (x);
40637 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40638 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
40639
40640 if (IS_STACK_MODE (mode))
40641 {
40642 if (regclass == FP_TOP_SSE_REGS)
40643 return FP_TOP_REG;
40644 else if (regclass == FP_SECOND_SSE_REGS)
40645 return FP_SECOND_REG;
40646 else
40647 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
40648 }
40649
40650 return regclass;
40651 }
40652
40653 static reg_class_t
40654 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
40655 machine_mode mode, secondary_reload_info *sri)
40656 {
40657 /* Double-word spills from general registers to non-offsettable memory
40658 references (zero-extended addresses) require special handling. */
40659 if (TARGET_64BIT
40660 && MEM_P (x)
40661 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
40662 && INTEGER_CLASS_P (rclass)
40663 && !offsettable_memref_p (x))
40664 {
40665 sri->icode = (in_p
40666 ? CODE_FOR_reload_noff_load
40667 : CODE_FOR_reload_noff_store);
40668 /* Add the cost of moving address to a temporary. */
40669 sri->extra_cost = 1;
40670
40671 return NO_REGS;
40672 }
40673
40674 /* QImode spills from non-QI registers require
40675 intermediate register on 32bit targets. */
40676 if (mode == QImode
40677 && ((!TARGET_64BIT && !in_p
40678 && INTEGER_CLASS_P (rclass)
40679 && MAYBE_NON_Q_CLASS_P (rclass))
40680 || (!TARGET_AVX512DQ
40681 && MAYBE_MASK_CLASS_P (rclass))))
40682 {
40683 int regno = true_regnum (x);
40684
40685 /* Return Q_REGS if the operand is in memory. */
40686 if (regno == -1)
40687 return Q_REGS;
40688
40689 return NO_REGS;
40690 }
40691
40692 /* This condition handles corner case where an expression involving
40693 pointers gets vectorized. We're trying to use the address of a
40694 stack slot as a vector initializer.
40695
40696 (set (reg:V2DI 74 [ vect_cst_.2 ])
40697 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
40698
40699 Eventually frame gets turned into sp+offset like this:
40700
40701 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40702 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
40703 (const_int 392 [0x188]))))
40704
40705 That later gets turned into:
40706
40707 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40708 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
40709 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
40710
40711 We'll have the following reload recorded:
40712
40713 Reload 0: reload_in (DI) =
40714 (plus:DI (reg/f:DI 7 sp)
40715 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
40716 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40717 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
40718 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
40719 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40720 reload_reg_rtx: (reg:V2DI 22 xmm1)
40721
40722 Which isn't going to work since SSE instructions can't handle scalar
40723 additions. Returning GENERAL_REGS forces the addition into integer
40724 register and reload can handle subsequent reloads without problems. */
40725
40726 if (in_p && GET_CODE (x) == PLUS
40727 && SSE_CLASS_P (rclass)
40728 && SCALAR_INT_MODE_P (mode))
40729 return GENERAL_REGS;
40730
40731 return NO_REGS;
40732 }
40733
40734 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
40735
40736 static bool
40737 ix86_class_likely_spilled_p (reg_class_t rclass)
40738 {
40739 switch (rclass)
40740 {
40741 case AREG:
40742 case DREG:
40743 case CREG:
40744 case BREG:
40745 case AD_REGS:
40746 case SIREG:
40747 case DIREG:
40748 case SSE_FIRST_REG:
40749 case FP_TOP_REG:
40750 case FP_SECOND_REG:
40751 case BND_REGS:
40752 return true;
40753
40754 default:
40755 break;
40756 }
40757
40758 return false;
40759 }
40760
40761 /* If we are copying between registers from different register sets
40762 (e.g. FP and integer), we may need a memory location.
40763
40764 The function can't work reliably when one of the CLASSES is a class
40765 containing registers from multiple sets. We avoid this by never combining
40766 different sets in a single alternative in the machine description.
40767 Ensure that this constraint holds to avoid unexpected surprises.
40768
40769 When STRICT is false, we are being called from REGISTER_MOVE_COST,
40770 so do not enforce these sanity checks.
40771
40772 To optimize register_move_cost performance, define inline variant. */
40773
40774 static inline bool
40775 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
40776 machine_mode mode, int strict)
40777 {
40778 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
40779 return false;
40780
40781 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
40782 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
40783 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
40784 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
40785 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
40786 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
40787 || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
40788 || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
40789 {
40790 gcc_assert (!strict || lra_in_progress);
40791 return true;
40792 }
40793
40794 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
40795 return true;
40796
40797 /* Between mask and general, we have moves no larger than word size. */
40798 if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
40799 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
40800 return true;
40801
40802 /* ??? This is a lie. We do have moves between mmx/general, and for
40803 mmx/sse2. But by saying we need secondary memory we discourage the
40804 register allocator from using the mmx registers unless needed. */
40805 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
40806 return true;
40807
40808 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
40809 {
40810 /* SSE1 doesn't have any direct moves from other classes. */
40811 if (!TARGET_SSE2)
40812 return true;
40813
40814 /* If the target says that inter-unit moves are more expensive
40815 than moving through memory, then don't generate them. */
40816 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
40817 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
40818 return true;
40819
40820 /* Between SSE and general, we have moves no larger than word size. */
40821 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40822 return true;
40823 }
40824
40825 return false;
40826 }
40827
40828 bool
40829 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
40830 machine_mode mode, int strict)
40831 {
40832 return inline_secondary_memory_needed (class1, class2, mode, strict);
40833 }
40834
40835 /* Implement the TARGET_CLASS_MAX_NREGS hook.
40836
40837 On the 80386, this is the size of MODE in words,
40838 except in the FP regs, where a single reg is always enough. */
40839
40840 static unsigned char
40841 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
40842 {
40843 if (MAYBE_INTEGER_CLASS_P (rclass))
40844 {
40845 if (mode == XFmode)
40846 return (TARGET_64BIT ? 2 : 3);
40847 else if (mode == XCmode)
40848 return (TARGET_64BIT ? 4 : 6);
40849 else
40850 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
40851 }
40852 else
40853 {
40854 if (COMPLEX_MODE_P (mode))
40855 return 2;
40856 else
40857 return 1;
40858 }
40859 }
40860
40861 /* Return true if the registers in CLASS cannot represent the change from
40862 modes FROM to TO. */
40863
40864 bool
40865 ix86_cannot_change_mode_class (machine_mode from, machine_mode to,
40866 enum reg_class regclass)
40867 {
40868 if (from == to)
40869 return false;
40870
40871 /* x87 registers can't do subreg at all, as all values are reformatted
40872 to extended precision. */
40873 if (MAYBE_FLOAT_CLASS_P (regclass))
40874 return true;
40875
40876 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
40877 {
40878 /* Vector registers do not support QI or HImode loads. If we don't
40879 disallow a change to these modes, reload will assume it's ok to
40880 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
40881 the vec_dupv4hi pattern. */
40882 if (GET_MODE_SIZE (from) < 4)
40883 return true;
40884 }
40885
40886 return false;
40887 }
40888
40889 /* Return the cost of moving data of mode M between a
40890 register and memory. A value of 2 is the default; this cost is
40891 relative to those in `REGISTER_MOVE_COST'.
40892
40893 This function is used extensively by register_move_cost that is used to
40894 build tables at startup. Make it inline in this case.
40895 When IN is 2, return maximum of in and out move cost.
40896
40897 If moving between registers and memory is more expensive than
40898 between two registers, you should define this macro to express the
40899 relative cost.
40900
40901 Model also increased moving costs of QImode registers in non
40902 Q_REGS classes.
40903 */
40904 static inline int
40905 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
40906 int in)
40907 {
40908 int cost;
40909 if (FLOAT_CLASS_P (regclass))
40910 {
40911 int index;
40912 switch (mode)
40913 {
40914 case SFmode:
40915 index = 0;
40916 break;
40917 case DFmode:
40918 index = 1;
40919 break;
40920 case XFmode:
40921 index = 2;
40922 break;
40923 default:
40924 return 100;
40925 }
40926 if (in == 2)
40927 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
40928 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
40929 }
40930 if (SSE_CLASS_P (regclass))
40931 {
40932 int index;
40933 switch (GET_MODE_SIZE (mode))
40934 {
40935 case 4:
40936 index = 0;
40937 break;
40938 case 8:
40939 index = 1;
40940 break;
40941 case 16:
40942 index = 2;
40943 break;
40944 default:
40945 return 100;
40946 }
40947 if (in == 2)
40948 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
40949 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
40950 }
40951 if (MMX_CLASS_P (regclass))
40952 {
40953 int index;
40954 switch (GET_MODE_SIZE (mode))
40955 {
40956 case 4:
40957 index = 0;
40958 break;
40959 case 8:
40960 index = 1;
40961 break;
40962 default:
40963 return 100;
40964 }
40965 if (in)
40966 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
40967 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
40968 }
40969 switch (GET_MODE_SIZE (mode))
40970 {
40971 case 1:
40972 if (Q_CLASS_P (regclass) || TARGET_64BIT)
40973 {
40974 if (!in)
40975 return ix86_cost->int_store[0];
40976 if (TARGET_PARTIAL_REG_DEPENDENCY
40977 && optimize_function_for_speed_p (cfun))
40978 cost = ix86_cost->movzbl_load;
40979 else
40980 cost = ix86_cost->int_load[0];
40981 if (in == 2)
40982 return MAX (cost, ix86_cost->int_store[0]);
40983 return cost;
40984 }
40985 else
40986 {
40987 if (in == 2)
40988 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
40989 if (in)
40990 return ix86_cost->movzbl_load;
40991 else
40992 return ix86_cost->int_store[0] + 4;
40993 }
40994 break;
40995 case 2:
40996 if (in == 2)
40997 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
40998 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
40999 default:
41000 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
41001 if (mode == TFmode)
41002 mode = XFmode;
41003 if (in == 2)
41004 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
41005 else if (in)
41006 cost = ix86_cost->int_load[2];
41007 else
41008 cost = ix86_cost->int_store[2];
41009 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
41010 }
41011 }
41012
41013 static int
41014 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
41015 bool in)
41016 {
41017 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
41018 }
41019
41020
41021 /* Return the cost of moving data from a register in class CLASS1 to
41022 one in class CLASS2.
41023
41024 It is not required that the cost always equal 2 when FROM is the same as TO;
41025 on some machines it is expensive to move between registers if they are not
41026 general registers. */
41027
41028 static int
41029 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
41030 reg_class_t class2_i)
41031 {
41032 enum reg_class class1 = (enum reg_class) class1_i;
41033 enum reg_class class2 = (enum reg_class) class2_i;
41034
41035 /* In case we require secondary memory, compute cost of the store followed
41036 by load. In order to avoid bad register allocation choices, we need
41037 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
41038
41039 if (inline_secondary_memory_needed (class1, class2, mode, 0))
41040 {
41041 int cost = 1;
41042
41043 cost += inline_memory_move_cost (mode, class1, 2);
41044 cost += inline_memory_move_cost (mode, class2, 2);
41045
41046 /* In case of copying from general_purpose_register we may emit multiple
41047 stores followed by single load causing memory size mismatch stall.
41048 Count this as arbitrarily high cost of 20. */
41049 if (targetm.class_max_nregs (class1, mode)
41050 > targetm.class_max_nregs (class2, mode))
41051 cost += 20;
41052
41053 /* In the case of FP/MMX moves, the registers actually overlap, and we
41054 have to switch modes in order to treat them differently. */
41055 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
41056 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
41057 cost += 20;
41058
41059 return cost;
41060 }
41061
41062 /* Moves between SSE/MMX and integer unit are expensive. */
41063 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
41064 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
41065
41066 /* ??? By keeping returned value relatively high, we limit the number
41067 of moves between integer and MMX/SSE registers for all targets.
41068 Additionally, high value prevents problem with x86_modes_tieable_p(),
41069 where integer modes in MMX/SSE registers are not tieable
41070 because of missing QImode and HImode moves to, from or between
41071 MMX/SSE registers. */
41072 return MAX (8, ix86_cost->mmxsse_to_integer);
41073
41074 if (MAYBE_FLOAT_CLASS_P (class1))
41075 return ix86_cost->fp_move;
41076 if (MAYBE_SSE_CLASS_P (class1))
41077 return ix86_cost->sse_move;
41078 if (MAYBE_MMX_CLASS_P (class1))
41079 return ix86_cost->mmx_move;
41080 return 2;
41081 }
41082
41083 /* Return TRUE if hard register REGNO can hold a value of machine-mode
41084 MODE. */
41085
41086 bool
41087 ix86_hard_regno_mode_ok (int regno, machine_mode mode)
41088 {
41089 /* Flags and only flags can only hold CCmode values. */
41090 if (CC_REGNO_P (regno))
41091 return GET_MODE_CLASS (mode) == MODE_CC;
41092 if (GET_MODE_CLASS (mode) == MODE_CC
41093 || GET_MODE_CLASS (mode) == MODE_RANDOM
41094 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
41095 return false;
41096 if (STACK_REGNO_P (regno))
41097 return VALID_FP_MODE_P (mode);
41098 if (MASK_REGNO_P (regno))
41099 return (VALID_MASK_REG_MODE (mode)
41100 || (TARGET_AVX512BW
41101 && VALID_MASK_AVX512BW_MODE (mode)));
41102 if (BND_REGNO_P (regno))
41103 return VALID_BND_REG_MODE (mode);
41104 if (SSE_REGNO_P (regno))
41105 {
41106 /* We implement the move patterns for all vector modes into and
41107 out of SSE registers, even when no operation instructions
41108 are available. */
41109
41110 /* For AVX-512 we allow, regardless of regno:
41111 - XI mode
41112 - any of 512-bit wide vector mode
41113 - any scalar mode. */
41114 if (TARGET_AVX512F
41115 && (mode == XImode
41116 || VALID_AVX512F_REG_MODE (mode)
41117 || VALID_AVX512F_SCALAR_MODE (mode)))
41118 return true;
41119
41120 /* For AVX-5124FMAPS allow V64SFmode for special regnos. */
41121 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
41122 && MOD4_SSE_REGNO_P (regno)
41123 && mode == V64SFmode)
41124 return true;
41125
41126 /* For AVX-5124VNNIW allow V64SImode for special regnos. */
41127 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
41128 && MOD4_SSE_REGNO_P (regno)
41129 && mode == V64SImode)
41130 return true;
41131
41132 /* TODO check for QI/HI scalars. */
41133 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
41134 if (TARGET_AVX512VL
41135 && (mode == OImode
41136 || mode == TImode
41137 || VALID_AVX256_REG_MODE (mode)
41138 || VALID_AVX512VL_128_REG_MODE (mode)))
41139 return true;
41140
41141 /* xmm16-xmm31 are only available for AVX-512. */
41142 if (EXT_REX_SSE_REGNO_P (regno))
41143 return false;
41144
41145 /* OImode and AVX modes are available only when AVX is enabled. */
41146 return ((TARGET_AVX
41147 && VALID_AVX256_REG_OR_OI_MODE (mode))
41148 || VALID_SSE_REG_MODE (mode)
41149 || VALID_SSE2_REG_MODE (mode)
41150 || VALID_MMX_REG_MODE (mode)
41151 || VALID_MMX_REG_MODE_3DNOW (mode));
41152 }
41153 if (MMX_REGNO_P (regno))
41154 {
41155 /* We implement the move patterns for 3DNOW modes even in MMX mode,
41156 so if the register is available at all, then we can move data of
41157 the given mode into or out of it. */
41158 return (VALID_MMX_REG_MODE (mode)
41159 || VALID_MMX_REG_MODE_3DNOW (mode));
41160 }
41161
41162 if (mode == QImode)
41163 {
41164 /* Take care for QImode values - they can be in non-QI regs,
41165 but then they do cause partial register stalls. */
41166 if (ANY_QI_REGNO_P (regno))
41167 return true;
41168 if (!TARGET_PARTIAL_REG_STALL)
41169 return true;
41170 /* LRA checks if the hard register is OK for the given mode.
41171 QImode values can live in non-QI regs, so we allow all
41172 registers here. */
41173 if (lra_in_progress)
41174 return true;
41175 return !can_create_pseudo_p ();
41176 }
41177 /* We handle both integer and floats in the general purpose registers. */
41178 else if (VALID_INT_MODE_P (mode))
41179 return true;
41180 else if (VALID_FP_MODE_P (mode))
41181 return true;
41182 else if (VALID_DFP_MODE_P (mode))
41183 return true;
41184 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
41185 on to use that value in smaller contexts, this can easily force a
41186 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
41187 supporting DImode, allow it. */
41188 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
41189 return true;
41190
41191 return false;
41192 }
41193
41194 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
41195 tieable integer mode. */
41196
41197 static bool
41198 ix86_tieable_integer_mode_p (machine_mode mode)
41199 {
41200 switch (mode)
41201 {
41202 case HImode:
41203 case SImode:
41204 return true;
41205
41206 case QImode:
41207 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
41208
41209 case DImode:
41210 return TARGET_64BIT;
41211
41212 default:
41213 return false;
41214 }
41215 }
41216
41217 /* Return true if MODE1 is accessible in a register that can hold MODE2
41218 without copying. That is, all register classes that can hold MODE2
41219 can also hold MODE1. */
41220
41221 bool
41222 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
41223 {
41224 if (mode1 == mode2)
41225 return true;
41226
41227 if (ix86_tieable_integer_mode_p (mode1)
41228 && ix86_tieable_integer_mode_p (mode2))
41229 return true;
41230
41231 /* MODE2 being XFmode implies fp stack or general regs, which means we
41232 can tie any smaller floating point modes to it. Note that we do not
41233 tie this with TFmode. */
41234 if (mode2 == XFmode)
41235 return mode1 == SFmode || mode1 == DFmode;
41236
41237 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
41238 that we can tie it with SFmode. */
41239 if (mode2 == DFmode)
41240 return mode1 == SFmode;
41241
41242 /* If MODE2 is only appropriate for an SSE register, then tie with
41243 any other mode acceptable to SSE registers. */
41244 if (GET_MODE_SIZE (mode2) == 32
41245 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
41246 return (GET_MODE_SIZE (mode1) == 32
41247 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
41248 if (GET_MODE_SIZE (mode2) == 16
41249 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
41250 return (GET_MODE_SIZE (mode1) == 16
41251 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
41252
41253 /* If MODE2 is appropriate for an MMX register, then tie
41254 with any other mode acceptable to MMX registers. */
41255 if (GET_MODE_SIZE (mode2) == 8
41256 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
41257 return (GET_MODE_SIZE (mode1) == 8
41258 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
41259
41260 return false;
41261 }
41262
41263 /* Return the cost of moving between two registers of mode MODE. */
41264
41265 static int
41266 ix86_set_reg_reg_cost (machine_mode mode)
41267 {
41268 unsigned int units = UNITS_PER_WORD;
41269
41270 switch (GET_MODE_CLASS (mode))
41271 {
41272 default:
41273 break;
41274
41275 case MODE_CC:
41276 units = GET_MODE_SIZE (CCmode);
41277 break;
41278
41279 case MODE_FLOAT:
41280 if ((TARGET_SSE && mode == TFmode)
41281 || (TARGET_80387 && mode == XFmode)
41282 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
41283 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
41284 units = GET_MODE_SIZE (mode);
41285 break;
41286
41287 case MODE_COMPLEX_FLOAT:
41288 if ((TARGET_SSE && mode == TCmode)
41289 || (TARGET_80387 && mode == XCmode)
41290 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
41291 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
41292 units = GET_MODE_SIZE (mode);
41293 break;
41294
41295 case MODE_VECTOR_INT:
41296 case MODE_VECTOR_FLOAT:
41297 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
41298 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
41299 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
41300 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
41301 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
41302 units = GET_MODE_SIZE (mode);
41303 }
41304
41305 /* Return the cost of moving between two registers of mode MODE,
41306 assuming that the move will be in pieces of at most UNITS bytes. */
41307 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
41308 }
41309
41310 /* Compute a (partial) cost for rtx X. Return true if the complete
41311 cost has been computed, and false if subexpressions should be
41312 scanned. In either case, *TOTAL contains the cost result. */
41313
41314 static bool
41315 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
41316 int *total, bool speed)
41317 {
41318 rtx mask;
41319 enum rtx_code code = GET_CODE (x);
41320 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
41321 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
41322 int src_cost;
41323
41324 switch (code)
41325 {
41326 case SET:
41327 if (register_operand (SET_DEST (x), VOIDmode)
41328 && reg_or_0_operand (SET_SRC (x), VOIDmode))
41329 {
41330 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
41331 return true;
41332 }
41333
41334 if (register_operand (SET_SRC (x), VOIDmode))
41335 /* Avoid potentially incorrect high cost from rtx_costs
41336 for non-tieable SUBREGs. */
41337 src_cost = 0;
41338 else
41339 {
41340 src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
41341
41342 if (CONSTANT_P (SET_SRC (x)))
41343 /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
41344 a small value, possibly zero for cheap constants. */
41345 src_cost += COSTS_N_INSNS (1);
41346 }
41347
41348 *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
41349 return true;
41350
41351 case CONST_INT:
41352 case CONST:
41353 case LABEL_REF:
41354 case SYMBOL_REF:
41355 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
41356 *total = 3;
41357 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
41358 *total = 2;
41359 else if (flag_pic && SYMBOLIC_CONST (x)
41360 && !(TARGET_64BIT
41361 && (GET_CODE (x) == LABEL_REF
41362 || (GET_CODE (x) == SYMBOL_REF
41363 && SYMBOL_REF_LOCAL_P (x))))
41364 /* Use 0 cost for CONST to improve its propagation. */
41365 && (TARGET_64BIT || GET_CODE (x) != CONST))
41366 *total = 1;
41367 else
41368 *total = 0;
41369 return true;
41370
41371 case CONST_DOUBLE:
41372 if (IS_STACK_MODE (mode))
41373 switch (standard_80387_constant_p (x))
41374 {
41375 case -1:
41376 case 0:
41377 break;
41378 case 1: /* 0.0 */
41379 *total = 1;
41380 return true;
41381 default: /* Other constants */
41382 *total = 2;
41383 return true;
41384 }
41385 /* FALLTHRU */
41386
41387 case CONST_VECTOR:
41388 switch (standard_sse_constant_p (x, mode))
41389 {
41390 case 0:
41391 break;
41392 case 1: /* 0: xor eliminates false dependency */
41393 *total = 0;
41394 return true;
41395 default: /* -1: cmp contains false dependency */
41396 *total = 1;
41397 return true;
41398 }
41399 /* FALLTHRU */
41400
41401 case CONST_WIDE_INT:
41402 /* Fall back to (MEM (SYMBOL_REF)), since that's where
41403 it'll probably end up. Add a penalty for size. */
41404 *total = (COSTS_N_INSNS (1)
41405 + (!TARGET_64BIT && flag_pic)
41406 + (GET_MODE_SIZE (mode) <= 4
41407 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
41408 return true;
41409
41410 case ZERO_EXTEND:
41411 /* The zero extensions is often completely free on x86_64, so make
41412 it as cheap as possible. */
41413 if (TARGET_64BIT && mode == DImode
41414 && GET_MODE (XEXP (x, 0)) == SImode)
41415 *total = 1;
41416 else if (TARGET_ZERO_EXTEND_WITH_AND)
41417 *total = cost->add;
41418 else
41419 *total = cost->movzx;
41420 return false;
41421
41422 case SIGN_EXTEND:
41423 *total = cost->movsx;
41424 return false;
41425
41426 case ASHIFT:
41427 if (SCALAR_INT_MODE_P (mode)
41428 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
41429 && CONST_INT_P (XEXP (x, 1)))
41430 {
41431 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
41432 if (value == 1)
41433 {
41434 *total = cost->add;
41435 return false;
41436 }
41437 if ((value == 2 || value == 3)
41438 && cost->lea <= cost->shift_const)
41439 {
41440 *total = cost->lea;
41441 return false;
41442 }
41443 }
41444 /* FALLTHRU */
41445
41446 case ROTATE:
41447 case ASHIFTRT:
41448 case LSHIFTRT:
41449 case ROTATERT:
41450 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41451 {
41452 /* ??? Should be SSE vector operation cost. */
41453 /* At least for published AMD latencies, this really is the same
41454 as the latency for a simple fpu operation like fabs. */
41455 /* V*QImode is emulated with 1-11 insns. */
41456 if (mode == V16QImode || mode == V32QImode)
41457 {
41458 int count = 11;
41459 if (TARGET_XOP && mode == V16QImode)
41460 {
41461 /* For XOP we use vpshab, which requires a broadcast of the
41462 value to the variable shift insn. For constants this
41463 means a V16Q const in mem; even when we can perform the
41464 shift with one insn set the cost to prefer paddb. */
41465 if (CONSTANT_P (XEXP (x, 1)))
41466 {
41467 *total = (cost->fabs
41468 + rtx_cost (XEXP (x, 0), mode, code, 0, speed)
41469 + (speed ? 2 : COSTS_N_BYTES (16)));
41470 return true;
41471 }
41472 count = 3;
41473 }
41474 else if (TARGET_SSSE3)
41475 count = 7;
41476 *total = cost->fabs * count;
41477 }
41478 else
41479 *total = cost->fabs;
41480 }
41481 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41482 {
41483 if (CONST_INT_P (XEXP (x, 1)))
41484 {
41485 if (INTVAL (XEXP (x, 1)) > 32)
41486 *total = cost->shift_const + COSTS_N_INSNS (2);
41487 else
41488 *total = cost->shift_const * 2;
41489 }
41490 else
41491 {
41492 if (GET_CODE (XEXP (x, 1)) == AND)
41493 *total = cost->shift_var * 2;
41494 else
41495 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
41496 }
41497 }
41498 else
41499 {
41500 if (CONST_INT_P (XEXP (x, 1)))
41501 *total = cost->shift_const;
41502 else if (SUBREG_P (XEXP (x, 1))
41503 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
41504 {
41505 /* Return the cost after shift-and truncation. */
41506 *total = cost->shift_var;
41507 return true;
41508 }
41509 else
41510 *total = cost->shift_var;
41511 }
41512 return false;
41513
41514 case FMA:
41515 {
41516 rtx sub;
41517
41518 gcc_assert (FLOAT_MODE_P (mode));
41519 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
41520
41521 /* ??? SSE scalar/vector cost should be used here. */
41522 /* ??? Bald assumption that fma has the same cost as fmul. */
41523 *total = cost->fmul;
41524 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
41525
41526 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
41527 sub = XEXP (x, 0);
41528 if (GET_CODE (sub) == NEG)
41529 sub = XEXP (sub, 0);
41530 *total += rtx_cost (sub, mode, FMA, 0, speed);
41531
41532 sub = XEXP (x, 2);
41533 if (GET_CODE (sub) == NEG)
41534 sub = XEXP (sub, 0);
41535 *total += rtx_cost (sub, mode, FMA, 2, speed);
41536 return true;
41537 }
41538
41539 case MULT:
41540 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41541 {
41542 /* ??? SSE scalar cost should be used here. */
41543 *total = cost->fmul;
41544 return false;
41545 }
41546 else if (X87_FLOAT_MODE_P (mode))
41547 {
41548 *total = cost->fmul;
41549 return false;
41550 }
41551 else if (FLOAT_MODE_P (mode))
41552 {
41553 /* ??? SSE vector cost should be used here. */
41554 *total = cost->fmul;
41555 return false;
41556 }
41557 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41558 {
41559 /* V*QImode is emulated with 7-13 insns. */
41560 if (mode == V16QImode || mode == V32QImode)
41561 {
41562 int extra = 11;
41563 if (TARGET_XOP && mode == V16QImode)
41564 extra = 5;
41565 else if (TARGET_SSSE3)
41566 extra = 6;
41567 *total = cost->fmul * 2 + cost->fabs * extra;
41568 }
41569 /* V*DImode is emulated with 5-8 insns. */
41570 else if (mode == V2DImode || mode == V4DImode)
41571 {
41572 if (TARGET_XOP && mode == V2DImode)
41573 *total = cost->fmul * 2 + cost->fabs * 3;
41574 else
41575 *total = cost->fmul * 3 + cost->fabs * 5;
41576 }
41577 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
41578 insns, including two PMULUDQ. */
41579 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
41580 *total = cost->fmul * 2 + cost->fabs * 5;
41581 else
41582 *total = cost->fmul;
41583 return false;
41584 }
41585 else
41586 {
41587 rtx op0 = XEXP (x, 0);
41588 rtx op1 = XEXP (x, 1);
41589 int nbits;
41590 if (CONST_INT_P (XEXP (x, 1)))
41591 {
41592 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
41593 for (nbits = 0; value != 0; value &= value - 1)
41594 nbits++;
41595 }
41596 else
41597 /* This is arbitrary. */
41598 nbits = 7;
41599
41600 /* Compute costs correctly for widening multiplication. */
41601 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
41602 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
41603 == GET_MODE_SIZE (mode))
41604 {
41605 int is_mulwiden = 0;
41606 machine_mode inner_mode = GET_MODE (op0);
41607
41608 if (GET_CODE (op0) == GET_CODE (op1))
41609 is_mulwiden = 1, op1 = XEXP (op1, 0);
41610 else if (CONST_INT_P (op1))
41611 {
41612 if (GET_CODE (op0) == SIGN_EXTEND)
41613 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
41614 == INTVAL (op1);
41615 else
41616 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
41617 }
41618
41619 if (is_mulwiden)
41620 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
41621 }
41622
41623 *total = (cost->mult_init[MODE_INDEX (mode)]
41624 + nbits * cost->mult_bit
41625 + rtx_cost (op0, mode, outer_code, opno, speed)
41626 + rtx_cost (op1, mode, outer_code, opno, speed));
41627
41628 return true;
41629 }
41630
41631 case DIV:
41632 case UDIV:
41633 case MOD:
41634 case UMOD:
41635 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41636 /* ??? SSE cost should be used here. */
41637 *total = cost->fdiv;
41638 else if (X87_FLOAT_MODE_P (mode))
41639 *total = cost->fdiv;
41640 else if (FLOAT_MODE_P (mode))
41641 /* ??? SSE vector cost should be used here. */
41642 *total = cost->fdiv;
41643 else
41644 *total = cost->divide[MODE_INDEX (mode)];
41645 return false;
41646
41647 case PLUS:
41648 if (GET_MODE_CLASS (mode) == MODE_INT
41649 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
41650 {
41651 if (GET_CODE (XEXP (x, 0)) == PLUS
41652 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
41653 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
41654 && CONSTANT_P (XEXP (x, 1)))
41655 {
41656 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
41657 if (val == 2 || val == 4 || val == 8)
41658 {
41659 *total = cost->lea;
41660 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
41661 outer_code, opno, speed);
41662 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
41663 outer_code, opno, speed);
41664 *total += rtx_cost (XEXP (x, 1), mode,
41665 outer_code, opno, speed);
41666 return true;
41667 }
41668 }
41669 else if (GET_CODE (XEXP (x, 0)) == MULT
41670 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
41671 {
41672 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
41673 if (val == 2 || val == 4 || val == 8)
41674 {
41675 *total = cost->lea;
41676 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41677 outer_code, opno, speed);
41678 *total += rtx_cost (XEXP (x, 1), mode,
41679 outer_code, opno, speed);
41680 return true;
41681 }
41682 }
41683 else if (GET_CODE (XEXP (x, 0)) == PLUS)
41684 {
41685 /* Add with carry, ignore the cost of adding a carry flag. */
41686 if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
41687 *total = cost->add;
41688 else
41689 {
41690 *total = cost->lea;
41691 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41692 outer_code, opno, speed);
41693 }
41694
41695 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
41696 outer_code, opno, speed);
41697 *total += rtx_cost (XEXP (x, 1), mode,
41698 outer_code, opno, speed);
41699 return true;
41700 }
41701 }
41702 /* FALLTHRU */
41703
41704 case MINUS:
41705 /* Subtract with borrow, ignore the cost of subtracting a carry flag. */
41706 if (GET_MODE_CLASS (mode) == MODE_INT
41707 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
41708 && GET_CODE (XEXP (x, 0)) == MINUS
41709 && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
41710 {
41711 *total = cost->add;
41712 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41713 outer_code, opno, speed);
41714 *total += rtx_cost (XEXP (x, 1), mode,
41715 outer_code, opno, speed);
41716 return true;
41717 }
41718
41719 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41720 {
41721 /* ??? SSE cost should be used here. */
41722 *total = cost->fadd;
41723 return false;
41724 }
41725 else if (X87_FLOAT_MODE_P (mode))
41726 {
41727 *total = cost->fadd;
41728 return false;
41729 }
41730 else if (FLOAT_MODE_P (mode))
41731 {
41732 /* ??? SSE vector cost should be used here. */
41733 *total = cost->fadd;
41734 return false;
41735 }
41736 /* FALLTHRU */
41737
41738 case AND:
41739 case IOR:
41740 case XOR:
41741 if (GET_MODE_CLASS (mode) == MODE_INT
41742 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41743 {
41744 *total = (cost->add * 2
41745 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
41746 << (GET_MODE (XEXP (x, 0)) != DImode))
41747 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
41748 << (GET_MODE (XEXP (x, 1)) != DImode)));
41749 return true;
41750 }
41751 /* FALLTHRU */
41752
41753 case NEG:
41754 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41755 {
41756 /* ??? SSE cost should be used here. */
41757 *total = cost->fchs;
41758 return false;
41759 }
41760 else if (X87_FLOAT_MODE_P (mode))
41761 {
41762 *total = cost->fchs;
41763 return false;
41764 }
41765 else if (FLOAT_MODE_P (mode))
41766 {
41767 /* ??? SSE vector cost should be used here. */
41768 *total = cost->fchs;
41769 return false;
41770 }
41771 /* FALLTHRU */
41772
41773 case NOT:
41774 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41775 {
41776 /* ??? Should be SSE vector operation cost. */
41777 /* At least for published AMD latencies, this really is the same
41778 as the latency for a simple fpu operation like fabs. */
41779 *total = cost->fabs;
41780 }
41781 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41782 *total = cost->add * 2;
41783 else
41784 *total = cost->add;
41785 return false;
41786
41787 case COMPARE:
41788 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
41789 && XEXP (XEXP (x, 0), 1) == const1_rtx
41790 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
41791 && XEXP (x, 1) == const0_rtx)
41792 {
41793 /* This kind of construct is implemented using test[bwl].
41794 Treat it as if we had an AND. */
41795 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
41796 *total = (cost->add
41797 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
41798 opno, speed)
41799 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
41800 return true;
41801 }
41802
41803 /* The embedded comparison operand is completely free. */
41804 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
41805 && XEXP (x, 1) == const0_rtx)
41806 *total = 0;
41807
41808 return false;
41809
41810 case FLOAT_EXTEND:
41811 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
41812 *total = 0;
41813 return false;
41814
41815 case ABS:
41816 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41817 /* ??? SSE cost should be used here. */
41818 *total = cost->fabs;
41819 else if (X87_FLOAT_MODE_P (mode))
41820 *total = cost->fabs;
41821 else if (FLOAT_MODE_P (mode))
41822 /* ??? SSE vector cost should be used here. */
41823 *total = cost->fabs;
41824 return false;
41825
41826 case SQRT:
41827 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41828 /* ??? SSE cost should be used here. */
41829 *total = cost->fsqrt;
41830 else if (X87_FLOAT_MODE_P (mode))
41831 *total = cost->fsqrt;
41832 else if (FLOAT_MODE_P (mode))
41833 /* ??? SSE vector cost should be used here. */
41834 *total = cost->fsqrt;
41835 return false;
41836
41837 case UNSPEC:
41838 if (XINT (x, 1) == UNSPEC_TP)
41839 *total = 0;
41840 return false;
41841
41842 case VEC_SELECT:
41843 case VEC_CONCAT:
41844 case VEC_DUPLICATE:
41845 /* ??? Assume all of these vector manipulation patterns are
41846 recognizable. In which case they all pretty much have the
41847 same cost. */
41848 *total = cost->fabs;
41849 return true;
41850 case VEC_MERGE:
41851 mask = XEXP (x, 2);
41852 /* This is masked instruction, assume the same cost,
41853 as nonmasked variant. */
41854 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
41855 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
41856 else
41857 *total = cost->fabs;
41858 return true;
41859
41860 default:
41861 return false;
41862 }
41863 }
41864
41865 #if TARGET_MACHO
41866
41867 static int current_machopic_label_num;
41868
41869 /* Given a symbol name and its associated stub, write out the
41870 definition of the stub. */
41871
41872 void
41873 machopic_output_stub (FILE *file, const char *symb, const char *stub)
41874 {
41875 unsigned int length;
41876 char *binder_name, *symbol_name, lazy_ptr_name[32];
41877 int label = ++current_machopic_label_num;
41878
41879 /* For 64-bit we shouldn't get here. */
41880 gcc_assert (!TARGET_64BIT);
41881
41882 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
41883 symb = targetm.strip_name_encoding (symb);
41884
41885 length = strlen (stub);
41886 binder_name = XALLOCAVEC (char, length + 32);
41887 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
41888
41889 length = strlen (symb);
41890 symbol_name = XALLOCAVEC (char, length + 32);
41891 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
41892
41893 sprintf (lazy_ptr_name, "L%d$lz", label);
41894
41895 if (MACHOPIC_ATT_STUB)
41896 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
41897 else if (MACHOPIC_PURE)
41898 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
41899 else
41900 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
41901
41902 fprintf (file, "%s:\n", stub);
41903 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
41904
41905 if (MACHOPIC_ATT_STUB)
41906 {
41907 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
41908 }
41909 else if (MACHOPIC_PURE)
41910 {
41911 /* PIC stub. */
41912 /* 25-byte PIC stub using "CALL get_pc_thunk". */
41913 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
41914 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
41915 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
41916 label, lazy_ptr_name, label);
41917 fprintf (file, "\tjmp\t*%%ecx\n");
41918 }
41919 else
41920 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
41921
41922 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
41923 it needs no stub-binding-helper. */
41924 if (MACHOPIC_ATT_STUB)
41925 return;
41926
41927 fprintf (file, "%s:\n", binder_name);
41928
41929 if (MACHOPIC_PURE)
41930 {
41931 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
41932 fprintf (file, "\tpushl\t%%ecx\n");
41933 }
41934 else
41935 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
41936
41937 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
41938
41939 /* N.B. Keep the correspondence of these
41940 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
41941 old-pic/new-pic/non-pic stubs; altering this will break
41942 compatibility with existing dylibs. */
41943 if (MACHOPIC_PURE)
41944 {
41945 /* 25-byte PIC stub using "CALL get_pc_thunk". */
41946 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
41947 }
41948 else
41949 /* 16-byte -mdynamic-no-pic stub. */
41950 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
41951
41952 fprintf (file, "%s:\n", lazy_ptr_name);
41953 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
41954 fprintf (file, ASM_LONG "%s\n", binder_name);
41955 }
41956 #endif /* TARGET_MACHO */
41957
41958 /* Order the registers for register allocator. */
41959
41960 void
41961 x86_order_regs_for_local_alloc (void)
41962 {
41963 int pos = 0;
41964 int i;
41965
41966 /* First allocate the local general purpose registers. */
41967 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41968 if (GENERAL_REGNO_P (i) && call_used_regs[i])
41969 reg_alloc_order [pos++] = i;
41970
41971 /* Global general purpose registers. */
41972 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41973 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
41974 reg_alloc_order [pos++] = i;
41975
41976 /* x87 registers come first in case we are doing FP math
41977 using them. */
41978 if (!TARGET_SSE_MATH)
41979 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
41980 reg_alloc_order [pos++] = i;
41981
41982 /* SSE registers. */
41983 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
41984 reg_alloc_order [pos++] = i;
41985 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
41986 reg_alloc_order [pos++] = i;
41987
41988 /* Extended REX SSE registers. */
41989 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
41990 reg_alloc_order [pos++] = i;
41991
41992 /* Mask register. */
41993 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
41994 reg_alloc_order [pos++] = i;
41995
41996 /* MPX bound registers. */
41997 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
41998 reg_alloc_order [pos++] = i;
41999
42000 /* x87 registers. */
42001 if (TARGET_SSE_MATH)
42002 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
42003 reg_alloc_order [pos++] = i;
42004
42005 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
42006 reg_alloc_order [pos++] = i;
42007
42008 /* Initialize the rest of array as we do not allocate some registers
42009 at all. */
42010 while (pos < FIRST_PSEUDO_REGISTER)
42011 reg_alloc_order [pos++] = 0;
42012 }
42013
42014 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
42015 in struct attribute_spec handler. */
42016 static tree
42017 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
42018 tree args,
42019 int,
42020 bool *no_add_attrs)
42021 {
42022 if (TREE_CODE (*node) != FUNCTION_TYPE
42023 && TREE_CODE (*node) != METHOD_TYPE
42024 && TREE_CODE (*node) != FIELD_DECL
42025 && TREE_CODE (*node) != TYPE_DECL)
42026 {
42027 warning (OPT_Wattributes, "%qE attribute only applies to functions",
42028 name);
42029 *no_add_attrs = true;
42030 return NULL_TREE;
42031 }
42032 if (TARGET_64BIT)
42033 {
42034 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
42035 name);
42036 *no_add_attrs = true;
42037 return NULL_TREE;
42038 }
42039 if (is_attribute_p ("callee_pop_aggregate_return", name))
42040 {
42041 tree cst;
42042
42043 cst = TREE_VALUE (args);
42044 if (TREE_CODE (cst) != INTEGER_CST)
42045 {
42046 warning (OPT_Wattributes,
42047 "%qE attribute requires an integer constant argument",
42048 name);
42049 *no_add_attrs = true;
42050 }
42051 else if (compare_tree_int (cst, 0) != 0
42052 && compare_tree_int (cst, 1) != 0)
42053 {
42054 warning (OPT_Wattributes,
42055 "argument to %qE attribute is neither zero, nor one",
42056 name);
42057 *no_add_attrs = true;
42058 }
42059
42060 return NULL_TREE;
42061 }
42062
42063 return NULL_TREE;
42064 }
42065
42066 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
42067 struct attribute_spec.handler. */
42068 static tree
42069 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
42070 bool *no_add_attrs)
42071 {
42072 if (TREE_CODE (*node) != FUNCTION_TYPE
42073 && TREE_CODE (*node) != METHOD_TYPE
42074 && TREE_CODE (*node) != FIELD_DECL
42075 && TREE_CODE (*node) != TYPE_DECL)
42076 {
42077 warning (OPT_Wattributes, "%qE attribute only applies to functions",
42078 name);
42079 *no_add_attrs = true;
42080 return NULL_TREE;
42081 }
42082
42083 /* Can combine regparm with all attributes but fastcall. */
42084 if (is_attribute_p ("ms_abi", name))
42085 {
42086 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
42087 {
42088 error ("ms_abi and sysv_abi attributes are not compatible");
42089 }
42090
42091 return NULL_TREE;
42092 }
42093 else if (is_attribute_p ("sysv_abi", name))
42094 {
42095 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
42096 {
42097 error ("ms_abi and sysv_abi attributes are not compatible");
42098 }
42099
42100 return NULL_TREE;
42101 }
42102
42103 return NULL_TREE;
42104 }
42105
42106 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
42107 struct attribute_spec.handler. */
42108 static tree
42109 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
42110 bool *no_add_attrs)
42111 {
42112 tree *type = NULL;
42113 if (DECL_P (*node))
42114 {
42115 if (TREE_CODE (*node) == TYPE_DECL)
42116 type = &TREE_TYPE (*node);
42117 }
42118 else
42119 type = node;
42120
42121 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
42122 {
42123 warning (OPT_Wattributes, "%qE attribute ignored",
42124 name);
42125 *no_add_attrs = true;
42126 }
42127
42128 else if ((is_attribute_p ("ms_struct", name)
42129 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
42130 || ((is_attribute_p ("gcc_struct", name)
42131 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
42132 {
42133 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
42134 name);
42135 *no_add_attrs = true;
42136 }
42137
42138 return NULL_TREE;
42139 }
42140
42141 static tree
42142 ix86_handle_fndecl_attribute (tree *node, tree name, tree, int,
42143 bool *no_add_attrs)
42144 {
42145 if (TREE_CODE (*node) != FUNCTION_DECL)
42146 {
42147 warning (OPT_Wattributes, "%qE attribute only applies to functions",
42148 name);
42149 *no_add_attrs = true;
42150 }
42151 return NULL_TREE;
42152 }
42153
42154 static tree
42155 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
42156 int, bool *)
42157 {
42158 return NULL_TREE;
42159 }
42160
42161 static tree
42162 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
42163 {
42164 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
42165 but the function type contains args and return type data. */
42166 tree func_type = *node;
42167 tree return_type = TREE_TYPE (func_type);
42168
42169 int nargs = 0;
42170 tree current_arg_type = TYPE_ARG_TYPES (func_type);
42171 while (current_arg_type
42172 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
42173 {
42174 if (nargs == 0)
42175 {
42176 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
42177 error ("interrupt service routine should have a pointer "
42178 "as the first argument");
42179 }
42180 else if (nargs == 1)
42181 {
42182 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
42183 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
42184 error ("interrupt service routine should have unsigned %s"
42185 "int as the second argument",
42186 TARGET_64BIT
42187 ? (TARGET_X32 ? "long long " : "long ")
42188 : "");
42189 }
42190 nargs++;
42191 current_arg_type = TREE_CHAIN (current_arg_type);
42192 }
42193 if (!nargs || nargs > 2)
42194 error ("interrupt service routine can only have a pointer argument "
42195 "and an optional integer argument");
42196 if (! VOID_TYPE_P (return_type))
42197 error ("interrupt service routine can't have non-void return value");
42198
42199 return NULL_TREE;
42200 }
42201
42202 static bool
42203 ix86_ms_bitfield_layout_p (const_tree record_type)
42204 {
42205 return ((TARGET_MS_BITFIELD_LAYOUT
42206 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
42207 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
42208 }
42209
42210 /* Returns an expression indicating where the this parameter is
42211 located on entry to the FUNCTION. */
42212
42213 static rtx
42214 x86_this_parameter (tree function)
42215 {
42216 tree type = TREE_TYPE (function);
42217 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
42218 int nregs;
42219
42220 if (TARGET_64BIT)
42221 {
42222 const int *parm_regs;
42223
42224 if (ix86_function_type_abi (type) == MS_ABI)
42225 parm_regs = x86_64_ms_abi_int_parameter_registers;
42226 else
42227 parm_regs = x86_64_int_parameter_registers;
42228 return gen_rtx_REG (Pmode, parm_regs[aggr]);
42229 }
42230
42231 nregs = ix86_function_regparm (type, function);
42232
42233 if (nregs > 0 && !stdarg_p (type))
42234 {
42235 int regno;
42236 unsigned int ccvt = ix86_get_callcvt (type);
42237
42238 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
42239 regno = aggr ? DX_REG : CX_REG;
42240 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
42241 {
42242 regno = CX_REG;
42243 if (aggr)
42244 return gen_rtx_MEM (SImode,
42245 plus_constant (Pmode, stack_pointer_rtx, 4));
42246 }
42247 else
42248 {
42249 regno = AX_REG;
42250 if (aggr)
42251 {
42252 regno = DX_REG;
42253 if (nregs == 1)
42254 return gen_rtx_MEM (SImode,
42255 plus_constant (Pmode,
42256 stack_pointer_rtx, 4));
42257 }
42258 }
42259 return gen_rtx_REG (SImode, regno);
42260 }
42261
42262 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
42263 aggr ? 8 : 4));
42264 }
42265
42266 /* Determine whether x86_output_mi_thunk can succeed. */
42267
42268 static bool
42269 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
42270 const_tree function)
42271 {
42272 /* 64-bit can handle anything. */
42273 if (TARGET_64BIT)
42274 return true;
42275
42276 /* For 32-bit, everything's fine if we have one free register. */
42277 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
42278 return true;
42279
42280 /* Need a free register for vcall_offset. */
42281 if (vcall_offset)
42282 return false;
42283
42284 /* Need a free register for GOT references. */
42285 if (flag_pic && !targetm.binds_local_p (function))
42286 return false;
42287
42288 /* Otherwise ok. */
42289 return true;
42290 }
42291
42292 /* Output the assembler code for a thunk function. THUNK_DECL is the
42293 declaration for the thunk function itself, FUNCTION is the decl for
42294 the target function. DELTA is an immediate constant offset to be
42295 added to THIS. If VCALL_OFFSET is nonzero, the word at
42296 *(*this + vcall_offset) should be added to THIS. */
42297
42298 static void
42299 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
42300 HOST_WIDE_INT vcall_offset, tree function)
42301 {
42302 rtx this_param = x86_this_parameter (function);
42303 rtx this_reg, tmp, fnaddr;
42304 unsigned int tmp_regno;
42305 rtx_insn *insn;
42306
42307 if (TARGET_64BIT)
42308 tmp_regno = R10_REG;
42309 else
42310 {
42311 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
42312 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
42313 tmp_regno = AX_REG;
42314 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
42315 tmp_regno = DX_REG;
42316 else
42317 tmp_regno = CX_REG;
42318 }
42319
42320 emit_note (NOTE_INSN_PROLOGUE_END);
42321
42322 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
42323 pull it in now and let DELTA benefit. */
42324 if (REG_P (this_param))
42325 this_reg = this_param;
42326 else if (vcall_offset)
42327 {
42328 /* Put the this parameter into %eax. */
42329 this_reg = gen_rtx_REG (Pmode, AX_REG);
42330 emit_move_insn (this_reg, this_param);
42331 }
42332 else
42333 this_reg = NULL_RTX;
42334
42335 /* Adjust the this parameter by a fixed constant. */
42336 if (delta)
42337 {
42338 rtx delta_rtx = GEN_INT (delta);
42339 rtx delta_dst = this_reg ? this_reg : this_param;
42340
42341 if (TARGET_64BIT)
42342 {
42343 if (!x86_64_general_operand (delta_rtx, Pmode))
42344 {
42345 tmp = gen_rtx_REG (Pmode, tmp_regno);
42346 emit_move_insn (tmp, delta_rtx);
42347 delta_rtx = tmp;
42348 }
42349 }
42350
42351 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
42352 }
42353
42354 /* Adjust the this parameter by a value stored in the vtable. */
42355 if (vcall_offset)
42356 {
42357 rtx vcall_addr, vcall_mem, this_mem;
42358
42359 tmp = gen_rtx_REG (Pmode, tmp_regno);
42360
42361 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
42362 if (Pmode != ptr_mode)
42363 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
42364 emit_move_insn (tmp, this_mem);
42365
42366 /* Adjust the this parameter. */
42367 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
42368 if (TARGET_64BIT
42369 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
42370 {
42371 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
42372 emit_move_insn (tmp2, GEN_INT (vcall_offset));
42373 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
42374 }
42375
42376 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
42377 if (Pmode != ptr_mode)
42378 emit_insn (gen_addsi_1_zext (this_reg,
42379 gen_rtx_REG (ptr_mode,
42380 REGNO (this_reg)),
42381 vcall_mem));
42382 else
42383 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
42384 }
42385
42386 /* If necessary, drop THIS back to its stack slot. */
42387 if (this_reg && this_reg != this_param)
42388 emit_move_insn (this_param, this_reg);
42389
42390 fnaddr = XEXP (DECL_RTL (function), 0);
42391 if (TARGET_64BIT)
42392 {
42393 if (!flag_pic || targetm.binds_local_p (function)
42394 || TARGET_PECOFF)
42395 ;
42396 else
42397 {
42398 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
42399 tmp = gen_rtx_CONST (Pmode, tmp);
42400 fnaddr = gen_const_mem (Pmode, tmp);
42401 }
42402 }
42403 else
42404 {
42405 if (!flag_pic || targetm.binds_local_p (function))
42406 ;
42407 #if TARGET_MACHO
42408 else if (TARGET_MACHO)
42409 {
42410 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
42411 fnaddr = XEXP (fnaddr, 0);
42412 }
42413 #endif /* TARGET_MACHO */
42414 else
42415 {
42416 tmp = gen_rtx_REG (Pmode, CX_REG);
42417 output_set_got (tmp, NULL_RTX);
42418
42419 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
42420 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
42421 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
42422 fnaddr = gen_const_mem (Pmode, fnaddr);
42423 }
42424 }
42425
42426 /* Our sibling call patterns do not allow memories, because we have no
42427 predicate that can distinguish between frame and non-frame memory.
42428 For our purposes here, we can get away with (ab)using a jump pattern,
42429 because we're going to do no optimization. */
42430 if (MEM_P (fnaddr))
42431 {
42432 if (sibcall_insn_operand (fnaddr, word_mode))
42433 {
42434 fnaddr = XEXP (DECL_RTL (function), 0);
42435 tmp = gen_rtx_MEM (QImode, fnaddr);
42436 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
42437 tmp = emit_call_insn (tmp);
42438 SIBLING_CALL_P (tmp) = 1;
42439 }
42440 else
42441 emit_jump_insn (gen_indirect_jump (fnaddr));
42442 }
42443 else
42444 {
42445 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
42446 {
42447 // CM_LARGE_PIC always uses pseudo PIC register which is
42448 // uninitialized. Since FUNCTION is local and calling it
42449 // doesn't go through PLT, we use scratch register %r11 as
42450 // PIC register and initialize it here.
42451 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
42452 ix86_init_large_pic_reg (tmp_regno);
42453 fnaddr = legitimize_pic_address (fnaddr,
42454 gen_rtx_REG (Pmode, tmp_regno));
42455 }
42456
42457 if (!sibcall_insn_operand (fnaddr, word_mode))
42458 {
42459 tmp = gen_rtx_REG (word_mode, tmp_regno);
42460 if (GET_MODE (fnaddr) != word_mode)
42461 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
42462 emit_move_insn (tmp, fnaddr);
42463 fnaddr = tmp;
42464 }
42465
42466 tmp = gen_rtx_MEM (QImode, fnaddr);
42467 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
42468 tmp = emit_call_insn (tmp);
42469 SIBLING_CALL_P (tmp) = 1;
42470 }
42471 emit_barrier ();
42472
42473 /* Emit just enough of rest_of_compilation to get the insns emitted.
42474 Note that use_thunk calls assemble_start_function et al. */
42475 insn = get_insns ();
42476 shorten_branches (insn);
42477 final_start_function (insn, file, 1);
42478 final (insn, file, 1);
42479 final_end_function ();
42480 }
42481
42482 static void
42483 x86_file_start (void)
42484 {
42485 default_file_start ();
42486 if (TARGET_16BIT)
42487 fputs ("\t.code16gcc\n", asm_out_file);
42488 #if TARGET_MACHO
42489 darwin_file_start ();
42490 #endif
42491 if (X86_FILE_START_VERSION_DIRECTIVE)
42492 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
42493 if (X86_FILE_START_FLTUSED)
42494 fputs ("\t.global\t__fltused\n", asm_out_file);
42495 if (ix86_asm_dialect == ASM_INTEL)
42496 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
42497 }
42498
42499 int
42500 x86_field_alignment (tree type, int computed)
42501 {
42502 machine_mode mode;
42503
42504 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
42505 return computed;
42506 if (TARGET_IAMCU)
42507 return iamcu_alignment (type, computed);
42508 mode = TYPE_MODE (strip_array_types (type));
42509 if (mode == DFmode || mode == DCmode
42510 || GET_MODE_CLASS (mode) == MODE_INT
42511 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
42512 return MIN (32, computed);
42513 return computed;
42514 }
42515
42516 /* Print call to TARGET to FILE. */
42517
42518 static void
42519 x86_print_call_or_nop (FILE *file, const char *target)
42520 {
42521 if (flag_nop_mcount)
42522 fprintf (file, "1:\tnopl 0x00(%%eax,%%eax,1)\n"); /* 5 byte nop. */
42523 else
42524 fprintf (file, "1:\tcall\t%s\n", target);
42525 }
42526
42527 /* Output assembler code to FILE to increment profiler label # LABELNO
42528 for profiling a function entry. */
42529 void
42530 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
42531 {
42532 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
42533 : MCOUNT_NAME);
42534 if (TARGET_64BIT)
42535 {
42536 #ifndef NO_PROFILE_COUNTERS
42537 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
42538 #endif
42539
42540 if (!TARGET_PECOFF && flag_pic)
42541 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
42542 else
42543 x86_print_call_or_nop (file, mcount_name);
42544 }
42545 else if (flag_pic)
42546 {
42547 #ifndef NO_PROFILE_COUNTERS
42548 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
42549 LPREFIX, labelno);
42550 #endif
42551 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
42552 }
42553 else
42554 {
42555 #ifndef NO_PROFILE_COUNTERS
42556 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
42557 LPREFIX, labelno);
42558 #endif
42559 x86_print_call_or_nop (file, mcount_name);
42560 }
42561
42562 if (flag_record_mcount)
42563 {
42564 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
42565 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
42566 fprintf (file, "\t.previous\n");
42567 }
42568 }
42569
42570 /* We don't have exact information about the insn sizes, but we may assume
42571 quite safely that we are informed about all 1 byte insns and memory
42572 address sizes. This is enough to eliminate unnecessary padding in
42573 99% of cases. */
42574
42575 static int
42576 min_insn_size (rtx_insn *insn)
42577 {
42578 int l = 0, len;
42579
42580 if (!INSN_P (insn) || !active_insn_p (insn))
42581 return 0;
42582
42583 /* Discard alignments we've emit and jump instructions. */
42584 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
42585 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
42586 return 0;
42587
42588 /* Important case - calls are always 5 bytes.
42589 It is common to have many calls in the row. */
42590 if (CALL_P (insn)
42591 && symbolic_reference_mentioned_p (PATTERN (insn))
42592 && !SIBLING_CALL_P (insn))
42593 return 5;
42594 len = get_attr_length (insn);
42595 if (len <= 1)
42596 return 1;
42597
42598 /* For normal instructions we rely on get_attr_length being exact,
42599 with a few exceptions. */
42600 if (!JUMP_P (insn))
42601 {
42602 enum attr_type type = get_attr_type (insn);
42603
42604 switch (type)
42605 {
42606 case TYPE_MULTI:
42607 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
42608 || asm_noperands (PATTERN (insn)) >= 0)
42609 return 0;
42610 break;
42611 case TYPE_OTHER:
42612 case TYPE_FCMP:
42613 break;
42614 default:
42615 /* Otherwise trust get_attr_length. */
42616 return len;
42617 }
42618
42619 l = get_attr_length_address (insn);
42620 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
42621 l = 4;
42622 }
42623 if (l)
42624 return 1+l;
42625 else
42626 return 2;
42627 }
42628
42629 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
42630
42631 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
42632 window. */
42633
42634 static void
42635 ix86_avoid_jump_mispredicts (void)
42636 {
42637 rtx_insn *insn, *start = get_insns ();
42638 int nbytes = 0, njumps = 0;
42639 bool isjump = false;
42640
42641 /* Look for all minimal intervals of instructions containing 4 jumps.
42642 The intervals are bounded by START and INSN. NBYTES is the total
42643 size of instructions in the interval including INSN and not including
42644 START. When the NBYTES is smaller than 16 bytes, it is possible
42645 that the end of START and INSN ends up in the same 16byte page.
42646
42647 The smallest offset in the page INSN can start is the case where START
42648 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
42649 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
42650
42651 Don't consider asm goto as jump, while it can contain a jump, it doesn't
42652 have to, control transfer to label(s) can be performed through other
42653 means, and also we estimate minimum length of all asm stmts as 0. */
42654 for (insn = start; insn; insn = NEXT_INSN (insn))
42655 {
42656 int min_size;
42657
42658 if (LABEL_P (insn))
42659 {
42660 int align = label_to_alignment (insn);
42661 int max_skip = label_to_max_skip (insn);
42662
42663 if (max_skip > 15)
42664 max_skip = 15;
42665 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
42666 already in the current 16 byte page, because otherwise
42667 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
42668 bytes to reach 16 byte boundary. */
42669 if (align <= 0
42670 || (align <= 3 && max_skip != (1 << align) - 1))
42671 max_skip = 0;
42672 if (dump_file)
42673 fprintf (dump_file, "Label %i with max_skip %i\n",
42674 INSN_UID (insn), max_skip);
42675 if (max_skip)
42676 {
42677 while (nbytes + max_skip >= 16)
42678 {
42679 start = NEXT_INSN (start);
42680 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
42681 || CALL_P (start))
42682 njumps--, isjump = true;
42683 else
42684 isjump = false;
42685 nbytes -= min_insn_size (start);
42686 }
42687 }
42688 continue;
42689 }
42690
42691 min_size = min_insn_size (insn);
42692 nbytes += min_size;
42693 if (dump_file)
42694 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
42695 INSN_UID (insn), min_size);
42696 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
42697 || CALL_P (insn))
42698 njumps++;
42699 else
42700 continue;
42701
42702 while (njumps > 3)
42703 {
42704 start = NEXT_INSN (start);
42705 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
42706 || CALL_P (start))
42707 njumps--, isjump = true;
42708 else
42709 isjump = false;
42710 nbytes -= min_insn_size (start);
42711 }
42712 gcc_assert (njumps >= 0);
42713 if (dump_file)
42714 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
42715 INSN_UID (start), INSN_UID (insn), nbytes);
42716
42717 if (njumps == 3 && isjump && nbytes < 16)
42718 {
42719 int padsize = 15 - nbytes + min_insn_size (insn);
42720
42721 if (dump_file)
42722 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
42723 INSN_UID (insn), padsize);
42724 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
42725 }
42726 }
42727 }
42728 #endif
42729
42730 /* AMD Athlon works faster
42731 when RET is not destination of conditional jump or directly preceded
42732 by other jump instruction. We avoid the penalty by inserting NOP just
42733 before the RET instructions in such cases. */
42734 static void
42735 ix86_pad_returns (void)
42736 {
42737 edge e;
42738 edge_iterator ei;
42739
42740 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42741 {
42742 basic_block bb = e->src;
42743 rtx_insn *ret = BB_END (bb);
42744 rtx_insn *prev;
42745 bool replace = false;
42746
42747 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
42748 || optimize_bb_for_size_p (bb))
42749 continue;
42750 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
42751 if (active_insn_p (prev) || LABEL_P (prev))
42752 break;
42753 if (prev && LABEL_P (prev))
42754 {
42755 edge e;
42756 edge_iterator ei;
42757
42758 FOR_EACH_EDGE (e, ei, bb->preds)
42759 if (EDGE_FREQUENCY (e) && e->src->index >= 0
42760 && !(e->flags & EDGE_FALLTHRU))
42761 {
42762 replace = true;
42763 break;
42764 }
42765 }
42766 if (!replace)
42767 {
42768 prev = prev_active_insn (ret);
42769 if (prev
42770 && ((JUMP_P (prev) && any_condjump_p (prev))
42771 || CALL_P (prev)))
42772 replace = true;
42773 /* Empty functions get branch mispredict even when
42774 the jump destination is not visible to us. */
42775 if (!prev && !optimize_function_for_size_p (cfun))
42776 replace = true;
42777 }
42778 if (replace)
42779 {
42780 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
42781 delete_insn (ret);
42782 }
42783 }
42784 }
42785
42786 /* Count the minimum number of instructions in BB. Return 4 if the
42787 number of instructions >= 4. */
42788
42789 static int
42790 ix86_count_insn_bb (basic_block bb)
42791 {
42792 rtx_insn *insn;
42793 int insn_count = 0;
42794
42795 /* Count number of instructions in this block. Return 4 if the number
42796 of instructions >= 4. */
42797 FOR_BB_INSNS (bb, insn)
42798 {
42799 /* Only happen in exit blocks. */
42800 if (JUMP_P (insn)
42801 && ANY_RETURN_P (PATTERN (insn)))
42802 break;
42803
42804 if (NONDEBUG_INSN_P (insn)
42805 && GET_CODE (PATTERN (insn)) != USE
42806 && GET_CODE (PATTERN (insn)) != CLOBBER)
42807 {
42808 insn_count++;
42809 if (insn_count >= 4)
42810 return insn_count;
42811 }
42812 }
42813
42814 return insn_count;
42815 }
42816
42817
42818 /* Count the minimum number of instructions in code path in BB.
42819 Return 4 if the number of instructions >= 4. */
42820
42821 static int
42822 ix86_count_insn (basic_block bb)
42823 {
42824 edge e;
42825 edge_iterator ei;
42826 int min_prev_count;
42827
42828 /* Only bother counting instructions along paths with no
42829 more than 2 basic blocks between entry and exit. Given
42830 that BB has an edge to exit, determine if a predecessor
42831 of BB has an edge from entry. If so, compute the number
42832 of instructions in the predecessor block. If there
42833 happen to be multiple such blocks, compute the minimum. */
42834 min_prev_count = 4;
42835 FOR_EACH_EDGE (e, ei, bb->preds)
42836 {
42837 edge prev_e;
42838 edge_iterator prev_ei;
42839
42840 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
42841 {
42842 min_prev_count = 0;
42843 break;
42844 }
42845 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
42846 {
42847 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
42848 {
42849 int count = ix86_count_insn_bb (e->src);
42850 if (count < min_prev_count)
42851 min_prev_count = count;
42852 break;
42853 }
42854 }
42855 }
42856
42857 if (min_prev_count < 4)
42858 min_prev_count += ix86_count_insn_bb (bb);
42859
42860 return min_prev_count;
42861 }
42862
42863 /* Pad short function to 4 instructions. */
42864
42865 static void
42866 ix86_pad_short_function (void)
42867 {
42868 edge e;
42869 edge_iterator ei;
42870
42871 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42872 {
42873 rtx_insn *ret = BB_END (e->src);
42874 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
42875 {
42876 int insn_count = ix86_count_insn (e->src);
42877
42878 /* Pad short function. */
42879 if (insn_count < 4)
42880 {
42881 rtx_insn *insn = ret;
42882
42883 /* Find epilogue. */
42884 while (insn
42885 && (!NOTE_P (insn)
42886 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
42887 insn = PREV_INSN (insn);
42888
42889 if (!insn)
42890 insn = ret;
42891
42892 /* Two NOPs count as one instruction. */
42893 insn_count = 2 * (4 - insn_count);
42894 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
42895 }
42896 }
42897 }
42898 }
42899
42900 /* Fix up a Windows system unwinder issue. If an EH region falls through into
42901 the epilogue, the Windows system unwinder will apply epilogue logic and
42902 produce incorrect offsets. This can be avoided by adding a nop between
42903 the last insn that can throw and the first insn of the epilogue. */
42904
42905 static void
42906 ix86_seh_fixup_eh_fallthru (void)
42907 {
42908 edge e;
42909 edge_iterator ei;
42910
42911 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42912 {
42913 rtx_insn *insn, *next;
42914
42915 /* Find the beginning of the epilogue. */
42916 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
42917 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
42918 break;
42919 if (insn == NULL)
42920 continue;
42921
42922 /* We only care about preceding insns that can throw. */
42923 insn = prev_active_insn (insn);
42924 if (insn == NULL || !can_throw_internal (insn))
42925 continue;
42926
42927 /* Do not separate calls from their debug information. */
42928 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
42929 if (NOTE_P (next)
42930 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
42931 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
42932 insn = next;
42933 else
42934 break;
42935
42936 emit_insn_after (gen_nops (const1_rtx), insn);
42937 }
42938 }
42939
42940 /* Given a register number BASE, the lowest of a group of registers, update
42941 regsets IN and OUT with the registers that should be avoided in input
42942 and output operands respectively when trying to avoid generating a modr/m
42943 byte for -fmitigate-rop. */
42944
42945 static void
42946 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
42947 {
42948 SET_HARD_REG_BIT (out, base);
42949 SET_HARD_REG_BIT (out, base + 1);
42950 SET_HARD_REG_BIT (in, base + 2);
42951 SET_HARD_REG_BIT (in, base + 3);
42952 }
42953
42954 /* Called if -fmitigate_rop is in effect. Try to rewrite instructions so
42955 that certain encodings of modr/m bytes do not occur. */
42956 static void
42957 ix86_mitigate_rop (void)
42958 {
42959 HARD_REG_SET input_risky;
42960 HARD_REG_SET output_risky;
42961 HARD_REG_SET inout_risky;
42962
42963 CLEAR_HARD_REG_SET (output_risky);
42964 CLEAR_HARD_REG_SET (input_risky);
42965 SET_HARD_REG_BIT (output_risky, AX_REG);
42966 SET_HARD_REG_BIT (output_risky, CX_REG);
42967 SET_HARD_REG_BIT (input_risky, BX_REG);
42968 SET_HARD_REG_BIT (input_risky, DX_REG);
42969 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
42970 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
42971 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
42972 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
42973 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
42974 set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
42975 COPY_HARD_REG_SET (inout_risky, input_risky);
42976 IOR_HARD_REG_SET (inout_risky, output_risky);
42977
42978 df_note_add_problem ();
42979 /* Fix up what stack-regs did. */
42980 df_insn_rescan_all ();
42981 df_analyze ();
42982
42983 regrename_init (true);
42984 regrename_analyze (NULL);
42985
42986 auto_vec<du_head_p> cands;
42987
42988 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
42989 {
42990 if (!NONDEBUG_INSN_P (insn))
42991 continue;
42992
42993 if (GET_CODE (PATTERN (insn)) == USE
42994 || GET_CODE (PATTERN (insn)) == CLOBBER)
42995 continue;
42996
42997 extract_insn (insn);
42998
42999 int opno0, opno1;
43000 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
43001 recog_data.n_operands, &opno0,
43002 &opno1);
43003
43004 if (!ix86_rop_should_change_byte_p (modrm))
43005 continue;
43006
43007 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
43008
43009 /* This happens when regrename has to fail a block. */
43010 if (!info->op_info)
43011 continue;
43012
43013 if (info->op_info[opno0].n_chains != 0)
43014 {
43015 gcc_assert (info->op_info[opno0].n_chains == 1);
43016 du_head_p op0c;
43017 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
43018 if (op0c->target_data_1 + op0c->target_data_2 == 0
43019 && !op0c->cannot_rename)
43020 cands.safe_push (op0c);
43021
43022 op0c->target_data_1++;
43023 }
43024 if (info->op_info[opno1].n_chains != 0)
43025 {
43026 gcc_assert (info->op_info[opno1].n_chains == 1);
43027 du_head_p op1c;
43028 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
43029 if (op1c->target_data_1 + op1c->target_data_2 == 0
43030 && !op1c->cannot_rename)
43031 cands.safe_push (op1c);
43032
43033 op1c->target_data_2++;
43034 }
43035 }
43036
43037 int i;
43038 du_head_p head;
43039 FOR_EACH_VEC_ELT (cands, i, head)
43040 {
43041 int old_reg, best_reg;
43042 HARD_REG_SET unavailable;
43043
43044 CLEAR_HARD_REG_SET (unavailable);
43045 if (head->target_data_1)
43046 IOR_HARD_REG_SET (unavailable, output_risky);
43047 if (head->target_data_2)
43048 IOR_HARD_REG_SET (unavailable, input_risky);
43049
43050 int n_uses;
43051 reg_class superclass = regrename_find_superclass (head, &n_uses,
43052 &unavailable);
43053 old_reg = head->regno;
43054 best_reg = find_rename_reg (head, superclass, &unavailable,
43055 old_reg, false);
43056 bool ok = regrename_do_replace (head, best_reg);
43057 gcc_assert (ok);
43058 if (dump_file)
43059 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
43060 reg_names[best_reg], reg_class_names[superclass]);
43061
43062 }
43063
43064 regrename_finish ();
43065
43066 df_analyze ();
43067
43068 basic_block bb;
43069 regset_head live;
43070
43071 INIT_REG_SET (&live);
43072
43073 FOR_EACH_BB_FN (bb, cfun)
43074 {
43075 rtx_insn *insn;
43076
43077 COPY_REG_SET (&live, DF_LR_OUT (bb));
43078 df_simulate_initialize_backwards (bb, &live);
43079
43080 FOR_BB_INSNS_REVERSE (bb, insn)
43081 {
43082 if (!NONDEBUG_INSN_P (insn))
43083 continue;
43084
43085 df_simulate_one_insn_backwards (bb, insn, &live);
43086
43087 if (GET_CODE (PATTERN (insn)) == USE
43088 || GET_CODE (PATTERN (insn)) == CLOBBER)
43089 continue;
43090
43091 extract_insn (insn);
43092 constrain_operands_cached (insn, reload_completed);
43093 int opno0, opno1;
43094 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
43095 recog_data.n_operands, &opno0,
43096 &opno1);
43097 if (modrm < 0
43098 || !ix86_rop_should_change_byte_p (modrm)
43099 || opno0 == opno1)
43100 continue;
43101
43102 rtx oldreg = recog_data.operand[opno1];
43103 preprocess_constraints (insn);
43104 const operand_alternative *alt = which_op_alt ();
43105
43106 int i;
43107 for (i = 0; i < recog_data.n_operands; i++)
43108 if (i != opno1
43109 && alt[i].earlyclobber
43110 && reg_overlap_mentioned_p (recog_data.operand[i],
43111 oldreg))
43112 break;
43113
43114 if (i < recog_data.n_operands)
43115 continue;
43116
43117 if (dump_file)
43118 fprintf (dump_file,
43119 "attempting to fix modrm byte in insn %d:"
43120 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
43121 reg_class_names[alt[opno1].cl]);
43122
43123 HARD_REG_SET unavailable;
43124 REG_SET_TO_HARD_REG_SET (unavailable, &live);
43125 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
43126 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
43127 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
43128 IOR_HARD_REG_SET (unavailable, output_risky);
43129 IOR_COMPL_HARD_REG_SET (unavailable,
43130 reg_class_contents[alt[opno1].cl]);
43131
43132 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
43133 if (!TEST_HARD_REG_BIT (unavailable, i))
43134 break;
43135 if (i == FIRST_PSEUDO_REGISTER)
43136 {
43137 if (dump_file)
43138 fprintf (dump_file, ", none available\n");
43139 continue;
43140 }
43141 if (dump_file)
43142 fprintf (dump_file, " -> %d\n", i);
43143 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
43144 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
43145 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
43146 }
43147 }
43148 }
43149
43150 /* Implement machine specific optimizations. We implement padding of returns
43151 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
43152 static void
43153 ix86_reorg (void)
43154 {
43155 /* We are freeing block_for_insn in the toplev to keep compatibility
43156 with old MDEP_REORGS that are not CFG based. Recompute it now. */
43157 compute_bb_for_insn ();
43158
43159 if (flag_mitigate_rop)
43160 ix86_mitigate_rop ();
43161
43162 if (TARGET_SEH && current_function_has_exception_handlers ())
43163 ix86_seh_fixup_eh_fallthru ();
43164
43165 if (optimize && optimize_function_for_speed_p (cfun))
43166 {
43167 if (TARGET_PAD_SHORT_FUNCTION)
43168 ix86_pad_short_function ();
43169 else if (TARGET_PAD_RETURNS)
43170 ix86_pad_returns ();
43171 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
43172 if (TARGET_FOUR_JUMP_LIMIT)
43173 ix86_avoid_jump_mispredicts ();
43174 #endif
43175 }
43176 }
43177
43178 /* Return nonzero when QImode register that must be represented via REX prefix
43179 is used. */
43180 bool
43181 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
43182 {
43183 int i;
43184 extract_insn_cached (insn);
43185 for (i = 0; i < recog_data.n_operands; i++)
43186 if (GENERAL_REG_P (recog_data.operand[i])
43187 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
43188 return true;
43189 return false;
43190 }
43191
43192 /* Return true when INSN mentions register that must be encoded using REX
43193 prefix. */
43194 bool
43195 x86_extended_reg_mentioned_p (rtx insn)
43196 {
43197 subrtx_iterator::array_type array;
43198 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
43199 {
43200 const_rtx x = *iter;
43201 if (REG_P (x)
43202 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
43203 return true;
43204 }
43205 return false;
43206 }
43207
43208 /* If profitable, negate (without causing overflow) integer constant
43209 of mode MODE at location LOC. Return true in this case. */
43210 bool
43211 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
43212 {
43213 HOST_WIDE_INT val;
43214
43215 if (!CONST_INT_P (*loc))
43216 return false;
43217
43218 switch (mode)
43219 {
43220 case DImode:
43221 /* DImode x86_64 constants must fit in 32 bits. */
43222 gcc_assert (x86_64_immediate_operand (*loc, mode));
43223
43224 mode = SImode;
43225 break;
43226
43227 case SImode:
43228 case HImode:
43229 case QImode:
43230 break;
43231
43232 default:
43233 gcc_unreachable ();
43234 }
43235
43236 /* Avoid overflows. */
43237 if (mode_signbit_p (mode, *loc))
43238 return false;
43239
43240 val = INTVAL (*loc);
43241
43242 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
43243 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
43244 if ((val < 0 && val != -128)
43245 || val == 128)
43246 {
43247 *loc = GEN_INT (-val);
43248 return true;
43249 }
43250
43251 return false;
43252 }
43253
43254 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
43255 optabs would emit if we didn't have TFmode patterns. */
43256
43257 void
43258 x86_emit_floatuns (rtx operands[2])
43259 {
43260 rtx_code_label *neglab, *donelab;
43261 rtx i0, i1, f0, in, out;
43262 machine_mode mode, inmode;
43263
43264 inmode = GET_MODE (operands[1]);
43265 gcc_assert (inmode == SImode || inmode == DImode);
43266
43267 out = operands[0];
43268 in = force_reg (inmode, operands[1]);
43269 mode = GET_MODE (out);
43270 neglab = gen_label_rtx ();
43271 donelab = gen_label_rtx ();
43272 f0 = gen_reg_rtx (mode);
43273
43274 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
43275
43276 expand_float (out, in, 0);
43277
43278 emit_jump_insn (gen_jump (donelab));
43279 emit_barrier ();
43280
43281 emit_label (neglab);
43282
43283 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
43284 1, OPTAB_DIRECT);
43285 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
43286 1, OPTAB_DIRECT);
43287 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
43288
43289 expand_float (f0, i0, 0);
43290
43291 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
43292
43293 emit_label (donelab);
43294 }
43295 \f
43296 static bool canonicalize_perm (struct expand_vec_perm_d *d);
43297 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
43298 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
43299 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
43300
43301 /* Get a vector mode of the same size as the original but with elements
43302 twice as wide. This is only guaranteed to apply to integral vectors. */
43303
43304 static inline machine_mode
43305 get_mode_wider_vector (machine_mode o)
43306 {
43307 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
43308 machine_mode n = GET_MODE_WIDER_MODE (o);
43309 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
43310 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
43311 return n;
43312 }
43313
43314 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
43315 fill target with val via vec_duplicate. */
43316
43317 static bool
43318 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
43319 {
43320 bool ok;
43321 rtx_insn *insn;
43322 rtx dup;
43323
43324 /* First attempt to recognize VAL as-is. */
43325 dup = gen_rtx_VEC_DUPLICATE (mode, val);
43326 insn = emit_insn (gen_rtx_SET (target, dup));
43327 if (recog_memoized (insn) < 0)
43328 {
43329 rtx_insn *seq;
43330 machine_mode innermode = GET_MODE_INNER (mode);
43331 rtx reg;
43332
43333 /* If that fails, force VAL into a register. */
43334
43335 start_sequence ();
43336 reg = force_reg (innermode, val);
43337 if (GET_MODE (reg) != innermode)
43338 reg = gen_lowpart (innermode, reg);
43339 XEXP (dup, 0) = reg;
43340 seq = get_insns ();
43341 end_sequence ();
43342 if (seq)
43343 emit_insn_before (seq, insn);
43344
43345 ok = recog_memoized (insn) >= 0;
43346 gcc_assert (ok);
43347 }
43348 return true;
43349 }
43350
43351 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
43352 with all elements equal to VAR. Return true if successful. */
43353
43354 static bool
43355 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
43356 rtx target, rtx val)
43357 {
43358 bool ok;
43359
43360 switch (mode)
43361 {
43362 case V2SImode:
43363 case V2SFmode:
43364 if (!mmx_ok)
43365 return false;
43366 /* FALLTHRU */
43367
43368 case V4DFmode:
43369 case V4DImode:
43370 case V8SFmode:
43371 case V8SImode:
43372 case V2DFmode:
43373 case V2DImode:
43374 case V4SFmode:
43375 case V4SImode:
43376 case V16SImode:
43377 case V8DImode:
43378 case V16SFmode:
43379 case V8DFmode:
43380 return ix86_vector_duplicate_value (mode, target, val);
43381
43382 case V4HImode:
43383 if (!mmx_ok)
43384 return false;
43385 if (TARGET_SSE || TARGET_3DNOW_A)
43386 {
43387 rtx x;
43388
43389 val = gen_lowpart (SImode, val);
43390 x = gen_rtx_TRUNCATE (HImode, val);
43391 x = gen_rtx_VEC_DUPLICATE (mode, x);
43392 emit_insn (gen_rtx_SET (target, x));
43393 return true;
43394 }
43395 goto widen;
43396
43397 case V8QImode:
43398 if (!mmx_ok)
43399 return false;
43400 goto widen;
43401
43402 case V8HImode:
43403 if (TARGET_AVX2)
43404 return ix86_vector_duplicate_value (mode, target, val);
43405
43406 if (TARGET_SSE2)
43407 {
43408 struct expand_vec_perm_d dperm;
43409 rtx tmp1, tmp2;
43410
43411 permute:
43412 memset (&dperm, 0, sizeof (dperm));
43413 dperm.target = target;
43414 dperm.vmode = mode;
43415 dperm.nelt = GET_MODE_NUNITS (mode);
43416 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
43417 dperm.one_operand_p = true;
43418
43419 /* Extend to SImode using a paradoxical SUBREG. */
43420 tmp1 = gen_reg_rtx (SImode);
43421 emit_move_insn (tmp1, gen_lowpart (SImode, val));
43422
43423 /* Insert the SImode value as low element of a V4SImode vector. */
43424 tmp2 = gen_reg_rtx (V4SImode);
43425 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
43426 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
43427
43428 ok = (expand_vec_perm_1 (&dperm)
43429 || expand_vec_perm_broadcast_1 (&dperm));
43430 gcc_assert (ok);
43431 return ok;
43432 }
43433 goto widen;
43434
43435 case V16QImode:
43436 if (TARGET_AVX2)
43437 return ix86_vector_duplicate_value (mode, target, val);
43438
43439 if (TARGET_SSE2)
43440 goto permute;
43441 goto widen;
43442
43443 widen:
43444 /* Replicate the value once into the next wider mode and recurse. */
43445 {
43446 machine_mode smode, wsmode, wvmode;
43447 rtx x;
43448
43449 smode = GET_MODE_INNER (mode);
43450 wvmode = get_mode_wider_vector (mode);
43451 wsmode = GET_MODE_INNER (wvmode);
43452
43453 val = convert_modes (wsmode, smode, val, true);
43454 x = expand_simple_binop (wsmode, ASHIFT, val,
43455 GEN_INT (GET_MODE_BITSIZE (smode)),
43456 NULL_RTX, 1, OPTAB_LIB_WIDEN);
43457 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
43458
43459 x = gen_reg_rtx (wvmode);
43460 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
43461 gcc_assert (ok);
43462 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
43463 return ok;
43464 }
43465
43466 case V16HImode:
43467 case V32QImode:
43468 if (TARGET_AVX2)
43469 return ix86_vector_duplicate_value (mode, target, val);
43470 else
43471 {
43472 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
43473 rtx x = gen_reg_rtx (hvmode);
43474
43475 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
43476 gcc_assert (ok);
43477
43478 x = gen_rtx_VEC_CONCAT (mode, x, x);
43479 emit_insn (gen_rtx_SET (target, x));
43480 }
43481 return true;
43482
43483 case V64QImode:
43484 case V32HImode:
43485 if (TARGET_AVX512BW)
43486 return ix86_vector_duplicate_value (mode, target, val);
43487 else
43488 {
43489 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
43490 rtx x = gen_reg_rtx (hvmode);
43491
43492 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
43493 gcc_assert (ok);
43494
43495 x = gen_rtx_VEC_CONCAT (mode, x, x);
43496 emit_insn (gen_rtx_SET (target, x));
43497 }
43498 return true;
43499
43500 default:
43501 return false;
43502 }
43503 }
43504
43505 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
43506 whose ONE_VAR element is VAR, and other elements are zero. Return true
43507 if successful. */
43508
43509 static bool
43510 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
43511 rtx target, rtx var, int one_var)
43512 {
43513 machine_mode vsimode;
43514 rtx new_target;
43515 rtx x, tmp;
43516 bool use_vector_set = false;
43517
43518 switch (mode)
43519 {
43520 case V2DImode:
43521 /* For SSE4.1, we normally use vector set. But if the second
43522 element is zero and inter-unit moves are OK, we use movq
43523 instead. */
43524 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
43525 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
43526 && one_var == 0));
43527 break;
43528 case V16QImode:
43529 case V4SImode:
43530 case V4SFmode:
43531 use_vector_set = TARGET_SSE4_1;
43532 break;
43533 case V8HImode:
43534 use_vector_set = TARGET_SSE2;
43535 break;
43536 case V4HImode:
43537 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
43538 break;
43539 case V32QImode:
43540 case V16HImode:
43541 case V8SImode:
43542 case V8SFmode:
43543 case V4DFmode:
43544 use_vector_set = TARGET_AVX;
43545 break;
43546 case V4DImode:
43547 /* Use ix86_expand_vector_set in 64bit mode only. */
43548 use_vector_set = TARGET_AVX && TARGET_64BIT;
43549 break;
43550 default:
43551 break;
43552 }
43553
43554 if (use_vector_set)
43555 {
43556 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
43557 var = force_reg (GET_MODE_INNER (mode), var);
43558 ix86_expand_vector_set (mmx_ok, target, var, one_var);
43559 return true;
43560 }
43561
43562 switch (mode)
43563 {
43564 case V2SFmode:
43565 case V2SImode:
43566 if (!mmx_ok)
43567 return false;
43568 /* FALLTHRU */
43569
43570 case V2DFmode:
43571 case V2DImode:
43572 if (one_var != 0)
43573 return false;
43574 var = force_reg (GET_MODE_INNER (mode), var);
43575 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
43576 emit_insn (gen_rtx_SET (target, x));
43577 return true;
43578
43579 case V4SFmode:
43580 case V4SImode:
43581 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
43582 new_target = gen_reg_rtx (mode);
43583 else
43584 new_target = target;
43585 var = force_reg (GET_MODE_INNER (mode), var);
43586 x = gen_rtx_VEC_DUPLICATE (mode, var);
43587 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
43588 emit_insn (gen_rtx_SET (new_target, x));
43589 if (one_var != 0)
43590 {
43591 /* We need to shuffle the value to the correct position, so
43592 create a new pseudo to store the intermediate result. */
43593
43594 /* With SSE2, we can use the integer shuffle insns. */
43595 if (mode != V4SFmode && TARGET_SSE2)
43596 {
43597 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
43598 const1_rtx,
43599 GEN_INT (one_var == 1 ? 0 : 1),
43600 GEN_INT (one_var == 2 ? 0 : 1),
43601 GEN_INT (one_var == 3 ? 0 : 1)));
43602 if (target != new_target)
43603 emit_move_insn (target, new_target);
43604 return true;
43605 }
43606
43607 /* Otherwise convert the intermediate result to V4SFmode and
43608 use the SSE1 shuffle instructions. */
43609 if (mode != V4SFmode)
43610 {
43611 tmp = gen_reg_rtx (V4SFmode);
43612 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
43613 }
43614 else
43615 tmp = new_target;
43616
43617 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
43618 const1_rtx,
43619 GEN_INT (one_var == 1 ? 0 : 1),
43620 GEN_INT (one_var == 2 ? 0+4 : 1+4),
43621 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
43622
43623 if (mode != V4SFmode)
43624 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
43625 else if (tmp != target)
43626 emit_move_insn (target, tmp);
43627 }
43628 else if (target != new_target)
43629 emit_move_insn (target, new_target);
43630 return true;
43631
43632 case V8HImode:
43633 case V16QImode:
43634 vsimode = V4SImode;
43635 goto widen;
43636 case V4HImode:
43637 case V8QImode:
43638 if (!mmx_ok)
43639 return false;
43640 vsimode = V2SImode;
43641 goto widen;
43642 widen:
43643 if (one_var != 0)
43644 return false;
43645
43646 /* Zero extend the variable element to SImode and recurse. */
43647 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
43648
43649 x = gen_reg_rtx (vsimode);
43650 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
43651 var, one_var))
43652 gcc_unreachable ();
43653
43654 emit_move_insn (target, gen_lowpart (mode, x));
43655 return true;
43656
43657 default:
43658 return false;
43659 }
43660 }
43661
43662 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
43663 consisting of the values in VALS. It is known that all elements
43664 except ONE_VAR are constants. Return true if successful. */
43665
43666 static bool
43667 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
43668 rtx target, rtx vals, int one_var)
43669 {
43670 rtx var = XVECEXP (vals, 0, one_var);
43671 machine_mode wmode;
43672 rtx const_vec, x;
43673
43674 const_vec = copy_rtx (vals);
43675 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
43676 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
43677
43678 switch (mode)
43679 {
43680 case V2DFmode:
43681 case V2DImode:
43682 case V2SFmode:
43683 case V2SImode:
43684 /* For the two element vectors, it's just as easy to use
43685 the general case. */
43686 return false;
43687
43688 case V4DImode:
43689 /* Use ix86_expand_vector_set in 64bit mode only. */
43690 if (!TARGET_64BIT)
43691 return false;
43692 /* FALLTHRU */
43693 case V4DFmode:
43694 case V8SFmode:
43695 case V8SImode:
43696 case V16HImode:
43697 case V32QImode:
43698 case V4SFmode:
43699 case V4SImode:
43700 case V8HImode:
43701 case V4HImode:
43702 break;
43703
43704 case V16QImode:
43705 if (TARGET_SSE4_1)
43706 break;
43707 wmode = V8HImode;
43708 goto widen;
43709 case V8QImode:
43710 wmode = V4HImode;
43711 goto widen;
43712 widen:
43713 /* There's no way to set one QImode entry easily. Combine
43714 the variable value with its adjacent constant value, and
43715 promote to an HImode set. */
43716 x = XVECEXP (vals, 0, one_var ^ 1);
43717 if (one_var & 1)
43718 {
43719 var = convert_modes (HImode, QImode, var, true);
43720 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
43721 NULL_RTX, 1, OPTAB_LIB_WIDEN);
43722 x = GEN_INT (INTVAL (x) & 0xff);
43723 }
43724 else
43725 {
43726 var = convert_modes (HImode, QImode, var, true);
43727 x = gen_int_mode (INTVAL (x) << 8, HImode);
43728 }
43729 if (x != const0_rtx)
43730 var = expand_simple_binop (HImode, IOR, var, x, var,
43731 1, OPTAB_LIB_WIDEN);
43732
43733 x = gen_reg_rtx (wmode);
43734 emit_move_insn (x, gen_lowpart (wmode, const_vec));
43735 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
43736
43737 emit_move_insn (target, gen_lowpart (mode, x));
43738 return true;
43739
43740 default:
43741 return false;
43742 }
43743
43744 emit_move_insn (target, const_vec);
43745 ix86_expand_vector_set (mmx_ok, target, var, one_var);
43746 return true;
43747 }
43748
43749 /* A subroutine of ix86_expand_vector_init_general. Use vector
43750 concatenate to handle the most general case: all values variable,
43751 and none identical. */
43752
43753 static void
43754 ix86_expand_vector_init_concat (machine_mode mode,
43755 rtx target, rtx *ops, int n)
43756 {
43757 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
43758 rtx first[16], second[8], third[4];
43759 rtvec v;
43760 int i, j;
43761
43762 switch (n)
43763 {
43764 case 2:
43765 switch (mode)
43766 {
43767 case V16SImode:
43768 cmode = V8SImode;
43769 break;
43770 case V16SFmode:
43771 cmode = V8SFmode;
43772 break;
43773 case V8DImode:
43774 cmode = V4DImode;
43775 break;
43776 case V8DFmode:
43777 cmode = V4DFmode;
43778 break;
43779 case V8SImode:
43780 cmode = V4SImode;
43781 break;
43782 case V8SFmode:
43783 cmode = V4SFmode;
43784 break;
43785 case V4DImode:
43786 cmode = V2DImode;
43787 break;
43788 case V4DFmode:
43789 cmode = V2DFmode;
43790 break;
43791 case V4SImode:
43792 cmode = V2SImode;
43793 break;
43794 case V4SFmode:
43795 cmode = V2SFmode;
43796 break;
43797 case V2DImode:
43798 cmode = DImode;
43799 break;
43800 case V2SImode:
43801 cmode = SImode;
43802 break;
43803 case V2DFmode:
43804 cmode = DFmode;
43805 break;
43806 case V2SFmode:
43807 cmode = SFmode;
43808 break;
43809 default:
43810 gcc_unreachable ();
43811 }
43812
43813 if (!register_operand (ops[1], cmode))
43814 ops[1] = force_reg (cmode, ops[1]);
43815 if (!register_operand (ops[0], cmode))
43816 ops[0] = force_reg (cmode, ops[0]);
43817 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
43818 ops[1])));
43819 break;
43820
43821 case 4:
43822 switch (mode)
43823 {
43824 case V4DImode:
43825 cmode = V2DImode;
43826 break;
43827 case V4DFmode:
43828 cmode = V2DFmode;
43829 break;
43830 case V4SImode:
43831 cmode = V2SImode;
43832 break;
43833 case V4SFmode:
43834 cmode = V2SFmode;
43835 break;
43836 default:
43837 gcc_unreachable ();
43838 }
43839 goto half;
43840
43841 case 8:
43842 switch (mode)
43843 {
43844 case V8DImode:
43845 cmode = V2DImode;
43846 hmode = V4DImode;
43847 break;
43848 case V8DFmode:
43849 cmode = V2DFmode;
43850 hmode = V4DFmode;
43851 break;
43852 case V8SImode:
43853 cmode = V2SImode;
43854 hmode = V4SImode;
43855 break;
43856 case V8SFmode:
43857 cmode = V2SFmode;
43858 hmode = V4SFmode;
43859 break;
43860 default:
43861 gcc_unreachable ();
43862 }
43863 goto half;
43864
43865 case 16:
43866 switch (mode)
43867 {
43868 case V16SImode:
43869 cmode = V2SImode;
43870 hmode = V4SImode;
43871 gmode = V8SImode;
43872 break;
43873 case V16SFmode:
43874 cmode = V2SFmode;
43875 hmode = V4SFmode;
43876 gmode = V8SFmode;
43877 break;
43878 default:
43879 gcc_unreachable ();
43880 }
43881 goto half;
43882
43883 half:
43884 /* FIXME: We process inputs backward to help RA. PR 36222. */
43885 i = n - 1;
43886 j = (n >> 1) - 1;
43887 for (; i > 0; i -= 2, j--)
43888 {
43889 first[j] = gen_reg_rtx (cmode);
43890 v = gen_rtvec (2, ops[i - 1], ops[i]);
43891 ix86_expand_vector_init (false, first[j],
43892 gen_rtx_PARALLEL (cmode, v));
43893 }
43894
43895 n >>= 1;
43896 if (n > 4)
43897 {
43898 gcc_assert (hmode != VOIDmode);
43899 gcc_assert (gmode != VOIDmode);
43900 for (i = j = 0; i < n; i += 2, j++)
43901 {
43902 second[j] = gen_reg_rtx (hmode);
43903 ix86_expand_vector_init_concat (hmode, second [j],
43904 &first [i], 2);
43905 }
43906 n >>= 1;
43907 for (i = j = 0; i < n; i += 2, j++)
43908 {
43909 third[j] = gen_reg_rtx (gmode);
43910 ix86_expand_vector_init_concat (gmode, third[j],
43911 &second[i], 2);
43912 }
43913 n >>= 1;
43914 ix86_expand_vector_init_concat (mode, target, third, n);
43915 }
43916 else if (n > 2)
43917 {
43918 gcc_assert (hmode != VOIDmode);
43919 for (i = j = 0; i < n; i += 2, j++)
43920 {
43921 second[j] = gen_reg_rtx (hmode);
43922 ix86_expand_vector_init_concat (hmode, second [j],
43923 &first [i], 2);
43924 }
43925 n >>= 1;
43926 ix86_expand_vector_init_concat (mode, target, second, n);
43927 }
43928 else
43929 ix86_expand_vector_init_concat (mode, target, first, n);
43930 break;
43931
43932 default:
43933 gcc_unreachable ();
43934 }
43935 }
43936
43937 /* A subroutine of ix86_expand_vector_init_general. Use vector
43938 interleave to handle the most general case: all values variable,
43939 and none identical. */
43940
43941 static void
43942 ix86_expand_vector_init_interleave (machine_mode mode,
43943 rtx target, rtx *ops, int n)
43944 {
43945 machine_mode first_imode, second_imode, third_imode, inner_mode;
43946 int i, j;
43947 rtx op0, op1;
43948 rtx (*gen_load_even) (rtx, rtx, rtx);
43949 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
43950 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
43951
43952 switch (mode)
43953 {
43954 case V8HImode:
43955 gen_load_even = gen_vec_setv8hi;
43956 gen_interleave_first_low = gen_vec_interleave_lowv4si;
43957 gen_interleave_second_low = gen_vec_interleave_lowv2di;
43958 inner_mode = HImode;
43959 first_imode = V4SImode;
43960 second_imode = V2DImode;
43961 third_imode = VOIDmode;
43962 break;
43963 case V16QImode:
43964 gen_load_even = gen_vec_setv16qi;
43965 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
43966 gen_interleave_second_low = gen_vec_interleave_lowv4si;
43967 inner_mode = QImode;
43968 first_imode = V8HImode;
43969 second_imode = V4SImode;
43970 third_imode = V2DImode;
43971 break;
43972 default:
43973 gcc_unreachable ();
43974 }
43975
43976 for (i = 0; i < n; i++)
43977 {
43978 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
43979 op0 = gen_reg_rtx (SImode);
43980 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
43981
43982 /* Insert the SImode value as low element of V4SImode vector. */
43983 op1 = gen_reg_rtx (V4SImode);
43984 op0 = gen_rtx_VEC_MERGE (V4SImode,
43985 gen_rtx_VEC_DUPLICATE (V4SImode,
43986 op0),
43987 CONST0_RTX (V4SImode),
43988 const1_rtx);
43989 emit_insn (gen_rtx_SET (op1, op0));
43990
43991 /* Cast the V4SImode vector back to a vector in orignal mode. */
43992 op0 = gen_reg_rtx (mode);
43993 emit_move_insn (op0, gen_lowpart (mode, op1));
43994
43995 /* Load even elements into the second position. */
43996 emit_insn (gen_load_even (op0,
43997 force_reg (inner_mode,
43998 ops [i + i + 1]),
43999 const1_rtx));
44000
44001 /* Cast vector to FIRST_IMODE vector. */
44002 ops[i] = gen_reg_rtx (first_imode);
44003 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
44004 }
44005
44006 /* Interleave low FIRST_IMODE vectors. */
44007 for (i = j = 0; i < n; i += 2, j++)
44008 {
44009 op0 = gen_reg_rtx (first_imode);
44010 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
44011
44012 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
44013 ops[j] = gen_reg_rtx (second_imode);
44014 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
44015 }
44016
44017 /* Interleave low SECOND_IMODE vectors. */
44018 switch (second_imode)
44019 {
44020 case V4SImode:
44021 for (i = j = 0; i < n / 2; i += 2, j++)
44022 {
44023 op0 = gen_reg_rtx (second_imode);
44024 emit_insn (gen_interleave_second_low (op0, ops[i],
44025 ops[i + 1]));
44026
44027 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
44028 vector. */
44029 ops[j] = gen_reg_rtx (third_imode);
44030 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
44031 }
44032 second_imode = V2DImode;
44033 gen_interleave_second_low = gen_vec_interleave_lowv2di;
44034 /* FALLTHRU */
44035
44036 case V2DImode:
44037 op0 = gen_reg_rtx (second_imode);
44038 emit_insn (gen_interleave_second_low (op0, ops[0],
44039 ops[1]));
44040
44041 /* Cast the SECOND_IMODE vector back to a vector on original
44042 mode. */
44043 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
44044 break;
44045
44046 default:
44047 gcc_unreachable ();
44048 }
44049 }
44050
44051 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
44052 all values variable, and none identical. */
44053
44054 static void
44055 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
44056 rtx target, rtx vals)
44057 {
44058 rtx ops[64], op0, op1, op2, op3, op4, op5;
44059 machine_mode half_mode = VOIDmode;
44060 machine_mode quarter_mode = VOIDmode;
44061 int n, i;
44062
44063 switch (mode)
44064 {
44065 case V2SFmode:
44066 case V2SImode:
44067 if (!mmx_ok && !TARGET_SSE)
44068 break;
44069 /* FALLTHRU */
44070
44071 case V16SImode:
44072 case V16SFmode:
44073 case V8DFmode:
44074 case V8DImode:
44075 case V8SFmode:
44076 case V8SImode:
44077 case V4DFmode:
44078 case V4DImode:
44079 case V4SFmode:
44080 case V4SImode:
44081 case V2DFmode:
44082 case V2DImode:
44083 n = GET_MODE_NUNITS (mode);
44084 for (i = 0; i < n; i++)
44085 ops[i] = XVECEXP (vals, 0, i);
44086 ix86_expand_vector_init_concat (mode, target, ops, n);
44087 return;
44088
44089 case V32QImode:
44090 half_mode = V16QImode;
44091 goto half;
44092
44093 case V16HImode:
44094 half_mode = V8HImode;
44095 goto half;
44096
44097 half:
44098 n = GET_MODE_NUNITS (mode);
44099 for (i = 0; i < n; i++)
44100 ops[i] = XVECEXP (vals, 0, i);
44101 op0 = gen_reg_rtx (half_mode);
44102 op1 = gen_reg_rtx (half_mode);
44103 ix86_expand_vector_init_interleave (half_mode, op0, ops,
44104 n >> 2);
44105 ix86_expand_vector_init_interleave (half_mode, op1,
44106 &ops [n >> 1], n >> 2);
44107 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
44108 return;
44109
44110 case V64QImode:
44111 quarter_mode = V16QImode;
44112 half_mode = V32QImode;
44113 goto quarter;
44114
44115 case V32HImode:
44116 quarter_mode = V8HImode;
44117 half_mode = V16HImode;
44118 goto quarter;
44119
44120 quarter:
44121 n = GET_MODE_NUNITS (mode);
44122 for (i = 0; i < n; i++)
44123 ops[i] = XVECEXP (vals, 0, i);
44124 op0 = gen_reg_rtx (quarter_mode);
44125 op1 = gen_reg_rtx (quarter_mode);
44126 op2 = gen_reg_rtx (quarter_mode);
44127 op3 = gen_reg_rtx (quarter_mode);
44128 op4 = gen_reg_rtx (half_mode);
44129 op5 = gen_reg_rtx (half_mode);
44130 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
44131 n >> 3);
44132 ix86_expand_vector_init_interleave (quarter_mode, op1,
44133 &ops [n >> 2], n >> 3);
44134 ix86_expand_vector_init_interleave (quarter_mode, op2,
44135 &ops [n >> 1], n >> 3);
44136 ix86_expand_vector_init_interleave (quarter_mode, op3,
44137 &ops [(n >> 1) | (n >> 2)], n >> 3);
44138 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
44139 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
44140 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
44141 return;
44142
44143 case V16QImode:
44144 if (!TARGET_SSE4_1)
44145 break;
44146 /* FALLTHRU */
44147
44148 case V8HImode:
44149 if (!TARGET_SSE2)
44150 break;
44151
44152 /* Don't use ix86_expand_vector_init_interleave if we can't
44153 move from GPR to SSE register directly. */
44154 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
44155 break;
44156
44157 n = GET_MODE_NUNITS (mode);
44158 for (i = 0; i < n; i++)
44159 ops[i] = XVECEXP (vals, 0, i);
44160 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
44161 return;
44162
44163 case V4HImode:
44164 case V8QImode:
44165 break;
44166
44167 default:
44168 gcc_unreachable ();
44169 }
44170
44171 {
44172 int i, j, n_elts, n_words, n_elt_per_word;
44173 machine_mode inner_mode;
44174 rtx words[4], shift;
44175
44176 inner_mode = GET_MODE_INNER (mode);
44177 n_elts = GET_MODE_NUNITS (mode);
44178 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
44179 n_elt_per_word = n_elts / n_words;
44180 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
44181
44182 for (i = 0; i < n_words; ++i)
44183 {
44184 rtx word = NULL_RTX;
44185
44186 for (j = 0; j < n_elt_per_word; ++j)
44187 {
44188 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
44189 elt = convert_modes (word_mode, inner_mode, elt, true);
44190
44191 if (j == 0)
44192 word = elt;
44193 else
44194 {
44195 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
44196 word, 1, OPTAB_LIB_WIDEN);
44197 word = expand_simple_binop (word_mode, IOR, word, elt,
44198 word, 1, OPTAB_LIB_WIDEN);
44199 }
44200 }
44201
44202 words[i] = word;
44203 }
44204
44205 if (n_words == 1)
44206 emit_move_insn (target, gen_lowpart (mode, words[0]));
44207 else if (n_words == 2)
44208 {
44209 rtx tmp = gen_reg_rtx (mode);
44210 emit_clobber (tmp);
44211 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
44212 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
44213 emit_move_insn (target, tmp);
44214 }
44215 else if (n_words == 4)
44216 {
44217 rtx tmp = gen_reg_rtx (V4SImode);
44218 gcc_assert (word_mode == SImode);
44219 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
44220 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
44221 emit_move_insn (target, gen_lowpart (mode, tmp));
44222 }
44223 else
44224 gcc_unreachable ();
44225 }
44226 }
44227
44228 /* Initialize vector TARGET via VALS. Suppress the use of MMX
44229 instructions unless MMX_OK is true. */
44230
44231 void
44232 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
44233 {
44234 machine_mode mode = GET_MODE (target);
44235 machine_mode inner_mode = GET_MODE_INNER (mode);
44236 int n_elts = GET_MODE_NUNITS (mode);
44237 int n_var = 0, one_var = -1;
44238 bool all_same = true, all_const_zero = true;
44239 int i;
44240 rtx x;
44241
44242 for (i = 0; i < n_elts; ++i)
44243 {
44244 x = XVECEXP (vals, 0, i);
44245 if (!(CONST_SCALAR_INT_P (x)
44246 || CONST_DOUBLE_P (x)
44247 || CONST_FIXED_P (x)))
44248 n_var++, one_var = i;
44249 else if (x != CONST0_RTX (inner_mode))
44250 all_const_zero = false;
44251 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
44252 all_same = false;
44253 }
44254
44255 /* Constants are best loaded from the constant pool. */
44256 if (n_var == 0)
44257 {
44258 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
44259 return;
44260 }
44261
44262 /* If all values are identical, broadcast the value. */
44263 if (all_same
44264 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
44265 XVECEXP (vals, 0, 0)))
44266 return;
44267
44268 /* Values where only one field is non-constant are best loaded from
44269 the pool and overwritten via move later. */
44270 if (n_var == 1)
44271 {
44272 if (all_const_zero
44273 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
44274 XVECEXP (vals, 0, one_var),
44275 one_var))
44276 return;
44277
44278 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
44279 return;
44280 }
44281
44282 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
44283 }
44284
44285 void
44286 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
44287 {
44288 machine_mode mode = GET_MODE (target);
44289 machine_mode inner_mode = GET_MODE_INNER (mode);
44290 machine_mode half_mode;
44291 bool use_vec_merge = false;
44292 rtx tmp;
44293 static rtx (*gen_extract[6][2]) (rtx, rtx)
44294 = {
44295 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
44296 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
44297 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
44298 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
44299 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
44300 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
44301 };
44302 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
44303 = {
44304 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
44305 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
44306 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
44307 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
44308 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
44309 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
44310 };
44311 int i, j, n;
44312 machine_mode mmode = VOIDmode;
44313 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
44314
44315 switch (mode)
44316 {
44317 case V2SFmode:
44318 case V2SImode:
44319 if (mmx_ok)
44320 {
44321 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
44322 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
44323 if (elt == 0)
44324 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
44325 else
44326 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
44327 emit_insn (gen_rtx_SET (target, tmp));
44328 return;
44329 }
44330 break;
44331
44332 case V2DImode:
44333 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
44334 if (use_vec_merge)
44335 break;
44336
44337 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
44338 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
44339 if (elt == 0)
44340 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
44341 else
44342 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
44343 emit_insn (gen_rtx_SET (target, tmp));
44344 return;
44345
44346 case V2DFmode:
44347 {
44348 rtx op0, op1;
44349
44350 /* For the two element vectors, we implement a VEC_CONCAT with
44351 the extraction of the other element. */
44352
44353 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
44354 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
44355
44356 if (elt == 0)
44357 op0 = val, op1 = tmp;
44358 else
44359 op0 = tmp, op1 = val;
44360
44361 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
44362 emit_insn (gen_rtx_SET (target, tmp));
44363 }
44364 return;
44365
44366 case V4SFmode:
44367 use_vec_merge = TARGET_SSE4_1;
44368 if (use_vec_merge)
44369 break;
44370
44371 switch (elt)
44372 {
44373 case 0:
44374 use_vec_merge = true;
44375 break;
44376
44377 case 1:
44378 /* tmp = target = A B C D */
44379 tmp = copy_to_reg (target);
44380 /* target = A A B B */
44381 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
44382 /* target = X A B B */
44383 ix86_expand_vector_set (false, target, val, 0);
44384 /* target = A X C D */
44385 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
44386 const1_rtx, const0_rtx,
44387 GEN_INT (2+4), GEN_INT (3+4)));
44388 return;
44389
44390 case 2:
44391 /* tmp = target = A B C D */
44392 tmp = copy_to_reg (target);
44393 /* tmp = X B C D */
44394 ix86_expand_vector_set (false, tmp, val, 0);
44395 /* target = A B X D */
44396 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
44397 const0_rtx, const1_rtx,
44398 GEN_INT (0+4), GEN_INT (3+4)));
44399 return;
44400
44401 case 3:
44402 /* tmp = target = A B C D */
44403 tmp = copy_to_reg (target);
44404 /* tmp = X B C D */
44405 ix86_expand_vector_set (false, tmp, val, 0);
44406 /* target = A B X D */
44407 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
44408 const0_rtx, const1_rtx,
44409 GEN_INT (2+4), GEN_INT (0+4)));
44410 return;
44411
44412 default:
44413 gcc_unreachable ();
44414 }
44415 break;
44416
44417 case V4SImode:
44418 use_vec_merge = TARGET_SSE4_1;
44419 if (use_vec_merge)
44420 break;
44421
44422 /* Element 0 handled by vec_merge below. */
44423 if (elt == 0)
44424 {
44425 use_vec_merge = true;
44426 break;
44427 }
44428
44429 if (TARGET_SSE2)
44430 {
44431 /* With SSE2, use integer shuffles to swap element 0 and ELT,
44432 store into element 0, then shuffle them back. */
44433
44434 rtx order[4];
44435
44436 order[0] = GEN_INT (elt);
44437 order[1] = const1_rtx;
44438 order[2] = const2_rtx;
44439 order[3] = GEN_INT (3);
44440 order[elt] = const0_rtx;
44441
44442 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
44443 order[1], order[2], order[3]));
44444
44445 ix86_expand_vector_set (false, target, val, 0);
44446
44447 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
44448 order[1], order[2], order[3]));
44449 }
44450 else
44451 {
44452 /* For SSE1, we have to reuse the V4SF code. */
44453 rtx t = gen_reg_rtx (V4SFmode);
44454 emit_move_insn (t, gen_lowpart (V4SFmode, target));
44455 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
44456 emit_move_insn (target, gen_lowpart (mode, t));
44457 }
44458 return;
44459
44460 case V8HImode:
44461 use_vec_merge = TARGET_SSE2;
44462 break;
44463 case V4HImode:
44464 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
44465 break;
44466
44467 case V16QImode:
44468 use_vec_merge = TARGET_SSE4_1;
44469 break;
44470
44471 case V8QImode:
44472 break;
44473
44474 case V32QImode:
44475 half_mode = V16QImode;
44476 j = 0;
44477 n = 16;
44478 goto half;
44479
44480 case V16HImode:
44481 half_mode = V8HImode;
44482 j = 1;
44483 n = 8;
44484 goto half;
44485
44486 case V8SImode:
44487 half_mode = V4SImode;
44488 j = 2;
44489 n = 4;
44490 goto half;
44491
44492 case V4DImode:
44493 half_mode = V2DImode;
44494 j = 3;
44495 n = 2;
44496 goto half;
44497
44498 case V8SFmode:
44499 half_mode = V4SFmode;
44500 j = 4;
44501 n = 4;
44502 goto half;
44503
44504 case V4DFmode:
44505 half_mode = V2DFmode;
44506 j = 5;
44507 n = 2;
44508 goto half;
44509
44510 half:
44511 /* Compute offset. */
44512 i = elt / n;
44513 elt %= n;
44514
44515 gcc_assert (i <= 1);
44516
44517 /* Extract the half. */
44518 tmp = gen_reg_rtx (half_mode);
44519 emit_insn (gen_extract[j][i] (tmp, target));
44520
44521 /* Put val in tmp at elt. */
44522 ix86_expand_vector_set (false, tmp, val, elt);
44523
44524 /* Put it back. */
44525 emit_insn (gen_insert[j][i] (target, target, tmp));
44526 return;
44527
44528 case V8DFmode:
44529 if (TARGET_AVX512F)
44530 {
44531 mmode = QImode;
44532 gen_blendm = gen_avx512f_blendmv8df;
44533 }
44534 break;
44535
44536 case V8DImode:
44537 if (TARGET_AVX512F)
44538 {
44539 mmode = QImode;
44540 gen_blendm = gen_avx512f_blendmv8di;
44541 }
44542 break;
44543
44544 case V16SFmode:
44545 if (TARGET_AVX512F)
44546 {
44547 mmode = HImode;
44548 gen_blendm = gen_avx512f_blendmv16sf;
44549 }
44550 break;
44551
44552 case V16SImode:
44553 if (TARGET_AVX512F)
44554 {
44555 mmode = HImode;
44556 gen_blendm = gen_avx512f_blendmv16si;
44557 }
44558 break;
44559
44560 case V32HImode:
44561 if (TARGET_AVX512F && TARGET_AVX512BW)
44562 {
44563 mmode = SImode;
44564 gen_blendm = gen_avx512bw_blendmv32hi;
44565 }
44566 break;
44567
44568 case V64QImode:
44569 if (TARGET_AVX512F && TARGET_AVX512BW)
44570 {
44571 mmode = DImode;
44572 gen_blendm = gen_avx512bw_blendmv64qi;
44573 }
44574 break;
44575
44576 default:
44577 break;
44578 }
44579
44580 if (mmode != VOIDmode)
44581 {
44582 tmp = gen_reg_rtx (mode);
44583 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
44584 /* The avx512*_blendm<mode> expanders have different operand order
44585 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
44586 elements where the mask is set and second input operand otherwise,
44587 in {sse,avx}*_*blend* the first input operand is used for elements
44588 where the mask is clear and second input operand otherwise. */
44589 emit_insn (gen_blendm (target, target, tmp,
44590 force_reg (mmode,
44591 gen_int_mode (1 << elt, mmode))));
44592 }
44593 else if (use_vec_merge)
44594 {
44595 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
44596 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
44597 emit_insn (gen_rtx_SET (target, tmp));
44598 }
44599 else
44600 {
44601 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
44602
44603 emit_move_insn (mem, target);
44604
44605 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
44606 emit_move_insn (tmp, val);
44607
44608 emit_move_insn (target, mem);
44609 }
44610 }
44611
44612 void
44613 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
44614 {
44615 machine_mode mode = GET_MODE (vec);
44616 machine_mode inner_mode = GET_MODE_INNER (mode);
44617 bool use_vec_extr = false;
44618 rtx tmp;
44619
44620 switch (mode)
44621 {
44622 case V2SImode:
44623 case V2SFmode:
44624 if (!mmx_ok)
44625 break;
44626 /* FALLTHRU */
44627
44628 case V2DFmode:
44629 case V2DImode:
44630 use_vec_extr = true;
44631 break;
44632
44633 case V4SFmode:
44634 use_vec_extr = TARGET_SSE4_1;
44635 if (use_vec_extr)
44636 break;
44637
44638 switch (elt)
44639 {
44640 case 0:
44641 tmp = vec;
44642 break;
44643
44644 case 1:
44645 case 3:
44646 tmp = gen_reg_rtx (mode);
44647 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
44648 GEN_INT (elt), GEN_INT (elt),
44649 GEN_INT (elt+4), GEN_INT (elt+4)));
44650 break;
44651
44652 case 2:
44653 tmp = gen_reg_rtx (mode);
44654 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
44655 break;
44656
44657 default:
44658 gcc_unreachable ();
44659 }
44660 vec = tmp;
44661 use_vec_extr = true;
44662 elt = 0;
44663 break;
44664
44665 case V4SImode:
44666 use_vec_extr = TARGET_SSE4_1;
44667 if (use_vec_extr)
44668 break;
44669
44670 if (TARGET_SSE2)
44671 {
44672 switch (elt)
44673 {
44674 case 0:
44675 tmp = vec;
44676 break;
44677
44678 case 1:
44679 case 3:
44680 tmp = gen_reg_rtx (mode);
44681 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
44682 GEN_INT (elt), GEN_INT (elt),
44683 GEN_INT (elt), GEN_INT (elt)));
44684 break;
44685
44686 case 2:
44687 tmp = gen_reg_rtx (mode);
44688 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
44689 break;
44690
44691 default:
44692 gcc_unreachable ();
44693 }
44694 vec = tmp;
44695 use_vec_extr = true;
44696 elt = 0;
44697 }
44698 else
44699 {
44700 /* For SSE1, we have to reuse the V4SF code. */
44701 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
44702 gen_lowpart (V4SFmode, vec), elt);
44703 return;
44704 }
44705 break;
44706
44707 case V8HImode:
44708 use_vec_extr = TARGET_SSE2;
44709 break;
44710 case V4HImode:
44711 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
44712 break;
44713
44714 case V16QImode:
44715 use_vec_extr = TARGET_SSE4_1;
44716 break;
44717
44718 case V8SFmode:
44719 if (TARGET_AVX)
44720 {
44721 tmp = gen_reg_rtx (V4SFmode);
44722 if (elt < 4)
44723 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
44724 else
44725 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
44726 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44727 return;
44728 }
44729 break;
44730
44731 case V4DFmode:
44732 if (TARGET_AVX)
44733 {
44734 tmp = gen_reg_rtx (V2DFmode);
44735 if (elt < 2)
44736 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
44737 else
44738 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
44739 ix86_expand_vector_extract (false, target, tmp, elt & 1);
44740 return;
44741 }
44742 break;
44743
44744 case V32QImode:
44745 if (TARGET_AVX)
44746 {
44747 tmp = gen_reg_rtx (V16QImode);
44748 if (elt < 16)
44749 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
44750 else
44751 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
44752 ix86_expand_vector_extract (false, target, tmp, elt & 15);
44753 return;
44754 }
44755 break;
44756
44757 case V16HImode:
44758 if (TARGET_AVX)
44759 {
44760 tmp = gen_reg_rtx (V8HImode);
44761 if (elt < 8)
44762 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
44763 else
44764 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
44765 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44766 return;
44767 }
44768 break;
44769
44770 case V8SImode:
44771 if (TARGET_AVX)
44772 {
44773 tmp = gen_reg_rtx (V4SImode);
44774 if (elt < 4)
44775 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
44776 else
44777 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
44778 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44779 return;
44780 }
44781 break;
44782
44783 case V4DImode:
44784 if (TARGET_AVX)
44785 {
44786 tmp = gen_reg_rtx (V2DImode);
44787 if (elt < 2)
44788 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
44789 else
44790 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
44791 ix86_expand_vector_extract (false, target, tmp, elt & 1);
44792 return;
44793 }
44794 break;
44795
44796 case V32HImode:
44797 if (TARGET_AVX512BW)
44798 {
44799 tmp = gen_reg_rtx (V16HImode);
44800 if (elt < 16)
44801 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
44802 else
44803 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
44804 ix86_expand_vector_extract (false, target, tmp, elt & 15);
44805 return;
44806 }
44807 break;
44808
44809 case V64QImode:
44810 if (TARGET_AVX512BW)
44811 {
44812 tmp = gen_reg_rtx (V32QImode);
44813 if (elt < 32)
44814 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
44815 else
44816 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
44817 ix86_expand_vector_extract (false, target, tmp, elt & 31);
44818 return;
44819 }
44820 break;
44821
44822 case V16SFmode:
44823 tmp = gen_reg_rtx (V8SFmode);
44824 if (elt < 8)
44825 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
44826 else
44827 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
44828 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44829 return;
44830
44831 case V8DFmode:
44832 tmp = gen_reg_rtx (V4DFmode);
44833 if (elt < 4)
44834 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
44835 else
44836 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
44837 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44838 return;
44839
44840 case V16SImode:
44841 tmp = gen_reg_rtx (V8SImode);
44842 if (elt < 8)
44843 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
44844 else
44845 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
44846 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44847 return;
44848
44849 case V8DImode:
44850 tmp = gen_reg_rtx (V4DImode);
44851 if (elt < 4)
44852 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
44853 else
44854 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
44855 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44856 return;
44857
44858 case V8QImode:
44859 /* ??? Could extract the appropriate HImode element and shift. */
44860 default:
44861 break;
44862 }
44863
44864 if (use_vec_extr)
44865 {
44866 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
44867 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
44868
44869 /* Let the rtl optimizers know about the zero extension performed. */
44870 if (inner_mode == QImode || inner_mode == HImode)
44871 {
44872 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
44873 target = gen_lowpart (SImode, target);
44874 }
44875
44876 emit_insn (gen_rtx_SET (target, tmp));
44877 }
44878 else
44879 {
44880 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
44881
44882 emit_move_insn (mem, vec);
44883
44884 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
44885 emit_move_insn (target, tmp);
44886 }
44887 }
44888
44889 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
44890 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
44891 The upper bits of DEST are undefined, though they shouldn't cause
44892 exceptions (some bits from src or all zeros are ok). */
44893
44894 static void
44895 emit_reduc_half (rtx dest, rtx src, int i)
44896 {
44897 rtx tem, d = dest;
44898 switch (GET_MODE (src))
44899 {
44900 case V4SFmode:
44901 if (i == 128)
44902 tem = gen_sse_movhlps (dest, src, src);
44903 else
44904 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
44905 GEN_INT (1 + 4), GEN_INT (1 + 4));
44906 break;
44907 case V2DFmode:
44908 tem = gen_vec_interleave_highv2df (dest, src, src);
44909 break;
44910 case V16QImode:
44911 case V8HImode:
44912 case V4SImode:
44913 case V2DImode:
44914 d = gen_reg_rtx (V1TImode);
44915 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
44916 GEN_INT (i / 2));
44917 break;
44918 case V8SFmode:
44919 if (i == 256)
44920 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
44921 else
44922 tem = gen_avx_shufps256 (dest, src, src,
44923 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
44924 break;
44925 case V4DFmode:
44926 if (i == 256)
44927 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
44928 else
44929 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
44930 break;
44931 case V32QImode:
44932 case V16HImode:
44933 case V8SImode:
44934 case V4DImode:
44935 if (i == 256)
44936 {
44937 if (GET_MODE (dest) != V4DImode)
44938 d = gen_reg_rtx (V4DImode);
44939 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
44940 gen_lowpart (V4DImode, src),
44941 const1_rtx);
44942 }
44943 else
44944 {
44945 d = gen_reg_rtx (V2TImode);
44946 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
44947 GEN_INT (i / 2));
44948 }
44949 break;
44950 case V64QImode:
44951 case V32HImode:
44952 case V16SImode:
44953 case V16SFmode:
44954 case V8DImode:
44955 case V8DFmode:
44956 if (i > 128)
44957 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
44958 gen_lowpart (V16SImode, src),
44959 gen_lowpart (V16SImode, src),
44960 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
44961 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
44962 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
44963 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
44964 GEN_INT (0xC), GEN_INT (0xD),
44965 GEN_INT (0xE), GEN_INT (0xF),
44966 GEN_INT (0x10), GEN_INT (0x11),
44967 GEN_INT (0x12), GEN_INT (0x13),
44968 GEN_INT (0x14), GEN_INT (0x15),
44969 GEN_INT (0x16), GEN_INT (0x17));
44970 else
44971 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
44972 gen_lowpart (V16SImode, src),
44973 GEN_INT (i == 128 ? 0x2 : 0x1),
44974 GEN_INT (0x3),
44975 GEN_INT (0x3),
44976 GEN_INT (0x3),
44977 GEN_INT (i == 128 ? 0x6 : 0x5),
44978 GEN_INT (0x7),
44979 GEN_INT (0x7),
44980 GEN_INT (0x7),
44981 GEN_INT (i == 128 ? 0xA : 0x9),
44982 GEN_INT (0xB),
44983 GEN_INT (0xB),
44984 GEN_INT (0xB),
44985 GEN_INT (i == 128 ? 0xE : 0xD),
44986 GEN_INT (0xF),
44987 GEN_INT (0xF),
44988 GEN_INT (0xF));
44989 break;
44990 default:
44991 gcc_unreachable ();
44992 }
44993 emit_insn (tem);
44994 if (d != dest)
44995 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
44996 }
44997
44998 /* Expand a vector reduction. FN is the binary pattern to reduce;
44999 DEST is the destination; IN is the input vector. */
45000
45001 void
45002 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
45003 {
45004 rtx half, dst, vec = in;
45005 machine_mode mode = GET_MODE (in);
45006 int i;
45007
45008 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
45009 if (TARGET_SSE4_1
45010 && mode == V8HImode
45011 && fn == gen_uminv8hi3)
45012 {
45013 emit_insn (gen_sse4_1_phminposuw (dest, in));
45014 return;
45015 }
45016
45017 for (i = GET_MODE_BITSIZE (mode);
45018 i > GET_MODE_UNIT_BITSIZE (mode);
45019 i >>= 1)
45020 {
45021 half = gen_reg_rtx (mode);
45022 emit_reduc_half (half, vec, i);
45023 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
45024 dst = dest;
45025 else
45026 dst = gen_reg_rtx (mode);
45027 emit_insn (fn (dst, half, vec));
45028 vec = dst;
45029 }
45030 }
45031 \f
45032 /* Target hook for scalar_mode_supported_p. */
45033 static bool
45034 ix86_scalar_mode_supported_p (machine_mode mode)
45035 {
45036 if (DECIMAL_FLOAT_MODE_P (mode))
45037 return default_decimal_float_supported_p ();
45038 else if (mode == TFmode)
45039 return true;
45040 else
45041 return default_scalar_mode_supported_p (mode);
45042 }
45043
45044 /* Implements target hook vector_mode_supported_p. */
45045 static bool
45046 ix86_vector_mode_supported_p (machine_mode mode)
45047 {
45048 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
45049 return true;
45050 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
45051 return true;
45052 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
45053 return true;
45054 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
45055 return true;
45056 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
45057 return true;
45058 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
45059 return true;
45060 return false;
45061 }
45062
45063 /* Target hook for c_mode_for_suffix. */
45064 static machine_mode
45065 ix86_c_mode_for_suffix (char suffix)
45066 {
45067 if (suffix == 'q')
45068 return TFmode;
45069 if (suffix == 'w')
45070 return XFmode;
45071
45072 return VOIDmode;
45073 }
45074
45075 /* Worker function for TARGET_MD_ASM_ADJUST.
45076
45077 We implement asm flag outputs, and maintain source compatibility
45078 with the old cc0-based compiler. */
45079
45080 static rtx_insn *
45081 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
45082 vec<const char *> &constraints,
45083 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
45084 {
45085 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
45086 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
45087
45088 bool saw_asm_flag = false;
45089
45090 start_sequence ();
45091 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
45092 {
45093 const char *con = constraints[i];
45094 if (strncmp (con, "=@cc", 4) != 0)
45095 continue;
45096 con += 4;
45097 if (strchr (con, ',') != NULL)
45098 {
45099 error ("alternatives not allowed in asm flag output");
45100 continue;
45101 }
45102
45103 bool invert = false;
45104 if (con[0] == 'n')
45105 invert = true, con++;
45106
45107 machine_mode mode = CCmode;
45108 rtx_code code = UNKNOWN;
45109
45110 switch (con[0])
45111 {
45112 case 'a':
45113 if (con[1] == 0)
45114 mode = CCAmode, code = EQ;
45115 else if (con[1] == 'e' && con[2] == 0)
45116 mode = CCCmode, code = NE;
45117 break;
45118 case 'b':
45119 if (con[1] == 0)
45120 mode = CCCmode, code = EQ;
45121 else if (con[1] == 'e' && con[2] == 0)
45122 mode = CCAmode, code = NE;
45123 break;
45124 case 'c':
45125 if (con[1] == 0)
45126 mode = CCCmode, code = EQ;
45127 break;
45128 case 'e':
45129 if (con[1] == 0)
45130 mode = CCZmode, code = EQ;
45131 break;
45132 case 'g':
45133 if (con[1] == 0)
45134 mode = CCGCmode, code = GT;
45135 else if (con[1] == 'e' && con[2] == 0)
45136 mode = CCGCmode, code = GE;
45137 break;
45138 case 'l':
45139 if (con[1] == 0)
45140 mode = CCGCmode, code = LT;
45141 else if (con[1] == 'e' && con[2] == 0)
45142 mode = CCGCmode, code = LE;
45143 break;
45144 case 'o':
45145 if (con[1] == 0)
45146 mode = CCOmode, code = EQ;
45147 break;
45148 case 'p':
45149 if (con[1] == 0)
45150 mode = CCPmode, code = EQ;
45151 break;
45152 case 's':
45153 if (con[1] == 0)
45154 mode = CCSmode, code = EQ;
45155 break;
45156 case 'z':
45157 if (con[1] == 0)
45158 mode = CCZmode, code = EQ;
45159 break;
45160 }
45161 if (code == UNKNOWN)
45162 {
45163 error ("unknown asm flag output %qs", constraints[i]);
45164 continue;
45165 }
45166 if (invert)
45167 code = reverse_condition (code);
45168
45169 rtx dest = outputs[i];
45170 if (!saw_asm_flag)
45171 {
45172 /* This is the first asm flag output. Here we put the flags
45173 register in as the real output and adjust the condition to
45174 allow it. */
45175 constraints[i] = "=Bf";
45176 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
45177 saw_asm_flag = true;
45178 }
45179 else
45180 {
45181 /* We don't need the flags register as output twice. */
45182 constraints[i] = "=X";
45183 outputs[i] = gen_rtx_SCRATCH (SImode);
45184 }
45185
45186 rtx x = gen_rtx_REG (mode, FLAGS_REG);
45187 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
45188
45189 machine_mode dest_mode = GET_MODE (dest);
45190 if (!SCALAR_INT_MODE_P (dest_mode))
45191 {
45192 error ("invalid type for asm flag output");
45193 continue;
45194 }
45195
45196 if (dest_mode == DImode && !TARGET_64BIT)
45197 dest_mode = SImode;
45198
45199 if (dest_mode != QImode)
45200 {
45201 rtx destqi = gen_reg_rtx (QImode);
45202 emit_insn (gen_rtx_SET (destqi, x));
45203
45204 if (TARGET_ZERO_EXTEND_WITH_AND
45205 && optimize_function_for_speed_p (cfun))
45206 {
45207 x = force_reg (dest_mode, const0_rtx);
45208
45209 emit_insn (gen_movstrictqi
45210 (gen_lowpart (QImode, x), destqi));
45211 }
45212 else
45213 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
45214 }
45215
45216 if (dest_mode != GET_MODE (dest))
45217 {
45218 rtx tmp = gen_reg_rtx (SImode);
45219
45220 emit_insn (gen_rtx_SET (tmp, x));
45221 emit_insn (gen_zero_extendsidi2 (dest, tmp));
45222 }
45223 else
45224 emit_insn (gen_rtx_SET (dest, x));
45225 }
45226 rtx_insn *seq = get_insns ();
45227 end_sequence ();
45228
45229 if (saw_asm_flag)
45230 return seq;
45231 else
45232 {
45233 /* If we had no asm flag outputs, clobber the flags. */
45234 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
45235 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
45236 return NULL;
45237 }
45238 }
45239
45240 /* Implements target vector targetm.asm.encode_section_info. */
45241
45242 static void ATTRIBUTE_UNUSED
45243 ix86_encode_section_info (tree decl, rtx rtl, int first)
45244 {
45245 default_encode_section_info (decl, rtl, first);
45246
45247 if (ix86_in_large_data_p (decl))
45248 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
45249 }
45250
45251 /* Worker function for REVERSE_CONDITION. */
45252
45253 enum rtx_code
45254 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
45255 {
45256 return (mode != CCFPmode && mode != CCFPUmode
45257 ? reverse_condition (code)
45258 : reverse_condition_maybe_unordered (code));
45259 }
45260
45261 /* Output code to perform an x87 FP register move, from OPERANDS[1]
45262 to OPERANDS[0]. */
45263
45264 const char *
45265 output_387_reg_move (rtx_insn *insn, rtx *operands)
45266 {
45267 if (REG_P (operands[0]))
45268 {
45269 if (REG_P (operands[1])
45270 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
45271 {
45272 if (REGNO (operands[0]) == FIRST_STACK_REG)
45273 return output_387_ffreep (operands, 0);
45274 return "fstp\t%y0";
45275 }
45276 if (STACK_TOP_P (operands[0]))
45277 return "fld%Z1\t%y1";
45278 return "fst\t%y0";
45279 }
45280 else if (MEM_P (operands[0]))
45281 {
45282 gcc_assert (REG_P (operands[1]));
45283 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
45284 return "fstp%Z0\t%y0";
45285 else
45286 {
45287 /* There is no non-popping store to memory for XFmode.
45288 So if we need one, follow the store with a load. */
45289 if (GET_MODE (operands[0]) == XFmode)
45290 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
45291 else
45292 return "fst%Z0\t%y0";
45293 }
45294 }
45295 else
45296 gcc_unreachable();
45297 }
45298
45299 /* Output code to perform a conditional jump to LABEL, if C2 flag in
45300 FP status register is set. */
45301
45302 void
45303 ix86_emit_fp_unordered_jump (rtx label)
45304 {
45305 rtx reg = gen_reg_rtx (HImode);
45306 rtx temp;
45307
45308 emit_insn (gen_x86_fnstsw_1 (reg));
45309
45310 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
45311 {
45312 emit_insn (gen_x86_sahf_1 (reg));
45313
45314 temp = gen_rtx_REG (CCmode, FLAGS_REG);
45315 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
45316 }
45317 else
45318 {
45319 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
45320
45321 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
45322 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
45323 }
45324
45325 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
45326 gen_rtx_LABEL_REF (VOIDmode, label),
45327 pc_rtx);
45328 temp = gen_rtx_SET (pc_rtx, temp);
45329
45330 emit_jump_insn (temp);
45331 predict_jump (REG_BR_PROB_BASE * 10 / 100);
45332 }
45333
45334 /* Output code to perform a log1p XFmode calculation. */
45335
45336 void ix86_emit_i387_log1p (rtx op0, rtx op1)
45337 {
45338 rtx_code_label *label1 = gen_label_rtx ();
45339 rtx_code_label *label2 = gen_label_rtx ();
45340
45341 rtx tmp = gen_reg_rtx (XFmode);
45342 rtx tmp2 = gen_reg_rtx (XFmode);
45343 rtx test;
45344
45345 emit_insn (gen_absxf2 (tmp, op1));
45346 test = gen_rtx_GE (VOIDmode, tmp,
45347 const_double_from_real_value (
45348 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
45349 XFmode));
45350 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
45351
45352 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
45353 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
45354 emit_jump (label2);
45355
45356 emit_label (label1);
45357 emit_move_insn (tmp, CONST1_RTX (XFmode));
45358 emit_insn (gen_addxf3 (tmp, op1, tmp));
45359 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
45360 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
45361
45362 emit_label (label2);
45363 }
45364
45365 /* Emit code for round calculation. */
45366 void ix86_emit_i387_round (rtx op0, rtx op1)
45367 {
45368 machine_mode inmode = GET_MODE (op1);
45369 machine_mode outmode = GET_MODE (op0);
45370 rtx e1, e2, res, tmp, tmp1, half;
45371 rtx scratch = gen_reg_rtx (HImode);
45372 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
45373 rtx_code_label *jump_label = gen_label_rtx ();
45374 rtx insn;
45375 rtx (*gen_abs) (rtx, rtx);
45376 rtx (*gen_neg) (rtx, rtx);
45377
45378 switch (inmode)
45379 {
45380 case SFmode:
45381 gen_abs = gen_abssf2;
45382 break;
45383 case DFmode:
45384 gen_abs = gen_absdf2;
45385 break;
45386 case XFmode:
45387 gen_abs = gen_absxf2;
45388 break;
45389 default:
45390 gcc_unreachable ();
45391 }
45392
45393 switch (outmode)
45394 {
45395 case SFmode:
45396 gen_neg = gen_negsf2;
45397 break;
45398 case DFmode:
45399 gen_neg = gen_negdf2;
45400 break;
45401 case XFmode:
45402 gen_neg = gen_negxf2;
45403 break;
45404 case HImode:
45405 gen_neg = gen_neghi2;
45406 break;
45407 case SImode:
45408 gen_neg = gen_negsi2;
45409 break;
45410 case DImode:
45411 gen_neg = gen_negdi2;
45412 break;
45413 default:
45414 gcc_unreachable ();
45415 }
45416
45417 e1 = gen_reg_rtx (inmode);
45418 e2 = gen_reg_rtx (inmode);
45419 res = gen_reg_rtx (outmode);
45420
45421 half = const_double_from_real_value (dconsthalf, inmode);
45422
45423 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
45424
45425 /* scratch = fxam(op1) */
45426 emit_insn (gen_rtx_SET (scratch,
45427 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
45428 UNSPEC_FXAM)));
45429 /* e1 = fabs(op1) */
45430 emit_insn (gen_abs (e1, op1));
45431
45432 /* e2 = e1 + 0.5 */
45433 half = force_reg (inmode, half);
45434 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
45435
45436 /* res = floor(e2) */
45437 if (inmode != XFmode)
45438 {
45439 tmp1 = gen_reg_rtx (XFmode);
45440
45441 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
45442 }
45443 else
45444 tmp1 = e2;
45445
45446 switch (outmode)
45447 {
45448 case SFmode:
45449 case DFmode:
45450 {
45451 rtx tmp0 = gen_reg_rtx (XFmode);
45452
45453 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
45454
45455 emit_insn (gen_rtx_SET (res,
45456 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
45457 UNSPEC_TRUNC_NOOP)));
45458 }
45459 break;
45460 case XFmode:
45461 emit_insn (gen_frndintxf2_floor (res, tmp1));
45462 break;
45463 case HImode:
45464 emit_insn (gen_lfloorxfhi2 (res, tmp1));
45465 break;
45466 case SImode:
45467 emit_insn (gen_lfloorxfsi2 (res, tmp1));
45468 break;
45469 case DImode:
45470 emit_insn (gen_lfloorxfdi2 (res, tmp1));
45471 break;
45472 default:
45473 gcc_unreachable ();
45474 }
45475
45476 /* flags = signbit(a) */
45477 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
45478
45479 /* if (flags) then res = -res */
45480 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
45481 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
45482 gen_rtx_LABEL_REF (VOIDmode, jump_label),
45483 pc_rtx);
45484 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
45485 predict_jump (REG_BR_PROB_BASE * 50 / 100);
45486 JUMP_LABEL (insn) = jump_label;
45487
45488 emit_insn (gen_neg (res, res));
45489
45490 emit_label (jump_label);
45491 LABEL_NUSES (jump_label) = 1;
45492
45493 emit_move_insn (op0, res);
45494 }
45495
45496 /* Output code to perform a Newton-Rhapson approximation of a single precision
45497 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
45498
45499 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
45500 {
45501 rtx x0, x1, e0, e1;
45502
45503 x0 = gen_reg_rtx (mode);
45504 e0 = gen_reg_rtx (mode);
45505 e1 = gen_reg_rtx (mode);
45506 x1 = gen_reg_rtx (mode);
45507
45508 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
45509
45510 b = force_reg (mode, b);
45511
45512 /* x0 = rcp(b) estimate */
45513 if (mode == V16SFmode || mode == V8DFmode)
45514 {
45515 if (TARGET_AVX512ER)
45516 {
45517 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45518 UNSPEC_RCP28)));
45519 /* res = a * x0 */
45520 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
45521 return;
45522 }
45523 else
45524 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45525 UNSPEC_RCP14)));
45526 }
45527 else
45528 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45529 UNSPEC_RCP)));
45530
45531 /* e0 = x0 * b */
45532 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
45533
45534 /* e0 = x0 * e0 */
45535 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
45536
45537 /* e1 = x0 + x0 */
45538 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
45539
45540 /* x1 = e1 - e0 */
45541 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
45542
45543 /* res = a * x1 */
45544 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
45545 }
45546
45547 /* Output code to perform a Newton-Rhapson approximation of a
45548 single precision floating point [reciprocal] square root. */
45549
45550 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
45551 {
45552 rtx x0, e0, e1, e2, e3, mthree, mhalf;
45553 REAL_VALUE_TYPE r;
45554 int unspec;
45555
45556 x0 = gen_reg_rtx (mode);
45557 e0 = gen_reg_rtx (mode);
45558 e1 = gen_reg_rtx (mode);
45559 e2 = gen_reg_rtx (mode);
45560 e3 = gen_reg_rtx (mode);
45561
45562 if (TARGET_AVX512ER && mode == V16SFmode)
45563 {
45564 if (recip)
45565 /* res = rsqrt28(a) estimate */
45566 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45567 UNSPEC_RSQRT28)));
45568 else
45569 {
45570 /* x0 = rsqrt28(a) estimate */
45571 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45572 UNSPEC_RSQRT28)));
45573 /* res = rcp28(x0) estimate */
45574 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
45575 UNSPEC_RCP28)));
45576 }
45577 return;
45578 }
45579
45580 real_from_integer (&r, VOIDmode, -3, SIGNED);
45581 mthree = const_double_from_real_value (r, SFmode);
45582
45583 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
45584 mhalf = const_double_from_real_value (r, SFmode);
45585 unspec = UNSPEC_RSQRT;
45586
45587 if (VECTOR_MODE_P (mode))
45588 {
45589 mthree = ix86_build_const_vector (mode, true, mthree);
45590 mhalf = ix86_build_const_vector (mode, true, mhalf);
45591 /* There is no 512-bit rsqrt. There is however rsqrt14. */
45592 if (GET_MODE_SIZE (mode) == 64)
45593 unspec = UNSPEC_RSQRT14;
45594 }
45595
45596 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
45597 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
45598
45599 a = force_reg (mode, a);
45600
45601 /* x0 = rsqrt(a) estimate */
45602 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45603 unspec)));
45604
45605 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
45606 if (!recip)
45607 {
45608 rtx zero = force_reg (mode, CONST0_RTX(mode));
45609 rtx mask;
45610
45611 /* Handle masked compare. */
45612 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
45613 {
45614 mask = gen_reg_rtx (HImode);
45615 /* Imm value 0x4 corresponds to not-equal comparison. */
45616 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
45617 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
45618 }
45619 else
45620 {
45621 mask = gen_reg_rtx (mode);
45622 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
45623 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
45624 }
45625 }
45626
45627 /* e0 = x0 * a */
45628 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
45629 /* e1 = e0 * x0 */
45630 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
45631
45632 /* e2 = e1 - 3. */
45633 mthree = force_reg (mode, mthree);
45634 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
45635
45636 mhalf = force_reg (mode, mhalf);
45637 if (recip)
45638 /* e3 = -.5 * x0 */
45639 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
45640 else
45641 /* e3 = -.5 * e0 */
45642 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
45643 /* ret = e2 * e3 */
45644 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
45645 }
45646
45647 #ifdef TARGET_SOLARIS
45648 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
45649
45650 static void
45651 i386_solaris_elf_named_section (const char *name, unsigned int flags,
45652 tree decl)
45653 {
45654 /* With Binutils 2.15, the "@unwind" marker must be specified on
45655 every occurrence of the ".eh_frame" section, not just the first
45656 one. */
45657 if (TARGET_64BIT
45658 && strcmp (name, ".eh_frame") == 0)
45659 {
45660 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
45661 flags & SECTION_WRITE ? "aw" : "a");
45662 return;
45663 }
45664
45665 #ifndef USE_GAS
45666 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
45667 {
45668 solaris_elf_asm_comdat_section (name, flags, decl);
45669 return;
45670 }
45671 #endif
45672
45673 default_elf_asm_named_section (name, flags, decl);
45674 }
45675 #endif /* TARGET_SOLARIS */
45676
45677 /* Return the mangling of TYPE if it is an extended fundamental type. */
45678
45679 static const char *
45680 ix86_mangle_type (const_tree type)
45681 {
45682 type = TYPE_MAIN_VARIANT (type);
45683
45684 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
45685 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
45686 return NULL;
45687
45688 switch (TYPE_MODE (type))
45689 {
45690 case TFmode:
45691 /* __float128 is "g". */
45692 return "g";
45693 case XFmode:
45694 /* "long double" or __float80 is "e". */
45695 return "e";
45696 default:
45697 return NULL;
45698 }
45699 }
45700
45701 #ifdef TARGET_THREAD_SSP_OFFSET
45702 /* If using TLS guards, don't waste time creating and expanding
45703 __stack_chk_guard decl and MEM as we are going to ignore it. */
45704 static tree
45705 ix86_stack_protect_guard (void)
45706 {
45707 if (TARGET_SSP_TLS_GUARD)
45708 return NULL_TREE;
45709 return default_stack_protect_guard ();
45710 }
45711 #endif
45712
45713 /* For 32-bit code we can save PIC register setup by using
45714 __stack_chk_fail_local hidden function instead of calling
45715 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
45716 register, so it is better to call __stack_chk_fail directly. */
45717
45718 static tree ATTRIBUTE_UNUSED
45719 ix86_stack_protect_fail (void)
45720 {
45721 return TARGET_64BIT
45722 ? default_external_stack_protect_fail ()
45723 : default_hidden_stack_protect_fail ();
45724 }
45725
45726 /* Select a format to encode pointers in exception handling data. CODE
45727 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
45728 true if the symbol may be affected by dynamic relocations.
45729
45730 ??? All x86 object file formats are capable of representing this.
45731 After all, the relocation needed is the same as for the call insn.
45732 Whether or not a particular assembler allows us to enter such, I
45733 guess we'll have to see. */
45734 int
45735 asm_preferred_eh_data_format (int code, int global)
45736 {
45737 if (flag_pic)
45738 {
45739 int type = DW_EH_PE_sdata8;
45740 if (!TARGET_64BIT
45741 || ix86_cmodel == CM_SMALL_PIC
45742 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
45743 type = DW_EH_PE_sdata4;
45744 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
45745 }
45746 if (ix86_cmodel == CM_SMALL
45747 || (ix86_cmodel == CM_MEDIUM && code))
45748 return DW_EH_PE_udata4;
45749 return DW_EH_PE_absptr;
45750 }
45751 \f
45752 /* Expand copysign from SIGN to the positive value ABS_VALUE
45753 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
45754 the sign-bit. */
45755 static void
45756 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
45757 {
45758 machine_mode mode = GET_MODE (sign);
45759 rtx sgn = gen_reg_rtx (mode);
45760 if (mask == NULL_RTX)
45761 {
45762 machine_mode vmode;
45763
45764 if (mode == SFmode)
45765 vmode = V4SFmode;
45766 else if (mode == DFmode)
45767 vmode = V2DFmode;
45768 else
45769 vmode = mode;
45770
45771 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
45772 if (!VECTOR_MODE_P (mode))
45773 {
45774 /* We need to generate a scalar mode mask in this case. */
45775 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
45776 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
45777 mask = gen_reg_rtx (mode);
45778 emit_insn (gen_rtx_SET (mask, tmp));
45779 }
45780 }
45781 else
45782 mask = gen_rtx_NOT (mode, mask);
45783 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
45784 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
45785 }
45786
45787 /* Expand fabs (OP0) and return a new rtx that holds the result. The
45788 mask for masking out the sign-bit is stored in *SMASK, if that is
45789 non-null. */
45790 static rtx
45791 ix86_expand_sse_fabs (rtx op0, rtx *smask)
45792 {
45793 machine_mode vmode, mode = GET_MODE (op0);
45794 rtx xa, mask;
45795
45796 xa = gen_reg_rtx (mode);
45797 if (mode == SFmode)
45798 vmode = V4SFmode;
45799 else if (mode == DFmode)
45800 vmode = V2DFmode;
45801 else
45802 vmode = mode;
45803 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
45804 if (!VECTOR_MODE_P (mode))
45805 {
45806 /* We need to generate a scalar mode mask in this case. */
45807 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
45808 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
45809 mask = gen_reg_rtx (mode);
45810 emit_insn (gen_rtx_SET (mask, tmp));
45811 }
45812 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
45813
45814 if (smask)
45815 *smask = mask;
45816
45817 return xa;
45818 }
45819
45820 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
45821 swapping the operands if SWAP_OPERANDS is true. The expanded
45822 code is a forward jump to a newly created label in case the
45823 comparison is true. The generated label rtx is returned. */
45824 static rtx_code_label *
45825 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
45826 bool swap_operands)
45827 {
45828 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
45829 rtx_code_label *label;
45830 rtx tmp;
45831
45832 if (swap_operands)
45833 std::swap (op0, op1);
45834
45835 label = gen_label_rtx ();
45836 tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG);
45837 emit_insn (gen_rtx_SET (tmp, gen_rtx_COMPARE (fpcmp_mode, op0, op1)));
45838 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
45839 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
45840 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
45841 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
45842 JUMP_LABEL (tmp) = label;
45843
45844 return label;
45845 }
45846
45847 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
45848 using comparison code CODE. Operands are swapped for the comparison if
45849 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
45850 static rtx
45851 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
45852 bool swap_operands)
45853 {
45854 rtx (*insn)(rtx, rtx, rtx, rtx);
45855 machine_mode mode = GET_MODE (op0);
45856 rtx mask = gen_reg_rtx (mode);
45857
45858 if (swap_operands)
45859 std::swap (op0, op1);
45860
45861 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
45862
45863 emit_insn (insn (mask, op0, op1,
45864 gen_rtx_fmt_ee (code, mode, op0, op1)));
45865 return mask;
45866 }
45867
45868 /* Generate and return a rtx of mode MODE for 2**n where n is the number
45869 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
45870 static rtx
45871 ix86_gen_TWO52 (machine_mode mode)
45872 {
45873 REAL_VALUE_TYPE TWO52r;
45874 rtx TWO52;
45875
45876 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
45877 TWO52 = const_double_from_real_value (TWO52r, mode);
45878 TWO52 = force_reg (mode, TWO52);
45879
45880 return TWO52;
45881 }
45882
45883 /* Expand SSE sequence for computing lround from OP1 storing
45884 into OP0. */
45885 void
45886 ix86_expand_lround (rtx op0, rtx op1)
45887 {
45888 /* C code for the stuff we're doing below:
45889 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
45890 return (long)tmp;
45891 */
45892 machine_mode mode = GET_MODE (op1);
45893 const struct real_format *fmt;
45894 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45895 rtx adj;
45896
45897 /* load nextafter (0.5, 0.0) */
45898 fmt = REAL_MODE_FORMAT (mode);
45899 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45900 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45901
45902 /* adj = copysign (0.5, op1) */
45903 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
45904 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
45905
45906 /* adj = op1 + adj */
45907 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
45908
45909 /* op0 = (imode)adj */
45910 expand_fix (op0, adj, 0);
45911 }
45912
45913 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
45914 into OPERAND0. */
45915 void
45916 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
45917 {
45918 /* C code for the stuff we're doing below (for do_floor):
45919 xi = (long)op1;
45920 xi -= (double)xi > op1 ? 1 : 0;
45921 return xi;
45922 */
45923 machine_mode fmode = GET_MODE (op1);
45924 machine_mode imode = GET_MODE (op0);
45925 rtx ireg, freg, tmp;
45926 rtx_code_label *label;
45927
45928 /* reg = (long)op1 */
45929 ireg = gen_reg_rtx (imode);
45930 expand_fix (ireg, op1, 0);
45931
45932 /* freg = (double)reg */
45933 freg = gen_reg_rtx (fmode);
45934 expand_float (freg, ireg, 0);
45935
45936 /* ireg = (freg > op1) ? ireg - 1 : ireg */
45937 label = ix86_expand_sse_compare_and_jump (UNLE,
45938 freg, op1, !do_floor);
45939 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
45940 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
45941 emit_move_insn (ireg, tmp);
45942
45943 emit_label (label);
45944 LABEL_NUSES (label) = 1;
45945
45946 emit_move_insn (op0, ireg);
45947 }
45948
45949 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
45950 result in OPERAND0. */
45951 void
45952 ix86_expand_rint (rtx operand0, rtx operand1)
45953 {
45954 /* C code for the stuff we're doing below:
45955 xa = fabs (operand1);
45956 if (!isless (xa, 2**52))
45957 return operand1;
45958 xa = xa + 2**52 - 2**52;
45959 return copysign (xa, operand1);
45960 */
45961 machine_mode mode = GET_MODE (operand0);
45962 rtx res, xa, TWO52, mask;
45963 rtx_code_label *label;
45964
45965 res = gen_reg_rtx (mode);
45966 emit_move_insn (res, operand1);
45967
45968 /* xa = abs (operand1) */
45969 xa = ix86_expand_sse_fabs (res, &mask);
45970
45971 /* if (!isless (xa, TWO52)) goto label; */
45972 TWO52 = ix86_gen_TWO52 (mode);
45973 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45974
45975 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45976 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
45977
45978 ix86_sse_copysign_to_positive (res, xa, res, mask);
45979
45980 emit_label (label);
45981 LABEL_NUSES (label) = 1;
45982
45983 emit_move_insn (operand0, res);
45984 }
45985
45986 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
45987 into OPERAND0. */
45988 void
45989 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
45990 {
45991 /* C code for the stuff we expand below.
45992 double xa = fabs (x), x2;
45993 if (!isless (xa, TWO52))
45994 return x;
45995 xa = xa + TWO52 - TWO52;
45996 x2 = copysign (xa, x);
45997 Compensate. Floor:
45998 if (x2 > x)
45999 x2 -= 1;
46000 Compensate. Ceil:
46001 if (x2 < x)
46002 x2 -= -1;
46003 return x2;
46004 */
46005 machine_mode mode = GET_MODE (operand0);
46006 rtx xa, TWO52, tmp, one, res, mask;
46007 rtx_code_label *label;
46008
46009 TWO52 = ix86_gen_TWO52 (mode);
46010
46011 /* Temporary for holding the result, initialized to the input
46012 operand to ease control flow. */
46013 res = gen_reg_rtx (mode);
46014 emit_move_insn (res, operand1);
46015
46016 /* xa = abs (operand1) */
46017 xa = ix86_expand_sse_fabs (res, &mask);
46018
46019 /* if (!isless (xa, TWO52)) goto label; */
46020 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46021
46022 /* xa = xa + TWO52 - TWO52; */
46023 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46024 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
46025
46026 /* xa = copysign (xa, operand1) */
46027 ix86_sse_copysign_to_positive (xa, xa, res, mask);
46028
46029 /* generate 1.0 or -1.0 */
46030 one = force_reg (mode,
46031 const_double_from_real_value (do_floor
46032 ? dconst1 : dconstm1, mode));
46033
46034 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
46035 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
46036 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46037 /* We always need to subtract here to preserve signed zero. */
46038 tmp = expand_simple_binop (mode, MINUS,
46039 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46040 emit_move_insn (res, tmp);
46041
46042 emit_label (label);
46043 LABEL_NUSES (label) = 1;
46044
46045 emit_move_insn (operand0, res);
46046 }
46047
46048 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
46049 into OPERAND0. */
46050 void
46051 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
46052 {
46053 /* C code for the stuff we expand below.
46054 double xa = fabs (x), x2;
46055 if (!isless (xa, TWO52))
46056 return x;
46057 x2 = (double)(long)x;
46058 Compensate. Floor:
46059 if (x2 > x)
46060 x2 -= 1;
46061 Compensate. Ceil:
46062 if (x2 < x)
46063 x2 += 1;
46064 if (HONOR_SIGNED_ZEROS (mode))
46065 return copysign (x2, x);
46066 return x2;
46067 */
46068 machine_mode mode = GET_MODE (operand0);
46069 rtx xa, xi, TWO52, tmp, one, res, mask;
46070 rtx_code_label *label;
46071
46072 TWO52 = ix86_gen_TWO52 (mode);
46073
46074 /* Temporary for holding the result, initialized to the input
46075 operand to ease control flow. */
46076 res = gen_reg_rtx (mode);
46077 emit_move_insn (res, operand1);
46078
46079 /* xa = abs (operand1) */
46080 xa = ix86_expand_sse_fabs (res, &mask);
46081
46082 /* if (!isless (xa, TWO52)) goto label; */
46083 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46084
46085 /* xa = (double)(long)x */
46086 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
46087 expand_fix (xi, res, 0);
46088 expand_float (xa, xi, 0);
46089
46090 /* generate 1.0 */
46091 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
46092
46093 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
46094 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
46095 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46096 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
46097 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46098 emit_move_insn (res, tmp);
46099
46100 if (HONOR_SIGNED_ZEROS (mode))
46101 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
46102
46103 emit_label (label);
46104 LABEL_NUSES (label) = 1;
46105
46106 emit_move_insn (operand0, res);
46107 }
46108
46109 /* Expand SSE sequence for computing round from OPERAND1 storing
46110 into OPERAND0. Sequence that works without relying on DImode truncation
46111 via cvttsd2siq that is only available on 64bit targets. */
46112 void
46113 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
46114 {
46115 /* C code for the stuff we expand below.
46116 double xa = fabs (x), xa2, x2;
46117 if (!isless (xa, TWO52))
46118 return x;
46119 Using the absolute value and copying back sign makes
46120 -0.0 -> -0.0 correct.
46121 xa2 = xa + TWO52 - TWO52;
46122 Compensate.
46123 dxa = xa2 - xa;
46124 if (dxa <= -0.5)
46125 xa2 += 1;
46126 else if (dxa > 0.5)
46127 xa2 -= 1;
46128 x2 = copysign (xa2, x);
46129 return x2;
46130 */
46131 machine_mode mode = GET_MODE (operand0);
46132 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
46133 rtx_code_label *label;
46134
46135 TWO52 = ix86_gen_TWO52 (mode);
46136
46137 /* Temporary for holding the result, initialized to the input
46138 operand to ease control flow. */
46139 res = gen_reg_rtx (mode);
46140 emit_move_insn (res, operand1);
46141
46142 /* xa = abs (operand1) */
46143 xa = ix86_expand_sse_fabs (res, &mask);
46144
46145 /* if (!isless (xa, TWO52)) goto label; */
46146 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46147
46148 /* xa2 = xa + TWO52 - TWO52; */
46149 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46150 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
46151
46152 /* dxa = xa2 - xa; */
46153 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
46154
46155 /* generate 0.5, 1.0 and -0.5 */
46156 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
46157 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
46158 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
46159 0, OPTAB_DIRECT);
46160
46161 /* Compensate. */
46162 tmp = gen_reg_rtx (mode);
46163 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
46164 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
46165 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46166 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46167 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
46168 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
46169 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46170 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46171
46172 /* res = copysign (xa2, operand1) */
46173 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
46174
46175 emit_label (label);
46176 LABEL_NUSES (label) = 1;
46177
46178 emit_move_insn (operand0, res);
46179 }
46180
46181 /* Expand SSE sequence for computing trunc from OPERAND1 storing
46182 into OPERAND0. */
46183 void
46184 ix86_expand_trunc (rtx operand0, rtx operand1)
46185 {
46186 /* C code for SSE variant we expand below.
46187 double xa = fabs (x), x2;
46188 if (!isless (xa, TWO52))
46189 return x;
46190 x2 = (double)(long)x;
46191 if (HONOR_SIGNED_ZEROS (mode))
46192 return copysign (x2, x);
46193 return x2;
46194 */
46195 machine_mode mode = GET_MODE (operand0);
46196 rtx xa, xi, TWO52, res, mask;
46197 rtx_code_label *label;
46198
46199 TWO52 = ix86_gen_TWO52 (mode);
46200
46201 /* Temporary for holding the result, initialized to the input
46202 operand to ease control flow. */
46203 res = gen_reg_rtx (mode);
46204 emit_move_insn (res, operand1);
46205
46206 /* xa = abs (operand1) */
46207 xa = ix86_expand_sse_fabs (res, &mask);
46208
46209 /* if (!isless (xa, TWO52)) goto label; */
46210 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46211
46212 /* x = (double)(long)x */
46213 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
46214 expand_fix (xi, res, 0);
46215 expand_float (res, xi, 0);
46216
46217 if (HONOR_SIGNED_ZEROS (mode))
46218 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
46219
46220 emit_label (label);
46221 LABEL_NUSES (label) = 1;
46222
46223 emit_move_insn (operand0, res);
46224 }
46225
46226 /* Expand SSE sequence for computing trunc from OPERAND1 storing
46227 into OPERAND0. */
46228 void
46229 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
46230 {
46231 machine_mode mode = GET_MODE (operand0);
46232 rtx xa, mask, TWO52, one, res, smask, tmp;
46233 rtx_code_label *label;
46234
46235 /* C code for SSE variant we expand below.
46236 double xa = fabs (x), x2;
46237 if (!isless (xa, TWO52))
46238 return x;
46239 xa2 = xa + TWO52 - TWO52;
46240 Compensate:
46241 if (xa2 > xa)
46242 xa2 -= 1.0;
46243 x2 = copysign (xa2, x);
46244 return x2;
46245 */
46246
46247 TWO52 = ix86_gen_TWO52 (mode);
46248
46249 /* Temporary for holding the result, initialized to the input
46250 operand to ease control flow. */
46251 res = gen_reg_rtx (mode);
46252 emit_move_insn (res, operand1);
46253
46254 /* xa = abs (operand1) */
46255 xa = ix86_expand_sse_fabs (res, &smask);
46256
46257 /* if (!isless (xa, TWO52)) goto label; */
46258 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46259
46260 /* res = xa + TWO52 - TWO52; */
46261 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46262 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
46263 emit_move_insn (res, tmp);
46264
46265 /* generate 1.0 */
46266 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
46267
46268 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
46269 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
46270 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
46271 tmp = expand_simple_binop (mode, MINUS,
46272 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
46273 emit_move_insn (res, tmp);
46274
46275 /* res = copysign (res, operand1) */
46276 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
46277
46278 emit_label (label);
46279 LABEL_NUSES (label) = 1;
46280
46281 emit_move_insn (operand0, res);
46282 }
46283
46284 /* Expand SSE sequence for computing round from OPERAND1 storing
46285 into OPERAND0. */
46286 void
46287 ix86_expand_round (rtx operand0, rtx operand1)
46288 {
46289 /* C code for the stuff we're doing below:
46290 double xa = fabs (x);
46291 if (!isless (xa, TWO52))
46292 return x;
46293 xa = (double)(long)(xa + nextafter (0.5, 0.0));
46294 return copysign (xa, x);
46295 */
46296 machine_mode mode = GET_MODE (operand0);
46297 rtx res, TWO52, xa, xi, half, mask;
46298 rtx_code_label *label;
46299 const struct real_format *fmt;
46300 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
46301
46302 /* Temporary for holding the result, initialized to the input
46303 operand to ease control flow. */
46304 res = gen_reg_rtx (mode);
46305 emit_move_insn (res, operand1);
46306
46307 TWO52 = ix86_gen_TWO52 (mode);
46308 xa = ix86_expand_sse_fabs (res, &mask);
46309 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46310
46311 /* load nextafter (0.5, 0.0) */
46312 fmt = REAL_MODE_FORMAT (mode);
46313 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
46314 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
46315
46316 /* xa = xa + 0.5 */
46317 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
46318 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
46319
46320 /* xa = (double)(int64_t)xa */
46321 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
46322 expand_fix (xi, xa, 0);
46323 expand_float (xa, xi, 0);
46324
46325 /* res = copysign (xa, operand1) */
46326 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
46327
46328 emit_label (label);
46329 LABEL_NUSES (label) = 1;
46330
46331 emit_move_insn (operand0, res);
46332 }
46333
46334 /* Expand SSE sequence for computing round
46335 from OP1 storing into OP0 using sse4 round insn. */
46336 void
46337 ix86_expand_round_sse4 (rtx op0, rtx op1)
46338 {
46339 machine_mode mode = GET_MODE (op0);
46340 rtx e1, e2, res, half;
46341 const struct real_format *fmt;
46342 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
46343 rtx (*gen_copysign) (rtx, rtx, rtx);
46344 rtx (*gen_round) (rtx, rtx, rtx);
46345
46346 switch (mode)
46347 {
46348 case SFmode:
46349 gen_copysign = gen_copysignsf3;
46350 gen_round = gen_sse4_1_roundsf2;
46351 break;
46352 case DFmode:
46353 gen_copysign = gen_copysigndf3;
46354 gen_round = gen_sse4_1_rounddf2;
46355 break;
46356 default:
46357 gcc_unreachable ();
46358 }
46359
46360 /* round (a) = trunc (a + copysign (0.5, a)) */
46361
46362 /* load nextafter (0.5, 0.0) */
46363 fmt = REAL_MODE_FORMAT (mode);
46364 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
46365 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
46366 half = const_double_from_real_value (pred_half, mode);
46367
46368 /* e1 = copysign (0.5, op1) */
46369 e1 = gen_reg_rtx (mode);
46370 emit_insn (gen_copysign (e1, half, op1));
46371
46372 /* e2 = op1 + e1 */
46373 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
46374
46375 /* res = trunc (e2) */
46376 res = gen_reg_rtx (mode);
46377 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
46378
46379 emit_move_insn (op0, res);
46380 }
46381 \f
46382
46383 /* Table of valid machine attributes. */
46384 static const struct attribute_spec ix86_attribute_table[] =
46385 {
46386 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
46387 affects_type_identity } */
46388 /* Stdcall attribute says callee is responsible for popping arguments
46389 if they are not variable. */
46390 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46391 true },
46392 /* Fastcall attribute says callee is responsible for popping arguments
46393 if they are not variable. */
46394 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46395 true },
46396 /* Thiscall attribute says callee is responsible for popping arguments
46397 if they are not variable. */
46398 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46399 true },
46400 /* Cdecl attribute says the callee is a normal C declaration */
46401 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46402 true },
46403 /* Regparm attribute specifies how many integer arguments are to be
46404 passed in registers. */
46405 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
46406 true },
46407 /* Sseregparm attribute says we are using x86_64 calling conventions
46408 for FP arguments. */
46409 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46410 true },
46411 /* The transactional memory builtins are implicitly regparm or fastcall
46412 depending on the ABI. Override the generic do-nothing attribute that
46413 these builtins were declared with. */
46414 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
46415 true },
46416 /* force_align_arg_pointer says this function realigns the stack at entry. */
46417 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
46418 false, true, true, ix86_handle_force_align_arg_pointer_attribute, false },
46419 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
46420 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
46421 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
46422 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
46423 false },
46424 #endif
46425 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
46426 false },
46427 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
46428 false },
46429 #ifdef SUBTARGET_ATTRIBUTE_TABLE
46430 SUBTARGET_ATTRIBUTE_TABLE,
46431 #endif
46432 /* ms_abi and sysv_abi calling convention function attributes. */
46433 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
46434 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
46435 { "ms_abi va_list", 0, 0, false, false, false, NULL, false },
46436 { "sysv_abi va_list", 0, 0, false, false, false, NULL, false },
46437 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
46438 false },
46439 { "callee_pop_aggregate_return", 1, 1, false, true, true,
46440 ix86_handle_callee_pop_aggregate_return, true },
46441 { "interrupt", 0, 0, false, true, true,
46442 ix86_handle_interrupt_attribute, false },
46443 { "no_caller_saved_registers", 0, 0, false, true, true,
46444 ix86_handle_no_caller_saved_registers_attribute, false },
46445
46446 /* End element. */
46447 { NULL, 0, 0, false, false, false, NULL, false }
46448 };
46449
46450 /* Implement targetm.vectorize.builtin_vectorization_cost. */
46451 static int
46452 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
46453 tree vectype, int)
46454 {
46455 switch (type_of_cost)
46456 {
46457 case scalar_stmt:
46458 return ix86_cost->scalar_stmt_cost;
46459
46460 case scalar_load:
46461 return ix86_cost->scalar_load_cost;
46462
46463 case scalar_store:
46464 return ix86_cost->scalar_store_cost;
46465
46466 case vector_stmt:
46467 return ix86_cost->vec_stmt_cost;
46468
46469 case vector_load:
46470 return ix86_cost->vec_align_load_cost;
46471
46472 case vector_store:
46473 return ix86_cost->vec_store_cost;
46474
46475 case vec_to_scalar:
46476 return ix86_cost->vec_to_scalar_cost;
46477
46478 case scalar_to_vec:
46479 return ix86_cost->scalar_to_vec_cost;
46480
46481 case unaligned_load:
46482 case unaligned_store:
46483 return ix86_cost->vec_unalign_load_cost;
46484
46485 case cond_branch_taken:
46486 return ix86_cost->cond_taken_branch_cost;
46487
46488 case cond_branch_not_taken:
46489 return ix86_cost->cond_not_taken_branch_cost;
46490
46491 case vec_perm:
46492 case vec_promote_demote:
46493 return ix86_cost->vec_stmt_cost;
46494
46495 case vec_construct:
46496 return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
46497
46498 default:
46499 gcc_unreachable ();
46500 }
46501 }
46502
46503 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
46504 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
46505 insn every time. */
46506
46507 static GTY(()) rtx_insn *vselect_insn;
46508
46509 /* Initialize vselect_insn. */
46510
46511 static void
46512 init_vselect_insn (void)
46513 {
46514 unsigned i;
46515 rtx x;
46516
46517 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
46518 for (i = 0; i < MAX_VECT_LEN; ++i)
46519 XVECEXP (x, 0, i) = const0_rtx;
46520 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
46521 const0_rtx), x);
46522 x = gen_rtx_SET (const0_rtx, x);
46523 start_sequence ();
46524 vselect_insn = emit_insn (x);
46525 end_sequence ();
46526 }
46527
46528 /* Construct (set target (vec_select op0 (parallel perm))) and
46529 return true if that's a valid instruction in the active ISA. */
46530
46531 static bool
46532 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
46533 unsigned nelt, bool testing_p)
46534 {
46535 unsigned int i;
46536 rtx x, save_vconcat;
46537 int icode;
46538
46539 if (vselect_insn == NULL_RTX)
46540 init_vselect_insn ();
46541
46542 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
46543 PUT_NUM_ELEM (XVEC (x, 0), nelt);
46544 for (i = 0; i < nelt; ++i)
46545 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
46546 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
46547 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
46548 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
46549 SET_DEST (PATTERN (vselect_insn)) = target;
46550 icode = recog_memoized (vselect_insn);
46551
46552 if (icode >= 0 && !testing_p)
46553 emit_insn (copy_rtx (PATTERN (vselect_insn)));
46554
46555 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
46556 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
46557 INSN_CODE (vselect_insn) = -1;
46558
46559 return icode >= 0;
46560 }
46561
46562 /* Similar, but generate a vec_concat from op0 and op1 as well. */
46563
46564 static bool
46565 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
46566 const unsigned char *perm, unsigned nelt,
46567 bool testing_p)
46568 {
46569 machine_mode v2mode;
46570 rtx x;
46571 bool ok;
46572
46573 if (vselect_insn == NULL_RTX)
46574 init_vselect_insn ();
46575
46576 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
46577 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
46578 PUT_MODE (x, v2mode);
46579 XEXP (x, 0) = op0;
46580 XEXP (x, 1) = op1;
46581 ok = expand_vselect (target, x, perm, nelt, testing_p);
46582 XEXP (x, 0) = const0_rtx;
46583 XEXP (x, 1) = const0_rtx;
46584 return ok;
46585 }
46586
46587 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46588 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
46589
46590 static bool
46591 expand_vec_perm_blend (struct expand_vec_perm_d *d)
46592 {
46593 machine_mode mmode, vmode = d->vmode;
46594 unsigned i, mask, nelt = d->nelt;
46595 rtx target, op0, op1, maskop, x;
46596 rtx rperm[32], vperm;
46597
46598 if (d->one_operand_p)
46599 return false;
46600 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
46601 && (TARGET_AVX512BW
46602 || GET_MODE_UNIT_SIZE (vmode) >= 4))
46603 ;
46604 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
46605 ;
46606 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
46607 ;
46608 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
46609 ;
46610 else
46611 return false;
46612
46613 /* This is a blend, not a permute. Elements must stay in their
46614 respective lanes. */
46615 for (i = 0; i < nelt; ++i)
46616 {
46617 unsigned e = d->perm[i];
46618 if (!(e == i || e == i + nelt))
46619 return false;
46620 }
46621
46622 if (d->testing_p)
46623 return true;
46624
46625 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
46626 decision should be extracted elsewhere, so that we only try that
46627 sequence once all budget==3 options have been tried. */
46628 target = d->target;
46629 op0 = d->op0;
46630 op1 = d->op1;
46631 mask = 0;
46632
46633 switch (vmode)
46634 {
46635 case V8DFmode:
46636 case V16SFmode:
46637 case V4DFmode:
46638 case V8SFmode:
46639 case V2DFmode:
46640 case V4SFmode:
46641 case V8HImode:
46642 case V8SImode:
46643 case V32HImode:
46644 case V64QImode:
46645 case V16SImode:
46646 case V8DImode:
46647 for (i = 0; i < nelt; ++i)
46648 mask |= (d->perm[i] >= nelt) << i;
46649 break;
46650
46651 case V2DImode:
46652 for (i = 0; i < 2; ++i)
46653 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
46654 vmode = V8HImode;
46655 goto do_subreg;
46656
46657 case V4SImode:
46658 for (i = 0; i < 4; ++i)
46659 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
46660 vmode = V8HImode;
46661 goto do_subreg;
46662
46663 case V16QImode:
46664 /* See if bytes move in pairs so we can use pblendw with
46665 an immediate argument, rather than pblendvb with a vector
46666 argument. */
46667 for (i = 0; i < 16; i += 2)
46668 if (d->perm[i] + 1 != d->perm[i + 1])
46669 {
46670 use_pblendvb:
46671 for (i = 0; i < nelt; ++i)
46672 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
46673
46674 finish_pblendvb:
46675 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
46676 vperm = force_reg (vmode, vperm);
46677
46678 if (GET_MODE_SIZE (vmode) == 16)
46679 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
46680 else
46681 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
46682 if (target != d->target)
46683 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46684 return true;
46685 }
46686
46687 for (i = 0; i < 8; ++i)
46688 mask |= (d->perm[i * 2] >= 16) << i;
46689 vmode = V8HImode;
46690 /* FALLTHRU */
46691
46692 do_subreg:
46693 target = gen_reg_rtx (vmode);
46694 op0 = gen_lowpart (vmode, op0);
46695 op1 = gen_lowpart (vmode, op1);
46696 break;
46697
46698 case V32QImode:
46699 /* See if bytes move in pairs. If not, vpblendvb must be used. */
46700 for (i = 0; i < 32; i += 2)
46701 if (d->perm[i] + 1 != d->perm[i + 1])
46702 goto use_pblendvb;
46703 /* See if bytes move in quadruplets. If yes, vpblendd
46704 with immediate can be used. */
46705 for (i = 0; i < 32; i += 4)
46706 if (d->perm[i] + 2 != d->perm[i + 2])
46707 break;
46708 if (i < 32)
46709 {
46710 /* See if bytes move the same in both lanes. If yes,
46711 vpblendw with immediate can be used. */
46712 for (i = 0; i < 16; i += 2)
46713 if (d->perm[i] + 16 != d->perm[i + 16])
46714 goto use_pblendvb;
46715
46716 /* Use vpblendw. */
46717 for (i = 0; i < 16; ++i)
46718 mask |= (d->perm[i * 2] >= 32) << i;
46719 vmode = V16HImode;
46720 goto do_subreg;
46721 }
46722
46723 /* Use vpblendd. */
46724 for (i = 0; i < 8; ++i)
46725 mask |= (d->perm[i * 4] >= 32) << i;
46726 vmode = V8SImode;
46727 goto do_subreg;
46728
46729 case V16HImode:
46730 /* See if words move in pairs. If yes, vpblendd can be used. */
46731 for (i = 0; i < 16; i += 2)
46732 if (d->perm[i] + 1 != d->perm[i + 1])
46733 break;
46734 if (i < 16)
46735 {
46736 /* See if words move the same in both lanes. If not,
46737 vpblendvb must be used. */
46738 for (i = 0; i < 8; i++)
46739 if (d->perm[i] + 8 != d->perm[i + 8])
46740 {
46741 /* Use vpblendvb. */
46742 for (i = 0; i < 32; ++i)
46743 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
46744
46745 vmode = V32QImode;
46746 nelt = 32;
46747 target = gen_reg_rtx (vmode);
46748 op0 = gen_lowpart (vmode, op0);
46749 op1 = gen_lowpart (vmode, op1);
46750 goto finish_pblendvb;
46751 }
46752
46753 /* Use vpblendw. */
46754 for (i = 0; i < 16; ++i)
46755 mask |= (d->perm[i] >= 16) << i;
46756 break;
46757 }
46758
46759 /* Use vpblendd. */
46760 for (i = 0; i < 8; ++i)
46761 mask |= (d->perm[i * 2] >= 16) << i;
46762 vmode = V8SImode;
46763 goto do_subreg;
46764
46765 case V4DImode:
46766 /* Use vpblendd. */
46767 for (i = 0; i < 4; ++i)
46768 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
46769 vmode = V8SImode;
46770 goto do_subreg;
46771
46772 default:
46773 gcc_unreachable ();
46774 }
46775
46776 switch (vmode)
46777 {
46778 case V8DFmode:
46779 case V8DImode:
46780 mmode = QImode;
46781 break;
46782 case V16SFmode:
46783 case V16SImode:
46784 mmode = HImode;
46785 break;
46786 case V32HImode:
46787 mmode = SImode;
46788 break;
46789 case V64QImode:
46790 mmode = DImode;
46791 break;
46792 default:
46793 mmode = VOIDmode;
46794 }
46795
46796 if (mmode != VOIDmode)
46797 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
46798 else
46799 maskop = GEN_INT (mask);
46800
46801 /* This matches five different patterns with the different modes. */
46802 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
46803 x = gen_rtx_SET (target, x);
46804 emit_insn (x);
46805 if (target != d->target)
46806 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46807
46808 return true;
46809 }
46810
46811 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46812 in terms of the variable form of vpermilps.
46813
46814 Note that we will have already failed the immediate input vpermilps,
46815 which requires that the high and low part shuffle be identical; the
46816 variable form doesn't require that. */
46817
46818 static bool
46819 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
46820 {
46821 rtx rperm[8], vperm;
46822 unsigned i;
46823
46824 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
46825 return false;
46826
46827 /* We can only permute within the 128-bit lane. */
46828 for (i = 0; i < 8; ++i)
46829 {
46830 unsigned e = d->perm[i];
46831 if (i < 4 ? e >= 4 : e < 4)
46832 return false;
46833 }
46834
46835 if (d->testing_p)
46836 return true;
46837
46838 for (i = 0; i < 8; ++i)
46839 {
46840 unsigned e = d->perm[i];
46841
46842 /* Within each 128-bit lane, the elements of op0 are numbered
46843 from 0 and the elements of op1 are numbered from 4. */
46844 if (e >= 8 + 4)
46845 e -= 8;
46846 else if (e >= 4)
46847 e -= 4;
46848
46849 rperm[i] = GEN_INT (e);
46850 }
46851
46852 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
46853 vperm = force_reg (V8SImode, vperm);
46854 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
46855
46856 return true;
46857 }
46858
46859 /* Return true if permutation D can be performed as VMODE permutation
46860 instead. */
46861
46862 static bool
46863 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
46864 {
46865 unsigned int i, j, chunk;
46866
46867 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
46868 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
46869 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
46870 return false;
46871
46872 if (GET_MODE_NUNITS (vmode) >= d->nelt)
46873 return true;
46874
46875 chunk = d->nelt / GET_MODE_NUNITS (vmode);
46876 for (i = 0; i < d->nelt; i += chunk)
46877 if (d->perm[i] & (chunk - 1))
46878 return false;
46879 else
46880 for (j = 1; j < chunk; ++j)
46881 if (d->perm[i] + j != d->perm[i + j])
46882 return false;
46883
46884 return true;
46885 }
46886
46887 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46888 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
46889
46890 static bool
46891 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
46892 {
46893 unsigned i, nelt, eltsz, mask;
46894 unsigned char perm[64];
46895 machine_mode vmode = V16QImode;
46896 rtx rperm[64], vperm, target, op0, op1;
46897
46898 nelt = d->nelt;
46899
46900 if (!d->one_operand_p)
46901 {
46902 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
46903 {
46904 if (TARGET_AVX2
46905 && valid_perm_using_mode_p (V2TImode, d))
46906 {
46907 if (d->testing_p)
46908 return true;
46909
46910 /* Use vperm2i128 insn. The pattern uses
46911 V4DImode instead of V2TImode. */
46912 target = d->target;
46913 if (d->vmode != V4DImode)
46914 target = gen_reg_rtx (V4DImode);
46915 op0 = gen_lowpart (V4DImode, d->op0);
46916 op1 = gen_lowpart (V4DImode, d->op1);
46917 rperm[0]
46918 = GEN_INT ((d->perm[0] / (nelt / 2))
46919 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
46920 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
46921 if (target != d->target)
46922 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46923 return true;
46924 }
46925 return false;
46926 }
46927 }
46928 else
46929 {
46930 if (GET_MODE_SIZE (d->vmode) == 16)
46931 {
46932 if (!TARGET_SSSE3)
46933 return false;
46934 }
46935 else if (GET_MODE_SIZE (d->vmode) == 32)
46936 {
46937 if (!TARGET_AVX2)
46938 return false;
46939
46940 /* V4DImode should be already handled through
46941 expand_vselect by vpermq instruction. */
46942 gcc_assert (d->vmode != V4DImode);
46943
46944 vmode = V32QImode;
46945 if (d->vmode == V8SImode
46946 || d->vmode == V16HImode
46947 || d->vmode == V32QImode)
46948 {
46949 /* First see if vpermq can be used for
46950 V8SImode/V16HImode/V32QImode. */
46951 if (valid_perm_using_mode_p (V4DImode, d))
46952 {
46953 for (i = 0; i < 4; i++)
46954 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
46955 if (d->testing_p)
46956 return true;
46957 target = gen_reg_rtx (V4DImode);
46958 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
46959 perm, 4, false))
46960 {
46961 emit_move_insn (d->target,
46962 gen_lowpart (d->vmode, target));
46963 return true;
46964 }
46965 return false;
46966 }
46967
46968 /* Next see if vpermd can be used. */
46969 if (valid_perm_using_mode_p (V8SImode, d))
46970 vmode = V8SImode;
46971 }
46972 /* Or if vpermps can be used. */
46973 else if (d->vmode == V8SFmode)
46974 vmode = V8SImode;
46975
46976 if (vmode == V32QImode)
46977 {
46978 /* vpshufb only works intra lanes, it is not
46979 possible to shuffle bytes in between the lanes. */
46980 for (i = 0; i < nelt; ++i)
46981 if ((d->perm[i] ^ i) & (nelt / 2))
46982 return false;
46983 }
46984 }
46985 else if (GET_MODE_SIZE (d->vmode) == 64)
46986 {
46987 if (!TARGET_AVX512BW)
46988 return false;
46989
46990 /* If vpermq didn't work, vpshufb won't work either. */
46991 if (d->vmode == V8DFmode || d->vmode == V8DImode)
46992 return false;
46993
46994 vmode = V64QImode;
46995 if (d->vmode == V16SImode
46996 || d->vmode == V32HImode
46997 || d->vmode == V64QImode)
46998 {
46999 /* First see if vpermq can be used for
47000 V16SImode/V32HImode/V64QImode. */
47001 if (valid_perm_using_mode_p (V8DImode, d))
47002 {
47003 for (i = 0; i < 8; i++)
47004 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
47005 if (d->testing_p)
47006 return true;
47007 target = gen_reg_rtx (V8DImode);
47008 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
47009 perm, 8, false))
47010 {
47011 emit_move_insn (d->target,
47012 gen_lowpart (d->vmode, target));
47013 return true;
47014 }
47015 return false;
47016 }
47017
47018 /* Next see if vpermd can be used. */
47019 if (valid_perm_using_mode_p (V16SImode, d))
47020 vmode = V16SImode;
47021 }
47022 /* Or if vpermps can be used. */
47023 else if (d->vmode == V16SFmode)
47024 vmode = V16SImode;
47025 if (vmode == V64QImode)
47026 {
47027 /* vpshufb only works intra lanes, it is not
47028 possible to shuffle bytes in between the lanes. */
47029 for (i = 0; i < nelt; ++i)
47030 if ((d->perm[i] ^ i) & (nelt / 4))
47031 return false;
47032 }
47033 }
47034 else
47035 return false;
47036 }
47037
47038 if (d->testing_p)
47039 return true;
47040
47041 if (vmode == V8SImode)
47042 for (i = 0; i < 8; ++i)
47043 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
47044 else if (vmode == V16SImode)
47045 for (i = 0; i < 16; ++i)
47046 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
47047 else
47048 {
47049 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47050 if (!d->one_operand_p)
47051 mask = 2 * nelt - 1;
47052 else if (vmode == V16QImode)
47053 mask = nelt - 1;
47054 else if (vmode == V64QImode)
47055 mask = nelt / 4 - 1;
47056 else
47057 mask = nelt / 2 - 1;
47058
47059 for (i = 0; i < nelt; ++i)
47060 {
47061 unsigned j, e = d->perm[i] & mask;
47062 for (j = 0; j < eltsz; ++j)
47063 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
47064 }
47065 }
47066
47067 vperm = gen_rtx_CONST_VECTOR (vmode,
47068 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
47069 vperm = force_reg (vmode, vperm);
47070
47071 target = d->target;
47072 if (d->vmode != vmode)
47073 target = gen_reg_rtx (vmode);
47074 op0 = gen_lowpart (vmode, d->op0);
47075 if (d->one_operand_p)
47076 {
47077 if (vmode == V16QImode)
47078 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
47079 else if (vmode == V32QImode)
47080 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
47081 else if (vmode == V64QImode)
47082 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
47083 else if (vmode == V8SFmode)
47084 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
47085 else if (vmode == V8SImode)
47086 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
47087 else if (vmode == V16SFmode)
47088 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
47089 else if (vmode == V16SImode)
47090 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
47091 else
47092 gcc_unreachable ();
47093 }
47094 else
47095 {
47096 op1 = gen_lowpart (vmode, d->op1);
47097 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
47098 }
47099 if (target != d->target)
47100 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
47101
47102 return true;
47103 }
47104
47105 /* For V*[QHS]Imode permutations, check if the same permutation
47106 can't be performed in a 2x, 4x or 8x wider inner mode. */
47107
47108 static bool
47109 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
47110 struct expand_vec_perm_d *nd)
47111 {
47112 int i;
47113 enum machine_mode mode = VOIDmode;
47114
47115 switch (d->vmode)
47116 {
47117 case V16QImode: mode = V8HImode; break;
47118 case V32QImode: mode = V16HImode; break;
47119 case V64QImode: mode = V32HImode; break;
47120 case V8HImode: mode = V4SImode; break;
47121 case V16HImode: mode = V8SImode; break;
47122 case V32HImode: mode = V16SImode; break;
47123 case V4SImode: mode = V2DImode; break;
47124 case V8SImode: mode = V4DImode; break;
47125 case V16SImode: mode = V8DImode; break;
47126 default: return false;
47127 }
47128 for (i = 0; i < d->nelt; i += 2)
47129 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
47130 return false;
47131 nd->vmode = mode;
47132 nd->nelt = d->nelt / 2;
47133 for (i = 0; i < nd->nelt; i++)
47134 nd->perm[i] = d->perm[2 * i] / 2;
47135 if (GET_MODE_INNER (mode) != DImode)
47136 canonicalize_vector_int_perm (nd, nd);
47137 if (nd != d)
47138 {
47139 nd->one_operand_p = d->one_operand_p;
47140 nd->testing_p = d->testing_p;
47141 if (d->op0 == d->op1)
47142 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
47143 else
47144 {
47145 nd->op0 = gen_lowpart (nd->vmode, d->op0);
47146 nd->op1 = gen_lowpart (nd->vmode, d->op1);
47147 }
47148 if (d->testing_p)
47149 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
47150 else
47151 nd->target = gen_reg_rtx (nd->vmode);
47152 }
47153 return true;
47154 }
47155
47156 /* Try to expand one-operand permutation with constant mask. */
47157
47158 static bool
47159 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
47160 {
47161 machine_mode mode = GET_MODE (d->op0);
47162 machine_mode maskmode = mode;
47163 rtx (*gen) (rtx, rtx, rtx) = NULL;
47164 rtx target, op0, mask;
47165 rtx vec[64];
47166
47167 if (!rtx_equal_p (d->op0, d->op1))
47168 return false;
47169
47170 if (!TARGET_AVX512F)
47171 return false;
47172
47173 switch (mode)
47174 {
47175 case V16SImode:
47176 gen = gen_avx512f_permvarv16si;
47177 break;
47178 case V16SFmode:
47179 gen = gen_avx512f_permvarv16sf;
47180 maskmode = V16SImode;
47181 break;
47182 case V8DImode:
47183 gen = gen_avx512f_permvarv8di;
47184 break;
47185 case V8DFmode:
47186 gen = gen_avx512f_permvarv8df;
47187 maskmode = V8DImode;
47188 break;
47189 default:
47190 return false;
47191 }
47192
47193 target = d->target;
47194 op0 = d->op0;
47195 for (int i = 0; i < d->nelt; ++i)
47196 vec[i] = GEN_INT (d->perm[i]);
47197 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
47198 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
47199 return true;
47200 }
47201
47202 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
47203 in a single instruction. */
47204
47205 static bool
47206 expand_vec_perm_1 (struct expand_vec_perm_d *d)
47207 {
47208 unsigned i, nelt = d->nelt;
47209 struct expand_vec_perm_d nd;
47210
47211 /* Check plain VEC_SELECT first, because AVX has instructions that could
47212 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
47213 input where SEL+CONCAT may not. */
47214 if (d->one_operand_p)
47215 {
47216 int mask = nelt - 1;
47217 bool identity_perm = true;
47218 bool broadcast_perm = true;
47219
47220 for (i = 0; i < nelt; i++)
47221 {
47222 nd.perm[i] = d->perm[i] & mask;
47223 if (nd.perm[i] != i)
47224 identity_perm = false;
47225 if (nd.perm[i])
47226 broadcast_perm = false;
47227 }
47228
47229 if (identity_perm)
47230 {
47231 if (!d->testing_p)
47232 emit_move_insn (d->target, d->op0);
47233 return true;
47234 }
47235 else if (broadcast_perm && TARGET_AVX2)
47236 {
47237 /* Use vpbroadcast{b,w,d}. */
47238 rtx (*gen) (rtx, rtx) = NULL;
47239 switch (d->vmode)
47240 {
47241 case V64QImode:
47242 if (TARGET_AVX512BW)
47243 gen = gen_avx512bw_vec_dupv64qi_1;
47244 break;
47245 case V32QImode:
47246 gen = gen_avx2_pbroadcastv32qi_1;
47247 break;
47248 case V32HImode:
47249 if (TARGET_AVX512BW)
47250 gen = gen_avx512bw_vec_dupv32hi_1;
47251 break;
47252 case V16HImode:
47253 gen = gen_avx2_pbroadcastv16hi_1;
47254 break;
47255 case V16SImode:
47256 if (TARGET_AVX512F)
47257 gen = gen_avx512f_vec_dupv16si_1;
47258 break;
47259 case V8SImode:
47260 gen = gen_avx2_pbroadcastv8si_1;
47261 break;
47262 case V16QImode:
47263 gen = gen_avx2_pbroadcastv16qi;
47264 break;
47265 case V8HImode:
47266 gen = gen_avx2_pbroadcastv8hi;
47267 break;
47268 case V16SFmode:
47269 if (TARGET_AVX512F)
47270 gen = gen_avx512f_vec_dupv16sf_1;
47271 break;
47272 case V8SFmode:
47273 gen = gen_avx2_vec_dupv8sf_1;
47274 break;
47275 case V8DFmode:
47276 if (TARGET_AVX512F)
47277 gen = gen_avx512f_vec_dupv8df_1;
47278 break;
47279 case V8DImode:
47280 if (TARGET_AVX512F)
47281 gen = gen_avx512f_vec_dupv8di_1;
47282 break;
47283 /* For other modes prefer other shuffles this function creates. */
47284 default: break;
47285 }
47286 if (gen != NULL)
47287 {
47288 if (!d->testing_p)
47289 emit_insn (gen (d->target, d->op0));
47290 return true;
47291 }
47292 }
47293
47294 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
47295 return true;
47296
47297 /* There are plenty of patterns in sse.md that are written for
47298 SEL+CONCAT and are not replicated for a single op. Perhaps
47299 that should be changed, to avoid the nastiness here. */
47300
47301 /* Recognize interleave style patterns, which means incrementing
47302 every other permutation operand. */
47303 for (i = 0; i < nelt; i += 2)
47304 {
47305 nd.perm[i] = d->perm[i] & mask;
47306 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
47307 }
47308 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
47309 d->testing_p))
47310 return true;
47311
47312 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
47313 if (nelt >= 4)
47314 {
47315 for (i = 0; i < nelt; i += 4)
47316 {
47317 nd.perm[i + 0] = d->perm[i + 0] & mask;
47318 nd.perm[i + 1] = d->perm[i + 1] & mask;
47319 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
47320 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
47321 }
47322
47323 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
47324 d->testing_p))
47325 return true;
47326 }
47327 }
47328
47329 /* Finally, try the fully general two operand permute. */
47330 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
47331 d->testing_p))
47332 return true;
47333
47334 /* Recognize interleave style patterns with reversed operands. */
47335 if (!d->one_operand_p)
47336 {
47337 for (i = 0; i < nelt; ++i)
47338 {
47339 unsigned e = d->perm[i];
47340 if (e >= nelt)
47341 e -= nelt;
47342 else
47343 e += nelt;
47344 nd.perm[i] = e;
47345 }
47346
47347 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
47348 d->testing_p))
47349 return true;
47350 }
47351
47352 /* Try the SSE4.1 blend variable merge instructions. */
47353 if (expand_vec_perm_blend (d))
47354 return true;
47355
47356 /* Try one of the AVX vpermil variable permutations. */
47357 if (expand_vec_perm_vpermil (d))
47358 return true;
47359
47360 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
47361 vpshufb, vpermd, vpermps or vpermq variable permutation. */
47362 if (expand_vec_perm_pshufb (d))
47363 return true;
47364
47365 /* Try the AVX2 vpalignr instruction. */
47366 if (expand_vec_perm_palignr (d, true))
47367 return true;
47368
47369 /* Try the AVX512F vperm{s,d} instructions. */
47370 if (ix86_expand_vec_one_operand_perm_avx512 (d))
47371 return true;
47372
47373 /* Try the AVX512F vpermi2 instructions. */
47374 if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
47375 return true;
47376
47377 /* See if we can get the same permutation in different vector integer
47378 mode. */
47379 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
47380 {
47381 if (!d->testing_p)
47382 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
47383 return true;
47384 }
47385 return false;
47386 }
47387
47388 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
47389 in terms of a pair of pshuflw + pshufhw instructions. */
47390
47391 static bool
47392 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
47393 {
47394 unsigned char perm2[MAX_VECT_LEN];
47395 unsigned i;
47396 bool ok;
47397
47398 if (d->vmode != V8HImode || !d->one_operand_p)
47399 return false;
47400
47401 /* The two permutations only operate in 64-bit lanes. */
47402 for (i = 0; i < 4; ++i)
47403 if (d->perm[i] >= 4)
47404 return false;
47405 for (i = 4; i < 8; ++i)
47406 if (d->perm[i] < 4)
47407 return false;
47408
47409 if (d->testing_p)
47410 return true;
47411
47412 /* Emit the pshuflw. */
47413 memcpy (perm2, d->perm, 4);
47414 for (i = 4; i < 8; ++i)
47415 perm2[i] = i;
47416 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
47417 gcc_assert (ok);
47418
47419 /* Emit the pshufhw. */
47420 memcpy (perm2 + 4, d->perm + 4, 4);
47421 for (i = 0; i < 4; ++i)
47422 perm2[i] = i;
47423 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
47424 gcc_assert (ok);
47425
47426 return true;
47427 }
47428
47429 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47430 the permutation using the SSSE3 palignr instruction. This succeeds
47431 when all of the elements in PERM fit within one vector and we merely
47432 need to shift them down so that a single vector permutation has a
47433 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
47434 the vpalignr instruction itself can perform the requested permutation. */
47435
47436 static bool
47437 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
47438 {
47439 unsigned i, nelt = d->nelt;
47440 unsigned min, max, minswap, maxswap;
47441 bool in_order, ok, swap = false;
47442 rtx shift, target;
47443 struct expand_vec_perm_d dcopy;
47444
47445 /* Even with AVX, palignr only operates on 128-bit vectors,
47446 in AVX2 palignr operates on both 128-bit lanes. */
47447 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
47448 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
47449 return false;
47450
47451 min = 2 * nelt;
47452 max = 0;
47453 minswap = 2 * nelt;
47454 maxswap = 0;
47455 for (i = 0; i < nelt; ++i)
47456 {
47457 unsigned e = d->perm[i];
47458 unsigned eswap = d->perm[i] ^ nelt;
47459 if (GET_MODE_SIZE (d->vmode) == 32)
47460 {
47461 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
47462 eswap = e ^ (nelt / 2);
47463 }
47464 if (e < min)
47465 min = e;
47466 if (e > max)
47467 max = e;
47468 if (eswap < minswap)
47469 minswap = eswap;
47470 if (eswap > maxswap)
47471 maxswap = eswap;
47472 }
47473 if (min == 0
47474 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
47475 {
47476 if (d->one_operand_p
47477 || minswap == 0
47478 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
47479 ? nelt / 2 : nelt))
47480 return false;
47481 swap = true;
47482 min = minswap;
47483 max = maxswap;
47484 }
47485
47486 /* Given that we have SSSE3, we know we'll be able to implement the
47487 single operand permutation after the palignr with pshufb for
47488 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
47489 first. */
47490 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
47491 return true;
47492
47493 dcopy = *d;
47494 if (swap)
47495 {
47496 dcopy.op0 = d->op1;
47497 dcopy.op1 = d->op0;
47498 for (i = 0; i < nelt; ++i)
47499 dcopy.perm[i] ^= nelt;
47500 }
47501
47502 in_order = true;
47503 for (i = 0; i < nelt; ++i)
47504 {
47505 unsigned e = dcopy.perm[i];
47506 if (GET_MODE_SIZE (d->vmode) == 32
47507 && e >= nelt
47508 && (e & (nelt / 2 - 1)) < min)
47509 e = e - min - (nelt / 2);
47510 else
47511 e = e - min;
47512 if (e != i)
47513 in_order = false;
47514 dcopy.perm[i] = e;
47515 }
47516 dcopy.one_operand_p = true;
47517
47518 if (single_insn_only_p && !in_order)
47519 return false;
47520
47521 /* For AVX2, test whether we can permute the result in one instruction. */
47522 if (d->testing_p)
47523 {
47524 if (in_order)
47525 return true;
47526 dcopy.op1 = dcopy.op0;
47527 return expand_vec_perm_1 (&dcopy);
47528 }
47529
47530 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
47531 if (GET_MODE_SIZE (d->vmode) == 16)
47532 {
47533 target = gen_reg_rtx (TImode);
47534 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
47535 gen_lowpart (TImode, dcopy.op0), shift));
47536 }
47537 else
47538 {
47539 target = gen_reg_rtx (V2TImode);
47540 emit_insn (gen_avx2_palignrv2ti (target,
47541 gen_lowpart (V2TImode, dcopy.op1),
47542 gen_lowpart (V2TImode, dcopy.op0),
47543 shift));
47544 }
47545
47546 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
47547
47548 /* Test for the degenerate case where the alignment by itself
47549 produces the desired permutation. */
47550 if (in_order)
47551 {
47552 emit_move_insn (d->target, dcopy.op0);
47553 return true;
47554 }
47555
47556 ok = expand_vec_perm_1 (&dcopy);
47557 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
47558
47559 return ok;
47560 }
47561
47562 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
47563 the permutation using the SSE4_1 pblendv instruction. Potentially
47564 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
47565
47566 static bool
47567 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
47568 {
47569 unsigned i, which, nelt = d->nelt;
47570 struct expand_vec_perm_d dcopy, dcopy1;
47571 machine_mode vmode = d->vmode;
47572 bool ok;
47573
47574 /* Use the same checks as in expand_vec_perm_blend. */
47575 if (d->one_operand_p)
47576 return false;
47577 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
47578 ;
47579 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
47580 ;
47581 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
47582 ;
47583 else
47584 return false;
47585
47586 /* Figure out where permutation elements stay not in their
47587 respective lanes. */
47588 for (i = 0, which = 0; i < nelt; ++i)
47589 {
47590 unsigned e = d->perm[i];
47591 if (e != i)
47592 which |= (e < nelt ? 1 : 2);
47593 }
47594 /* We can pblend the part where elements stay not in their
47595 respective lanes only when these elements are all in one
47596 half of a permutation.
47597 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
47598 lanes, but both 8 and 9 >= 8
47599 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
47600 respective lanes and 8 >= 8, but 2 not. */
47601 if (which != 1 && which != 2)
47602 return false;
47603 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
47604 return true;
47605
47606 /* First we apply one operand permutation to the part where
47607 elements stay not in their respective lanes. */
47608 dcopy = *d;
47609 if (which == 2)
47610 dcopy.op0 = dcopy.op1 = d->op1;
47611 else
47612 dcopy.op0 = dcopy.op1 = d->op0;
47613 if (!d->testing_p)
47614 dcopy.target = gen_reg_rtx (vmode);
47615 dcopy.one_operand_p = true;
47616
47617 for (i = 0; i < nelt; ++i)
47618 dcopy.perm[i] = d->perm[i] & (nelt - 1);
47619
47620 ok = expand_vec_perm_1 (&dcopy);
47621 if (GET_MODE_SIZE (vmode) != 16 && !ok)
47622 return false;
47623 else
47624 gcc_assert (ok);
47625 if (d->testing_p)
47626 return true;
47627
47628 /* Next we put permuted elements into their positions. */
47629 dcopy1 = *d;
47630 if (which == 2)
47631 dcopy1.op1 = dcopy.target;
47632 else
47633 dcopy1.op0 = dcopy.target;
47634
47635 for (i = 0; i < nelt; ++i)
47636 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
47637
47638 ok = expand_vec_perm_blend (&dcopy1);
47639 gcc_assert (ok);
47640
47641 return true;
47642 }
47643
47644 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
47645
47646 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47647 a two vector permutation into a single vector permutation by using
47648 an interleave operation to merge the vectors. */
47649
47650 static bool
47651 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
47652 {
47653 struct expand_vec_perm_d dremap, dfinal;
47654 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
47655 unsigned HOST_WIDE_INT contents;
47656 unsigned char remap[2 * MAX_VECT_LEN];
47657 rtx_insn *seq;
47658 bool ok, same_halves = false;
47659
47660 if (GET_MODE_SIZE (d->vmode) == 16)
47661 {
47662 if (d->one_operand_p)
47663 return false;
47664 }
47665 else if (GET_MODE_SIZE (d->vmode) == 32)
47666 {
47667 if (!TARGET_AVX)
47668 return false;
47669 /* For 32-byte modes allow even d->one_operand_p.
47670 The lack of cross-lane shuffling in some instructions
47671 might prevent a single insn shuffle. */
47672 dfinal = *d;
47673 dfinal.testing_p = true;
47674 /* If expand_vec_perm_interleave3 can expand this into
47675 a 3 insn sequence, give up and let it be expanded as
47676 3 insn sequence. While that is one insn longer,
47677 it doesn't need a memory operand and in the common
47678 case that both interleave low and high permutations
47679 with the same operands are adjacent needs 4 insns
47680 for both after CSE. */
47681 if (expand_vec_perm_interleave3 (&dfinal))
47682 return false;
47683 }
47684 else
47685 return false;
47686
47687 /* Examine from whence the elements come. */
47688 contents = 0;
47689 for (i = 0; i < nelt; ++i)
47690 contents |= HOST_WIDE_INT_1U << d->perm[i];
47691
47692 memset (remap, 0xff, sizeof (remap));
47693 dremap = *d;
47694
47695 if (GET_MODE_SIZE (d->vmode) == 16)
47696 {
47697 unsigned HOST_WIDE_INT h1, h2, h3, h4;
47698
47699 /* Split the two input vectors into 4 halves. */
47700 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
47701 h2 = h1 << nelt2;
47702 h3 = h2 << nelt2;
47703 h4 = h3 << nelt2;
47704
47705 /* If the elements from the low halves use interleave low, and similarly
47706 for interleave high. If the elements are from mis-matched halves, we
47707 can use shufps for V4SF/V4SI or do a DImode shuffle. */
47708 if ((contents & (h1 | h3)) == contents)
47709 {
47710 /* punpckl* */
47711 for (i = 0; i < nelt2; ++i)
47712 {
47713 remap[i] = i * 2;
47714 remap[i + nelt] = i * 2 + 1;
47715 dremap.perm[i * 2] = i;
47716 dremap.perm[i * 2 + 1] = i + nelt;
47717 }
47718 if (!TARGET_SSE2 && d->vmode == V4SImode)
47719 dremap.vmode = V4SFmode;
47720 }
47721 else if ((contents & (h2 | h4)) == contents)
47722 {
47723 /* punpckh* */
47724 for (i = 0; i < nelt2; ++i)
47725 {
47726 remap[i + nelt2] = i * 2;
47727 remap[i + nelt + nelt2] = i * 2 + 1;
47728 dremap.perm[i * 2] = i + nelt2;
47729 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
47730 }
47731 if (!TARGET_SSE2 && d->vmode == V4SImode)
47732 dremap.vmode = V4SFmode;
47733 }
47734 else if ((contents & (h1 | h4)) == contents)
47735 {
47736 /* shufps */
47737 for (i = 0; i < nelt2; ++i)
47738 {
47739 remap[i] = i;
47740 remap[i + nelt + nelt2] = i + nelt2;
47741 dremap.perm[i] = i;
47742 dremap.perm[i + nelt2] = i + nelt + nelt2;
47743 }
47744 if (nelt != 4)
47745 {
47746 /* shufpd */
47747 dremap.vmode = V2DImode;
47748 dremap.nelt = 2;
47749 dremap.perm[0] = 0;
47750 dremap.perm[1] = 3;
47751 }
47752 }
47753 else if ((contents & (h2 | h3)) == contents)
47754 {
47755 /* shufps */
47756 for (i = 0; i < nelt2; ++i)
47757 {
47758 remap[i + nelt2] = i;
47759 remap[i + nelt] = i + nelt2;
47760 dremap.perm[i] = i + nelt2;
47761 dremap.perm[i + nelt2] = i + nelt;
47762 }
47763 if (nelt != 4)
47764 {
47765 /* shufpd */
47766 dremap.vmode = V2DImode;
47767 dremap.nelt = 2;
47768 dremap.perm[0] = 1;
47769 dremap.perm[1] = 2;
47770 }
47771 }
47772 else
47773 return false;
47774 }
47775 else
47776 {
47777 unsigned int nelt4 = nelt / 4, nzcnt = 0;
47778 unsigned HOST_WIDE_INT q[8];
47779 unsigned int nonzero_halves[4];
47780
47781 /* Split the two input vectors into 8 quarters. */
47782 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
47783 for (i = 1; i < 8; ++i)
47784 q[i] = q[0] << (nelt4 * i);
47785 for (i = 0; i < 4; ++i)
47786 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
47787 {
47788 nonzero_halves[nzcnt] = i;
47789 ++nzcnt;
47790 }
47791
47792 if (nzcnt == 1)
47793 {
47794 gcc_assert (d->one_operand_p);
47795 nonzero_halves[1] = nonzero_halves[0];
47796 same_halves = true;
47797 }
47798 else if (d->one_operand_p)
47799 {
47800 gcc_assert (nonzero_halves[0] == 0);
47801 gcc_assert (nonzero_halves[1] == 1);
47802 }
47803
47804 if (nzcnt <= 2)
47805 {
47806 if (d->perm[0] / nelt2 == nonzero_halves[1])
47807 {
47808 /* Attempt to increase the likelihood that dfinal
47809 shuffle will be intra-lane. */
47810 std::swap (nonzero_halves[0], nonzero_halves[1]);
47811 }
47812
47813 /* vperm2f128 or vperm2i128. */
47814 for (i = 0; i < nelt2; ++i)
47815 {
47816 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
47817 remap[i + nonzero_halves[0] * nelt2] = i;
47818 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
47819 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
47820 }
47821
47822 if (d->vmode != V8SFmode
47823 && d->vmode != V4DFmode
47824 && d->vmode != V8SImode)
47825 {
47826 dremap.vmode = V8SImode;
47827 dremap.nelt = 8;
47828 for (i = 0; i < 4; ++i)
47829 {
47830 dremap.perm[i] = i + nonzero_halves[0] * 4;
47831 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
47832 }
47833 }
47834 }
47835 else if (d->one_operand_p)
47836 return false;
47837 else if (TARGET_AVX2
47838 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
47839 {
47840 /* vpunpckl* */
47841 for (i = 0; i < nelt4; ++i)
47842 {
47843 remap[i] = i * 2;
47844 remap[i + nelt] = i * 2 + 1;
47845 remap[i + nelt2] = i * 2 + nelt2;
47846 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
47847 dremap.perm[i * 2] = i;
47848 dremap.perm[i * 2 + 1] = i + nelt;
47849 dremap.perm[i * 2 + nelt2] = i + nelt2;
47850 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
47851 }
47852 }
47853 else if (TARGET_AVX2
47854 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
47855 {
47856 /* vpunpckh* */
47857 for (i = 0; i < nelt4; ++i)
47858 {
47859 remap[i + nelt4] = i * 2;
47860 remap[i + nelt + nelt4] = i * 2 + 1;
47861 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
47862 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
47863 dremap.perm[i * 2] = i + nelt4;
47864 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
47865 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
47866 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
47867 }
47868 }
47869 else
47870 return false;
47871 }
47872
47873 /* Use the remapping array set up above to move the elements from their
47874 swizzled locations into their final destinations. */
47875 dfinal = *d;
47876 for (i = 0; i < nelt; ++i)
47877 {
47878 unsigned e = remap[d->perm[i]];
47879 gcc_assert (e < nelt);
47880 /* If same_halves is true, both halves of the remapped vector are the
47881 same. Avoid cross-lane accesses if possible. */
47882 if (same_halves && i >= nelt2)
47883 {
47884 gcc_assert (e < nelt2);
47885 dfinal.perm[i] = e + nelt2;
47886 }
47887 else
47888 dfinal.perm[i] = e;
47889 }
47890 if (!d->testing_p)
47891 {
47892 dremap.target = gen_reg_rtx (dremap.vmode);
47893 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
47894 }
47895 dfinal.op1 = dfinal.op0;
47896 dfinal.one_operand_p = true;
47897
47898 /* Test if the final remap can be done with a single insn. For V4SFmode or
47899 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
47900 start_sequence ();
47901 ok = expand_vec_perm_1 (&dfinal);
47902 seq = get_insns ();
47903 end_sequence ();
47904
47905 if (!ok)
47906 return false;
47907
47908 if (d->testing_p)
47909 return true;
47910
47911 if (dremap.vmode != dfinal.vmode)
47912 {
47913 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
47914 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
47915 }
47916
47917 ok = expand_vec_perm_1 (&dremap);
47918 gcc_assert (ok);
47919
47920 emit_insn (seq);
47921 return true;
47922 }
47923
47924 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47925 a single vector cross-lane permutation into vpermq followed
47926 by any of the single insn permutations. */
47927
47928 static bool
47929 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
47930 {
47931 struct expand_vec_perm_d dremap, dfinal;
47932 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
47933 unsigned contents[2];
47934 bool ok;
47935
47936 if (!(TARGET_AVX2
47937 && (d->vmode == V32QImode || d->vmode == V16HImode)
47938 && d->one_operand_p))
47939 return false;
47940
47941 contents[0] = 0;
47942 contents[1] = 0;
47943 for (i = 0; i < nelt2; ++i)
47944 {
47945 contents[0] |= 1u << (d->perm[i] / nelt4);
47946 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
47947 }
47948
47949 for (i = 0; i < 2; ++i)
47950 {
47951 unsigned int cnt = 0;
47952 for (j = 0; j < 4; ++j)
47953 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
47954 return false;
47955 }
47956
47957 if (d->testing_p)
47958 return true;
47959
47960 dremap = *d;
47961 dremap.vmode = V4DImode;
47962 dremap.nelt = 4;
47963 dremap.target = gen_reg_rtx (V4DImode);
47964 dremap.op0 = gen_lowpart (V4DImode, d->op0);
47965 dremap.op1 = dremap.op0;
47966 dremap.one_operand_p = true;
47967 for (i = 0; i < 2; ++i)
47968 {
47969 unsigned int cnt = 0;
47970 for (j = 0; j < 4; ++j)
47971 if ((contents[i] & (1u << j)) != 0)
47972 dremap.perm[2 * i + cnt++] = j;
47973 for (; cnt < 2; ++cnt)
47974 dremap.perm[2 * i + cnt] = 0;
47975 }
47976
47977 dfinal = *d;
47978 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
47979 dfinal.op1 = dfinal.op0;
47980 dfinal.one_operand_p = true;
47981 for (i = 0, j = 0; i < nelt; ++i)
47982 {
47983 if (i == nelt2)
47984 j = 2;
47985 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
47986 if ((d->perm[i] / nelt4) == dremap.perm[j])
47987 ;
47988 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
47989 dfinal.perm[i] |= nelt4;
47990 else
47991 gcc_unreachable ();
47992 }
47993
47994 ok = expand_vec_perm_1 (&dremap);
47995 gcc_assert (ok);
47996
47997 ok = expand_vec_perm_1 (&dfinal);
47998 gcc_assert (ok);
47999
48000 return true;
48001 }
48002
48003 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
48004 a vector permutation using two instructions, vperm2f128 resp.
48005 vperm2i128 followed by any single in-lane permutation. */
48006
48007 static bool
48008 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
48009 {
48010 struct expand_vec_perm_d dfirst, dsecond;
48011 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
48012 bool ok;
48013
48014 if (!TARGET_AVX
48015 || GET_MODE_SIZE (d->vmode) != 32
48016 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
48017 return false;
48018
48019 dsecond = *d;
48020 dsecond.one_operand_p = false;
48021 dsecond.testing_p = true;
48022
48023 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
48024 immediate. For perm < 16 the second permutation uses
48025 d->op0 as first operand, for perm >= 16 it uses d->op1
48026 as first operand. The second operand is the result of
48027 vperm2[fi]128. */
48028 for (perm = 0; perm < 32; perm++)
48029 {
48030 /* Ignore permutations which do not move anything cross-lane. */
48031 if (perm < 16)
48032 {
48033 /* The second shuffle for e.g. V4DFmode has
48034 0123 and ABCD operands.
48035 Ignore AB23, as 23 is already in the second lane
48036 of the first operand. */
48037 if ((perm & 0xc) == (1 << 2)) continue;
48038 /* And 01CD, as 01 is in the first lane of the first
48039 operand. */
48040 if ((perm & 3) == 0) continue;
48041 /* And 4567, as then the vperm2[fi]128 doesn't change
48042 anything on the original 4567 second operand. */
48043 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
48044 }
48045 else
48046 {
48047 /* The second shuffle for e.g. V4DFmode has
48048 4567 and ABCD operands.
48049 Ignore AB67, as 67 is already in the second lane
48050 of the first operand. */
48051 if ((perm & 0xc) == (3 << 2)) continue;
48052 /* And 45CD, as 45 is in the first lane of the first
48053 operand. */
48054 if ((perm & 3) == 2) continue;
48055 /* And 0123, as then the vperm2[fi]128 doesn't change
48056 anything on the original 0123 first operand. */
48057 if ((perm & 0xf) == (1 << 2)) continue;
48058 }
48059
48060 for (i = 0; i < nelt; i++)
48061 {
48062 j = d->perm[i] / nelt2;
48063 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
48064 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
48065 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
48066 dsecond.perm[i] = d->perm[i] & (nelt - 1);
48067 else
48068 break;
48069 }
48070
48071 if (i == nelt)
48072 {
48073 start_sequence ();
48074 ok = expand_vec_perm_1 (&dsecond);
48075 end_sequence ();
48076 }
48077 else
48078 ok = false;
48079
48080 if (ok)
48081 {
48082 if (d->testing_p)
48083 return true;
48084
48085 /* Found a usable second shuffle. dfirst will be
48086 vperm2f128 on d->op0 and d->op1. */
48087 dsecond.testing_p = false;
48088 dfirst = *d;
48089 dfirst.target = gen_reg_rtx (d->vmode);
48090 for (i = 0; i < nelt; i++)
48091 dfirst.perm[i] = (i & (nelt2 - 1))
48092 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
48093
48094 canonicalize_perm (&dfirst);
48095 ok = expand_vec_perm_1 (&dfirst);
48096 gcc_assert (ok);
48097
48098 /* And dsecond is some single insn shuffle, taking
48099 d->op0 and result of vperm2f128 (if perm < 16) or
48100 d->op1 and result of vperm2f128 (otherwise). */
48101 if (perm >= 16)
48102 dsecond.op0 = dsecond.op1;
48103 dsecond.op1 = dfirst.target;
48104
48105 ok = expand_vec_perm_1 (&dsecond);
48106 gcc_assert (ok);
48107
48108 return true;
48109 }
48110
48111 /* For one operand, the only useful vperm2f128 permutation is 0x01
48112 aka lanes swap. */
48113 if (d->one_operand_p)
48114 return false;
48115 }
48116
48117 return false;
48118 }
48119
48120 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
48121 a two vector permutation using 2 intra-lane interleave insns
48122 and cross-lane shuffle for 32-byte vectors. */
48123
48124 static bool
48125 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
48126 {
48127 unsigned i, nelt;
48128 rtx (*gen) (rtx, rtx, rtx);
48129
48130 if (d->one_operand_p)
48131 return false;
48132 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
48133 ;
48134 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
48135 ;
48136 else
48137 return false;
48138
48139 nelt = d->nelt;
48140 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
48141 return false;
48142 for (i = 0; i < nelt; i += 2)
48143 if (d->perm[i] != d->perm[0] + i / 2
48144 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
48145 return false;
48146
48147 if (d->testing_p)
48148 return true;
48149
48150 switch (d->vmode)
48151 {
48152 case V32QImode:
48153 if (d->perm[0])
48154 gen = gen_vec_interleave_highv32qi;
48155 else
48156 gen = gen_vec_interleave_lowv32qi;
48157 break;
48158 case V16HImode:
48159 if (d->perm[0])
48160 gen = gen_vec_interleave_highv16hi;
48161 else
48162 gen = gen_vec_interleave_lowv16hi;
48163 break;
48164 case V8SImode:
48165 if (d->perm[0])
48166 gen = gen_vec_interleave_highv8si;
48167 else
48168 gen = gen_vec_interleave_lowv8si;
48169 break;
48170 case V4DImode:
48171 if (d->perm[0])
48172 gen = gen_vec_interleave_highv4di;
48173 else
48174 gen = gen_vec_interleave_lowv4di;
48175 break;
48176 case V8SFmode:
48177 if (d->perm[0])
48178 gen = gen_vec_interleave_highv8sf;
48179 else
48180 gen = gen_vec_interleave_lowv8sf;
48181 break;
48182 case V4DFmode:
48183 if (d->perm[0])
48184 gen = gen_vec_interleave_highv4df;
48185 else
48186 gen = gen_vec_interleave_lowv4df;
48187 break;
48188 default:
48189 gcc_unreachable ();
48190 }
48191
48192 emit_insn (gen (d->target, d->op0, d->op1));
48193 return true;
48194 }
48195
48196 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
48197 a single vector permutation using a single intra-lane vector
48198 permutation, vperm2f128 swapping the lanes and vblend* insn blending
48199 the non-swapped and swapped vectors together. */
48200
48201 static bool
48202 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
48203 {
48204 struct expand_vec_perm_d dfirst, dsecond;
48205 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
48206 rtx_insn *seq;
48207 bool ok;
48208 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
48209
48210 if (!TARGET_AVX
48211 || TARGET_AVX2
48212 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
48213 || !d->one_operand_p)
48214 return false;
48215
48216 dfirst = *d;
48217 for (i = 0; i < nelt; i++)
48218 dfirst.perm[i] = 0xff;
48219 for (i = 0, msk = 0; i < nelt; i++)
48220 {
48221 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
48222 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
48223 return false;
48224 dfirst.perm[j] = d->perm[i];
48225 if (j != i)
48226 msk |= (1 << i);
48227 }
48228 for (i = 0; i < nelt; i++)
48229 if (dfirst.perm[i] == 0xff)
48230 dfirst.perm[i] = i;
48231
48232 if (!d->testing_p)
48233 dfirst.target = gen_reg_rtx (dfirst.vmode);
48234
48235 start_sequence ();
48236 ok = expand_vec_perm_1 (&dfirst);
48237 seq = get_insns ();
48238 end_sequence ();
48239
48240 if (!ok)
48241 return false;
48242
48243 if (d->testing_p)
48244 return true;
48245
48246 emit_insn (seq);
48247
48248 dsecond = *d;
48249 dsecond.op0 = dfirst.target;
48250 dsecond.op1 = dfirst.target;
48251 dsecond.one_operand_p = true;
48252 dsecond.target = gen_reg_rtx (dsecond.vmode);
48253 for (i = 0; i < nelt; i++)
48254 dsecond.perm[i] = i ^ nelt2;
48255
48256 ok = expand_vec_perm_1 (&dsecond);
48257 gcc_assert (ok);
48258
48259 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
48260 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
48261 return true;
48262 }
48263
48264 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
48265 permutation using two vperm2f128, followed by a vshufpd insn blending
48266 the two vectors together. */
48267
48268 static bool
48269 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
48270 {
48271 struct expand_vec_perm_d dfirst, dsecond, dthird;
48272 bool ok;
48273
48274 if (!TARGET_AVX || (d->vmode != V4DFmode))
48275 return false;
48276
48277 if (d->testing_p)
48278 return true;
48279
48280 dfirst = *d;
48281 dsecond = *d;
48282 dthird = *d;
48283
48284 dfirst.perm[0] = (d->perm[0] & ~1);
48285 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
48286 dfirst.perm[2] = (d->perm[2] & ~1);
48287 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
48288 dsecond.perm[0] = (d->perm[1] & ~1);
48289 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
48290 dsecond.perm[2] = (d->perm[3] & ~1);
48291 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
48292 dthird.perm[0] = (d->perm[0] % 2);
48293 dthird.perm[1] = (d->perm[1] % 2) + 4;
48294 dthird.perm[2] = (d->perm[2] % 2) + 2;
48295 dthird.perm[3] = (d->perm[3] % 2) + 6;
48296
48297 dfirst.target = gen_reg_rtx (dfirst.vmode);
48298 dsecond.target = gen_reg_rtx (dsecond.vmode);
48299 dthird.op0 = dfirst.target;
48300 dthird.op1 = dsecond.target;
48301 dthird.one_operand_p = false;
48302
48303 canonicalize_perm (&dfirst);
48304 canonicalize_perm (&dsecond);
48305
48306 ok = expand_vec_perm_1 (&dfirst)
48307 && expand_vec_perm_1 (&dsecond)
48308 && expand_vec_perm_1 (&dthird);
48309
48310 gcc_assert (ok);
48311
48312 return true;
48313 }
48314
48315 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
48316 permutation with two pshufb insns and an ior. We should have already
48317 failed all two instruction sequences. */
48318
48319 static bool
48320 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
48321 {
48322 rtx rperm[2][16], vperm, l, h, op, m128;
48323 unsigned int i, nelt, eltsz;
48324
48325 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
48326 return false;
48327 gcc_assert (!d->one_operand_p);
48328
48329 if (d->testing_p)
48330 return true;
48331
48332 nelt = d->nelt;
48333 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48334
48335 /* Generate two permutation masks. If the required element is within
48336 the given vector it is shuffled into the proper lane. If the required
48337 element is in the other vector, force a zero into the lane by setting
48338 bit 7 in the permutation mask. */
48339 m128 = GEN_INT (-128);
48340 for (i = 0; i < nelt; ++i)
48341 {
48342 unsigned j, e = d->perm[i];
48343 unsigned which = (e >= nelt);
48344 if (e >= nelt)
48345 e -= nelt;
48346
48347 for (j = 0; j < eltsz; ++j)
48348 {
48349 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
48350 rperm[1-which][i*eltsz + j] = m128;
48351 }
48352 }
48353
48354 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
48355 vperm = force_reg (V16QImode, vperm);
48356
48357 l = gen_reg_rtx (V16QImode);
48358 op = gen_lowpart (V16QImode, d->op0);
48359 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
48360
48361 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
48362 vperm = force_reg (V16QImode, vperm);
48363
48364 h = gen_reg_rtx (V16QImode);
48365 op = gen_lowpart (V16QImode, d->op1);
48366 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
48367
48368 op = d->target;
48369 if (d->vmode != V16QImode)
48370 op = gen_reg_rtx (V16QImode);
48371 emit_insn (gen_iorv16qi3 (op, l, h));
48372 if (op != d->target)
48373 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48374
48375 return true;
48376 }
48377
48378 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
48379 with two vpshufb insns, vpermq and vpor. We should have already failed
48380 all two or three instruction sequences. */
48381
48382 static bool
48383 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
48384 {
48385 rtx rperm[2][32], vperm, l, h, hp, op, m128;
48386 unsigned int i, nelt, eltsz;
48387
48388 if (!TARGET_AVX2
48389 || !d->one_operand_p
48390 || (d->vmode != V32QImode && d->vmode != V16HImode))
48391 return false;
48392
48393 if (d->testing_p)
48394 return true;
48395
48396 nelt = d->nelt;
48397 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48398
48399 /* Generate two permutation masks. If the required element is within
48400 the same lane, it is shuffled in. If the required element from the
48401 other lane, force a zero by setting bit 7 in the permutation mask.
48402 In the other mask the mask has non-negative elements if element
48403 is requested from the other lane, but also moved to the other lane,
48404 so that the result of vpshufb can have the two V2TImode halves
48405 swapped. */
48406 m128 = GEN_INT (-128);
48407 for (i = 0; i < nelt; ++i)
48408 {
48409 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48410 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
48411
48412 for (j = 0; j < eltsz; ++j)
48413 {
48414 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
48415 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
48416 }
48417 }
48418
48419 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
48420 vperm = force_reg (V32QImode, vperm);
48421
48422 h = gen_reg_rtx (V32QImode);
48423 op = gen_lowpart (V32QImode, d->op0);
48424 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
48425
48426 /* Swap the 128-byte lanes of h into hp. */
48427 hp = gen_reg_rtx (V4DImode);
48428 op = gen_lowpart (V4DImode, h);
48429 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
48430 const1_rtx));
48431
48432 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
48433 vperm = force_reg (V32QImode, vperm);
48434
48435 l = gen_reg_rtx (V32QImode);
48436 op = gen_lowpart (V32QImode, d->op0);
48437 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
48438
48439 op = d->target;
48440 if (d->vmode != V32QImode)
48441 op = gen_reg_rtx (V32QImode);
48442 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
48443 if (op != d->target)
48444 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48445
48446 return true;
48447 }
48448
48449 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48450 and extract-odd permutations of two V32QImode and V16QImode operand
48451 with two vpshufb insns, vpor and vpermq. We should have already
48452 failed all two or three instruction sequences. */
48453
48454 static bool
48455 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
48456 {
48457 rtx rperm[2][32], vperm, l, h, ior, op, m128;
48458 unsigned int i, nelt, eltsz;
48459
48460 if (!TARGET_AVX2
48461 || d->one_operand_p
48462 || (d->vmode != V32QImode && d->vmode != V16HImode))
48463 return false;
48464
48465 for (i = 0; i < d->nelt; ++i)
48466 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
48467 return false;
48468
48469 if (d->testing_p)
48470 return true;
48471
48472 nelt = d->nelt;
48473 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48474
48475 /* Generate two permutation masks. In the first permutation mask
48476 the first quarter will contain indexes for the first half
48477 of the op0, the second quarter will contain bit 7 set, third quarter
48478 will contain indexes for the second half of the op0 and the
48479 last quarter bit 7 set. In the second permutation mask
48480 the first quarter will contain bit 7 set, the second quarter
48481 indexes for the first half of the op1, the third quarter bit 7 set
48482 and last quarter indexes for the second half of the op1.
48483 I.e. the first mask e.g. for V32QImode extract even will be:
48484 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
48485 (all values masked with 0xf except for -128) and second mask
48486 for extract even will be
48487 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
48488 m128 = GEN_INT (-128);
48489 for (i = 0; i < nelt; ++i)
48490 {
48491 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48492 unsigned which = d->perm[i] >= nelt;
48493 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
48494
48495 for (j = 0; j < eltsz; ++j)
48496 {
48497 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
48498 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
48499 }
48500 }
48501
48502 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
48503 vperm = force_reg (V32QImode, vperm);
48504
48505 l = gen_reg_rtx (V32QImode);
48506 op = gen_lowpart (V32QImode, d->op0);
48507 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
48508
48509 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
48510 vperm = force_reg (V32QImode, vperm);
48511
48512 h = gen_reg_rtx (V32QImode);
48513 op = gen_lowpart (V32QImode, d->op1);
48514 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
48515
48516 ior = gen_reg_rtx (V32QImode);
48517 emit_insn (gen_iorv32qi3 (ior, l, h));
48518
48519 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
48520 op = gen_reg_rtx (V4DImode);
48521 ior = gen_lowpart (V4DImode, ior);
48522 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
48523 const1_rtx, GEN_INT (3)));
48524 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48525
48526 return true;
48527 }
48528
48529 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48530 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
48531 with two "and" and "pack" or two "shift" and "pack" insns. We should
48532 have already failed all two instruction sequences. */
48533
48534 static bool
48535 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
48536 {
48537 rtx op, dop0, dop1, t, rperm[16];
48538 unsigned i, odd, c, s, nelt = d->nelt;
48539 bool end_perm = false;
48540 machine_mode half_mode;
48541 rtx (*gen_and) (rtx, rtx, rtx);
48542 rtx (*gen_pack) (rtx, rtx, rtx);
48543 rtx (*gen_shift) (rtx, rtx, rtx);
48544
48545 if (d->one_operand_p)
48546 return false;
48547
48548 switch (d->vmode)
48549 {
48550 case V8HImode:
48551 /* Required for "pack". */
48552 if (!TARGET_SSE4_1)
48553 return false;
48554 c = 0xffff;
48555 s = 16;
48556 half_mode = V4SImode;
48557 gen_and = gen_andv4si3;
48558 gen_pack = gen_sse4_1_packusdw;
48559 gen_shift = gen_lshrv4si3;
48560 break;
48561 case V16QImode:
48562 /* No check as all instructions are SSE2. */
48563 c = 0xff;
48564 s = 8;
48565 half_mode = V8HImode;
48566 gen_and = gen_andv8hi3;
48567 gen_pack = gen_sse2_packuswb;
48568 gen_shift = gen_lshrv8hi3;
48569 break;
48570 case V16HImode:
48571 if (!TARGET_AVX2)
48572 return false;
48573 c = 0xffff;
48574 s = 16;
48575 half_mode = V8SImode;
48576 gen_and = gen_andv8si3;
48577 gen_pack = gen_avx2_packusdw;
48578 gen_shift = gen_lshrv8si3;
48579 end_perm = true;
48580 break;
48581 case V32QImode:
48582 if (!TARGET_AVX2)
48583 return false;
48584 c = 0xff;
48585 s = 8;
48586 half_mode = V16HImode;
48587 gen_and = gen_andv16hi3;
48588 gen_pack = gen_avx2_packuswb;
48589 gen_shift = gen_lshrv16hi3;
48590 end_perm = true;
48591 break;
48592 default:
48593 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
48594 general shuffles. */
48595 return false;
48596 }
48597
48598 /* Check that permutation is even or odd. */
48599 odd = d->perm[0];
48600 if (odd > 1)
48601 return false;
48602
48603 for (i = 1; i < nelt; ++i)
48604 if (d->perm[i] != 2 * i + odd)
48605 return false;
48606
48607 if (d->testing_p)
48608 return true;
48609
48610 dop0 = gen_reg_rtx (half_mode);
48611 dop1 = gen_reg_rtx (half_mode);
48612 if (odd == 0)
48613 {
48614 for (i = 0; i < nelt / 2; i++)
48615 rperm[i] = GEN_INT (c);
48616 t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
48617 t = force_reg (half_mode, t);
48618 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
48619 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
48620 }
48621 else
48622 {
48623 emit_insn (gen_shift (dop0,
48624 gen_lowpart (half_mode, d->op0),
48625 GEN_INT (s)));
48626 emit_insn (gen_shift (dop1,
48627 gen_lowpart (half_mode, d->op1),
48628 GEN_INT (s)));
48629 }
48630 /* In AVX2 for 256 bit case we need to permute pack result. */
48631 if (TARGET_AVX2 && end_perm)
48632 {
48633 op = gen_reg_rtx (d->vmode);
48634 t = gen_reg_rtx (V4DImode);
48635 emit_insn (gen_pack (op, dop0, dop1));
48636 emit_insn (gen_avx2_permv4di_1 (t,
48637 gen_lowpart (V4DImode, op),
48638 const0_rtx,
48639 const2_rtx,
48640 const1_rtx,
48641 GEN_INT (3)));
48642 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
48643 }
48644 else
48645 emit_insn (gen_pack (d->target, dop0, dop1));
48646
48647 return true;
48648 }
48649
48650 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48651 and extract-odd permutations of two V64QI operands
48652 with two "shifts", two "truncs" and one "concat" insns for "odd"
48653 and two "truncs" and one concat insn for "even."
48654 Have already failed all two instruction sequences. */
48655
48656 static bool
48657 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
48658 {
48659 rtx t1, t2, t3, t4;
48660 unsigned i, odd, nelt = d->nelt;
48661
48662 if (!TARGET_AVX512BW
48663 || d->one_operand_p
48664 || d->vmode != V64QImode)
48665 return false;
48666
48667 /* Check that permutation is even or odd. */
48668 odd = d->perm[0];
48669 if (odd > 1)
48670 return false;
48671
48672 for (i = 1; i < nelt; ++i)
48673 if (d->perm[i] != 2 * i + odd)
48674 return false;
48675
48676 if (d->testing_p)
48677 return true;
48678
48679
48680 if (odd)
48681 {
48682 t1 = gen_reg_rtx (V32HImode);
48683 t2 = gen_reg_rtx (V32HImode);
48684 emit_insn (gen_lshrv32hi3 (t1,
48685 gen_lowpart (V32HImode, d->op0),
48686 GEN_INT (8)));
48687 emit_insn (gen_lshrv32hi3 (t2,
48688 gen_lowpart (V32HImode, d->op1),
48689 GEN_INT (8)));
48690 }
48691 else
48692 {
48693 t1 = gen_lowpart (V32HImode, d->op0);
48694 t2 = gen_lowpart (V32HImode, d->op1);
48695 }
48696
48697 t3 = gen_reg_rtx (V32QImode);
48698 t4 = gen_reg_rtx (V32QImode);
48699 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
48700 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
48701 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
48702
48703 return true;
48704 }
48705
48706 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
48707 and extract-odd permutations. */
48708
48709 static bool
48710 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
48711 {
48712 rtx t1, t2, t3, t4, t5;
48713
48714 switch (d->vmode)
48715 {
48716 case V4DFmode:
48717 if (d->testing_p)
48718 break;
48719 t1 = gen_reg_rtx (V4DFmode);
48720 t2 = gen_reg_rtx (V4DFmode);
48721
48722 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
48723 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
48724 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
48725
48726 /* Now an unpck[lh]pd will produce the result required. */
48727 if (odd)
48728 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
48729 else
48730 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
48731 emit_insn (t3);
48732 break;
48733
48734 case V8SFmode:
48735 {
48736 int mask = odd ? 0xdd : 0x88;
48737
48738 if (d->testing_p)
48739 break;
48740 t1 = gen_reg_rtx (V8SFmode);
48741 t2 = gen_reg_rtx (V8SFmode);
48742 t3 = gen_reg_rtx (V8SFmode);
48743
48744 /* Shuffle within the 128-bit lanes to produce:
48745 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
48746 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
48747 GEN_INT (mask)));
48748
48749 /* Shuffle the lanes around to produce:
48750 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
48751 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
48752 GEN_INT (0x3)));
48753
48754 /* Shuffle within the 128-bit lanes to produce:
48755 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
48756 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
48757
48758 /* Shuffle within the 128-bit lanes to produce:
48759 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
48760 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
48761
48762 /* Shuffle the lanes around to produce:
48763 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
48764 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
48765 GEN_INT (0x20)));
48766 }
48767 break;
48768
48769 case V2DFmode:
48770 case V4SFmode:
48771 case V2DImode:
48772 case V4SImode:
48773 /* These are always directly implementable by expand_vec_perm_1. */
48774 gcc_unreachable ();
48775
48776 case V8HImode:
48777 if (TARGET_SSE4_1)
48778 return expand_vec_perm_even_odd_pack (d);
48779 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
48780 return expand_vec_perm_pshufb2 (d);
48781 else
48782 {
48783 if (d->testing_p)
48784 break;
48785 /* We need 2*log2(N)-1 operations to achieve odd/even
48786 with interleave. */
48787 t1 = gen_reg_rtx (V8HImode);
48788 t2 = gen_reg_rtx (V8HImode);
48789 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
48790 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
48791 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
48792 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
48793 if (odd)
48794 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
48795 else
48796 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
48797 emit_insn (t3);
48798 }
48799 break;
48800
48801 case V16QImode:
48802 return expand_vec_perm_even_odd_pack (d);
48803
48804 case V16HImode:
48805 case V32QImode:
48806 return expand_vec_perm_even_odd_pack (d);
48807
48808 case V64QImode:
48809 return expand_vec_perm_even_odd_trunc (d);
48810
48811 case V4DImode:
48812 if (!TARGET_AVX2)
48813 {
48814 struct expand_vec_perm_d d_copy = *d;
48815 d_copy.vmode = V4DFmode;
48816 if (d->testing_p)
48817 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
48818 else
48819 d_copy.target = gen_reg_rtx (V4DFmode);
48820 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
48821 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
48822 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
48823 {
48824 if (!d->testing_p)
48825 emit_move_insn (d->target,
48826 gen_lowpart (V4DImode, d_copy.target));
48827 return true;
48828 }
48829 return false;
48830 }
48831
48832 if (d->testing_p)
48833 break;
48834
48835 t1 = gen_reg_rtx (V4DImode);
48836 t2 = gen_reg_rtx (V4DImode);
48837
48838 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
48839 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
48840 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
48841
48842 /* Now an vpunpck[lh]qdq will produce the result required. */
48843 if (odd)
48844 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
48845 else
48846 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
48847 emit_insn (t3);
48848 break;
48849
48850 case V8SImode:
48851 if (!TARGET_AVX2)
48852 {
48853 struct expand_vec_perm_d d_copy = *d;
48854 d_copy.vmode = V8SFmode;
48855 if (d->testing_p)
48856 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
48857 else
48858 d_copy.target = gen_reg_rtx (V8SFmode);
48859 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
48860 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
48861 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
48862 {
48863 if (!d->testing_p)
48864 emit_move_insn (d->target,
48865 gen_lowpart (V8SImode, d_copy.target));
48866 return true;
48867 }
48868 return false;
48869 }
48870
48871 if (d->testing_p)
48872 break;
48873
48874 t1 = gen_reg_rtx (V8SImode);
48875 t2 = gen_reg_rtx (V8SImode);
48876 t3 = gen_reg_rtx (V4DImode);
48877 t4 = gen_reg_rtx (V4DImode);
48878 t5 = gen_reg_rtx (V4DImode);
48879
48880 /* Shuffle the lanes around into
48881 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
48882 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
48883 gen_lowpart (V4DImode, d->op1),
48884 GEN_INT (0x20)));
48885 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
48886 gen_lowpart (V4DImode, d->op1),
48887 GEN_INT (0x31)));
48888
48889 /* Swap the 2nd and 3rd position in each lane into
48890 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
48891 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
48892 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
48893 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
48894 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
48895
48896 /* Now an vpunpck[lh]qdq will produce
48897 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
48898 if (odd)
48899 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
48900 gen_lowpart (V4DImode, t2));
48901 else
48902 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
48903 gen_lowpart (V4DImode, t2));
48904 emit_insn (t3);
48905 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
48906 break;
48907
48908 default:
48909 gcc_unreachable ();
48910 }
48911
48912 return true;
48913 }
48914
48915 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
48916 extract-even and extract-odd permutations. */
48917
48918 static bool
48919 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
48920 {
48921 unsigned i, odd, nelt = d->nelt;
48922
48923 odd = d->perm[0];
48924 if (odd != 0 && odd != 1)
48925 return false;
48926
48927 for (i = 1; i < nelt; ++i)
48928 if (d->perm[i] != 2 * i + odd)
48929 return false;
48930
48931 return expand_vec_perm_even_odd_1 (d, odd);
48932 }
48933
48934 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
48935 permutations. We assume that expand_vec_perm_1 has already failed. */
48936
48937 static bool
48938 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
48939 {
48940 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
48941 machine_mode vmode = d->vmode;
48942 unsigned char perm2[4];
48943 rtx op0 = d->op0, dest;
48944 bool ok;
48945
48946 switch (vmode)
48947 {
48948 case V4DFmode:
48949 case V8SFmode:
48950 /* These are special-cased in sse.md so that we can optionally
48951 use the vbroadcast instruction. They expand to two insns
48952 if the input happens to be in a register. */
48953 gcc_unreachable ();
48954
48955 case V2DFmode:
48956 case V2DImode:
48957 case V4SFmode:
48958 case V4SImode:
48959 /* These are always implementable using standard shuffle patterns. */
48960 gcc_unreachable ();
48961
48962 case V8HImode:
48963 case V16QImode:
48964 /* These can be implemented via interleave. We save one insn by
48965 stopping once we have promoted to V4SImode and then use pshufd. */
48966 if (d->testing_p)
48967 return true;
48968 do
48969 {
48970 rtx dest;
48971 rtx (*gen) (rtx, rtx, rtx)
48972 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
48973 : gen_vec_interleave_lowv8hi;
48974
48975 if (elt >= nelt2)
48976 {
48977 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
48978 : gen_vec_interleave_highv8hi;
48979 elt -= nelt2;
48980 }
48981 nelt2 /= 2;
48982
48983 dest = gen_reg_rtx (vmode);
48984 emit_insn (gen (dest, op0, op0));
48985 vmode = get_mode_wider_vector (vmode);
48986 op0 = gen_lowpart (vmode, dest);
48987 }
48988 while (vmode != V4SImode);
48989
48990 memset (perm2, elt, 4);
48991 dest = gen_reg_rtx (V4SImode);
48992 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
48993 gcc_assert (ok);
48994 if (!d->testing_p)
48995 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
48996 return true;
48997
48998 case V64QImode:
48999 case V32QImode:
49000 case V16HImode:
49001 case V8SImode:
49002 case V4DImode:
49003 /* For AVX2 broadcasts of the first element vpbroadcast* or
49004 vpermq should be used by expand_vec_perm_1. */
49005 gcc_assert (!TARGET_AVX2 || d->perm[0]);
49006 return false;
49007
49008 default:
49009 gcc_unreachable ();
49010 }
49011 }
49012
49013 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
49014 broadcast permutations. */
49015
49016 static bool
49017 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
49018 {
49019 unsigned i, elt, nelt = d->nelt;
49020
49021 if (!d->one_operand_p)
49022 return false;
49023
49024 elt = d->perm[0];
49025 for (i = 1; i < nelt; ++i)
49026 if (d->perm[i] != elt)
49027 return false;
49028
49029 return expand_vec_perm_broadcast_1 (d);
49030 }
49031
49032 /* Implement arbitrary permutations of two V64QImode operands
49033 will 2 vpermi2w, 2 vpshufb and one vpor instruction. */
49034 static bool
49035 expand_vec_perm_vpermi2_vpshub2 (struct expand_vec_perm_d *d)
49036 {
49037 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
49038 return false;
49039
49040 if (d->testing_p)
49041 return true;
49042
49043 struct expand_vec_perm_d ds[2];
49044 rtx rperm[128], vperm, target0, target1;
49045 unsigned int i, nelt;
49046 machine_mode vmode;
49047
49048 nelt = d->nelt;
49049 vmode = V64QImode;
49050
49051 for (i = 0; i < 2; i++)
49052 {
49053 ds[i] = *d;
49054 ds[i].vmode = V32HImode;
49055 ds[i].nelt = 32;
49056 ds[i].target = gen_reg_rtx (V32HImode);
49057 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
49058 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
49059 }
49060
49061 /* Prepare permutations such that the first one takes care of
49062 putting the even bytes into the right positions or one higher
49063 positions (ds[0]) and the second one takes care of
49064 putting the odd bytes into the right positions or one below
49065 (ds[1]). */
49066
49067 for (i = 0; i < nelt; i++)
49068 {
49069 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
49070 if (i & 1)
49071 {
49072 rperm[i] = constm1_rtx;
49073 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
49074 }
49075 else
49076 {
49077 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
49078 rperm[i + 64] = constm1_rtx;
49079 }
49080 }
49081
49082 bool ok = expand_vec_perm_1 (&ds[0]);
49083 gcc_assert (ok);
49084 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
49085
49086 ok = expand_vec_perm_1 (&ds[1]);
49087 gcc_assert (ok);
49088 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
49089
49090 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
49091 vperm = force_reg (vmode, vperm);
49092 target0 = gen_reg_rtx (V64QImode);
49093 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
49094
49095 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
49096 vperm = force_reg (vmode, vperm);
49097 target1 = gen_reg_rtx (V64QImode);
49098 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
49099
49100 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
49101 return true;
49102 }
49103
49104 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
49105 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
49106 all the shorter instruction sequences. */
49107
49108 static bool
49109 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
49110 {
49111 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
49112 unsigned int i, nelt, eltsz;
49113 bool used[4];
49114
49115 if (!TARGET_AVX2
49116 || d->one_operand_p
49117 || (d->vmode != V32QImode && d->vmode != V16HImode))
49118 return false;
49119
49120 if (d->testing_p)
49121 return true;
49122
49123 nelt = d->nelt;
49124 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
49125
49126 /* Generate 4 permutation masks. If the required element is within
49127 the same lane, it is shuffled in. If the required element from the
49128 other lane, force a zero by setting bit 7 in the permutation mask.
49129 In the other mask the mask has non-negative elements if element
49130 is requested from the other lane, but also moved to the other lane,
49131 so that the result of vpshufb can have the two V2TImode halves
49132 swapped. */
49133 m128 = GEN_INT (-128);
49134 for (i = 0; i < 32; ++i)
49135 {
49136 rperm[0][i] = m128;
49137 rperm[1][i] = m128;
49138 rperm[2][i] = m128;
49139 rperm[3][i] = m128;
49140 }
49141 used[0] = false;
49142 used[1] = false;
49143 used[2] = false;
49144 used[3] = false;
49145 for (i = 0; i < nelt; ++i)
49146 {
49147 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
49148 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
49149 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
49150
49151 for (j = 0; j < eltsz; ++j)
49152 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
49153 used[which] = true;
49154 }
49155
49156 for (i = 0; i < 2; ++i)
49157 {
49158 if (!used[2 * i + 1])
49159 {
49160 h[i] = NULL_RTX;
49161 continue;
49162 }
49163 vperm = gen_rtx_CONST_VECTOR (V32QImode,
49164 gen_rtvec_v (32, rperm[2 * i + 1]));
49165 vperm = force_reg (V32QImode, vperm);
49166 h[i] = gen_reg_rtx (V32QImode);
49167 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
49168 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
49169 }
49170
49171 /* Swap the 128-byte lanes of h[X]. */
49172 for (i = 0; i < 2; ++i)
49173 {
49174 if (h[i] == NULL_RTX)
49175 continue;
49176 op = gen_reg_rtx (V4DImode);
49177 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
49178 const2_rtx, GEN_INT (3), const0_rtx,
49179 const1_rtx));
49180 h[i] = gen_lowpart (V32QImode, op);
49181 }
49182
49183 for (i = 0; i < 2; ++i)
49184 {
49185 if (!used[2 * i])
49186 {
49187 l[i] = NULL_RTX;
49188 continue;
49189 }
49190 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
49191 vperm = force_reg (V32QImode, vperm);
49192 l[i] = gen_reg_rtx (V32QImode);
49193 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
49194 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
49195 }
49196
49197 for (i = 0; i < 2; ++i)
49198 {
49199 if (h[i] && l[i])
49200 {
49201 op = gen_reg_rtx (V32QImode);
49202 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
49203 l[i] = op;
49204 }
49205 else if (h[i])
49206 l[i] = h[i];
49207 }
49208
49209 gcc_assert (l[0] && l[1]);
49210 op = d->target;
49211 if (d->vmode != V32QImode)
49212 op = gen_reg_rtx (V32QImode);
49213 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
49214 if (op != d->target)
49215 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
49216 return true;
49217 }
49218
49219 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
49220 With all of the interface bits taken care of, perform the expansion
49221 in D and return true on success. */
49222
49223 static bool
49224 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
49225 {
49226 /* Try a single instruction expansion. */
49227 if (expand_vec_perm_1 (d))
49228 return true;
49229
49230 /* Try sequences of two instructions. */
49231
49232 if (expand_vec_perm_pshuflw_pshufhw (d))
49233 return true;
49234
49235 if (expand_vec_perm_palignr (d, false))
49236 return true;
49237
49238 if (expand_vec_perm_interleave2 (d))
49239 return true;
49240
49241 if (expand_vec_perm_broadcast (d))
49242 return true;
49243
49244 if (expand_vec_perm_vpermq_perm_1 (d))
49245 return true;
49246
49247 if (expand_vec_perm_vperm2f128 (d))
49248 return true;
49249
49250 if (expand_vec_perm_pblendv (d))
49251 return true;
49252
49253 /* Try sequences of three instructions. */
49254
49255 if (expand_vec_perm_even_odd_pack (d))
49256 return true;
49257
49258 if (expand_vec_perm_2vperm2f128_vshuf (d))
49259 return true;
49260
49261 if (expand_vec_perm_pshufb2 (d))
49262 return true;
49263
49264 if (expand_vec_perm_interleave3 (d))
49265 return true;
49266
49267 if (expand_vec_perm_vperm2f128_vblend (d))
49268 return true;
49269
49270 /* Try sequences of four instructions. */
49271
49272 if (expand_vec_perm_even_odd_trunc (d))
49273 return true;
49274 if (expand_vec_perm_vpshufb2_vpermq (d))
49275 return true;
49276
49277 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
49278 return true;
49279
49280 if (expand_vec_perm_vpermi2_vpshub2 (d))
49281 return true;
49282
49283 /* ??? Look for narrow permutations whose element orderings would
49284 allow the promotion to a wider mode. */
49285
49286 /* ??? Look for sequences of interleave or a wider permute that place
49287 the data into the correct lanes for a half-vector shuffle like
49288 pshuf[lh]w or vpermilps. */
49289
49290 /* ??? Look for sequences of interleave that produce the desired results.
49291 The combinatorics of punpck[lh] get pretty ugly... */
49292
49293 if (expand_vec_perm_even_odd (d))
49294 return true;
49295
49296 /* Even longer sequences. */
49297 if (expand_vec_perm_vpshufb4_vpermq2 (d))
49298 return true;
49299
49300 /* See if we can get the same permutation in different vector integer
49301 mode. */
49302 struct expand_vec_perm_d nd;
49303 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
49304 {
49305 if (!d->testing_p)
49306 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
49307 return true;
49308 }
49309
49310 return false;
49311 }
49312
49313 /* If a permutation only uses one operand, make it clear. Returns true
49314 if the permutation references both operands. */
49315
49316 static bool
49317 canonicalize_perm (struct expand_vec_perm_d *d)
49318 {
49319 int i, which, nelt = d->nelt;
49320
49321 for (i = which = 0; i < nelt; ++i)
49322 which |= (d->perm[i] < nelt ? 1 : 2);
49323
49324 d->one_operand_p = true;
49325 switch (which)
49326 {
49327 default:
49328 gcc_unreachable();
49329
49330 case 3:
49331 if (!rtx_equal_p (d->op0, d->op1))
49332 {
49333 d->one_operand_p = false;
49334 break;
49335 }
49336 /* The elements of PERM do not suggest that only the first operand
49337 is used, but both operands are identical. Allow easier matching
49338 of the permutation by folding the permutation into the single
49339 input vector. */
49340 /* FALLTHRU */
49341
49342 case 2:
49343 for (i = 0; i < nelt; ++i)
49344 d->perm[i] &= nelt - 1;
49345 d->op0 = d->op1;
49346 break;
49347
49348 case 1:
49349 d->op1 = d->op0;
49350 break;
49351 }
49352
49353 return (which == 3);
49354 }
49355
49356 bool
49357 ix86_expand_vec_perm_const (rtx operands[4])
49358 {
49359 struct expand_vec_perm_d d;
49360 unsigned char perm[MAX_VECT_LEN];
49361 int i, nelt;
49362 bool two_args;
49363 rtx sel;
49364
49365 d.target = operands[0];
49366 d.op0 = operands[1];
49367 d.op1 = operands[2];
49368 sel = operands[3];
49369
49370 d.vmode = GET_MODE (d.target);
49371 gcc_assert (VECTOR_MODE_P (d.vmode));
49372 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49373 d.testing_p = false;
49374
49375 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
49376 gcc_assert (XVECLEN (sel, 0) == nelt);
49377 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
49378
49379 for (i = 0; i < nelt; ++i)
49380 {
49381 rtx e = XVECEXP (sel, 0, i);
49382 int ei = INTVAL (e) & (2 * nelt - 1);
49383 d.perm[i] = ei;
49384 perm[i] = ei;
49385 }
49386
49387 two_args = canonicalize_perm (&d);
49388
49389 if (ix86_expand_vec_perm_const_1 (&d))
49390 return true;
49391
49392 /* If the selector says both arguments are needed, but the operands are the
49393 same, the above tried to expand with one_operand_p and flattened selector.
49394 If that didn't work, retry without one_operand_p; we succeeded with that
49395 during testing. */
49396 if (two_args && d.one_operand_p)
49397 {
49398 d.one_operand_p = false;
49399 memcpy (d.perm, perm, sizeof (perm));
49400 return ix86_expand_vec_perm_const_1 (&d);
49401 }
49402
49403 return false;
49404 }
49405
49406 /* Implement targetm.vectorize.vec_perm_const_ok. */
49407
49408 static bool
49409 ix86_vectorize_vec_perm_const_ok (machine_mode vmode,
49410 const unsigned char *sel)
49411 {
49412 struct expand_vec_perm_d d;
49413 unsigned int i, nelt, which;
49414 bool ret;
49415
49416 d.vmode = vmode;
49417 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49418 d.testing_p = true;
49419
49420 /* Given sufficient ISA support we can just return true here
49421 for selected vector modes. */
49422 switch (d.vmode)
49423 {
49424 case V16SFmode:
49425 case V16SImode:
49426 case V8DImode:
49427 case V8DFmode:
49428 if (TARGET_AVX512F)
49429 /* All implementable with a single vpermi2 insn. */
49430 return true;
49431 break;
49432 case V32HImode:
49433 if (TARGET_AVX512BW)
49434 /* All implementable with a single vpermi2 insn. */
49435 return true;
49436 break;
49437 case V64QImode:
49438 if (TARGET_AVX512BW)
49439 /* Implementable with 2 vpermi2, 2 vpshufb and 1 or insn. */
49440 return true;
49441 break;
49442 case V8SImode:
49443 case V8SFmode:
49444 case V4DFmode:
49445 case V4DImode:
49446 if (TARGET_AVX512VL)
49447 /* All implementable with a single vpermi2 insn. */
49448 return true;
49449 break;
49450 case V16HImode:
49451 if (TARGET_AVX2)
49452 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
49453 return true;
49454 break;
49455 case V32QImode:
49456 if (TARGET_AVX2)
49457 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
49458 return true;
49459 break;
49460 case V4SImode:
49461 case V4SFmode:
49462 case V8HImode:
49463 case V16QImode:
49464 /* All implementable with a single vpperm insn. */
49465 if (TARGET_XOP)
49466 return true;
49467 /* All implementable with 2 pshufb + 1 ior. */
49468 if (TARGET_SSSE3)
49469 return true;
49470 break;
49471 case V2DImode:
49472 case V2DFmode:
49473 /* All implementable with shufpd or unpck[lh]pd. */
49474 return true;
49475 default:
49476 return false;
49477 }
49478
49479 /* Extract the values from the vector CST into the permutation
49480 array in D. */
49481 memcpy (d.perm, sel, nelt);
49482 for (i = which = 0; i < nelt; ++i)
49483 {
49484 unsigned char e = d.perm[i];
49485 gcc_assert (e < 2 * nelt);
49486 which |= (e < nelt ? 1 : 2);
49487 }
49488
49489 /* For all elements from second vector, fold the elements to first. */
49490 if (which == 2)
49491 for (i = 0; i < nelt; ++i)
49492 d.perm[i] -= nelt;
49493
49494 /* Check whether the mask can be applied to the vector type. */
49495 d.one_operand_p = (which != 3);
49496
49497 /* Implementable with shufps or pshufd. */
49498 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
49499 return true;
49500
49501 /* Otherwise we have to go through the motions and see if we can
49502 figure out how to generate the requested permutation. */
49503 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
49504 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
49505 if (!d.one_operand_p)
49506 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
49507
49508 start_sequence ();
49509 ret = ix86_expand_vec_perm_const_1 (&d);
49510 end_sequence ();
49511
49512 return ret;
49513 }
49514
49515 void
49516 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
49517 {
49518 struct expand_vec_perm_d d;
49519 unsigned i, nelt;
49520
49521 d.target = targ;
49522 d.op0 = op0;
49523 d.op1 = op1;
49524 d.vmode = GET_MODE (targ);
49525 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49526 d.one_operand_p = false;
49527 d.testing_p = false;
49528
49529 for (i = 0; i < nelt; ++i)
49530 d.perm[i] = i * 2 + odd;
49531
49532 /* We'll either be able to implement the permutation directly... */
49533 if (expand_vec_perm_1 (&d))
49534 return;
49535
49536 /* ... or we use the special-case patterns. */
49537 expand_vec_perm_even_odd_1 (&d, odd);
49538 }
49539
49540 static void
49541 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
49542 {
49543 struct expand_vec_perm_d d;
49544 unsigned i, nelt, base;
49545 bool ok;
49546
49547 d.target = targ;
49548 d.op0 = op0;
49549 d.op1 = op1;
49550 d.vmode = GET_MODE (targ);
49551 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49552 d.one_operand_p = false;
49553 d.testing_p = false;
49554
49555 base = high_p ? nelt / 2 : 0;
49556 for (i = 0; i < nelt / 2; ++i)
49557 {
49558 d.perm[i * 2] = i + base;
49559 d.perm[i * 2 + 1] = i + base + nelt;
49560 }
49561
49562 /* Note that for AVX this isn't one instruction. */
49563 ok = ix86_expand_vec_perm_const_1 (&d);
49564 gcc_assert (ok);
49565 }
49566
49567
49568 /* Expand a vector operation CODE for a V*QImode in terms of the
49569 same operation on V*HImode. */
49570
49571 void
49572 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
49573 {
49574 machine_mode qimode = GET_MODE (dest);
49575 machine_mode himode;
49576 rtx (*gen_il) (rtx, rtx, rtx);
49577 rtx (*gen_ih) (rtx, rtx, rtx);
49578 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
49579 struct expand_vec_perm_d d;
49580 bool ok, full_interleave;
49581 bool uns_p = false;
49582 int i;
49583
49584 switch (qimode)
49585 {
49586 case V16QImode:
49587 himode = V8HImode;
49588 gen_il = gen_vec_interleave_lowv16qi;
49589 gen_ih = gen_vec_interleave_highv16qi;
49590 break;
49591 case V32QImode:
49592 himode = V16HImode;
49593 gen_il = gen_avx2_interleave_lowv32qi;
49594 gen_ih = gen_avx2_interleave_highv32qi;
49595 break;
49596 case V64QImode:
49597 himode = V32HImode;
49598 gen_il = gen_avx512bw_interleave_lowv64qi;
49599 gen_ih = gen_avx512bw_interleave_highv64qi;
49600 break;
49601 default:
49602 gcc_unreachable ();
49603 }
49604
49605 op2_l = op2_h = op2;
49606 switch (code)
49607 {
49608 case MULT:
49609 /* Unpack data such that we've got a source byte in each low byte of
49610 each word. We don't care what goes into the high byte of each word.
49611 Rather than trying to get zero in there, most convenient is to let
49612 it be a copy of the low byte. */
49613 op2_l = gen_reg_rtx (qimode);
49614 op2_h = gen_reg_rtx (qimode);
49615 emit_insn (gen_il (op2_l, op2, op2));
49616 emit_insn (gen_ih (op2_h, op2, op2));
49617 /* FALLTHRU */
49618
49619 op1_l = gen_reg_rtx (qimode);
49620 op1_h = gen_reg_rtx (qimode);
49621 emit_insn (gen_il (op1_l, op1, op1));
49622 emit_insn (gen_ih (op1_h, op1, op1));
49623 full_interleave = qimode == V16QImode;
49624 break;
49625
49626 case ASHIFT:
49627 case LSHIFTRT:
49628 uns_p = true;
49629 /* FALLTHRU */
49630 case ASHIFTRT:
49631 op1_l = gen_reg_rtx (himode);
49632 op1_h = gen_reg_rtx (himode);
49633 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
49634 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
49635 full_interleave = true;
49636 break;
49637 default:
49638 gcc_unreachable ();
49639 }
49640
49641 /* Perform the operation. */
49642 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
49643 1, OPTAB_DIRECT);
49644 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
49645 1, OPTAB_DIRECT);
49646 gcc_assert (res_l && res_h);
49647
49648 /* Merge the data back into the right place. */
49649 d.target = dest;
49650 d.op0 = gen_lowpart (qimode, res_l);
49651 d.op1 = gen_lowpart (qimode, res_h);
49652 d.vmode = qimode;
49653 d.nelt = GET_MODE_NUNITS (qimode);
49654 d.one_operand_p = false;
49655 d.testing_p = false;
49656
49657 if (full_interleave)
49658 {
49659 /* For SSE2, we used an full interleave, so the desired
49660 results are in the even elements. */
49661 for (i = 0; i < d.nelt; ++i)
49662 d.perm[i] = i * 2;
49663 }
49664 else
49665 {
49666 /* For AVX, the interleave used above was not cross-lane. So the
49667 extraction is evens but with the second and third quarter swapped.
49668 Happily, that is even one insn shorter than even extraction.
49669 For AVX512BW we have 4 lanes. We extract evens from within a lane,
49670 always first from the first and then from the second source operand,
49671 the index bits above the low 4 bits remains the same.
49672 Thus, for d.nelt == 32 we want permutation
49673 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
49674 and for d.nelt == 64 we want permutation
49675 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
49676 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
49677 for (i = 0; i < d.nelt; ++i)
49678 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
49679 }
49680
49681 ok = ix86_expand_vec_perm_const_1 (&d);
49682 gcc_assert (ok);
49683
49684 set_unique_reg_note (get_last_insn (), REG_EQUAL,
49685 gen_rtx_fmt_ee (code, qimode, op1, op2));
49686 }
49687
49688 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
49689 if op is CONST_VECTOR with all odd elements equal to their
49690 preceding element. */
49691
49692 static bool
49693 const_vector_equal_evenodd_p (rtx op)
49694 {
49695 machine_mode mode = GET_MODE (op);
49696 int i, nunits = GET_MODE_NUNITS (mode);
49697 if (GET_CODE (op) != CONST_VECTOR
49698 || nunits != CONST_VECTOR_NUNITS (op))
49699 return false;
49700 for (i = 0; i < nunits; i += 2)
49701 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
49702 return false;
49703 return true;
49704 }
49705
49706 void
49707 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
49708 bool uns_p, bool odd_p)
49709 {
49710 machine_mode mode = GET_MODE (op1);
49711 machine_mode wmode = GET_MODE (dest);
49712 rtx x;
49713 rtx orig_op1 = op1, orig_op2 = op2;
49714
49715 if (!nonimmediate_operand (op1, mode))
49716 op1 = force_reg (mode, op1);
49717 if (!nonimmediate_operand (op2, mode))
49718 op2 = force_reg (mode, op2);
49719
49720 /* We only play even/odd games with vectors of SImode. */
49721 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
49722
49723 /* If we're looking for the odd results, shift those members down to
49724 the even slots. For some cpus this is faster than a PSHUFD. */
49725 if (odd_p)
49726 {
49727 /* For XOP use vpmacsdqh, but only for smult, as it is only
49728 signed. */
49729 if (TARGET_XOP && mode == V4SImode && !uns_p)
49730 {
49731 x = force_reg (wmode, CONST0_RTX (wmode));
49732 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
49733 return;
49734 }
49735
49736 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
49737 if (!const_vector_equal_evenodd_p (orig_op1))
49738 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
49739 x, NULL, 1, OPTAB_DIRECT);
49740 if (!const_vector_equal_evenodd_p (orig_op2))
49741 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
49742 x, NULL, 1, OPTAB_DIRECT);
49743 op1 = gen_lowpart (mode, op1);
49744 op2 = gen_lowpart (mode, op2);
49745 }
49746
49747 if (mode == V16SImode)
49748 {
49749 if (uns_p)
49750 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
49751 else
49752 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
49753 }
49754 else if (mode == V8SImode)
49755 {
49756 if (uns_p)
49757 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
49758 else
49759 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
49760 }
49761 else if (uns_p)
49762 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
49763 else if (TARGET_SSE4_1)
49764 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
49765 else
49766 {
49767 rtx s1, s2, t0, t1, t2;
49768
49769 /* The easiest way to implement this without PMULDQ is to go through
49770 the motions as if we are performing a full 64-bit multiply. With
49771 the exception that we need to do less shuffling of the elements. */
49772
49773 /* Compute the sign-extension, aka highparts, of the two operands. */
49774 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
49775 op1, pc_rtx, pc_rtx);
49776 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
49777 op2, pc_rtx, pc_rtx);
49778
49779 /* Multiply LO(A) * HI(B), and vice-versa. */
49780 t1 = gen_reg_rtx (wmode);
49781 t2 = gen_reg_rtx (wmode);
49782 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
49783 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
49784
49785 /* Multiply LO(A) * LO(B). */
49786 t0 = gen_reg_rtx (wmode);
49787 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
49788
49789 /* Combine and shift the highparts into place. */
49790 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
49791 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
49792 1, OPTAB_DIRECT);
49793
49794 /* Combine high and low parts. */
49795 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
49796 return;
49797 }
49798 emit_insn (x);
49799 }
49800
49801 void
49802 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
49803 bool uns_p, bool high_p)
49804 {
49805 machine_mode wmode = GET_MODE (dest);
49806 machine_mode mode = GET_MODE (op1);
49807 rtx t1, t2, t3, t4, mask;
49808
49809 switch (mode)
49810 {
49811 case V4SImode:
49812 t1 = gen_reg_rtx (mode);
49813 t2 = gen_reg_rtx (mode);
49814 if (TARGET_XOP && !uns_p)
49815 {
49816 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
49817 shuffle the elements once so that all elements are in the right
49818 place for immediate use: { A C B D }. */
49819 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
49820 const1_rtx, GEN_INT (3)));
49821 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
49822 const1_rtx, GEN_INT (3)));
49823 }
49824 else
49825 {
49826 /* Put the elements into place for the multiply. */
49827 ix86_expand_vec_interleave (t1, op1, op1, high_p);
49828 ix86_expand_vec_interleave (t2, op2, op2, high_p);
49829 high_p = false;
49830 }
49831 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
49832 break;
49833
49834 case V8SImode:
49835 /* Shuffle the elements between the lanes. After this we
49836 have { A B E F | C D G H } for each operand. */
49837 t1 = gen_reg_rtx (V4DImode);
49838 t2 = gen_reg_rtx (V4DImode);
49839 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
49840 const0_rtx, const2_rtx,
49841 const1_rtx, GEN_INT (3)));
49842 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
49843 const0_rtx, const2_rtx,
49844 const1_rtx, GEN_INT (3)));
49845
49846 /* Shuffle the elements within the lanes. After this we
49847 have { A A B B | C C D D } or { E E F F | G G H H }. */
49848 t3 = gen_reg_rtx (V8SImode);
49849 t4 = gen_reg_rtx (V8SImode);
49850 mask = GEN_INT (high_p
49851 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
49852 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
49853 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
49854 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
49855
49856 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
49857 break;
49858
49859 case V8HImode:
49860 case V16HImode:
49861 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
49862 uns_p, OPTAB_DIRECT);
49863 t2 = expand_binop (mode,
49864 uns_p ? umul_highpart_optab : smul_highpart_optab,
49865 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
49866 gcc_assert (t1 && t2);
49867
49868 t3 = gen_reg_rtx (mode);
49869 ix86_expand_vec_interleave (t3, t1, t2, high_p);
49870 emit_move_insn (dest, gen_lowpart (wmode, t3));
49871 break;
49872
49873 case V16QImode:
49874 case V32QImode:
49875 case V32HImode:
49876 case V16SImode:
49877 case V64QImode:
49878 t1 = gen_reg_rtx (wmode);
49879 t2 = gen_reg_rtx (wmode);
49880 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
49881 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
49882
49883 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
49884 break;
49885
49886 default:
49887 gcc_unreachable ();
49888 }
49889 }
49890
49891 void
49892 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
49893 {
49894 rtx res_1, res_2, res_3, res_4;
49895
49896 res_1 = gen_reg_rtx (V4SImode);
49897 res_2 = gen_reg_rtx (V4SImode);
49898 res_3 = gen_reg_rtx (V2DImode);
49899 res_4 = gen_reg_rtx (V2DImode);
49900 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
49901 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
49902
49903 /* Move the results in element 2 down to element 1; we don't care
49904 what goes in elements 2 and 3. Then we can merge the parts
49905 back together with an interleave.
49906
49907 Note that two other sequences were tried:
49908 (1) Use interleaves at the start instead of psrldq, which allows
49909 us to use a single shufps to merge things back at the end.
49910 (2) Use shufps here to combine the two vectors, then pshufd to
49911 put the elements in the correct order.
49912 In both cases the cost of the reformatting stall was too high
49913 and the overall sequence slower. */
49914
49915 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
49916 const0_rtx, const2_rtx,
49917 const0_rtx, const0_rtx));
49918 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
49919 const0_rtx, const2_rtx,
49920 const0_rtx, const0_rtx));
49921 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
49922
49923 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
49924 }
49925
49926 void
49927 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
49928 {
49929 machine_mode mode = GET_MODE (op0);
49930 rtx t1, t2, t3, t4, t5, t6;
49931
49932 if (TARGET_AVX512DQ && mode == V8DImode)
49933 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
49934 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
49935 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
49936 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
49937 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
49938 else if (TARGET_XOP && mode == V2DImode)
49939 {
49940 /* op1: A,B,C,D, op2: E,F,G,H */
49941 op1 = gen_lowpart (V4SImode, op1);
49942 op2 = gen_lowpart (V4SImode, op2);
49943
49944 t1 = gen_reg_rtx (V4SImode);
49945 t2 = gen_reg_rtx (V4SImode);
49946 t3 = gen_reg_rtx (V2DImode);
49947 t4 = gen_reg_rtx (V2DImode);
49948
49949 /* t1: B,A,D,C */
49950 emit_insn (gen_sse2_pshufd_1 (t1, op1,
49951 GEN_INT (1),
49952 GEN_INT (0),
49953 GEN_INT (3),
49954 GEN_INT (2)));
49955
49956 /* t2: (B*E),(A*F),(D*G),(C*H) */
49957 emit_insn (gen_mulv4si3 (t2, t1, op2));
49958
49959 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
49960 emit_insn (gen_xop_phadddq (t3, t2));
49961
49962 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
49963 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
49964
49965 /* Multiply lower parts and add all */
49966 t5 = gen_reg_rtx (V2DImode);
49967 emit_insn (gen_vec_widen_umult_even_v4si (t5,
49968 gen_lowpart (V4SImode, op1),
49969 gen_lowpart (V4SImode, op2)));
49970 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
49971
49972 }
49973 else
49974 {
49975 machine_mode nmode;
49976 rtx (*umul) (rtx, rtx, rtx);
49977
49978 if (mode == V2DImode)
49979 {
49980 umul = gen_vec_widen_umult_even_v4si;
49981 nmode = V4SImode;
49982 }
49983 else if (mode == V4DImode)
49984 {
49985 umul = gen_vec_widen_umult_even_v8si;
49986 nmode = V8SImode;
49987 }
49988 else if (mode == V8DImode)
49989 {
49990 umul = gen_vec_widen_umult_even_v16si;
49991 nmode = V16SImode;
49992 }
49993 else
49994 gcc_unreachable ();
49995
49996
49997 /* Multiply low parts. */
49998 t1 = gen_reg_rtx (mode);
49999 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
50000
50001 /* Shift input vectors right 32 bits so we can multiply high parts. */
50002 t6 = GEN_INT (32);
50003 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
50004 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
50005
50006 /* Multiply high parts by low parts. */
50007 t4 = gen_reg_rtx (mode);
50008 t5 = gen_reg_rtx (mode);
50009 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
50010 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
50011
50012 /* Combine and shift the highparts back. */
50013 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
50014 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
50015
50016 /* Combine high and low parts. */
50017 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
50018 }
50019
50020 set_unique_reg_note (get_last_insn (), REG_EQUAL,
50021 gen_rtx_MULT (mode, op1, op2));
50022 }
50023
50024 /* Return 1 if control tansfer instruction INSN
50025 should be encoded with bnd prefix.
50026 If insn is NULL then return 1 when control
50027 transfer instructions should be prefixed with
50028 bnd by default for current function. */
50029
50030 bool
50031 ix86_bnd_prefixed_insn_p (rtx insn)
50032 {
50033 /* For call insns check special flag. */
50034 if (insn && CALL_P (insn))
50035 {
50036 rtx call = get_call_rtx_from (insn);
50037 if (call)
50038 return CALL_EXPR_WITH_BOUNDS_P (call);
50039 }
50040
50041 /* All other insns are prefixed only if function is instrumented. */
50042 return chkp_function_instrumented_p (current_function_decl);
50043 }
50044
50045 /* Calculate integer abs() using only SSE2 instructions. */
50046
50047 void
50048 ix86_expand_sse2_abs (rtx target, rtx input)
50049 {
50050 machine_mode mode = GET_MODE (target);
50051 rtx tmp0, tmp1, x;
50052
50053 switch (mode)
50054 {
50055 /* For 32-bit signed integer X, the best way to calculate the absolute
50056 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
50057 case V4SImode:
50058 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
50059 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
50060 NULL, 0, OPTAB_DIRECT);
50061 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
50062 NULL, 0, OPTAB_DIRECT);
50063 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
50064 target, 0, OPTAB_DIRECT);
50065 break;
50066
50067 /* For 16-bit signed integer X, the best way to calculate the absolute
50068 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
50069 case V8HImode:
50070 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
50071
50072 x = expand_simple_binop (mode, SMAX, tmp0, input,
50073 target, 0, OPTAB_DIRECT);
50074 break;
50075
50076 /* For 8-bit signed integer X, the best way to calculate the absolute
50077 value of X is min ((unsigned char) X, (unsigned char) (-X)),
50078 as SSE2 provides the PMINUB insn. */
50079 case V16QImode:
50080 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
50081
50082 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
50083 target, 0, OPTAB_DIRECT);
50084 break;
50085
50086 default:
50087 gcc_unreachable ();
50088 }
50089
50090 if (x != target)
50091 emit_move_insn (target, x);
50092 }
50093
50094 /* Expand an extract from a vector register through pextr insn.
50095 Return true if successful. */
50096
50097 bool
50098 ix86_expand_pextr (rtx *operands)
50099 {
50100 rtx dst = operands[0];
50101 rtx src = operands[1];
50102
50103 unsigned int size = INTVAL (operands[2]);
50104 unsigned int pos = INTVAL (operands[3]);
50105
50106 if (SUBREG_P (dst))
50107 {
50108 /* Reject non-lowpart subregs. */
50109 if (SUBREG_BYTE (dst) > 0)
50110 return false;
50111 dst = SUBREG_REG (dst);
50112 }
50113
50114 if (SUBREG_P (src))
50115 {
50116 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
50117 src = SUBREG_REG (src);
50118 }
50119
50120 switch (GET_MODE (src))
50121 {
50122 case V16QImode:
50123 case V8HImode:
50124 case V4SImode:
50125 case V2DImode:
50126 case V1TImode:
50127 case TImode:
50128 {
50129 machine_mode srcmode, dstmode;
50130 rtx d, pat;
50131
50132 dstmode = mode_for_size (size, MODE_INT, 0);
50133
50134 switch (dstmode)
50135 {
50136 case QImode:
50137 if (!TARGET_SSE4_1)
50138 return false;
50139 srcmode = V16QImode;
50140 break;
50141
50142 case HImode:
50143 if (!TARGET_SSE2)
50144 return false;
50145 srcmode = V8HImode;
50146 break;
50147
50148 case SImode:
50149 if (!TARGET_SSE4_1)
50150 return false;
50151 srcmode = V4SImode;
50152 break;
50153
50154 case DImode:
50155 gcc_assert (TARGET_64BIT);
50156 if (!TARGET_SSE4_1)
50157 return false;
50158 srcmode = V2DImode;
50159 break;
50160
50161 default:
50162 return false;
50163 }
50164
50165 /* Reject extractions from misaligned positions. */
50166 if (pos & (size-1))
50167 return false;
50168
50169 if (GET_MODE (dst) == dstmode)
50170 d = dst;
50171 else
50172 d = gen_reg_rtx (dstmode);
50173
50174 /* Construct insn pattern. */
50175 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
50176 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
50177
50178 /* Let the rtl optimizers know about the zero extension performed. */
50179 if (dstmode == QImode || dstmode == HImode)
50180 {
50181 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
50182 d = gen_lowpart (SImode, d);
50183 }
50184
50185 emit_insn (gen_rtx_SET (d, pat));
50186
50187 if (d != dst)
50188 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
50189 return true;
50190 }
50191
50192 default:
50193 return false;
50194 }
50195 }
50196
50197 /* Expand an insert into a vector register through pinsr insn.
50198 Return true if successful. */
50199
50200 bool
50201 ix86_expand_pinsr (rtx *operands)
50202 {
50203 rtx dst = operands[0];
50204 rtx src = operands[3];
50205
50206 unsigned int size = INTVAL (operands[1]);
50207 unsigned int pos = INTVAL (operands[2]);
50208
50209 if (SUBREG_P (dst))
50210 {
50211 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
50212 dst = SUBREG_REG (dst);
50213 }
50214
50215 switch (GET_MODE (dst))
50216 {
50217 case V16QImode:
50218 case V8HImode:
50219 case V4SImode:
50220 case V2DImode:
50221 case V1TImode:
50222 case TImode:
50223 {
50224 machine_mode srcmode, dstmode;
50225 rtx (*pinsr)(rtx, rtx, rtx, rtx);
50226 rtx d;
50227
50228 srcmode = mode_for_size (size, MODE_INT, 0);
50229
50230 switch (srcmode)
50231 {
50232 case QImode:
50233 if (!TARGET_SSE4_1)
50234 return false;
50235 dstmode = V16QImode;
50236 pinsr = gen_sse4_1_pinsrb;
50237 break;
50238
50239 case HImode:
50240 if (!TARGET_SSE2)
50241 return false;
50242 dstmode = V8HImode;
50243 pinsr = gen_sse2_pinsrw;
50244 break;
50245
50246 case SImode:
50247 if (!TARGET_SSE4_1)
50248 return false;
50249 dstmode = V4SImode;
50250 pinsr = gen_sse4_1_pinsrd;
50251 break;
50252
50253 case DImode:
50254 gcc_assert (TARGET_64BIT);
50255 if (!TARGET_SSE4_1)
50256 return false;
50257 dstmode = V2DImode;
50258 pinsr = gen_sse4_1_pinsrq;
50259 break;
50260
50261 default:
50262 return false;
50263 }
50264
50265 /* Reject insertions to misaligned positions. */
50266 if (pos & (size-1))
50267 return false;
50268
50269 if (SUBREG_P (src))
50270 {
50271 unsigned int srcpos = SUBREG_BYTE (src);
50272
50273 if (srcpos > 0)
50274 {
50275 rtx extr_ops[4];
50276
50277 extr_ops[0] = gen_reg_rtx (srcmode);
50278 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
50279 extr_ops[2] = GEN_INT (size);
50280 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
50281
50282 if (!ix86_expand_pextr (extr_ops))
50283 return false;
50284
50285 src = extr_ops[0];
50286 }
50287 else
50288 src = gen_lowpart (srcmode, SUBREG_REG (src));
50289 }
50290
50291 if (GET_MODE (dst) == dstmode)
50292 d = dst;
50293 else
50294 d = gen_reg_rtx (dstmode);
50295
50296 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
50297 gen_lowpart (srcmode, src),
50298 GEN_INT (1 << (pos / size))));
50299 if (d != dst)
50300 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
50301 return true;
50302 }
50303
50304 default:
50305 return false;
50306 }
50307 }
50308 \f
50309 /* This function returns the calling abi specific va_list type node.
50310 It returns the FNDECL specific va_list type. */
50311
50312 static tree
50313 ix86_fn_abi_va_list (tree fndecl)
50314 {
50315 if (!TARGET_64BIT)
50316 return va_list_type_node;
50317 gcc_assert (fndecl != NULL_TREE);
50318
50319 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
50320 return ms_va_list_type_node;
50321 else
50322 return sysv_va_list_type_node;
50323 }
50324
50325 /* Returns the canonical va_list type specified by TYPE. If there
50326 is no valid TYPE provided, it return NULL_TREE. */
50327
50328 static tree
50329 ix86_canonical_va_list_type (tree type)
50330 {
50331 if (TARGET_64BIT)
50332 {
50333 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
50334 return ms_va_list_type_node;
50335
50336 if ((TREE_CODE (type) == ARRAY_TYPE
50337 && integer_zerop (array_type_nelts (type)))
50338 || POINTER_TYPE_P (type))
50339 {
50340 tree elem_type = TREE_TYPE (type);
50341 if (TREE_CODE (elem_type) == RECORD_TYPE
50342 && lookup_attribute ("sysv_abi va_list",
50343 TYPE_ATTRIBUTES (elem_type)))
50344 return sysv_va_list_type_node;
50345 }
50346
50347 return NULL_TREE;
50348 }
50349
50350 return std_canonical_va_list_type (type);
50351 }
50352
50353 /* Iterate through the target-specific builtin types for va_list.
50354 IDX denotes the iterator, *PTREE is set to the result type of
50355 the va_list builtin, and *PNAME to its internal type.
50356 Returns zero if there is no element for this index, otherwise
50357 IDX should be increased upon the next call.
50358 Note, do not iterate a base builtin's name like __builtin_va_list.
50359 Used from c_common_nodes_and_builtins. */
50360
50361 static int
50362 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
50363 {
50364 if (TARGET_64BIT)
50365 {
50366 switch (idx)
50367 {
50368 default:
50369 break;
50370
50371 case 0:
50372 *ptree = ms_va_list_type_node;
50373 *pname = "__builtin_ms_va_list";
50374 return 1;
50375
50376 case 1:
50377 *ptree = sysv_va_list_type_node;
50378 *pname = "__builtin_sysv_va_list";
50379 return 1;
50380 }
50381 }
50382
50383 return 0;
50384 }
50385
50386 #undef TARGET_SCHED_DISPATCH
50387 #define TARGET_SCHED_DISPATCH has_dispatch
50388 #undef TARGET_SCHED_DISPATCH_DO
50389 #define TARGET_SCHED_DISPATCH_DO do_dispatch
50390 #undef TARGET_SCHED_REASSOCIATION_WIDTH
50391 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
50392 #undef TARGET_SCHED_REORDER
50393 #define TARGET_SCHED_REORDER ix86_sched_reorder
50394 #undef TARGET_SCHED_ADJUST_PRIORITY
50395 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
50396 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
50397 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
50398 ix86_dependencies_evaluation_hook
50399
50400 /* The size of the dispatch window is the total number of bytes of
50401 object code allowed in a window. */
50402 #define DISPATCH_WINDOW_SIZE 16
50403
50404 /* Number of dispatch windows considered for scheduling. */
50405 #define MAX_DISPATCH_WINDOWS 3
50406
50407 /* Maximum number of instructions in a window. */
50408 #define MAX_INSN 4
50409
50410 /* Maximum number of immediate operands in a window. */
50411 #define MAX_IMM 4
50412
50413 /* Maximum number of immediate bits allowed in a window. */
50414 #define MAX_IMM_SIZE 128
50415
50416 /* Maximum number of 32 bit immediates allowed in a window. */
50417 #define MAX_IMM_32 4
50418
50419 /* Maximum number of 64 bit immediates allowed in a window. */
50420 #define MAX_IMM_64 2
50421
50422 /* Maximum total of loads or prefetches allowed in a window. */
50423 #define MAX_LOAD 2
50424
50425 /* Maximum total of stores allowed in a window. */
50426 #define MAX_STORE 1
50427
50428 #undef BIG
50429 #define BIG 100
50430
50431
50432 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
50433 enum dispatch_group {
50434 disp_no_group = 0,
50435 disp_load,
50436 disp_store,
50437 disp_load_store,
50438 disp_prefetch,
50439 disp_imm,
50440 disp_imm_32,
50441 disp_imm_64,
50442 disp_branch,
50443 disp_cmp,
50444 disp_jcc,
50445 disp_last
50446 };
50447
50448 /* Number of allowable groups in a dispatch window. It is an array
50449 indexed by dispatch_group enum. 100 is used as a big number,
50450 because the number of these kind of operations does not have any
50451 effect in dispatch window, but we need them for other reasons in
50452 the table. */
50453 static unsigned int num_allowable_groups[disp_last] = {
50454 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
50455 };
50456
50457 char group_name[disp_last + 1][16] = {
50458 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
50459 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
50460 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
50461 };
50462
50463 /* Instruction path. */
50464 enum insn_path {
50465 no_path = 0,
50466 path_single, /* Single micro op. */
50467 path_double, /* Double micro op. */
50468 path_multi, /* Instructions with more than 2 micro op.. */
50469 last_path
50470 };
50471
50472 /* sched_insn_info defines a window to the instructions scheduled in
50473 the basic block. It contains a pointer to the insn_info table and
50474 the instruction scheduled.
50475
50476 Windows are allocated for each basic block and are linked
50477 together. */
50478 typedef struct sched_insn_info_s {
50479 rtx insn;
50480 enum dispatch_group group;
50481 enum insn_path path;
50482 int byte_len;
50483 int imm_bytes;
50484 } sched_insn_info;
50485
50486 /* Linked list of dispatch windows. This is a two way list of
50487 dispatch windows of a basic block. It contains information about
50488 the number of uops in the window and the total number of
50489 instructions and of bytes in the object code for this dispatch
50490 window. */
50491 typedef struct dispatch_windows_s {
50492 int num_insn; /* Number of insn in the window. */
50493 int num_uops; /* Number of uops in the window. */
50494 int window_size; /* Number of bytes in the window. */
50495 int window_num; /* Window number between 0 or 1. */
50496 int num_imm; /* Number of immediates in an insn. */
50497 int num_imm_32; /* Number of 32 bit immediates in an insn. */
50498 int num_imm_64; /* Number of 64 bit immediates in an insn. */
50499 int imm_size; /* Total immediates in the window. */
50500 int num_loads; /* Total memory loads in the window. */
50501 int num_stores; /* Total memory stores in the window. */
50502 int violation; /* Violation exists in window. */
50503 sched_insn_info *window; /* Pointer to the window. */
50504 struct dispatch_windows_s *next;
50505 struct dispatch_windows_s *prev;
50506 } dispatch_windows;
50507
50508 /* Immediate valuse used in an insn. */
50509 typedef struct imm_info_s
50510 {
50511 int imm;
50512 int imm32;
50513 int imm64;
50514 } imm_info;
50515
50516 static dispatch_windows *dispatch_window_list;
50517 static dispatch_windows *dispatch_window_list1;
50518
50519 /* Get dispatch group of insn. */
50520
50521 static enum dispatch_group
50522 get_mem_group (rtx_insn *insn)
50523 {
50524 enum attr_memory memory;
50525
50526 if (INSN_CODE (insn) < 0)
50527 return disp_no_group;
50528 memory = get_attr_memory (insn);
50529 if (memory == MEMORY_STORE)
50530 return disp_store;
50531
50532 if (memory == MEMORY_LOAD)
50533 return disp_load;
50534
50535 if (memory == MEMORY_BOTH)
50536 return disp_load_store;
50537
50538 return disp_no_group;
50539 }
50540
50541 /* Return true if insn is a compare instruction. */
50542
50543 static bool
50544 is_cmp (rtx_insn *insn)
50545 {
50546 enum attr_type type;
50547
50548 type = get_attr_type (insn);
50549 return (type == TYPE_TEST
50550 || type == TYPE_ICMP
50551 || type == TYPE_FCMP
50552 || GET_CODE (PATTERN (insn)) == COMPARE);
50553 }
50554
50555 /* Return true if a dispatch violation encountered. */
50556
50557 static bool
50558 dispatch_violation (void)
50559 {
50560 if (dispatch_window_list->next)
50561 return dispatch_window_list->next->violation;
50562 return dispatch_window_list->violation;
50563 }
50564
50565 /* Return true if insn is a branch instruction. */
50566
50567 static bool
50568 is_branch (rtx_insn *insn)
50569 {
50570 return (CALL_P (insn) || JUMP_P (insn));
50571 }
50572
50573 /* Return true if insn is a prefetch instruction. */
50574
50575 static bool
50576 is_prefetch (rtx_insn *insn)
50577 {
50578 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
50579 }
50580
50581 /* This function initializes a dispatch window and the list container holding a
50582 pointer to the window. */
50583
50584 static void
50585 init_window (int window_num)
50586 {
50587 int i;
50588 dispatch_windows *new_list;
50589
50590 if (window_num == 0)
50591 new_list = dispatch_window_list;
50592 else
50593 new_list = dispatch_window_list1;
50594
50595 new_list->num_insn = 0;
50596 new_list->num_uops = 0;
50597 new_list->window_size = 0;
50598 new_list->next = NULL;
50599 new_list->prev = NULL;
50600 new_list->window_num = window_num;
50601 new_list->num_imm = 0;
50602 new_list->num_imm_32 = 0;
50603 new_list->num_imm_64 = 0;
50604 new_list->imm_size = 0;
50605 new_list->num_loads = 0;
50606 new_list->num_stores = 0;
50607 new_list->violation = false;
50608
50609 for (i = 0; i < MAX_INSN; i++)
50610 {
50611 new_list->window[i].insn = NULL;
50612 new_list->window[i].group = disp_no_group;
50613 new_list->window[i].path = no_path;
50614 new_list->window[i].byte_len = 0;
50615 new_list->window[i].imm_bytes = 0;
50616 }
50617 return;
50618 }
50619
50620 /* This function allocates and initializes a dispatch window and the
50621 list container holding a pointer to the window. */
50622
50623 static dispatch_windows *
50624 allocate_window (void)
50625 {
50626 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
50627 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
50628
50629 return new_list;
50630 }
50631
50632 /* This routine initializes the dispatch scheduling information. It
50633 initiates building dispatch scheduler tables and constructs the
50634 first dispatch window. */
50635
50636 static void
50637 init_dispatch_sched (void)
50638 {
50639 /* Allocate a dispatch list and a window. */
50640 dispatch_window_list = allocate_window ();
50641 dispatch_window_list1 = allocate_window ();
50642 init_window (0);
50643 init_window (1);
50644 }
50645
50646 /* This function returns true if a branch is detected. End of a basic block
50647 does not have to be a branch, but here we assume only branches end a
50648 window. */
50649
50650 static bool
50651 is_end_basic_block (enum dispatch_group group)
50652 {
50653 return group == disp_branch;
50654 }
50655
50656 /* This function is called when the end of a window processing is reached. */
50657
50658 static void
50659 process_end_window (void)
50660 {
50661 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
50662 if (dispatch_window_list->next)
50663 {
50664 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
50665 gcc_assert (dispatch_window_list->window_size
50666 + dispatch_window_list1->window_size <= 48);
50667 init_window (1);
50668 }
50669 init_window (0);
50670 }
50671
50672 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
50673 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
50674 for 48 bytes of instructions. Note that these windows are not dispatch
50675 windows that their sizes are DISPATCH_WINDOW_SIZE. */
50676
50677 static dispatch_windows *
50678 allocate_next_window (int window_num)
50679 {
50680 if (window_num == 0)
50681 {
50682 if (dispatch_window_list->next)
50683 init_window (1);
50684 init_window (0);
50685 return dispatch_window_list;
50686 }
50687
50688 dispatch_window_list->next = dispatch_window_list1;
50689 dispatch_window_list1->prev = dispatch_window_list;
50690
50691 return dispatch_window_list1;
50692 }
50693
50694 /* Compute number of immediate operands of an instruction. */
50695
50696 static void
50697 find_constant (rtx in_rtx, imm_info *imm_values)
50698 {
50699 if (INSN_P (in_rtx))
50700 in_rtx = PATTERN (in_rtx);
50701 subrtx_iterator::array_type array;
50702 FOR_EACH_SUBRTX (iter, array, in_rtx, ALL)
50703 if (const_rtx x = *iter)
50704 switch (GET_CODE (x))
50705 {
50706 case CONST:
50707 case SYMBOL_REF:
50708 case CONST_INT:
50709 (imm_values->imm)++;
50710 if (x86_64_immediate_operand (CONST_CAST_RTX (x), SImode))
50711 (imm_values->imm32)++;
50712 else
50713 (imm_values->imm64)++;
50714 break;
50715
50716 case CONST_DOUBLE:
50717 case CONST_WIDE_INT:
50718 (imm_values->imm)++;
50719 (imm_values->imm64)++;
50720 break;
50721
50722 case CODE_LABEL:
50723 if (LABEL_KIND (x) == LABEL_NORMAL)
50724 {
50725 (imm_values->imm)++;
50726 (imm_values->imm32)++;
50727 }
50728 break;
50729
50730 default:
50731 break;
50732 }
50733 }
50734
50735 /* Return total size of immediate operands of an instruction along with number
50736 of corresponding immediate-operands. It initializes its parameters to zero
50737 befor calling FIND_CONSTANT.
50738 INSN is the input instruction. IMM is the total of immediates.
50739 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
50740 bit immediates. */
50741
50742 static int
50743 get_num_immediates (rtx_insn *insn, int *imm, int *imm32, int *imm64)
50744 {
50745 imm_info imm_values = {0, 0, 0};
50746
50747 find_constant (insn, &imm_values);
50748 *imm = imm_values.imm;
50749 *imm32 = imm_values.imm32;
50750 *imm64 = imm_values.imm64;
50751 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
50752 }
50753
50754 /* This function indicates if an operand of an instruction is an
50755 immediate. */
50756
50757 static bool
50758 has_immediate (rtx_insn *insn)
50759 {
50760 int num_imm_operand;
50761 int num_imm32_operand;
50762 int num_imm64_operand;
50763
50764 if (insn)
50765 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
50766 &num_imm64_operand);
50767 return false;
50768 }
50769
50770 /* Return single or double path for instructions. */
50771
50772 static enum insn_path
50773 get_insn_path (rtx_insn *insn)
50774 {
50775 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
50776
50777 if ((int)path == 0)
50778 return path_single;
50779
50780 if ((int)path == 1)
50781 return path_double;
50782
50783 return path_multi;
50784 }
50785
50786 /* Return insn dispatch group. */
50787
50788 static enum dispatch_group
50789 get_insn_group (rtx_insn *insn)
50790 {
50791 enum dispatch_group group = get_mem_group (insn);
50792 if (group)
50793 return group;
50794
50795 if (is_branch (insn))
50796 return disp_branch;
50797
50798 if (is_cmp (insn))
50799 return disp_cmp;
50800
50801 if (has_immediate (insn))
50802 return disp_imm;
50803
50804 if (is_prefetch (insn))
50805 return disp_prefetch;
50806
50807 return disp_no_group;
50808 }
50809
50810 /* Count number of GROUP restricted instructions in a dispatch
50811 window WINDOW_LIST. */
50812
50813 static int
50814 count_num_restricted (rtx_insn *insn, dispatch_windows *window_list)
50815 {
50816 enum dispatch_group group = get_insn_group (insn);
50817 int imm_size;
50818 int num_imm_operand;
50819 int num_imm32_operand;
50820 int num_imm64_operand;
50821
50822 if (group == disp_no_group)
50823 return 0;
50824
50825 if (group == disp_imm)
50826 {
50827 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
50828 &num_imm64_operand);
50829 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
50830 || num_imm_operand + window_list->num_imm > MAX_IMM
50831 || (num_imm32_operand > 0
50832 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
50833 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
50834 || (num_imm64_operand > 0
50835 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
50836 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
50837 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
50838 && num_imm64_operand > 0
50839 && ((window_list->num_imm_64 > 0
50840 && window_list->num_insn >= 2)
50841 || window_list->num_insn >= 3)))
50842 return BIG;
50843
50844 return 1;
50845 }
50846
50847 if ((group == disp_load_store
50848 && (window_list->num_loads >= MAX_LOAD
50849 || window_list->num_stores >= MAX_STORE))
50850 || ((group == disp_load
50851 || group == disp_prefetch)
50852 && window_list->num_loads >= MAX_LOAD)
50853 || (group == disp_store
50854 && window_list->num_stores >= MAX_STORE))
50855 return BIG;
50856
50857 return 1;
50858 }
50859
50860 /* This function returns true if insn satisfies dispatch rules on the
50861 last window scheduled. */
50862
50863 static bool
50864 fits_dispatch_window (rtx_insn *insn)
50865 {
50866 dispatch_windows *window_list = dispatch_window_list;
50867 dispatch_windows *window_list_next = dispatch_window_list->next;
50868 unsigned int num_restrict;
50869 enum dispatch_group group = get_insn_group (insn);
50870 enum insn_path path = get_insn_path (insn);
50871 int sum;
50872
50873 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
50874 instructions should be given the lowest priority in the
50875 scheduling process in Haifa scheduler to make sure they will be
50876 scheduled in the same dispatch window as the reference to them. */
50877 if (group == disp_jcc || group == disp_cmp)
50878 return false;
50879
50880 /* Check nonrestricted. */
50881 if (group == disp_no_group || group == disp_branch)
50882 return true;
50883
50884 /* Get last dispatch window. */
50885 if (window_list_next)
50886 window_list = window_list_next;
50887
50888 if (window_list->window_num == 1)
50889 {
50890 sum = window_list->prev->window_size + window_list->window_size;
50891
50892 if (sum == 32
50893 || (min_insn_size (insn) + sum) >= 48)
50894 /* Window 1 is full. Go for next window. */
50895 return true;
50896 }
50897
50898 num_restrict = count_num_restricted (insn, window_list);
50899
50900 if (num_restrict > num_allowable_groups[group])
50901 return false;
50902
50903 /* See if it fits in the first window. */
50904 if (window_list->window_num == 0)
50905 {
50906 /* The first widow should have only single and double path
50907 uops. */
50908 if (path == path_double
50909 && (window_list->num_uops + 2) > MAX_INSN)
50910 return false;
50911 else if (path != path_single)
50912 return false;
50913 }
50914 return true;
50915 }
50916
50917 /* Add an instruction INSN with NUM_UOPS micro-operations to the
50918 dispatch window WINDOW_LIST. */
50919
50920 static void
50921 add_insn_window (rtx_insn *insn, dispatch_windows *window_list, int num_uops)
50922 {
50923 int byte_len = min_insn_size (insn);
50924 int num_insn = window_list->num_insn;
50925 int imm_size;
50926 sched_insn_info *window = window_list->window;
50927 enum dispatch_group group = get_insn_group (insn);
50928 enum insn_path path = get_insn_path (insn);
50929 int num_imm_operand;
50930 int num_imm32_operand;
50931 int num_imm64_operand;
50932
50933 if (!window_list->violation && group != disp_cmp
50934 && !fits_dispatch_window (insn))
50935 window_list->violation = true;
50936
50937 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
50938 &num_imm64_operand);
50939
50940 /* Initialize window with new instruction. */
50941 window[num_insn].insn = insn;
50942 window[num_insn].byte_len = byte_len;
50943 window[num_insn].group = group;
50944 window[num_insn].path = path;
50945 window[num_insn].imm_bytes = imm_size;
50946
50947 window_list->window_size += byte_len;
50948 window_list->num_insn = num_insn + 1;
50949 window_list->num_uops = window_list->num_uops + num_uops;
50950 window_list->imm_size += imm_size;
50951 window_list->num_imm += num_imm_operand;
50952 window_list->num_imm_32 += num_imm32_operand;
50953 window_list->num_imm_64 += num_imm64_operand;
50954
50955 if (group == disp_store)
50956 window_list->num_stores += 1;
50957 else if (group == disp_load
50958 || group == disp_prefetch)
50959 window_list->num_loads += 1;
50960 else if (group == disp_load_store)
50961 {
50962 window_list->num_stores += 1;
50963 window_list->num_loads += 1;
50964 }
50965 }
50966
50967 /* Adds a scheduled instruction, INSN, to the current dispatch window.
50968 If the total bytes of instructions or the number of instructions in
50969 the window exceed allowable, it allocates a new window. */
50970
50971 static void
50972 add_to_dispatch_window (rtx_insn *insn)
50973 {
50974 int byte_len;
50975 dispatch_windows *window_list;
50976 dispatch_windows *next_list;
50977 dispatch_windows *window0_list;
50978 enum insn_path path;
50979 enum dispatch_group insn_group;
50980 bool insn_fits;
50981 int num_insn;
50982 int num_uops;
50983 int window_num;
50984 int insn_num_uops;
50985 int sum;
50986
50987 if (INSN_CODE (insn) < 0)
50988 return;
50989
50990 byte_len = min_insn_size (insn);
50991 window_list = dispatch_window_list;
50992 next_list = window_list->next;
50993 path = get_insn_path (insn);
50994 insn_group = get_insn_group (insn);
50995
50996 /* Get the last dispatch window. */
50997 if (next_list)
50998 window_list = dispatch_window_list->next;
50999
51000 if (path == path_single)
51001 insn_num_uops = 1;
51002 else if (path == path_double)
51003 insn_num_uops = 2;
51004 else
51005 insn_num_uops = (int) path;
51006
51007 /* If current window is full, get a new window.
51008 Window number zero is full, if MAX_INSN uops are scheduled in it.
51009 Window number one is full, if window zero's bytes plus window
51010 one's bytes is 32, or if the bytes of the new instruction added
51011 to the total makes it greater than 48, or it has already MAX_INSN
51012 instructions in it. */
51013 num_insn = window_list->num_insn;
51014 num_uops = window_list->num_uops;
51015 window_num = window_list->window_num;
51016 insn_fits = fits_dispatch_window (insn);
51017
51018 if (num_insn >= MAX_INSN
51019 || num_uops + insn_num_uops > MAX_INSN
51020 || !(insn_fits))
51021 {
51022 window_num = ~window_num & 1;
51023 window_list = allocate_next_window (window_num);
51024 }
51025
51026 if (window_num == 0)
51027 {
51028 add_insn_window (insn, window_list, insn_num_uops);
51029 if (window_list->num_insn >= MAX_INSN
51030 && insn_group == disp_branch)
51031 {
51032 process_end_window ();
51033 return;
51034 }
51035 }
51036 else if (window_num == 1)
51037 {
51038 window0_list = window_list->prev;
51039 sum = window0_list->window_size + window_list->window_size;
51040 if (sum == 32
51041 || (byte_len + sum) >= 48)
51042 {
51043 process_end_window ();
51044 window_list = dispatch_window_list;
51045 }
51046
51047 add_insn_window (insn, window_list, insn_num_uops);
51048 }
51049 else
51050 gcc_unreachable ();
51051
51052 if (is_end_basic_block (insn_group))
51053 {
51054 /* End of basic block is reached do end-basic-block process. */
51055 process_end_window ();
51056 return;
51057 }
51058 }
51059
51060 /* Print the dispatch window, WINDOW_NUM, to FILE. */
51061
51062 DEBUG_FUNCTION static void
51063 debug_dispatch_window_file (FILE *file, int window_num)
51064 {
51065 dispatch_windows *list;
51066 int i;
51067
51068 if (window_num == 0)
51069 list = dispatch_window_list;
51070 else
51071 list = dispatch_window_list1;
51072
51073 fprintf (file, "Window #%d:\n", list->window_num);
51074 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
51075 list->num_insn, list->num_uops, list->window_size);
51076 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
51077 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
51078
51079 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
51080 list->num_stores);
51081 fprintf (file, " insn info:\n");
51082
51083 for (i = 0; i < MAX_INSN; i++)
51084 {
51085 if (!list->window[i].insn)
51086 break;
51087 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
51088 i, group_name[list->window[i].group],
51089 i, (void *)list->window[i].insn,
51090 i, list->window[i].path,
51091 i, list->window[i].byte_len,
51092 i, list->window[i].imm_bytes);
51093 }
51094 }
51095
51096 /* Print to stdout a dispatch window. */
51097
51098 DEBUG_FUNCTION void
51099 debug_dispatch_window (int window_num)
51100 {
51101 debug_dispatch_window_file (stdout, window_num);
51102 }
51103
51104 /* Print INSN dispatch information to FILE. */
51105
51106 DEBUG_FUNCTION static void
51107 debug_insn_dispatch_info_file (FILE *file, rtx_insn *insn)
51108 {
51109 int byte_len;
51110 enum insn_path path;
51111 enum dispatch_group group;
51112 int imm_size;
51113 int num_imm_operand;
51114 int num_imm32_operand;
51115 int num_imm64_operand;
51116
51117 if (INSN_CODE (insn) < 0)
51118 return;
51119
51120 byte_len = min_insn_size (insn);
51121 path = get_insn_path (insn);
51122 group = get_insn_group (insn);
51123 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
51124 &num_imm64_operand);
51125
51126 fprintf (file, " insn info:\n");
51127 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
51128 group_name[group], path, byte_len);
51129 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
51130 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
51131 }
51132
51133 /* Print to STDERR the status of the ready list with respect to
51134 dispatch windows. */
51135
51136 DEBUG_FUNCTION void
51137 debug_ready_dispatch (void)
51138 {
51139 int i;
51140 int no_ready = number_in_ready ();
51141
51142 fprintf (stdout, "Number of ready: %d\n", no_ready);
51143
51144 for (i = 0; i < no_ready; i++)
51145 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
51146 }
51147
51148 /* This routine is the driver of the dispatch scheduler. */
51149
51150 static void
51151 do_dispatch (rtx_insn *insn, int mode)
51152 {
51153 if (mode == DISPATCH_INIT)
51154 init_dispatch_sched ();
51155 else if (mode == ADD_TO_DISPATCH_WINDOW)
51156 add_to_dispatch_window (insn);
51157 }
51158
51159 /* Return TRUE if Dispatch Scheduling is supported. */
51160
51161 static bool
51162 has_dispatch (rtx_insn *insn, int action)
51163 {
51164 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3
51165 || TARGET_BDVER4 || TARGET_ZNVER1) && flag_dispatch_scheduler)
51166 switch (action)
51167 {
51168 default:
51169 return false;
51170
51171 case IS_DISPATCH_ON:
51172 return true;
51173
51174 case IS_CMP:
51175 return is_cmp (insn);
51176
51177 case DISPATCH_VIOLATION:
51178 return dispatch_violation ();
51179
51180 case FITS_DISPATCH_WINDOW:
51181 return fits_dispatch_window (insn);
51182 }
51183
51184 return false;
51185 }
51186
51187 /* Implementation of reassociation_width target hook used by
51188 reassoc phase to identify parallelism level in reassociated
51189 tree. Statements tree_code is passed in OPC. Arguments type
51190 is passed in MODE.
51191
51192 Currently parallel reassociation is enabled for Atom
51193 processors only and we set reassociation width to be 2
51194 because Atom may issue up to 2 instructions per cycle.
51195
51196 Return value should be fixed if parallel reassociation is
51197 enabled for other processors. */
51198
51199 static int
51200 ix86_reassociation_width (unsigned int, machine_mode mode)
51201 {
51202 /* Vector part. */
51203 if (VECTOR_MODE_P (mode))
51204 {
51205 if (TARGET_VECTOR_PARALLEL_EXECUTION)
51206 return 2;
51207 else
51208 return 1;
51209 }
51210
51211 /* Scalar part. */
51212 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
51213 return 2;
51214 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
51215 return ((TARGET_64BIT && ix86_tune == PROCESSOR_HASWELL)? 4 : 2);
51216 else
51217 return 1;
51218 }
51219
51220 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
51221 place emms and femms instructions. */
51222
51223 static machine_mode
51224 ix86_preferred_simd_mode (machine_mode mode)
51225 {
51226 if (!TARGET_SSE)
51227 return word_mode;
51228
51229 switch (mode)
51230 {
51231 case QImode:
51232 return TARGET_AVX512BW ? V64QImode :
51233 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
51234 case HImode:
51235 return TARGET_AVX512BW ? V32HImode :
51236 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
51237 case SImode:
51238 return TARGET_AVX512F ? V16SImode :
51239 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
51240 case DImode:
51241 return TARGET_AVX512F ? V8DImode :
51242 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
51243
51244 case SFmode:
51245 if (TARGET_AVX512F)
51246 return V16SFmode;
51247 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
51248 return V8SFmode;
51249 else
51250 return V4SFmode;
51251
51252 case DFmode:
51253 if (TARGET_AVX512F)
51254 return V8DFmode;
51255 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
51256 return V4DFmode;
51257 else if (TARGET_SSE2)
51258 return V2DFmode;
51259 /* FALLTHRU */
51260
51261 default:
51262 return word_mode;
51263 }
51264 }
51265
51266 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
51267 vectors. If AVX512F is enabled then try vectorizing with 512bit,
51268 256bit and 128bit vectors. */
51269
51270 static unsigned int
51271 ix86_autovectorize_vector_sizes (void)
51272 {
51273 return TARGET_AVX512F ? 64 | 32 | 16 :
51274 (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
51275 }
51276
51277 /* Implemenation of targetm.vectorize.get_mask_mode. */
51278
51279 static machine_mode
51280 ix86_get_mask_mode (unsigned nunits, unsigned vector_size)
51281 {
51282 unsigned elem_size = vector_size / nunits;
51283
51284 /* Scalar mask case. */
51285 if ((TARGET_AVX512F && vector_size == 64)
51286 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
51287 {
51288 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
51289 return smallest_mode_for_size (nunits, MODE_INT);
51290 }
51291
51292 machine_mode elem_mode
51293 = smallest_mode_for_size (elem_size * BITS_PER_UNIT, MODE_INT);
51294
51295 gcc_assert (elem_size * nunits == vector_size);
51296
51297 return mode_for_vector (elem_mode, nunits);
51298 }
51299
51300 \f
51301
51302 /* Return class of registers which could be used for pseudo of MODE
51303 and of class RCLASS for spilling instead of memory. Return NO_REGS
51304 if it is not possible or non-profitable. */
51305
51306 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
51307
51308 static reg_class_t
51309 ix86_spill_class (reg_class_t rclass, machine_mode mode)
51310 {
51311 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
51312 && TARGET_SSE2
51313 && TARGET_INTER_UNIT_MOVES_TO_VEC
51314 && TARGET_INTER_UNIT_MOVES_FROM_VEC
51315 && (mode == SImode || (TARGET_64BIT && mode == DImode))
51316 && INTEGER_CLASS_P (rclass))
51317 return ALL_SSE_REGS;
51318 return NO_REGS;
51319 }
51320
51321 /* Implement TARGET_MAX_NOCE_IFCVT_SEQ_COST. Like the default implementation,
51322 but returns a lower bound. */
51323
51324 static unsigned int
51325 ix86_max_noce_ifcvt_seq_cost (edge e)
51326 {
51327 bool predictable_p = predictable_edge_p (e);
51328
51329 enum compiler_param param
51330 = (predictable_p
51331 ? PARAM_MAX_RTL_IF_CONVERSION_PREDICTABLE_COST
51332 : PARAM_MAX_RTL_IF_CONVERSION_UNPREDICTABLE_COST);
51333
51334 /* If we have a parameter set, use that, otherwise take a guess using
51335 BRANCH_COST. */
51336 if (global_options_set.x_param_values[param])
51337 return PARAM_VALUE (param);
51338 else
51339 return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
51340 }
51341
51342 /* Return true if SEQ is a good candidate as a replacement for the
51343 if-convertible sequence described in IF_INFO. */
51344
51345 static bool
51346 ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
51347 {
51348 if (TARGET_ONE_IF_CONV_INSN && if_info->speed_p)
51349 {
51350 int cmov_cnt = 0;
51351 /* Punt if SEQ contains more than one CMOV or FCMOV instruction.
51352 Maybe we should allow even more conditional moves as long as they
51353 are used far enough not to stall the CPU, or also consider
51354 IF_INFO->TEST_BB succ edge probabilities. */
51355 for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
51356 {
51357 rtx set = single_set (insn);
51358 if (!set)
51359 continue;
51360 if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE)
51361 continue;
51362 rtx src = SET_SRC (set);
51363 enum machine_mode mode = GET_MODE (src);
51364 if (GET_MODE_CLASS (mode) != MODE_INT
51365 && GET_MODE_CLASS (mode) != MODE_FLOAT)
51366 continue;
51367 if ((!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)))
51368 || (!REG_P (XEXP (src, 2)) && !MEM_P (XEXP (src, 2))))
51369 continue;
51370 /* insn is CMOV or FCMOV. */
51371 if (++cmov_cnt > 1)
51372 return false;
51373 }
51374 }
51375 return default_noce_conversion_profitable_p (seq, if_info);
51376 }
51377
51378 /* Implement targetm.vectorize.init_cost. */
51379
51380 static void *
51381 ix86_init_cost (struct loop *)
51382 {
51383 unsigned *cost = XNEWVEC (unsigned, 3);
51384 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
51385 return cost;
51386 }
51387
51388 /* Implement targetm.vectorize.add_stmt_cost. */
51389
51390 static unsigned
51391 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
51392 struct _stmt_vec_info *stmt_info, int misalign,
51393 enum vect_cost_model_location where)
51394 {
51395 unsigned *cost = (unsigned *) data;
51396 unsigned retval = 0;
51397
51398 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
51399 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
51400
51401 /* Penalize DFmode vector operations for Bonnell. */
51402 if (TARGET_BONNELL && kind == vector_stmt
51403 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
51404 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
51405
51406 /* Statements in an inner loop relative to the loop being
51407 vectorized are weighted more heavily. The value here is
51408 arbitrary and could potentially be improved with analysis. */
51409 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
51410 count *= 50; /* FIXME. */
51411
51412 retval = (unsigned) (count * stmt_cost);
51413
51414 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
51415 for Silvermont as it has out of order integer pipeline and can execute
51416 2 scalar instruction per tick, but has in order SIMD pipeline. */
51417 if ((TARGET_SILVERMONT || TARGET_INTEL)
51418 && stmt_info && stmt_info->stmt)
51419 {
51420 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
51421 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
51422 retval = (retval * 17) / 10;
51423 }
51424
51425 cost[where] += retval;
51426
51427 return retval;
51428 }
51429
51430 /* Implement targetm.vectorize.finish_cost. */
51431
51432 static void
51433 ix86_finish_cost (void *data, unsigned *prologue_cost,
51434 unsigned *body_cost, unsigned *epilogue_cost)
51435 {
51436 unsigned *cost = (unsigned *) data;
51437 *prologue_cost = cost[vect_prologue];
51438 *body_cost = cost[vect_body];
51439 *epilogue_cost = cost[vect_epilogue];
51440 }
51441
51442 /* Implement targetm.vectorize.destroy_cost_data. */
51443
51444 static void
51445 ix86_destroy_cost_data (void *data)
51446 {
51447 free (data);
51448 }
51449
51450 /* Validate target specific memory model bits in VAL. */
51451
51452 static unsigned HOST_WIDE_INT
51453 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
51454 {
51455 enum memmodel model = memmodel_from_int (val);
51456 bool strong;
51457
51458 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
51459 |MEMMODEL_MASK)
51460 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
51461 {
51462 warning (OPT_Winvalid_memory_model,
51463 "Unknown architecture specific memory model");
51464 return MEMMODEL_SEQ_CST;
51465 }
51466 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
51467 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
51468 {
51469 warning (OPT_Winvalid_memory_model,
51470 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
51471 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
51472 }
51473 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
51474 {
51475 warning (OPT_Winvalid_memory_model,
51476 "HLE_RELEASE not used with RELEASE or stronger memory model");
51477 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
51478 }
51479 return val;
51480 }
51481
51482 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
51483 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
51484 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
51485 or number of vecsize_mangle variants that should be emitted. */
51486
51487 static int
51488 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
51489 struct cgraph_simd_clone *clonei,
51490 tree base_type, int num)
51491 {
51492 int ret = 1;
51493
51494 if (clonei->simdlen
51495 && (clonei->simdlen < 2
51496 || clonei->simdlen > 1024
51497 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
51498 {
51499 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
51500 "unsupported simdlen %d", clonei->simdlen);
51501 return 0;
51502 }
51503
51504 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
51505 if (TREE_CODE (ret_type) != VOID_TYPE)
51506 switch (TYPE_MODE (ret_type))
51507 {
51508 case QImode:
51509 case HImode:
51510 case SImode:
51511 case DImode:
51512 case SFmode:
51513 case DFmode:
51514 /* case SCmode: */
51515 /* case DCmode: */
51516 break;
51517 default:
51518 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
51519 "unsupported return type %qT for simd\n", ret_type);
51520 return 0;
51521 }
51522
51523 tree t;
51524 int i;
51525
51526 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
51527 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
51528 switch (TYPE_MODE (TREE_TYPE (t)))
51529 {
51530 case QImode:
51531 case HImode:
51532 case SImode:
51533 case DImode:
51534 case SFmode:
51535 case DFmode:
51536 /* case SCmode: */
51537 /* case DCmode: */
51538 break;
51539 default:
51540 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
51541 "unsupported argument type %qT for simd\n", TREE_TYPE (t));
51542 return 0;
51543 }
51544
51545 if (clonei->cilk_elemental)
51546 {
51547 /* Parse here processor clause. If not present, default to 'b'. */
51548 clonei->vecsize_mangle = 'b';
51549 }
51550 else if (!TREE_PUBLIC (node->decl))
51551 {
51552 /* If the function isn't exported, we can pick up just one ISA
51553 for the clones. */
51554 if (TARGET_AVX512F)
51555 clonei->vecsize_mangle = 'e';
51556 else if (TARGET_AVX2)
51557 clonei->vecsize_mangle = 'd';
51558 else if (TARGET_AVX)
51559 clonei->vecsize_mangle = 'c';
51560 else
51561 clonei->vecsize_mangle = 'b';
51562 ret = 1;
51563 }
51564 else
51565 {
51566 clonei->vecsize_mangle = "bcde"[num];
51567 ret = 4;
51568 }
51569 clonei->mask_mode = VOIDmode;
51570 switch (clonei->vecsize_mangle)
51571 {
51572 case 'b':
51573 clonei->vecsize_int = 128;
51574 clonei->vecsize_float = 128;
51575 break;
51576 case 'c':
51577 clonei->vecsize_int = 128;
51578 clonei->vecsize_float = 256;
51579 break;
51580 case 'd':
51581 clonei->vecsize_int = 256;
51582 clonei->vecsize_float = 256;
51583 break;
51584 case 'e':
51585 clonei->vecsize_int = 512;
51586 clonei->vecsize_float = 512;
51587 if (TYPE_MODE (base_type) == QImode)
51588 clonei->mask_mode = DImode;
51589 else
51590 clonei->mask_mode = SImode;
51591 break;
51592 }
51593 if (clonei->simdlen == 0)
51594 {
51595 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
51596 clonei->simdlen = clonei->vecsize_int;
51597 else
51598 clonei->simdlen = clonei->vecsize_float;
51599 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
51600 }
51601 else if (clonei->simdlen > 16)
51602 {
51603 /* For compatibility with ICC, use the same upper bounds
51604 for simdlen. In particular, for CTYPE below, use the return type,
51605 unless the function returns void, in that case use the characteristic
51606 type. If it is possible for given SIMDLEN to pass CTYPE value
51607 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
51608 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
51609 emit corresponding clone. */
51610 tree ctype = ret_type;
51611 if (TREE_CODE (ret_type) == VOID_TYPE)
51612 ctype = base_type;
51613 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
51614 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
51615 cnt /= clonei->vecsize_int;
51616 else
51617 cnt /= clonei->vecsize_float;
51618 if (cnt > (TARGET_64BIT ? 16 : 8))
51619 {
51620 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
51621 "unsupported simdlen %d", clonei->simdlen);
51622 return 0;
51623 }
51624 }
51625 return ret;
51626 }
51627
51628 /* Add target attribute to SIMD clone NODE if needed. */
51629
51630 static void
51631 ix86_simd_clone_adjust (struct cgraph_node *node)
51632 {
51633 const char *str = NULL;
51634 gcc_assert (node->decl == cfun->decl);
51635 switch (node->simdclone->vecsize_mangle)
51636 {
51637 case 'b':
51638 if (!TARGET_SSE2)
51639 str = "sse2";
51640 break;
51641 case 'c':
51642 if (!TARGET_AVX)
51643 str = "avx";
51644 break;
51645 case 'd':
51646 if (!TARGET_AVX2)
51647 str = "avx2";
51648 break;
51649 case 'e':
51650 if (!TARGET_AVX512F)
51651 str = "avx512f";
51652 break;
51653 default:
51654 gcc_unreachable ();
51655 }
51656 if (str == NULL)
51657 return;
51658 push_cfun (NULL);
51659 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
51660 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
51661 gcc_assert (ok);
51662 pop_cfun ();
51663 ix86_reset_previous_fndecl ();
51664 ix86_set_current_function (node->decl);
51665 }
51666
51667 /* If SIMD clone NODE can't be used in a vectorized loop
51668 in current function, return -1, otherwise return a badness of using it
51669 (0 if it is most desirable from vecsize_mangle point of view, 1
51670 slightly less desirable, etc.). */
51671
51672 static int
51673 ix86_simd_clone_usable (struct cgraph_node *node)
51674 {
51675 switch (node->simdclone->vecsize_mangle)
51676 {
51677 case 'b':
51678 if (!TARGET_SSE2)
51679 return -1;
51680 if (!TARGET_AVX)
51681 return 0;
51682 return TARGET_AVX2 ? 2 : 1;
51683 case 'c':
51684 if (!TARGET_AVX)
51685 return -1;
51686 return TARGET_AVX2 ? 1 : 0;
51687 case 'd':
51688 if (!TARGET_AVX2)
51689 return -1;
51690 return 0;
51691 case 'e':
51692 if (!TARGET_AVX512F)
51693 return -1;
51694 return 0;
51695 default:
51696 gcc_unreachable ();
51697 }
51698 }
51699
51700 /* This function adjusts the unroll factor based on
51701 the hardware capabilities. For ex, bdver3 has
51702 a loop buffer which makes unrolling of smaller
51703 loops less important. This function decides the
51704 unroll factor using number of memory references
51705 (value 32 is used) as a heuristic. */
51706
51707 static unsigned
51708 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
51709 {
51710 basic_block *bbs;
51711 rtx_insn *insn;
51712 unsigned i;
51713 unsigned mem_count = 0;
51714
51715 if (!TARGET_ADJUST_UNROLL)
51716 return nunroll;
51717
51718 /* Count the number of memory references within the loop body.
51719 This value determines the unrolling factor for bdver3 and bdver4
51720 architectures. */
51721 subrtx_iterator::array_type array;
51722 bbs = get_loop_body (loop);
51723 for (i = 0; i < loop->num_nodes; i++)
51724 FOR_BB_INSNS (bbs[i], insn)
51725 if (NONDEBUG_INSN_P (insn))
51726 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
51727 if (const_rtx x = *iter)
51728 if (MEM_P (x))
51729 {
51730 machine_mode mode = GET_MODE (x);
51731 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
51732 if (n_words > 4)
51733 mem_count += 2;
51734 else
51735 mem_count += 1;
51736 }
51737 free (bbs);
51738
51739 if (mem_count && mem_count <=32)
51740 return 32/mem_count;
51741
51742 return nunroll;
51743 }
51744
51745
51746 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
51747
51748 static bool
51749 ix86_float_exceptions_rounding_supported_p (void)
51750 {
51751 /* For x87 floating point with standard excess precision handling,
51752 there is no adddf3 pattern (since x87 floating point only has
51753 XFmode operations) so the default hook implementation gets this
51754 wrong. */
51755 return TARGET_80387 || TARGET_SSE_MATH;
51756 }
51757
51758 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
51759
51760 static void
51761 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
51762 {
51763 if (!TARGET_80387 && !TARGET_SSE_MATH)
51764 return;
51765 tree exceptions_var = create_tmp_var_raw (integer_type_node);
51766 if (TARGET_80387)
51767 {
51768 tree fenv_index_type = build_index_type (size_int (6));
51769 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
51770 tree fenv_var = create_tmp_var_raw (fenv_type);
51771 TREE_ADDRESSABLE (fenv_var) = 1;
51772 tree fenv_ptr = build_pointer_type (fenv_type);
51773 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
51774 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
51775 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
51776 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
51777 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
51778 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
51779 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
51780 tree hold_fnclex = build_call_expr (fnclex, 0);
51781 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
51782 NULL_TREE, NULL_TREE);
51783 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
51784 hold_fnclex);
51785 *clear = build_call_expr (fnclex, 0);
51786 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
51787 tree fnstsw_call = build_call_expr (fnstsw, 0);
51788 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
51789 sw_var, fnstsw_call);
51790 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
51791 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
51792 exceptions_var, exceptions_x87);
51793 *update = build2 (COMPOUND_EXPR, integer_type_node,
51794 sw_mod, update_mod);
51795 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
51796 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
51797 }
51798 if (TARGET_SSE_MATH)
51799 {
51800 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
51801 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
51802 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
51803 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
51804 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
51805 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
51806 mxcsr_orig_var, stmxcsr_hold_call);
51807 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
51808 mxcsr_orig_var,
51809 build_int_cst (unsigned_type_node, 0x1f80));
51810 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
51811 build_int_cst (unsigned_type_node, 0xffffffc0));
51812 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
51813 mxcsr_mod_var, hold_mod_val);
51814 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
51815 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
51816 hold_assign_orig, hold_assign_mod);
51817 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
51818 ldmxcsr_hold_call);
51819 if (*hold)
51820 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
51821 else
51822 *hold = hold_all;
51823 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
51824 if (*clear)
51825 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
51826 ldmxcsr_clear_call);
51827 else
51828 *clear = ldmxcsr_clear_call;
51829 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
51830 tree exceptions_sse = fold_convert (integer_type_node,
51831 stxmcsr_update_call);
51832 if (*update)
51833 {
51834 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
51835 exceptions_var, exceptions_sse);
51836 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
51837 exceptions_var, exceptions_mod);
51838 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
51839 exceptions_assign);
51840 }
51841 else
51842 *update = build2 (MODIFY_EXPR, integer_type_node,
51843 exceptions_var, exceptions_sse);
51844 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
51845 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
51846 ldmxcsr_update_call);
51847 }
51848 tree atomic_feraiseexcept
51849 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
51850 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
51851 1, exceptions_var);
51852 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
51853 atomic_feraiseexcept_call);
51854 }
51855
51856 /* Return mode to be used for bounds or VOIDmode
51857 if bounds are not supported. */
51858
51859 static enum machine_mode
51860 ix86_mpx_bound_mode ()
51861 {
51862 /* Do not support pointer checker if MPX
51863 is not enabled. */
51864 if (!TARGET_MPX)
51865 {
51866 if (flag_check_pointer_bounds)
51867 warning (0, "Pointer Checker requires MPX support on this target."
51868 " Use -mmpx options to enable MPX.");
51869 return VOIDmode;
51870 }
51871
51872 return BNDmode;
51873 }
51874
51875 /* Return constant used to statically initialize constant bounds.
51876
51877 This function is used to create special bound values. For now
51878 only INIT bounds and NONE bounds are expected. More special
51879 values may be added later. */
51880
51881 static tree
51882 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
51883 {
51884 tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
51885 : build_zero_cst (pointer_sized_int_node);
51886 tree high = ub ? build_zero_cst (pointer_sized_int_node)
51887 : build_minus_one_cst (pointer_sized_int_node);
51888
51889 /* This function is supposed to be used to create INIT and
51890 NONE bounds only. */
51891 gcc_assert ((lb == 0 && ub == -1)
51892 || (lb == -1 && ub == 0));
51893
51894 return build_complex (NULL, low, high);
51895 }
51896
51897 /* Generate a list of statements STMTS to initialize pointer bounds
51898 variable VAR with bounds LB and UB. Return the number of generated
51899 statements. */
51900
51901 static int
51902 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
51903 {
51904 tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
51905 tree lhs, modify, var_p;
51906
51907 ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
51908 var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
51909
51910 lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
51911 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
51912 append_to_statement_list (modify, stmts);
51913
51914 lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
51915 build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
51916 TYPE_SIZE_UNIT (pointer_sized_int_node)));
51917 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
51918 append_to_statement_list (modify, stmts);
51919
51920 return 2;
51921 }
51922
51923 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
51924 /* For i386, common symbol is local only for non-PIE binaries. For
51925 x86-64, common symbol is local only for non-PIE binaries or linker
51926 supports copy reloc in PIE binaries. */
51927
51928 static bool
51929 ix86_binds_local_p (const_tree exp)
51930 {
51931 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
51932 (!flag_pic
51933 || (TARGET_64BIT
51934 && HAVE_LD_PIE_COPYRELOC != 0)));
51935 }
51936 #endif
51937
51938 /* If MEM is in the form of [base+offset], extract the two parts
51939 of address and set to BASE and OFFSET, otherwise return false. */
51940
51941 static bool
51942 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
51943 {
51944 rtx addr;
51945
51946 gcc_assert (MEM_P (mem));
51947
51948 addr = XEXP (mem, 0);
51949
51950 if (GET_CODE (addr) == CONST)
51951 addr = XEXP (addr, 0);
51952
51953 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
51954 {
51955 *base = addr;
51956 *offset = const0_rtx;
51957 return true;
51958 }
51959
51960 if (GET_CODE (addr) == PLUS
51961 && (REG_P (XEXP (addr, 0))
51962 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
51963 && CONST_INT_P (XEXP (addr, 1)))
51964 {
51965 *base = XEXP (addr, 0);
51966 *offset = XEXP (addr, 1);
51967 return true;
51968 }
51969
51970 return false;
51971 }
51972
51973 /* Given OPERANDS of consecutive load/store, check if we can merge
51974 them into move multiple. LOAD is true if they are load instructions.
51975 MODE is the mode of memory operands. */
51976
51977 bool
51978 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
51979 enum machine_mode mode)
51980 {
51981 HOST_WIDE_INT offval_1, offval_2, msize;
51982 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
51983
51984 if (load)
51985 {
51986 mem_1 = operands[1];
51987 mem_2 = operands[3];
51988 reg_1 = operands[0];
51989 reg_2 = operands[2];
51990 }
51991 else
51992 {
51993 mem_1 = operands[0];
51994 mem_2 = operands[2];
51995 reg_1 = operands[1];
51996 reg_2 = operands[3];
51997 }
51998
51999 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
52000
52001 if (REGNO (reg_1) != REGNO (reg_2))
52002 return false;
52003
52004 /* Check if the addresses are in the form of [base+offset]. */
52005 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
52006 return false;
52007 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
52008 return false;
52009
52010 /* Check if the bases are the same. */
52011 if (!rtx_equal_p (base_1, base_2))
52012 return false;
52013
52014 offval_1 = INTVAL (offset_1);
52015 offval_2 = INTVAL (offset_2);
52016 msize = GET_MODE_SIZE (mode);
52017 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
52018 if (offval_1 + msize != offval_2)
52019 return false;
52020
52021 return true;
52022 }
52023
52024 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
52025
52026 static bool
52027 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
52028 optimization_type opt_type)
52029 {
52030 switch (op)
52031 {
52032 case asin_optab:
52033 case acos_optab:
52034 case log1p_optab:
52035 case exp_optab:
52036 case exp10_optab:
52037 case exp2_optab:
52038 case expm1_optab:
52039 case ldexp_optab:
52040 case scalb_optab:
52041 case round_optab:
52042 return opt_type == OPTIMIZE_FOR_SPEED;
52043
52044 case rint_optab:
52045 if (SSE_FLOAT_MODE_P (mode1)
52046 && TARGET_SSE_MATH
52047 && !flag_trapping_math
52048 && !TARGET_ROUND)
52049 return opt_type == OPTIMIZE_FOR_SPEED;
52050 return true;
52051
52052 case floor_optab:
52053 case ceil_optab:
52054 case btrunc_optab:
52055 if (SSE_FLOAT_MODE_P (mode1)
52056 && TARGET_SSE_MATH
52057 && !flag_trapping_math
52058 && TARGET_ROUND)
52059 return true;
52060 return opt_type == OPTIMIZE_FOR_SPEED;
52061
52062 case rsqrt_optab:
52063 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
52064
52065 default:
52066 return true;
52067 }
52068 }
52069
52070 /* Address space support.
52071
52072 This is not "far pointers" in the 16-bit sense, but an easy way
52073 to use %fs and %gs segment prefixes. Therefore:
52074
52075 (a) All address spaces have the same modes,
52076 (b) All address spaces have the same addresss forms,
52077 (c) While %fs and %gs are technically subsets of the generic
52078 address space, they are probably not subsets of each other.
52079 (d) Since we have no access to the segment base register values
52080 without resorting to a system call, we cannot convert a
52081 non-default address space to a default address space.
52082 Therefore we do not claim %fs or %gs are subsets of generic.
52083
52084 Therefore we can (mostly) use the default hooks. */
52085
52086 /* All use of segmentation is assumed to make address 0 valid. */
52087
52088 static bool
52089 ix86_addr_space_zero_address_valid (addr_space_t as)
52090 {
52091 return as != ADDR_SPACE_GENERIC;
52092 }
52093
52094 static void
52095 ix86_init_libfuncs (void)
52096 {
52097 if (TARGET_64BIT)
52098 {
52099 set_optab_libfunc (sdivmod_optab, TImode, "__divmodti4");
52100 set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
52101 }
52102 else
52103 {
52104 set_optab_libfunc (sdivmod_optab, DImode, "__divmoddi4");
52105 set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
52106 }
52107
52108 #if TARGET_MACHO
52109 darwin_rename_builtins ();
52110 #endif
52111 }
52112
52113 /* Generate call to __divmoddi4. */
52114
52115 static void
52116 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
52117 rtx op0, rtx op1,
52118 rtx *quot_p, rtx *rem_p)
52119 {
52120 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
52121
52122 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
52123 mode, 3,
52124 op0, GET_MODE (op0),
52125 op1, GET_MODE (op1),
52126 XEXP (rem, 0), Pmode);
52127 *quot_p = quot;
52128 *rem_p = rem;
52129 }
52130
52131 /* Set the value of FLT_EVAL_METHOD in float.h. When using only the
52132 FPU, assume that the fpcw is set to extended precision; when using
52133 only SSE, rounding is correct; when using both SSE and the FPU,
52134 the rounding precision is indeterminate, since either may be chosen
52135 apparently at random. */
52136
52137 static enum flt_eval_method
52138 ix86_excess_precision (enum excess_precision_type type)
52139 {
52140 switch (type)
52141 {
52142 case EXCESS_PRECISION_TYPE_FAST:
52143 /* The fastest type to promote to will always be the native type,
52144 whether that occurs with implicit excess precision or
52145 otherwise. */
52146 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
52147 case EXCESS_PRECISION_TYPE_STANDARD:
52148 case EXCESS_PRECISION_TYPE_IMPLICIT:
52149 /* Otherwise, the excess precision we want when we are
52150 in a standards compliant mode, and the implicit precision we
52151 provide would be identical were it not for the unpredictable
52152 cases. */
52153 if (!TARGET_80387)
52154 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
52155 else if (!TARGET_MIX_SSE_I387)
52156 {
52157 if (!TARGET_SSE_MATH)
52158 return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;
52159 else if (TARGET_SSE2)
52160 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
52161 }
52162
52163 /* If we are in standards compliant mode, but we know we will
52164 calculate in unpredictable precision, return
52165 FLT_EVAL_METHOD_FLOAT. There is no reason to introduce explicit
52166 excess precision if the target can't guarantee it will honor
52167 it. */
52168 return (type == EXCESS_PRECISION_TYPE_STANDARD
52169 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
52170 : FLT_EVAL_METHOD_UNPREDICTABLE);
52171 default:
52172 gcc_unreachable ();
52173 }
52174
52175 return FLT_EVAL_METHOD_UNPREDICTABLE;
52176 }
52177
52178 /* Target-specific selftests. */
52179
52180 #if CHECKING_P
52181
52182 namespace selftest {
52183
52184 /* Verify that hard regs are dumped as expected (in compact mode). */
52185
52186 static void
52187 ix86_test_dumping_hard_regs ()
52188 {
52189 ASSERT_RTL_DUMP_EQ ("(reg:SI ax)", gen_raw_REG (SImode, 0));
52190 ASSERT_RTL_DUMP_EQ ("(reg:SI dx)", gen_raw_REG (SImode, 1));
52191 }
52192
52193 /* Test dumping an insn with repeated references to the same SCRATCH,
52194 to verify the rtx_reuse code. */
52195
52196 static void
52197 ix86_test_dumping_memory_blockage ()
52198 {
52199 set_new_first_and_last_insn (NULL, NULL);
52200
52201 rtx pat = gen_memory_blockage ();
52202 rtx_reuse_manager r;
52203 r.preprocess (pat);
52204
52205 /* Verify that the repeated references to the SCRATCH show use
52206 reuse IDS. The first should be prefixed with a reuse ID,
52207 and the second should be dumped as a "reuse_rtx" of that ID.
52208 The expected string assumes Pmode == DImode. */
52209 if (Pmode == DImode)
52210 ASSERT_RTL_DUMP_EQ_WITH_REUSE
52211 ("(cinsn 1 (set (mem/v:BLK (0|scratch:DI) [0 A8])\n"
52212 " (unspec:BLK [\n"
52213 " (mem/v:BLK (reuse_rtx 0) [0 A8])\n"
52214 " ] UNSPEC_MEMORY_BLOCKAGE)))\n", pat, &r);
52215 }
52216
52217 /* Verify loading an RTL dump; specifically a dump of copying
52218 a param on x86_64 from a hard reg into the frame.
52219 This test is target-specific since the dump contains target-specific
52220 hard reg names. */
52221
52222 static void
52223 ix86_test_loading_dump_fragment_1 ()
52224 {
52225 rtl_dump_test t (SELFTEST_LOCATION,
52226 locate_file ("x86_64/copy-hard-reg-into-frame.rtl"));
52227
52228 rtx_insn *insn = get_insn_by_uid (1);
52229
52230 /* The block structure and indentation here is purely for
52231 readability; it mirrors the structure of the rtx. */
52232 tree mem_expr;
52233 {
52234 rtx pat = PATTERN (insn);
52235 ASSERT_EQ (SET, GET_CODE (pat));
52236 {
52237 rtx dest = SET_DEST (pat);
52238 ASSERT_EQ (MEM, GET_CODE (dest));
52239 /* Verify the "/c" was parsed. */
52240 ASSERT_TRUE (RTX_FLAG (dest, call));
52241 ASSERT_EQ (SImode, GET_MODE (dest));
52242 {
52243 rtx addr = XEXP (dest, 0);
52244 ASSERT_EQ (PLUS, GET_CODE (addr));
52245 ASSERT_EQ (DImode, GET_MODE (addr));
52246 {
52247 rtx lhs = XEXP (addr, 0);
52248 /* Verify that the "frame" REG was consolidated. */
52249 ASSERT_RTX_PTR_EQ (frame_pointer_rtx, lhs);
52250 }
52251 {
52252 rtx rhs = XEXP (addr, 1);
52253 ASSERT_EQ (CONST_INT, GET_CODE (rhs));
52254 ASSERT_EQ (-4, INTVAL (rhs));
52255 }
52256 }
52257 /* Verify the "[1 i+0 S4 A32]" was parsed. */
52258 ASSERT_EQ (1, MEM_ALIAS_SET (dest));
52259 /* "i" should have been handled by synthesizing a global int
52260 variable named "i". */
52261 mem_expr = MEM_EXPR (dest);
52262 ASSERT_NE (mem_expr, NULL);
52263 ASSERT_EQ (VAR_DECL, TREE_CODE (mem_expr));
52264 ASSERT_EQ (integer_type_node, TREE_TYPE (mem_expr));
52265 ASSERT_EQ (IDENTIFIER_NODE, TREE_CODE (DECL_NAME (mem_expr)));
52266 ASSERT_STREQ ("i", IDENTIFIER_POINTER (DECL_NAME (mem_expr)));
52267 /* "+0". */
52268 ASSERT_TRUE (MEM_OFFSET_KNOWN_P (dest));
52269 ASSERT_EQ (0, MEM_OFFSET (dest));
52270 /* "S4". */
52271 ASSERT_EQ (4, MEM_SIZE (dest));
52272 /* "A32. */
52273 ASSERT_EQ (32, MEM_ALIGN (dest));
52274 }
52275 {
52276 rtx src = SET_SRC (pat);
52277 ASSERT_EQ (REG, GET_CODE (src));
52278 ASSERT_EQ (SImode, GET_MODE (src));
52279 ASSERT_EQ (5, REGNO (src));
52280 tree reg_expr = REG_EXPR (src);
52281 /* "i" here should point to the same var as for the MEM_EXPR. */
52282 ASSERT_EQ (reg_expr, mem_expr);
52283 }
52284 }
52285 }
52286
52287 /* Verify that the RTL loader copes with a call_insn dump.
52288 This test is target-specific since the dump contains a target-specific
52289 hard reg name. */
52290
52291 static void
52292 ix86_test_loading_call_insn ()
52293 {
52294 /* The test dump includes register "xmm0", where requires TARGET_SSE
52295 to exist. */
52296 if (!TARGET_SSE)
52297 return;
52298
52299 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/call-insn.rtl"));
52300
52301 rtx_insn *insn = get_insns ();
52302 ASSERT_EQ (CALL_INSN, GET_CODE (insn));
52303
52304 /* "/j". */
52305 ASSERT_TRUE (RTX_FLAG (insn, jump));
52306
52307 rtx pat = PATTERN (insn);
52308 ASSERT_EQ (CALL, GET_CODE (SET_SRC (pat)));
52309
52310 /* Verify REG_NOTES. */
52311 {
52312 /* "(expr_list:REG_CALL_DECL". */
52313 ASSERT_EQ (EXPR_LIST, GET_CODE (REG_NOTES (insn)));
52314 rtx_expr_list *note0 = as_a <rtx_expr_list *> (REG_NOTES (insn));
52315 ASSERT_EQ (REG_CALL_DECL, REG_NOTE_KIND (note0));
52316
52317 /* "(expr_list:REG_EH_REGION (const_int 0 [0])". */
52318 rtx_expr_list *note1 = note0->next ();
52319 ASSERT_EQ (REG_EH_REGION, REG_NOTE_KIND (note1));
52320
52321 ASSERT_EQ (NULL, note1->next ());
52322 }
52323
52324 /* Verify CALL_INSN_FUNCTION_USAGE. */
52325 {
52326 /* "(expr_list:DF (use (reg:DF 21 xmm0))". */
52327 rtx_expr_list *usage
52328 = as_a <rtx_expr_list *> (CALL_INSN_FUNCTION_USAGE (insn));
52329 ASSERT_EQ (EXPR_LIST, GET_CODE (usage));
52330 ASSERT_EQ (DFmode, GET_MODE (usage));
52331 ASSERT_EQ (USE, GET_CODE (usage->element ()));
52332 ASSERT_EQ (NULL, usage->next ());
52333 }
52334 }
52335
52336 /* Verify that the RTL loader copes a dump from print_rtx_function.
52337 This test is target-specific since the dump contains target-specific
52338 hard reg names. */
52339
52340 static void
52341 ix86_test_loading_full_dump ()
52342 {
52343 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/times-two.rtl"));
52344
52345 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
52346
52347 rtx_insn *insn_1 = get_insn_by_uid (1);
52348 ASSERT_EQ (NOTE, GET_CODE (insn_1));
52349
52350 rtx_insn *insn_7 = get_insn_by_uid (7);
52351 ASSERT_EQ (INSN, GET_CODE (insn_7));
52352 ASSERT_EQ (PARALLEL, GET_CODE (PATTERN (insn_7)));
52353
52354 rtx_insn *insn_15 = get_insn_by_uid (15);
52355 ASSERT_EQ (INSN, GET_CODE (insn_15));
52356 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
52357
52358 /* Verify crtl->return_rtx. */
52359 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
52360 ASSERT_EQ (0, REGNO (crtl->return_rtx));
52361 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
52362 }
52363
52364 /* Verify that the RTL loader copes with UNSPEC and UNSPEC_VOLATILE insns.
52365 In particular, verify that it correctly loads the 2nd operand.
52366 This test is target-specific since these are machine-specific
52367 operands (and enums). */
52368
52369 static void
52370 ix86_test_loading_unspec ()
52371 {
52372 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/unspec.rtl"));
52373
52374 ASSERT_STREQ ("test_unspec", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
52375
52376 ASSERT_TRUE (cfun);
52377
52378 /* Test of an UNSPEC. */
52379 rtx_insn *insn = get_insns ();
52380 ASSERT_EQ (INSN, GET_CODE (insn));
52381 rtx set = single_set (insn);
52382 ASSERT_NE (NULL, set);
52383 rtx dst = SET_DEST (set);
52384 ASSERT_EQ (MEM, GET_CODE (dst));
52385 rtx src = SET_SRC (set);
52386 ASSERT_EQ (UNSPEC, GET_CODE (src));
52387 ASSERT_EQ (BLKmode, GET_MODE (src));
52388 ASSERT_EQ (UNSPEC_MEMORY_BLOCKAGE, XINT (src, 1));
52389
52390 rtx v0 = XVECEXP (src, 0, 0);
52391
52392 /* Verify that the two uses of the first SCRATCH have pointer
52393 equality. */
52394 rtx scratch_a = XEXP (dst, 0);
52395 ASSERT_EQ (SCRATCH, GET_CODE (scratch_a));
52396
52397 rtx scratch_b = XEXP (v0, 0);
52398 ASSERT_EQ (SCRATCH, GET_CODE (scratch_b));
52399
52400 ASSERT_EQ (scratch_a, scratch_b);
52401
52402 /* Verify that the two mems are thus treated as equal. */
52403 ASSERT_TRUE (rtx_equal_p (dst, v0));
52404
52405 /* Verify the the insn is recognized. */
52406 ASSERT_NE(-1, recog_memoized (insn));
52407
52408 /* Test of an UNSPEC_VOLATILE, which has its own enum values. */
52409 insn = NEXT_INSN (insn);
52410 ASSERT_EQ (INSN, GET_CODE (insn));
52411
52412 set = single_set (insn);
52413 ASSERT_NE (NULL, set);
52414
52415 src = SET_SRC (set);
52416 ASSERT_EQ (UNSPEC_VOLATILE, GET_CODE (src));
52417 ASSERT_EQ (UNSPECV_RDTSCP, XINT (src, 1));
52418 }
52419
52420 /* Run all target-specific selftests. */
52421
52422 static void
52423 ix86_run_selftests (void)
52424 {
52425 ix86_test_dumping_hard_regs ();
52426 ix86_test_dumping_memory_blockage ();
52427
52428 /* Various tests of loading RTL dumps, here because they contain
52429 ix86-isms (e.g. names of hard regs). */
52430 ix86_test_loading_dump_fragment_1 ();
52431 ix86_test_loading_call_insn ();
52432 ix86_test_loading_full_dump ();
52433 ix86_test_loading_unspec ();
52434 }
52435
52436 } // namespace selftest
52437
52438 #endif /* CHECKING_P */
52439
52440 /* Initialize the GCC target structure. */
52441 #undef TARGET_RETURN_IN_MEMORY
52442 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
52443
52444 #undef TARGET_LEGITIMIZE_ADDRESS
52445 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
52446
52447 #undef TARGET_ATTRIBUTE_TABLE
52448 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
52449 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
52450 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
52451 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
52452 # undef TARGET_MERGE_DECL_ATTRIBUTES
52453 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
52454 #endif
52455
52456 #undef TARGET_COMP_TYPE_ATTRIBUTES
52457 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
52458
52459 #undef TARGET_INIT_BUILTINS
52460 #define TARGET_INIT_BUILTINS ix86_init_builtins
52461 #undef TARGET_BUILTIN_DECL
52462 #define TARGET_BUILTIN_DECL ix86_builtin_decl
52463 #undef TARGET_EXPAND_BUILTIN
52464 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
52465
52466 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
52467 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
52468 ix86_builtin_vectorized_function
52469
52470 #undef TARGET_VECTORIZE_BUILTIN_GATHER
52471 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
52472
52473 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
52474 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
52475
52476 #undef TARGET_BUILTIN_RECIPROCAL
52477 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
52478
52479 #undef TARGET_ASM_FUNCTION_EPILOGUE
52480 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
52481
52482 #undef TARGET_ENCODE_SECTION_INFO
52483 #ifndef SUBTARGET_ENCODE_SECTION_INFO
52484 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
52485 #else
52486 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
52487 #endif
52488
52489 #undef TARGET_ASM_OPEN_PAREN
52490 #define TARGET_ASM_OPEN_PAREN ""
52491 #undef TARGET_ASM_CLOSE_PAREN
52492 #define TARGET_ASM_CLOSE_PAREN ""
52493
52494 #undef TARGET_ASM_BYTE_OP
52495 #define TARGET_ASM_BYTE_OP ASM_BYTE
52496
52497 #undef TARGET_ASM_ALIGNED_HI_OP
52498 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
52499 #undef TARGET_ASM_ALIGNED_SI_OP
52500 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
52501 #ifdef ASM_QUAD
52502 #undef TARGET_ASM_ALIGNED_DI_OP
52503 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
52504 #endif
52505
52506 #undef TARGET_PROFILE_BEFORE_PROLOGUE
52507 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
52508
52509 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
52510 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
52511
52512 #undef TARGET_ASM_UNALIGNED_HI_OP
52513 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
52514 #undef TARGET_ASM_UNALIGNED_SI_OP
52515 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
52516 #undef TARGET_ASM_UNALIGNED_DI_OP
52517 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
52518
52519 #undef TARGET_PRINT_OPERAND
52520 #define TARGET_PRINT_OPERAND ix86_print_operand
52521 #undef TARGET_PRINT_OPERAND_ADDRESS
52522 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
52523 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
52524 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
52525 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
52526 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
52527
52528 #undef TARGET_SCHED_INIT_GLOBAL
52529 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
52530 #undef TARGET_SCHED_ADJUST_COST
52531 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
52532 #undef TARGET_SCHED_ISSUE_RATE
52533 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
52534 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
52535 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
52536 ia32_multipass_dfa_lookahead
52537 #undef TARGET_SCHED_MACRO_FUSION_P
52538 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
52539 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
52540 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
52541
52542 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
52543 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
52544
52545 #undef TARGET_MEMMODEL_CHECK
52546 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
52547
52548 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
52549 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
52550
52551 #ifdef HAVE_AS_TLS
52552 #undef TARGET_HAVE_TLS
52553 #define TARGET_HAVE_TLS true
52554 #endif
52555 #undef TARGET_CANNOT_FORCE_CONST_MEM
52556 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
52557 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
52558 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
52559
52560 #undef TARGET_DELEGITIMIZE_ADDRESS
52561 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
52562
52563 #undef TARGET_MS_BITFIELD_LAYOUT_P
52564 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
52565
52566 #if TARGET_MACHO
52567 #undef TARGET_BINDS_LOCAL_P
52568 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
52569 #else
52570 #undef TARGET_BINDS_LOCAL_P
52571 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
52572 #endif
52573 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
52574 #undef TARGET_BINDS_LOCAL_P
52575 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
52576 #endif
52577
52578 #undef TARGET_ASM_OUTPUT_MI_THUNK
52579 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
52580 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
52581 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
52582
52583 #undef TARGET_ASM_FILE_START
52584 #define TARGET_ASM_FILE_START x86_file_start
52585
52586 #undef TARGET_OPTION_OVERRIDE
52587 #define TARGET_OPTION_OVERRIDE ix86_option_override
52588
52589 #undef TARGET_REGISTER_MOVE_COST
52590 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
52591 #undef TARGET_MEMORY_MOVE_COST
52592 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
52593 #undef TARGET_RTX_COSTS
52594 #define TARGET_RTX_COSTS ix86_rtx_costs
52595 #undef TARGET_ADDRESS_COST
52596 #define TARGET_ADDRESS_COST ix86_address_cost
52597
52598 #undef TARGET_FLAGS_REGNUM
52599 #define TARGET_FLAGS_REGNUM FLAGS_REG
52600 #undef TARGET_FIXED_CONDITION_CODE_REGS
52601 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
52602 #undef TARGET_CC_MODES_COMPATIBLE
52603 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
52604
52605 #undef TARGET_MACHINE_DEPENDENT_REORG
52606 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
52607
52608 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
52609 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
52610
52611 #undef TARGET_BUILD_BUILTIN_VA_LIST
52612 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
52613
52614 #undef TARGET_FOLD_BUILTIN
52615 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
52616
52617 #undef TARGET_GIMPLE_FOLD_BUILTIN
52618 #define TARGET_GIMPLE_FOLD_BUILTIN ix86_gimple_fold_builtin
52619
52620 #undef TARGET_COMPARE_VERSION_PRIORITY
52621 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
52622
52623 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
52624 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
52625 ix86_generate_version_dispatcher_body
52626
52627 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
52628 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
52629 ix86_get_function_versions_dispatcher
52630
52631 #undef TARGET_ENUM_VA_LIST_P
52632 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
52633
52634 #undef TARGET_FN_ABI_VA_LIST
52635 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
52636
52637 #undef TARGET_CANONICAL_VA_LIST_TYPE
52638 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
52639
52640 #undef TARGET_EXPAND_BUILTIN_VA_START
52641 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
52642
52643 #undef TARGET_MD_ASM_ADJUST
52644 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
52645
52646 #undef TARGET_C_EXCESS_PRECISION
52647 #define TARGET_C_EXCESS_PRECISION ix86_excess_precision
52648 #undef TARGET_PROMOTE_PROTOTYPES
52649 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
52650 #undef TARGET_SETUP_INCOMING_VARARGS
52651 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
52652 #undef TARGET_MUST_PASS_IN_STACK
52653 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
52654 #undef TARGET_FUNCTION_ARG_ADVANCE
52655 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
52656 #undef TARGET_FUNCTION_ARG
52657 #define TARGET_FUNCTION_ARG ix86_function_arg
52658 #undef TARGET_INIT_PIC_REG
52659 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
52660 #undef TARGET_USE_PSEUDO_PIC_REG
52661 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
52662 #undef TARGET_FUNCTION_ARG_BOUNDARY
52663 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
52664 #undef TARGET_PASS_BY_REFERENCE
52665 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
52666 #undef TARGET_INTERNAL_ARG_POINTER
52667 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
52668 #undef TARGET_UPDATE_STACK_BOUNDARY
52669 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
52670 #undef TARGET_GET_DRAP_RTX
52671 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
52672 #undef TARGET_STRICT_ARGUMENT_NAMING
52673 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
52674 #undef TARGET_STATIC_CHAIN
52675 #define TARGET_STATIC_CHAIN ix86_static_chain
52676 #undef TARGET_TRAMPOLINE_INIT
52677 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
52678 #undef TARGET_RETURN_POPS_ARGS
52679 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
52680
52681 #undef TARGET_LEGITIMATE_COMBINED_INSN
52682 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
52683
52684 #undef TARGET_ASAN_SHADOW_OFFSET
52685 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
52686
52687 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
52688 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
52689
52690 #undef TARGET_SCALAR_MODE_SUPPORTED_P
52691 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
52692
52693 #undef TARGET_VECTOR_MODE_SUPPORTED_P
52694 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
52695
52696 #undef TARGET_C_MODE_FOR_SUFFIX
52697 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
52698
52699 #ifdef HAVE_AS_TLS
52700 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
52701 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
52702 #endif
52703
52704 #ifdef SUBTARGET_INSERT_ATTRIBUTES
52705 #undef TARGET_INSERT_ATTRIBUTES
52706 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
52707 #endif
52708
52709 #undef TARGET_MANGLE_TYPE
52710 #define TARGET_MANGLE_TYPE ix86_mangle_type
52711
52712 #ifdef TARGET_THREAD_SSP_OFFSET
52713 #undef TARGET_STACK_PROTECT_GUARD
52714 #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard
52715 #endif
52716
52717 #if !TARGET_MACHO
52718 #undef TARGET_STACK_PROTECT_FAIL
52719 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
52720 #endif
52721
52722 #undef TARGET_FUNCTION_VALUE
52723 #define TARGET_FUNCTION_VALUE ix86_function_value
52724
52725 #undef TARGET_FUNCTION_VALUE_REGNO_P
52726 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
52727
52728 #undef TARGET_PROMOTE_FUNCTION_MODE
52729 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
52730
52731 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
52732 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
52733
52734 #undef TARGET_MEMBER_TYPE_FORCES_BLK
52735 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
52736
52737 #undef TARGET_INSTANTIATE_DECLS
52738 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
52739
52740 #undef TARGET_SECONDARY_RELOAD
52741 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
52742
52743 #undef TARGET_CLASS_MAX_NREGS
52744 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
52745
52746 #undef TARGET_PREFERRED_RELOAD_CLASS
52747 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
52748 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
52749 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
52750 #undef TARGET_CLASS_LIKELY_SPILLED_P
52751 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
52752
52753 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
52754 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
52755 ix86_builtin_vectorization_cost
52756 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
52757 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
52758 ix86_vectorize_vec_perm_const_ok
52759 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
52760 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
52761 ix86_preferred_simd_mode
52762 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
52763 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
52764 ix86_autovectorize_vector_sizes
52765 #undef TARGET_VECTORIZE_GET_MASK_MODE
52766 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
52767 #undef TARGET_VECTORIZE_INIT_COST
52768 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
52769 #undef TARGET_VECTORIZE_ADD_STMT_COST
52770 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
52771 #undef TARGET_VECTORIZE_FINISH_COST
52772 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
52773 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
52774 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
52775
52776 #undef TARGET_SET_CURRENT_FUNCTION
52777 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
52778
52779 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
52780 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
52781
52782 #undef TARGET_OPTION_SAVE
52783 #define TARGET_OPTION_SAVE ix86_function_specific_save
52784
52785 #undef TARGET_OPTION_RESTORE
52786 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
52787
52788 #undef TARGET_OPTION_POST_STREAM_IN
52789 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
52790
52791 #undef TARGET_OPTION_PRINT
52792 #define TARGET_OPTION_PRINT ix86_function_specific_print
52793
52794 #undef TARGET_OPTION_FUNCTION_VERSIONS
52795 #define TARGET_OPTION_FUNCTION_VERSIONS common_function_versions
52796
52797 #undef TARGET_CAN_INLINE_P
52798 #define TARGET_CAN_INLINE_P ix86_can_inline_p
52799
52800 #undef TARGET_LEGITIMATE_ADDRESS_P
52801 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
52802
52803 #undef TARGET_REGISTER_PRIORITY
52804 #define TARGET_REGISTER_PRIORITY ix86_register_priority
52805
52806 #undef TARGET_REGISTER_USAGE_LEVELING_P
52807 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
52808
52809 #undef TARGET_LEGITIMATE_CONSTANT_P
52810 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
52811
52812 #undef TARGET_COMPUTE_FRAME_LAYOUT
52813 #define TARGET_COMPUTE_FRAME_LAYOUT ix86_compute_frame_layout
52814
52815 #undef TARGET_FRAME_POINTER_REQUIRED
52816 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
52817
52818 #undef TARGET_CAN_ELIMINATE
52819 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
52820
52821 #undef TARGET_EXTRA_LIVE_ON_ENTRY
52822 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
52823
52824 #undef TARGET_ASM_CODE_END
52825 #define TARGET_ASM_CODE_END ix86_code_end
52826
52827 #undef TARGET_CONDITIONAL_REGISTER_USAGE
52828 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
52829
52830 #undef TARGET_LOOP_UNROLL_ADJUST
52831 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
52832
52833 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
52834 #undef TARGET_SPILL_CLASS
52835 #define TARGET_SPILL_CLASS ix86_spill_class
52836
52837 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
52838 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
52839 ix86_simd_clone_compute_vecsize_and_simdlen
52840
52841 #undef TARGET_SIMD_CLONE_ADJUST
52842 #define TARGET_SIMD_CLONE_ADJUST \
52843 ix86_simd_clone_adjust
52844
52845 #undef TARGET_SIMD_CLONE_USABLE
52846 #define TARGET_SIMD_CLONE_USABLE \
52847 ix86_simd_clone_usable
52848
52849 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
52850 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
52851 ix86_float_exceptions_rounding_supported_p
52852
52853 #undef TARGET_MODE_EMIT
52854 #define TARGET_MODE_EMIT ix86_emit_mode_set
52855
52856 #undef TARGET_MODE_NEEDED
52857 #define TARGET_MODE_NEEDED ix86_mode_needed
52858
52859 #undef TARGET_MODE_AFTER
52860 #define TARGET_MODE_AFTER ix86_mode_after
52861
52862 #undef TARGET_MODE_ENTRY
52863 #define TARGET_MODE_ENTRY ix86_mode_entry
52864
52865 #undef TARGET_MODE_EXIT
52866 #define TARGET_MODE_EXIT ix86_mode_exit
52867
52868 #undef TARGET_MODE_PRIORITY
52869 #define TARGET_MODE_PRIORITY ix86_mode_priority
52870
52871 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
52872 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
52873
52874 #undef TARGET_LOAD_BOUNDS_FOR_ARG
52875 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
52876
52877 #undef TARGET_STORE_BOUNDS_FOR_ARG
52878 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
52879
52880 #undef TARGET_LOAD_RETURNED_BOUNDS
52881 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
52882
52883 #undef TARGET_STORE_RETURNED_BOUNDS
52884 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
52885
52886 #undef TARGET_CHKP_BOUND_MODE
52887 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
52888
52889 #undef TARGET_BUILTIN_CHKP_FUNCTION
52890 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
52891
52892 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
52893 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
52894
52895 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
52896 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
52897
52898 #undef TARGET_CHKP_INITIALIZE_BOUNDS
52899 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
52900
52901 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
52902 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
52903
52904 #undef TARGET_OFFLOAD_OPTIONS
52905 #define TARGET_OFFLOAD_OPTIONS \
52906 ix86_offload_options
52907
52908 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
52909 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
52910
52911 #undef TARGET_OPTAB_SUPPORTED_P
52912 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
52913
52914 #undef TARGET_HARD_REGNO_SCRATCH_OK
52915 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
52916
52917 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
52918 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 1
52919
52920 #undef TARGET_ADDITIONAL_ALLOCNO_CLASS_P
52921 #define TARGET_ADDITIONAL_ALLOCNO_CLASS_P ix86_additional_allocno_class_p
52922
52923 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
52924 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
52925
52926 #undef TARGET_INIT_LIBFUNCS
52927 #define TARGET_INIT_LIBFUNCS ix86_init_libfuncs
52928
52929 #undef TARGET_EXPAND_DIVMOD_LIBFUNC
52930 #define TARGET_EXPAND_DIVMOD_LIBFUNC ix86_expand_divmod_libfunc
52931
52932 #undef TARGET_MAX_NOCE_IFCVT_SEQ_COST
52933 #define TARGET_MAX_NOCE_IFCVT_SEQ_COST ix86_max_noce_ifcvt_seq_cost
52934
52935 #undef TARGET_NOCE_CONVERSION_PROFITABLE_P
52936 #define TARGET_NOCE_CONVERSION_PROFITABLE_P ix86_noce_conversion_profitable_p
52937
52938 #if CHECKING_P
52939 #undef TARGET_RUN_TARGET_SELFTESTS
52940 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
52941 #endif /* #if CHECKING_P */
52942
52943 struct gcc_target targetm = TARGET_INITIALIZER;
52944 \f
52945 #include "gt-i386.h"